def main(): parser = optparse.OptionParser(usage="usage: %prog [options] directory") parser.add_option("--config", dest="config_file", help="Configuration file name.", type="string", default="config.json") parser.add_option("--no-output", action="store_true", help="Do not create output files.") parser.add_option("--repeat", dest="repeat", help="Used for repeating the same experiment many times.", type="int", default="-1") (commandline_kwargs, args) = parser.parse_args() # Read in the config file #expt_dir = os.path.realpath('examples/cifar10') expt_dir = os.path.realpath(args[0]) if not os.path.isdir(expt_dir): raise Exception("Cannot find directory %s" % expt_dir) options = parse_config_file(expt_dir, commandline_kwargs.config_file) experiment_name = options["experiment-name"] # Special advanced feature for repeating the same experiment many times if commandline_kwargs.repeat >= 0: experiment_name = repeat_experiment_name(experiment_name, commandline_kwargs.repeat) if not commandline_kwargs.no_output: # if we want output if commandline_kwargs.repeat >= 0: output_directory = repeat_output_dir(expt_dir, commandline_kwargs.repeat) else: output_directory = os.path.join(expt_dir, 'output', options["experiment-name"]) if not os.path.isdir(output_directory): os.mkdir(output_directory) if commandline_kwargs.repeat < 0: rootLogger = logging.getLogger() fileHandler = logging.FileHandler(os.path.join(output_directory, 'main.log')) fileHandler.setFormatter(logFormatter) fileHandler.setLevel(logLevel) rootLogger.addHandler(fileHandler) # consoleHandler = logging.StreamHandler() # consoleHandler.setFormatter(logFormatter) # consoleHandler.setLevel(logLevel) # rootLogger.addHandler(consoleHandler) else: output_directory = None input_space = InputSpace(options["variables"]) resources = parse_resources_from_config(options) # Load up the chooser. chooser_module = importlib.import_module('spearmint.choosers.' + options['chooser']) chooser = chooser_module.init(input_space, options) # Connect to the database db_address = options['database']['address'] db = MongoDB(database_address=db_address) if os.getenv('SPEARMINT_MAX_ITERATIONS') == None and 'max_iterations' not in set(options.keys()): maxiterations = DEFAULT_MAX_ITERATIONS elif os.getenv('SPEARMINT_MAX_ITERATIONS') != None: maxiterations = int(os.getenv('SPEARMINT_MAX_ITERATIONS')) else: maxiterations = options['max_iterations'] # Set random seed if 'random_seed' in options.keys(): np.random.seed(int(options['random_seed'])) seed(int(options['random_seed'])) waiting_for_results = False # for printing purposes only while True: for resource_name, resource in resources.iteritems(): jobs = load_jobs(db, experiment_name) # resource.printStatus(jobs) # If the resource is currently accepting more jobs # TODO: here cost will eventually also be considered: even if the # resource is not full, we might wait because of cost incurred # Note: I could chose to fill up one resource and them move on to the next ("if") # You could also do it the other way, by changing "if" to "while" here # Remove any broken jobs from pending # note: make sure to do this before the acceptingJobs() condition is checked remove_broken_jobs(db, jobs, experiment_name, resources) if resource.acceptingJobs(jobs): if waiting_for_results: logging.info('\n') waiting_for_results = False optim_start_time = time.time() # Load jobs from DB # (move out of one or both loops?) would need to pass into load_tasks jobs = load_jobs(db, experiment_name) # Print out a list of broken jobs print_broken_jobs(jobs) # Get a suggestion for the next job tasks = parse_tasks_from_jobs(jobs, experiment_name, options, input_space) # Special case when coupled and there is a NaN task-- what to do with NaN task when decoupled?? if 'NaN' in tasks and 'NaN' not in resource.tasks: resource.tasks.append('NaN') # Load the model hypers from the database. hypers = db.load(experiment_name, 'hypers') # "Fit" the chooser - give the chooser data and let it fit the model(s). # NOTE: even if we are only suggesting for 1 task, we need to fit all of them # because the acquisition function for one task depends on all the tasks hypers = chooser.fit(tasks, hypers) if hypers: logging.debug('GP covariance hyperparameters:') print_hypers(hypers) # Save the hyperparameters to the database. if hypers: db.save(hypers, experiment_name, 'hypers') # Compute the best value so far, a.k.a. the "recommendation" recommendation = chooser.best() # Save the recommendation in the DB numComplete_by_task = {task_name : task.numComplete(jobs) for task_name, task in tasks.iteritems()} db.save({'num_complete' : resource.numComplete(jobs), 'num_complete_tasks' : numComplete_by_task, 'params' : input_space.paramify(recommendation['model_model_input']), 'objective': recommendation['model_model_value'], 'params_o' : None if recommendation['obser_obser_input'] is None else input_space.paramify(recommendation['obser_obser_input']), 'obj_o' : recommendation['obser_obser_value'], 'params_om': None if recommendation['obser_model_input'] is None else input_space.paramify(recommendation['obser_model_input']), 'obj_om' : recommendation['obser_model_value']}, experiment_name, 'recommendations', {'id' : len(jobs)}) # Get the decoupling groups task_couplings = {task_name : tasks[task_name].options["group"] for task_name in resource.tasks} logging.info('\nGetting suggestion for %s...\n' % (', '.join(task_couplings.keys()))) # Get the next suggested experiment from the chooser. suggested_input, suggested_tasks = chooser.suggest(task_couplings, optim_start_time) suggested_task = suggested_tasks[0] # hack, deal with later suggested_job = { 'id' : len(jobs) + 1, 'params' : input_space.paramify(suggested_input), 'expt_dir' : options['main_file_path'], 'tasks' : suggested_tasks, 'resource' : resource_name, 'main-file' : resource.main_file, 'language' : options['tasks'][suggested_task]['language'], 'status' : 'new', 'submit time' : time.time(), 'start time' : None, 'end time' : None } save_job(suggested_job, db, experiment_name) # Submit the job to the appropriate resource process_id = resource.attemptDispatch(experiment_name, suggested_job, db_address, expt_dir, output_directory) # Print the current time logging.info('Current time: %s' % datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) # Set the status of the job appropriately (successfully submitted or not) if process_id is None: suggested_job['status'] = 'broken' logging.info('Job %s failed -- check output file for details.' % job['id']) save_job(suggested_job, db, experiment_name) else: suggested_job['status'] = 'pending' suggested_job['proc_id'] = process_id save_job(suggested_job, db, experiment_name) jobs = load_jobs(db, experiment_name) # Print out the status of the resources # resource.printStatus(jobs) print_resources_status(resources.values(), jobs) if len(set(task_couplings.values())) > 1: # if decoupled print_tasks_status(tasks.values(), jobs) # For debug - print pending jobs print_pending_jobs(jobs) # Terminate the optimization if all resources are finished (run max number of jobs) # or ANY task is finished (just my weird convention) if reduce(lambda x,y: x and y, map(lambda x: x.maxCompleteReached(jobs), resources.values()), True) or \ reduce(lambda x,y: x or y, map(lambda x: x.maxCompleteReached(jobs), tasks.values()), False): # Do all this extra work just to save the final recommendation -- would be ok to delete everything # in here and just "return" sys.stdout.write('\n') jobs = load_jobs(db, experiment_name) tasks = parse_tasks_from_jobs(jobs, experiment_name, options, input_space) hypers = db.load(experiment_name, 'hypers') hypers = chooser.fit(tasks, hypers) if hypers: db.save(hypers, experiment_name, 'hypers') # logging.info('\n**All resources have run the maximum number of jobs.**\nFinal recommendation:') recommendation = chooser.best() # numComplete_per_task numComplete_by_task = {task_name : task.numComplete(jobs) for task_name, task in tasks.iteritems()} db.save({'num_complete' : resource.numComplete(jobs), 'num_complete_tasks' : numComplete_by_task, 'params' : input_space.paramify(recommendation['model_model_input']), 'objective': recommendation['model_model_value'], 'params_o' : None if recommendation['obser_obser_input'] is None else input_space.paramify(recommendation['obser_obser_input']), 'obj_o' : recommendation['obser_obser_value'], 'params_om': None if recommendation['obser_model_input'] is None else input_space.paramify(recommendation['obser_model_input']), 'obj_om' : recommendation['obser_model_value']}, experiment_name, 'recommendations', {'id' : len(jobs)}) logging.info('Maximum number of jobs completed. Have a nice day.') return # If no resources are accepting jobs, sleep if no_free_resources(db, experiment_name, resources): # Don't use logging here because it's too much effort to use logging without a newline at the end sys.stdout.write('Waiting for results...' if not waiting_for_results else '.') sys.stdout.flush() # sys.stderr.flush() waiting_for_results = True time.sleep(options['polling_time']) else: sys.stdout.write('\n')
def main(expt_dir, config_file="config.json", no_output=False, repeat=-1): if not os.path.isdir(expt_dir): raise Exception("Cannot find directory %s" % expt_dir) options = parse_config_file(expt_dir, config_file) experiment_name = options["experiment_name"] # Special advanced feature for repeating the same experiment many times if repeat >= 0: experiment_name = repeat_experiment_name(experiment_name, repeat) if not no_output: # if we want output if repeat >= 0: output_directory = repeat_output_dir(expt_dir, repeat) else: output_directory = os.path.join(expt_dir, 'output') if not os.path.isdir(output_directory): os.mkdir(output_directory) if repeat < 0: rootLogger = logging.getLogger() fileHandler = logging.FileHandler( os.path.join(output_directory, 'main.log')) fileHandler.setFormatter(logFormatter) fileHandler.setLevel(logLevel) rootLogger.addHandler(fileHandler) # consoleHandler = logging.StreamHandler() # consoleHandler.setFormatter(logFormatter) # consoleHandler.setLevel(logLevel) # rootLogger.addHandler(consoleHandler) else: output_directory = None input_space = InputSpace(options["variables"]) resources = parse_resources_from_config(options) # Load up the chooser. chooser_module = importlib.import_module('spearmint.choosers.' + options['chooser']) chooser = chooser_module.init(input_space, options) # Connect to the database db_address = options['database']['address'] db = MongoDB(database_address=db_address) overall_start_time = time.time() db.save({'start-time': overall_start_time}, experiment_name, 'start-time') waiting_for_results = False # for printing purposes only while True: for resource_name, resource in resources.iteritems(): jobs = load_jobs(db, experiment_name) # resource.printStatus(jobs) # If the resource is currently accepting more jobs # TODO: here cost will eventually also be considered: even if the # resource is not full, we might wait because of cost incurred # Note: I could chose to fill up one resource and them move on to the next ("if") # You could also do it the other way, by changing "if" to "while" here # Remove any broken jobs from pending # note: make sure to do this before the acceptingJobs() condition is checked remove_broken_jobs(db, jobs, experiment_name, resources) if resource.acceptingJobs(jobs): if waiting_for_results: logging.info('\n') waiting_for_results = False # Load jobs from DB # (move out of one or both loops?) would need to pass into load_tasks jobs = load_jobs(db, experiment_name) # Print out a list of broken jobs print_broken_jobs(jobs) # Get a suggestion for the next job tasks = parse_tasks_from_jobs(jobs, experiment_name, options, input_space) # Special case when coupled and there is a NaN task-- what to do with NaN task when decoupled?? if 'NaN' in tasks and 'NaN' not in resource.tasks: resource.tasks.append('NaN') # Load the model hypers from the database. hypers = db.load(experiment_name, 'hypers') # "Fit" the chooser - give the chooser data and let it fit the model(s). # NOTE: even if we are only suggesting for 1 task, we need to fit all of them # because the acquisition function for one task depends on all the tasks hypers = chooser.fit(tasks, hypers) if hypers: logging.debug('GP covariance hyperparameters:') print_hypers(hypers, input_space, options) # if 'duration hypers' in hypers: # logging.debug('Duration GP covariance hyperparameters:') # print_hypers(hypers['duration hypers'], input_space, options) # Save the hyperparameters to the database. if hypers: db.save(hypers, experiment_name, 'hypers') if options['recommendations'] == "during": # Compute the best value so far, a.k.a. the "recommendation" recommendation = chooser.best() # Save the recommendation in the DB if there are more complete jobs than last time store_recommendation(recommendation, db, experiment_name, tasks, jobs, input_space, time.time() - overall_start_time) # Get the decoupling groups task_couplings = { task_name: tasks[task_name].options["group"] for task_name in resource.tasks } logging.info('\nGetting suggestion for %s...\n' % (', '.join(task_couplings.keys()))) # Get the next suggested experiment from the chooser. suggested_input, suggested_tasks = chooser.suggest( task_couplings) suggested_task = suggested_tasks[0] # hack, deal with later suggested_job = { 'id': len(jobs) + 1, 'params': input_space.paramify(suggested_input), 'expt_dir': options['main_file_path'], 'tasks': suggested_tasks, 'resource': resource_name, 'main-file': options['tasks'][suggested_task]['main_file'], 'language': options['tasks'][suggested_task]['language'], 'status': 'new', 'submit time': time.time(), 'start time': None, 'end time': None, 'fast update': chooser.fast_update # just for plotting - not important } save_job(suggested_job, db, experiment_name) # Submit the job to the appropriate resource process_id = resource.attemptDispatch(experiment_name, suggested_job, db_address, expt_dir, output_directory) # Print the current time logging.info( 'Current time: %s' % datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) # Set the status of the job appropriately (successfully submitted or not) if process_id is None: suggested_job['status'] = 'broken' logging.info( 'Job %s failed -- check output file for details.' % job['id']) save_job(suggested_job, db, experiment_name) else: suggested_job['status'] = 'pending' suggested_job['proc_id'] = process_id save_job(suggested_job, db, experiment_name) jobs = load_jobs(db, experiment_name) # Print out the status of the resources # resource.printStatus(jobs) print_resources_status(resources.values(), jobs) if len(set(task_couplings.values())) > 1: # if decoupled print_tasks_status(tasks.values(), jobs) # For debug - print pending jobs print_pending_jobs(jobs) # Terminate the optimization if all resources are finished (run max number of jobs) # or ANY task is finished (just my weird convention) jobs = load_jobs(db, experiment_name) tasks = parse_tasks_from_jobs(jobs, experiment_name, options, input_space) terminate_resources = reduce( lambda x, y: x and y, map(lambda x: x.maxCompleteReached(jobs), resources.values()), True) terminate_tasks = reduce( lambda x, y: x or y, map(lambda x: x.maxCompleteReached(jobs), tasks.values()), False) terminate_maxtime = (time.time() - overall_start_time) >= ( options['max_time_mins'] * 60.0) if terminate_resources or terminate_tasks or terminate_maxtime: if terminate_resources: logging.info( 'Maximum number of jobs completed on all resources.') if terminate_tasks: logging.info( 'Maximum number of jobs reached for at least one task.') if terminate_maxtime: logging.info( 'Maximum total experiment time of %f minutes reached.' % options['max_time_mins']) # save rec in DB if options['recommendations'] in ("during", "end-one"): logging.info('Making final recommendation:') recommendation = chooser.best() store_recommendation(recommendation, db, experiment_name, tasks, jobs, input_space, time.time() - overall_start_time, final=True) elif options['recommendations'] == "end-all": logging.info('Making recommendations...') all_jobs = jobs for i in xrange(len(all_jobs)): logging.info('') logging.info( '-------------------------------------------------') logging.info( ' Getting recommendations for iter %d/%d ' % (i, len(all_jobs))) logging.info( '-------------------------------------------------') logging.info('') jobs = all_jobs[:i + 1] tasks = parse_tasks_from_jobs(jobs, experiment_name, options, input_space) hypers = chooser.fit(tasks, hypers) print_hypers(hypers, input_space, options) # get the biggest end time of the jobs end_time = max([job['end time'] for job in jobs]) elapsed_time = end_time - overall_start_time recommendation = chooser.best() store_recommendation(recommendation, db, experiment_name, tasks, jobs, input_space, elapsed_time) logging.info('Have a nice day.') return # If no resources are accepting jobs, sleep if no_free_resources(db, experiment_name, resources): # Don't use logging here because it's too much effort to use logging without a newline at the end sys.stdout.write( 'Waiting for results...' if not waiting_for_results else '.') sys.stdout.flush() # sys.stderr.flush() waiting_for_results = True time.sleep(options['polling_time']) else: sys.stdout.write('\n')