def run(self, experiment): if isinstance(experiment, basestring): experiment = self.db.get_experiment(experiment) elif not isinstance(experiment, model.Experiment): raise ValueError("Unknown type of experiment: " + str(type(experiment))) self.logger.info("Experiment key: " + experiment.key) with model.get_db_provider(self.config) as db: db.start_experiment(experiment) """ Override env variables with those inside the queued message """ env = dict(os.environ) if 'env' in self.config.keys(): for k, v in self.config['env'].iteritems(): if v is not None: env[str(k)] = str(v) fs_tracker.setup_experiment(env, experiment, clean=True) log_path = fs_tracker.get_artifact_cache('output', experiment.key) # log_path = os.path.join(model_dir, self.config['log']['name']) self.logger.debug('Child process environment:') self.logger.debug(str(env)) sched = BackgroundScheduler() sched.start() with open(log_path, 'w') as output_file: p = subprocess.Popen( ["python", experiment.filename] + experiment.args, stdout=output_file, stderr=subprocess.STDOUT, env=env, cwd=experiment.artifacts['workspace']['local']) # simple hack to show what's in the log file ptail = subprocess.Popen(["tail", "-f", log_path]) sched.add_job( lambda: db.checkpoint_experiment(experiment), 'interval', minutes=self.config['saveWorkspaceFrequencyMinutes']) def kill_if_stopped(): if db.get_experiment(experiment.key, getinfo=False).status == 'stopped': p.kill() sched.add_job(kill_if_stopped, 'interval', seconds=10) try: p.wait() finally: ptail.kill() db.finish_experiment(experiment) sched.shutdown()
def add_experiment(args): try: config, python_pkg, e = args e.pythonenv = add_packages(e.pythonenv, python_pkg) with model.get_db_provider(config) as db: db.add_experiment(e) except BaseException: traceback.print_exc() raise return e
def get_db(): global _db_provider global _db_provider_timestamp if not _db_provider or \ not _db_provider_timestamp or \ time.time() - _db_provider_timestamp > DB_PROVIDER_EXPIRATION: _db_provider = model.get_db_provider(blocking_auth=False) _db_provider_timestamp = time.time() return _db_provider
def get_experiment_fitnesses(experiments, optimizer, config, logger): db_provider = model.get_db_provider() has_result = [False] * len(experiments) fitnesses = [0.0] * len(experiments) try: term_criterion = config['optimizer']['termination_criterion'] except BaseException: logger.warn("Cannot find termination criterion in config.yaml, looking" "in optimizer source code instead") term_criterion = optimizer.get_configs()['termination_criterion'] skip_gen_thres = term_criterion['skip_gen_thres'] skip_gen_timeout = term_criterion['skip_gen_timeout'] result_timestamp = time.time() while sum(has_result) < len(experiments): for i, experiment in enumerate(experiments): if float(sum(has_result)) / len(experiments) > skip_gen_thres \ and time.time() - result_timestamp > skip_gen_timeout: logger.warn( "Skipping to next gen with %s of solutions evaled" % (float( sum(has_result)) / len(experiments))) has_result = [True] * len(experiments) break if has_result[i]: continue returned_experiment = db_provider.get_experiment(experiment.key, getinfo=True) # try: # experiment_output = returned_experiment.info['logtail'] # except: # logger.warn('Cannot access "logtail" ' + # 'field in experiment.info') output = db_provider._get_experiment_logtail(returned_experiment) for line in output: if line.startswith("Fitness") or line.startswith("fitness"): try: fitness = float(line.rstrip().split(':')[1]) assert fitness >= 0.0 except BaseException: logger.warn('Error parsing or invalid fitness (%s)' % line) else: fitnesses[i] = fitness has_result[i] = True result_timestamp = time.time() break time.sleep(config['sleep_time']) return fitnesses
def _list(args, cli_args): with model.get_db_provider(cli_args.config) as db: if len(args) == 0: experiments = db.get_user_experiments() elif args[0] == 'project': assert len(args) == 2 experiments = db.get_project_experiments(args[1]) elif args[0] == 'users': assert len(args) == 1 users = db.get_users() for u in users: print(users[u].get('email')) return elif args[0] == 'user': assert len(args) == 2 users = db.get_users() user_ids = [u for u in users if users[u].get('email') == args[1]] assert len(user_ids) == 1, \ 'The user with email ' + args[1] + \ 'not found!' experiments = db.get_user_experiments(user_ids[0]) elif args[0] == 'all': assert len(args) == 1 users = db.get_users() experiments = [] for u in users: experiments += db.get_user_experiments(u) else: get_logger().critical('Unknown command ' + args[0]) return if cli_args.short: for e in experiments: print(e) return experiments = [db.get_experiment(e) for e in experiments] experiments.sort(key=lambda e: -e.time_added) table = [['Time added', 'Key', 'Project', 'Status']] for e in experiments: table.append([ time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(e.time_added)), e.key, e.project, e.status]) print(AsciiTable(table).table)
def main(args=sys.argv[1:]): parser = argparse.ArgumentParser(description='Studio WebUI server. \ Usage: studio \ <arguments>') parser.add_argument('--config', help='configuration file', default=None) # parser.add_argument('--guest', # help='Guest mode (does not require db credentials)', # action='store_true') parser.add_argument('--port', help='port to run Flask server on', type=int, default=5000) parser.add_argument('--host', help='host name.', default='0.0.0.0') parser.add_argument('--verbose', '-v', help='Verbosity level. Allowed vaules: ' + 'debug, info, warn, error, crit ' + 'or numerical value of logger levels.', default=None) args = parser.parse_args(args) config = model.get_config() if args.config: with open(args.config) as f: config = yaml.load(f) if args.verbose: config['verbose'] = args.verbose # if args.guest: # config['database']['guest'] = True global _config global _db_provider _config = config _db_provider = model.get_db_provider(_config, blocking_auth=False) getlogger().setLevel(model.parse_verbosity(config.get('verbose'))) global _save_auth_cookie _save_auth_cookie = True print('Starting Studio UI on port {0}'.format(args.port)) app.run(host=args.host, port=args.port)
def __init__(self, args): self.config = model.get_config() if args.config: if isinstance(args.config, basestring): with open(args.config) as f: self.config.update(yaml.load(f)) else: self.config.update(args.config) if args.guest: self.config['database']['guest'] = True self.db = model.get_db_provider(self.config) self.logger = logging.getLogger('LocalExecutor') self.logger.setLevel(model.parse_verbosity(self.config.get('verbose'))) self.logger.debug("Config: ") self.logger.debug(self.config)
def submit_experiments( experiments, config, runner_args, logger, resources_needed): db = model.get_db_provider(config) verbose = model.parse_verbosity(config['verbose']) queue_name = 'local' if 'queue' in config.keys(): queue_name = config['queue'] if runner_args.queue: queue_name = runner_args.queue for e in experiments: e.pythonenv = add_packages(e.pythonenv, runner_args.python_pkg) db.add_experiment(e) logger.info("Added experiment " + e.key) if runner_args.cloud is not None: assert runner_args.cloud in ['gcloud', 'gcspot', 'ec2', 'ec2spot'] assert runner_args.queue is None, \ '--queue argument cannot be provided with --cloud argument' auth_cookie = None if config['database'].get('guest') \ else os.path.join( auth.token_dir, config['database']['apiKey'] ) if runner_args.cloud in ['gcloud', 'gcspot']: queue_name = 'pubsub_' + str(uuid.uuid4()) queue = PubsubQueue(queue_name, verbose=verbose) worker_manager = GCloudWorkerManager( auth_cookie=auth_cookie, zone=config['cloud']['zone'] ) if runner_args.cloud in ['ec2', 'ec2spot']: queue_name = 'sqs_' + str(uuid.uuid4()) queue = SQSQueue(queue_name, verbose=verbose) worker_manager = EC2WorkerManager( auth_cookie=auth_cookie ) if runner_args.cloud == 'gcloud' or \ runner_args.cloud == 'ec2': num_workers = int( runner_args.num_workers) if runner_args.num_workers else 1 for i in range(num_workers): worker_manager.start_worker( queue_name, resources_needed, ssh_keypair=runner_args.ssh_keypair, timeout=runner_args.cloud_timeout) else: assert runner_args.bid is not None if runner_args.num_workers: start_workers = runner_args.num_workers queue_upscaling = False else: start_workers = 1 queue_upscaling = True worker_manager.start_spot_workers( queue_name, runner_args.bid, resources_needed, start_workers=start_workers, queue_upscaling=queue_upscaling, ssh_keypair=runner_args.ssh_keypair, timeout=runner_args.cloud_timeout) else: if queue_name == 'local': queue = LocalQueue() queue.clean() elif queue_name.startswith('sqs_'): queue = SQSQueue(queue_name, verbose=verbose) else: queue = PubsubQueue( queue_name, config['database']['projectId'], verbose=verbose) for e in experiments: queue.enqueue(json.dumps({ 'experiment': e.__dict__, 'config': config})) if queue_name == 'local': worker_args = ['studio-local-worker'] if runner_args.config: worker_args += ['--config=' + runner_args.config] if runner_args.guest: worker_args += ['--guest'] logger.info('worker args: {}'.format(worker_args)) if not runner_args.num_workers or int(runner_args.num_workers) == 1: local_worker.main(worker_args) else: raise NotImplementedError("Multiple local workers are not " + "implemented yet") return
def main(args=sys.argv): logger = logging.getLogger('studio-runner') parser = argparse.ArgumentParser( description='Studio runner. \ Usage: studio run <runner_arguments> \ script <script_arguments>') parser.add_argument('--config', help='configuration file', default=None) parser.add_argument('--project', help='name of the project', default=None) parser.add_argument( '--experiment', '-e', help='Name of the experiment. If none provided, ' + 'random uuid will be generated', default=None) parser.add_argument( '--guest', help='Guest mode (does not require db credentials)', action='store_true') parser.add_argument( '--force-git', help='If run in a git directory, force running the experiment ' + 'even if changes are not commited', action='store_true') parser.add_argument( '--gpus', help='Number of gpus needed to run the experiment', default=None) parser.add_argument( '--cpus', help='Number of cpus needed to run the experiment' + ' (used to configure cloud instance)', default=None) parser.add_argument( '--ram', help='Amount of RAM needed to run the experiment' + ' (used to configure cloud instance)', default=None) parser.add_argument( '--hdd', help='Amount of hard drive space needed to run the experiment' + ' (used to configure cloud instance)', default=None) parser.add_argument( '--queue', '-q', help='Name of the remote execution queue', default=None) parser.add_argument( '--cloud', help='Cloud execution mode. Could be gcloud, ec2 or ec2spot', default=None) parser.add_argument( '--bid', help='Spot instance price bid, specified in USD or in percentage ' + 'of on-demand instance price. Default is %(default)s', default='100%') parser.add_argument( '--capture-once', '-co', help='Name of the immutable artifact to be captured. ' + 'It will be captured once before the experiment is run', default=[], action='append') parser.add_argument( '--capture', '-c', help='Name of the mutable artifact to be captured continuously', default=[], action='append') parser.add_argument( '--reuse', '-r', help='Name of the artifact from another experiment to use', default=[], action='append') parser.add_argument( '--verbose', '-v', help='Verbosity level. Allowed values: ' + 'debug, info, warn, error, crit ' + 'or numerical value of logger levels.', default=None) parser.add_argument( '--metric', '-m', help='Metric to show in the summary of the experiment, ' + 'and to base hyperparameter search on. ' + 'Refers a scalar value in tensorboard log ' + 'example: --metric=val_loss[:final | :min | :max] to report ' + 'validation loss in the end of the keras experiment ' + '(or smallest or largest throughout the experiment for :min ' + 'and :max respectively)', default=None) parser.add_argument( '--hyperparam', '-hp', help='Try out multiple values of a certain parameter. ' + 'For example, --hyperparam=learning_rate:0.01:0.1:l10 ' + 'will instantiate 10 versions of the script, replace ' + 'learning_rate with a one of the 10 values for learning ' + 'rate that lies on a log grid from 0.01 to 0.1, create ' 'experiments and place them in the queue.', default=[], action='append') parser.add_argument( '--num-workers', help='Number of local or cloud workers to spin up', default=None) parser.add_argument( '--python-pkg', help='Python package not present in the current environment ' + 'that is needed for experiment. Only compatible with ' + 'remote and cloud workers for now', default=[], action='append') parser.add_argument( '--ssh-keypair', help='Name of the SSH keypair used to access the EC2 ' + 'instances directly', default=None) parser.add_argument( '--optimizer', '-opt', help='Name of optimizer to use, by default is grid search. ' + 'The name of the optimizer must either be in ' + 'studio/optimizer_plugins ' + 'directory or the path to the optimizer source file ' + 'must be supplied. ', default='grid') parser.add_argument( '--cloud-timeout', help="Time (in seconds) that cloud workers wait for messages. " + "If negative, " + "wait for the first message in the queue indefinitely " + "and shut down " + "as soon as no new messages are available. " + "If zero, don't wait at all." + "Default value is %(default)", type=int, default=300) # detect which argument is the script filename # and attribute all arguments past that index as related to the script py_suffix_args = [i for i, arg in enumerate(args) if arg.endswith('.py')] if len(py_suffix_args) < 1: print('At least one argument should be a python script ' + '(end with *.py)') parser.print_help() exit() script_index = py_suffix_args[0] runner_args = parser.parse_args(args[1:script_index]) exec_filename, other_args = args[script_index], args[script_index + 1:] # TODO: Queue the job based on arguments and only then execute. config = model.get_config(runner_args.config) if runner_args.verbose: config['verbose'] = runner_args.verbose verbose = model.parse_verbosity(config['verbose']) logger.setLevel(verbose) db = model.get_db_provider(config) if git_util.is_git() and not git_util.is_clean(): logger.warn('Running from dirty git repo') if not runner_args.force_git: logger.error( 'Specify --force-git to run experiment from dirty git repo') sys.exit(1) resources_needed = parse_hardware(runner_args, config['cloud']) logger.debug('resources requested: ') logger.debug(str(resources_needed)) artifacts = {} artifacts.update(parse_artifacts(runner_args.capture, mutable=True)) artifacts.update(parse_artifacts(runner_args.capture_once, mutable=False)) artifacts.update(parse_external_artifacts(runner_args.reuse, db)) if any(runner_args.hyperparam): if runner_args.optimizer is "grid": experiments = add_hyperparam_experiments( exec_filename, other_args, runner_args, artifacts, resources_needed) submit_experiments( experiments, config, runner_args, logger, resources_needed) else: opt_modulepath = os.path.join( os.path.dirname(os.path.abspath(__file__)), "optimizer_plugins", runner_args.optimizer + ".py") # logger.info('optimizer path: %s' % opt_modulepath) if not os.path.exists(opt_modulepath): opt_modulepath = os.path.abspath( os.path.expanduser(runner_args.optimizer)) logger.info('optimizer path: %s' % opt_modulepath) assert os.path.exists(opt_modulepath) sys.path.append(os.path.dirname(opt_modulepath)) opt_module = importlib.import_module( os.path.basename(opt_modulepath.replace(".py", ''))) hyperparam_values, log_scale_dict = get_hyperparam_values( runner_args) optimizer = getattr(opt_module, "Optimizer")(hyperparam_values, log_scale_dict) while not optimizer.stop(): hyperparam_tuples = optimizer.ask() experiments = add_hyperparam_experiments( exec_filename, other_args, runner_args, artifacts, resources_needed, optimizer=optimizer, hyperparam_tuples=hyperparam_tuples) submit_experiments( experiments, config, runner_args, logger, resources_needed) fitnesses = get_experiment_fitnesses(experiments, optimizer, config, logger) optimizer.tell(hyperparam_tuples, fitnesses) # if config['verbose'] == "info" or config['verbose'] == # "debug": try: optimizer.disp() except BaseException: logger.warn('Optimizer has no disp() method') else: experiments = [model.create_experiment( filename=exec_filename, args=other_args, experiment_name=runner_args.experiment, project=runner_args.project, artifacts=artifacts, resources_needed=resources_needed, metric=runner_args.metric)] submit_experiments( experiments, config, runner_args, logger, resources_needed) db = None return
def _kill(args, cli_args): with model.get_db_provider(cli_args.config) as db: for e in args: get_logger().info('Deleting experiment ' + e) db.delete_experiment(e)
def run(self, experiment): if isinstance(experiment, six.string_types): experiment = self.db.get_experiment(experiment) elif not isinstance(experiment, Experiment): raise ValueError("Unknown type of experiment: " + str(type(experiment))) self.logger.info("Experiment key: " + experiment.key) with model.get_db_provider(self.config) as db: db.start_experiment(experiment) """ Override env variables with those inside the queued message """ env = dict(os.environ) if 'env' in self.config.keys(): for k, v in six.iteritems(self.config['env']): if v is not None: env[str(k)] = str(v) env['PYTHONUNBUFFERED'] = 'TRUE' fs_tracker.setup_experiment(env, experiment, clean=False) log_path = fs_tracker.get_artifact_cache('output', experiment.key) # log_path = os.path.join(model_dir, self.config['log']['name']) self.logger.debug('Child process environment:') self.logger.debug(str(env)) sched = BackgroundScheduler() sched.start() with open(log_path, 'w') as output_file: python = 'python' if experiment.pythonver == 3: python = 'python3' cmd = [python, experiment.filename] + experiment.args cwd = experiment.artifacts['workspace']['local'] container_artifact = experiment.artifacts.get('_singularity') if container_artifact: container = container_artifact.get('local') if not container: container = container_artifact.get('qualified') cwd = fs_tracker.get_artifact_cache( 'workspace', experiment.key) for tag, art in six.iteritems(experiment.artifacts): local_path = art.get('local') if not art['mutable'] and os.path.exists(local_path): os.symlink(art['local'], os.path.join(os.path.dirname(cwd), tag)) if experiment.filename is not None: cmd = [ 'singularity', 'exec', container, ] + cmd else: cmd = ['singularity', 'run', container] self.logger.info('Running cmd: \n {} '.format(cmd)) p = subprocess.Popen(cmd, stdout=output_file, stderr=subprocess.STDOUT, env=env, cwd=cwd) # simple hack to show what's in the log file # ptail = subprocess.Popen(["tail", "-f", log_path]) logtail = Pygtail(log_path) def tail_func(): while logtail: for line in logtail: print(line) time.sleep(0.1) tail_thread = threading.Thread(target=tail_func) tail_thread.start() minutes = 0 if self.config.get('saveWorkspaceFrequency'): minutes = int( str2duration(self.config['saveWorkspaceFrequency']). total_seconds() / 60) def checkpoint(): try: db.checkpoint_experiment(experiment) except BaseException as e: self.logger.info(e) sched.add_job(checkpoint, 'interval', minutes=minutes) metrics_path = fs_tracker.get_artifact_cache( '_metrics', experiment.key) minutes = 0 if self.config.get('saveMetricsFrequency'): minutes = int( str2duration(self.config['saveMetricsFrequency']). total_seconds() / 60) sched.add_job(lambda: save_metrics(metrics_path), 'interval', minutes=minutes) def kill_if_stopped(): if db.get_experiment(experiment.key, getinfo=False).status == 'stopped': p.kill() if experiment.max_duration is not None and \ time.time() > experiment.time_started + \ int(str2duration(experiment.max_duration) .total_seconds()): p.kill() sched.add_job(kill_if_stopped, 'interval', seconds=10) try: p.wait() finally: save_metrics(metrics_path) sched.shutdown() logtail = None db.checkpoint_experiment(experiment) db.finish_experiment(experiment) return p.returncode
def worker_loop(queue, parsed_args, single_experiment=False, timeout=0, verbose=None): fetch_artifacts = True logger = logs.getLogger('worker_loop') hold_period = 4 retval = 0 while True: msg = queue.dequeue(acknowledge=False, timeout=timeout) if not msg: break # first_exp, ack_key = queue.dequeue(acknowledge=False) first_exp, ack_key = msg data_dict = json.loads(sixdecode(first_exp)) experiment_key = data_dict['experiment']['key'] config = data_dict['config'] parsed_args.config = config if verbose: config['verbose'] = verbose else: verbose = model.parse_verbosity(config.get('verbose')) logger.setLevel(verbose) logger.debug('Received message: \n{}'.format(data_dict)) executor = LocalExecutor(parsed_args) with model.get_db_provider(config) as db: # experiment = experiment_from_dict(data_dict['experiment']) def try_get_experiment(): experiment = db.get_experiment(experiment_key) if experiment is None: raise ValueError( 'experiment is not found - indicates storage failure') return experiment experiment = retry(try_get_experiment, sleep_time=10, logger=logger) if config.get('experimentLifetime') and \ int(str2duration(config['experimentLifetime']) .total_seconds()) + experiment.time_added < time.time(): logger.info( 'Experiment expired (max lifetime of {} was exceeded)'. format(config.get('experimentLifetime'))) queue.acknowledge(ack_key) continue if allocate_resources(experiment, config, verbose=verbose): def hold_job(): queue.hold(ack_key, hold_period) hold_job() sched = BackgroundScheduler() sched.add_job(hold_job, 'interval', minutes=hold_period / 2) sched.start() try: python = 'python' if experiment.pythonver == 3: python = 'python3' if '_singularity' not in experiment.artifacts.keys(): pip_diff = pip_needed_packages(experiment.pythonenv, python) if any(pip_diff): logger.info( 'Setting up python packages for experiment') if pip_install_packages(pip_diff, python, logger) != 0: logger.info( "Installation of all packages together " + " failed, " "trying one package at a time") for pkg in pip_diff: pip_install_packages([pkg], python, logger) for tag, art in six.iteritems(experiment.artifacts): if fetch_artifacts or 'local' not in art.keys(): logger.info('Fetching artifact ' + tag) if tag == 'workspace': art['local'] = retry(lambda: db.get_artifact( art, only_newer=False), sleep_time=10, logger=logger) else: art['local'] = retry( lambda: db.get_artifact(art), sleep_time=10, logger=logger) returncode = executor.run(experiment) if returncode != 0: retval = returncode finally: sched.shutdown() queue.acknowledge(ack_key) if single_experiment: logger.info('single_experiment is True, quitting') return retval else: logger.info('Cannot run experiment ' + experiment.key + ' due lack of resources. Will retry') time.sleep(config['sleep_time']) # wait_for_messages(queue, timeout, logger) # queue = glob.glob(fs_tracker.get_queue_directory() + "/*") logger.info("Queue in {} is empty, quitting".format( fs_tracker.get_queue_directory())) return retval
def get_db(): global _db_provider if not _db_provider: _db_provider = model.get_db_provider(blocking_auth=False) return _db_provider
def main(args=sys.argv[1:]): logger = logs.getLogger('studio-runner') parser = argparse.ArgumentParser( description='Studio runner. \ Usage: studio run <runner_arguments> \ script <script_arguments>') parser.add_argument('--config', help='configuration file', default=None) parser.add_argument('--project', help='name of the project', default=None) parser.add_argument( '--experiment', '-e', help='Name of the experiment. If none provided, ' + 'random uuid will be generated', default=None) parser.add_argument( '--guest', help='Guest mode (does not require db credentials)', action='store_true') parser.add_argument( '--force-git', help='If run in a git directory, force running the experiment ' + 'even if changes are not commited', action='store_true') parser.add_argument( '--gpus', help='Number of gpus needed to run the experiment', type=int, default=None) parser.add_argument( '--cpus', help='Number of cpus needed to run the experiment' + ' (used to configure cloud instance)', type=int, default=None) parser.add_argument( '--ram', help='Amount of RAM needed to run the experiment' + ' (used to configure cloud instance), ex: 10G, 10GB', default=None) parser.add_argument( '--gpuMem', help='Amount of GPU RAM needed to run the experiment', default=None) parser.add_argument( '--hdd', help='Amount of hard drive space needed to run the experiment' + ' (used to configure cloud instance), ex: 10G, 10GB', default=None) parser.add_argument( '--queue', '-q', help='Name of the remote execution queue', default=None) parser.add_argument( '--cloud', help='Cloud execution mode. Could be gcloud, gcspot, ec2 or ec2spot', default=None) parser.add_argument( '--bid', help='Spot instance price bid, specified in USD or in percentage ' + 'of on-demand instance price. Default is %(default)s', default='100%') parser.add_argument( '--capture-once', '-co', help='Name of the immutable artifact to be captured. ' + 'It will be captured once before the experiment is run', default=[], action='append') parser.add_argument( '--capture', '-c', help='Name of the mutable artifact to be captured continuously', default=[], action='append') parser.add_argument( '--reuse', '-r', help='Name of the artifact from another experiment to use', default=[], action='append') parser.add_argument( '--verbose', '-v', help='Verbosity level. Allowed values: ' + 'debug, info, warn, error, crit ' + 'or numerical value of logger levels.', default=None) parser.add_argument( '--metric', help='Metric to show in the summary of the experiment, ' + 'and to base hyperparameter search on. ' + 'Refers a scalar value in tensorboard log ' + 'example: --metric=val_loss[:final | :min | :max] to report ' + 'validation loss in the end of the keras experiment ' + '(or smallest or largest throughout the experiment for :min ' + 'and :max respectively)', default=None) parser.add_argument( '--hyperparam', '-hp', help='Try out multiple values of a certain parameter. ' + 'For example, --hyperparam=learning_rate:0.01:0.1:l10 ' + 'will instantiate 10 versions of the script, replace ' + 'learning_rate with a one of the 10 values for learning ' + 'rate that lies on a log grid from 0.01 to 0.1, create ' 'experiments and place them in the queue.', default=[], action='append') parser.add_argument( '--num-workers', help='Number of local or cloud workers to spin up', type=int, default=None) parser.add_argument( '--python-pkg', help='Python package not present in the current environment ' + 'that is needed for experiment. Only compatible with ' + 'remote and cloud workers for now', default=[], action='append') parser.add_argument( '--ssh-keypair', help='Name of the SSH keypair used to access the EC2 ' + 'instances directly', default=None) parser.add_argument( '--optimizer', '-opt', help='Name of optimizer to use, by default is grid search. ' + 'The name of the optimizer must either be in ' + 'studio/optimizer_plugins ' + 'directory or the path to the optimizer source file ' + 'must be supplied. ', default='grid') parser.add_argument( '--cloud-timeout', help="Time (in seconds) that cloud workers wait for messages. " + "If negative, " + "wait for the first message in the queue indefinitely " + "and shut down " + "as soon as no new messages are available. " + "If zero, don't wait at all." + "Default value is %(default)d", type=int, default=300) parser.add_argument( '--user-startup-script', help='Path of script to run immediately ' + 'before running the remote worker', default=None) parser.add_argument( '--branch', help='Branch of studioml to use when running remote worker, useful ' + 'for debugging pull requests. Default is current', default=None) parser.add_argument( '--max-duration', help='Max experiment runtime (i.e. time after which experiment ' + 'should be killed no matter what.). Examples of values ' + 'might include 5h, 48h2m10s', default=None) parser.add_argument( '--lifetime', help='Max experiment lifetime (i.e. wait time after which ' + 'experiment loses relevance and should not be started)' + ' Examples include 240h30m10s', default=None) parser.add_argument( '--container', help='Singularity container in which experiment should be run. ' + 'Assumes that container has all dependencies installed', default=None ) parser.add_argument( '--port', help='Ports to open on a cloud instance', default=[], action='append' ) # detect which argument is the script filename # and attribute all arguments past that index as related to the script (runner_args, other_args) = parser.parse_known_args(args) py_suffix_args = [i for i, arg in enumerate(args) if arg.endswith('.py') or '::' in arg] rerun = False if len(py_suffix_args) < 1: print('None of the arugments end with .py') if len(other_args) == 0: print("Trying to run a container job") assert runner_args.container is not None exec_filename = None elif len(other_args) == 1: print("Treating last argument as experiment key to rerun") rerun = True experiment_key = args[-1] else: print("Too many extra arguments - should be either none " + "for container job or one for experiment re-run") sys.exit(1) else: script_index = py_suffix_args[0] exec_filename, other_args = args[script_index], args[script_index + 1:] runner_args = parser.parse_args(args[:script_index]) # TODO: Queue the job based on arguments and only then execute. config = model.get_config(runner_args.config) if runner_args.verbose: config['verbose'] = runner_args.verbose if runner_args.guest: config['database']['guest'] = True if runner_args.container: runner_args.capture_once.append( runner_args.container + ':_singularity') verbose = model.parse_verbosity(config['verbose']) logger.setLevel(verbose) if git_util.is_git() and not git_util.is_clean() and not rerun: logger.warn('Running from dirty git repo') if not runner_args.force_git: logger.error( 'Specify --force-git to run experiment from dirty git repo') sys.exit(1) resources_needed = parse_hardware(runner_args, config['resources_needed']) logger.debug('resources requested: ') logger.debug(str(resources_needed)) artifacts = {} artifacts.update(parse_artifacts(runner_args.capture, mutable=True)) artifacts.update(parse_artifacts(runner_args.capture_once, mutable=False)) with model.get_db_provider(config) as db: artifacts.update(parse_external_artifacts(runner_args.reuse, db)) if runner_args.branch: config['cloud']['branch'] = runner_args.branch if runner_args.user_startup_script: config['cloud']['user_startup_script'] = \ runner_args.user_startup_script if runner_args.lifetime: config['experimentLifetime'] = runner_args.lifetime if any(runner_args.hyperparam): if runner_args.optimizer is "grid": experiments = add_hyperparam_experiments( exec_filename, other_args, runner_args, artifacts, resources_needed, logger) queue_name = submit_experiments( experiments, config=config, logger=logger, queue_name=runner_args.queue, cloud=runner_args.cloud) spin_up_workers( runner_args, config, resources_needed, queue_name=queue_name, verbose=verbose) else: opt_modulepath = os.path.join( os.path.dirname(os.path.abspath(__file__)), "optimizer_plugins", runner_args.optimizer + ".py") if not os.path.exists(opt_modulepath): opt_modulepath = os.path.abspath( os.path.expanduser(runner_args.optimizer)) logger.info('optimizer path: %s' % opt_modulepath) assert os.path.exists(opt_modulepath) sys.path.append(os.path.dirname(opt_modulepath)) opt_module = importlib.import_module( os.path.basename(opt_modulepath.replace(".py", ''))) h = HyperparameterParser(runner_args, logger) hyperparams = h.parse() optimizer = getattr( opt_module, "Optimizer")( hyperparams, config['optimizer'], logger) workers_started = False queue_name = runner_args.queue while not optimizer.stop(): hyperparam_pop = optimizer.ask() hyperparam_tuples = h.convert_to_tuples(hyperparam_pop) experiments = add_hyperparam_experiments( exec_filename, other_args, runner_args, artifacts, resources_needed, logger, optimizer=optimizer, hyperparam_tuples=hyperparam_tuples) queue_name = submit_experiments( experiments, config=config, logger=logger, cloud=runner_args.cloud, queue_name=queue_name) if not workers_started: spin_up_workers( runner_args, config, resources_needed, queue_name=queue_name, verbose=verbose) workers_started = True fitnesses, behaviors = get_experiment_fitnesses( experiments, optimizer, config, logger) # for i, hh in enumerate(hyperparam_pop): # print fitnesses[i] # for hhh in hh: # print hhh try: optimizer.tell(hyperparam_pop, fitnesses, behaviors) except BaseException: optimizer.tell(hyperparam_pop, fitnesses) try: optimizer.disp() except BaseException: logger.warn('Optimizer has no disp() method') else: if rerun: with model.get_db_provider(config) as db: experiment = db.get_experiment(experiment_key) new_key = runner_args.experiment if runner_args.experiment \ else experiment_key + '_rerun' + str(uuid.uuid4()) experiment.key = new_key for _, art in six.iteritems(experiment.artifacts): art['mutable'] = False experiments = [experiment] else: experiments = [create_experiment( filename=exec_filename, args=other_args, experiment_name=runner_args.experiment, project=runner_args.project, artifacts=artifacts, resources_needed=resources_needed, metric=runner_args.metric, max_duration=runner_args.max_duration, )] queue_name = submit_experiments( experiments, config=config, logger=logger, cloud=runner_args.cloud, queue_name=runner_args.queue) spin_up_workers( runner_args, config, resources_needed, queue_name=queue_name, verbose=verbose) return
def _stop(args, cli_args): with model.get_db_provider(cli_args.config) as db: for e in args: get_logger().info('Stopping experiment ' + e) db.stop_experiment(e)
def get_experiment_fitnesses(experiments, optimizer, config, logger): with model.get_db_provider() as db: progbar = Progbar(len(experiments), interval=0.0) logger.info("Waiting for fitnesses from %s experiments" % len(experiments)) bad_line_dicts = [dict() for x in range(len(experiments))] has_result = [False for i in range(len(experiments))] fitnesses = [0.0 for i in range(len(experiments))] behaviors = [None for i in range(len(experiments))] term_criterion = config['optimizer']['termination_criterion'] skip_gen_thres = term_criterion['skip_gen_thres'] skip_gen_timeout = term_criterion['skip_gen_timeout'] result_timestamp = time.time() while sum(has_result) < len(experiments): for i, experiment in enumerate(experiments): if float(sum(has_result)) / len(experiments) >= skip_gen_thres\ and time.time() - result_timestamp > skip_gen_timeout: logger.warn( "Skipping to next gen with %s of solutions evaled" % (float( sum(has_result)) / len(experiments))) has_result = [True] * len(experiments) break if has_result[i]: continue returned_experiment = db.get_experiment(experiment.key, getinfo=True) # try: # experiment_output = returned_experiment.info['logtail'] # except BaseException: # logger.warn('Cannot access "logtail" in experiment.info') output = db._get_experiment_logtail( returned_experiment) if output is None: continue for j, line in enumerate(output): if line.startswith( "Traceback (most recent call last):") and \ j not in bad_line_dicts[i]: logger.warn("Experiment %s: error" " discovered in output" % returned_experiment.key) logger.warn("".join(output[j:])) bad_line_dicts[i][j] = True if line.startswith("Behavior") or \ line.startswith("behavior"): try: behavior = eval(line.rstrip().split(':')[1]) if isinstance(behavior, np.ndarray): pass elif isinstance(behavior, list): behavior = np.array(behavior) else: raise except BaseException: if j not in bad_line_dicts[i]: logger.warn( 'Experiment %s: error parsing or invalid' ' behavior' % returned_experiment.key) logger.warn(line) bad_line_dicts[i][j] = True else: behaviors[i] = behavior if line.startswith("Fitness") or \ line.startswith("fitness"): try: fitness = float(line.rstrip().split(':')[1]) # assert fitness >= 0.0 except BaseException: if j not in bad_line_dicts[i]: logger.warn( 'Experiment %s: error parsing or invalid' ' fitness' % returned_experiment.key) logger.warn(line) bad_line_dicts[i][j] = True else: if fitness < 0.0: logger.warn('Experiment %s: returned' ' fitness is less than zero,' ' setting it to zero' % returned_experiment.key) fitness = 0.0 fitnesses[i] = fitness has_result[i] = True progbar.add(1) result_timestamp = time.time() break time.sleep(config['sleep_time']) print return fitnesses, behaviors
def worker_loop(queue, parsed_args, setup_pyenv=False, single_experiment=False, fetch_artifacts=False, timeout=0): logger = logging.getLogger('worker_loop') hold_period = 4 while queue.has_next(): first_exp, ack_key = queue.dequeue(acknowledge=False) experiment_key = json.loads(first_exp)['experiment']['key'] config = json.loads(first_exp)['config'] parsed_args.config = config verbose = model.parse_verbosity(config.get('verbose')) logger.setLevel(verbose) logger.debug( 'Received experiment {} with config {} from the queue'.format( experiment_key, config)) executor = LocalExecutor(parsed_args) with model.get_db_provider(config) as db: experiment = db.get_experiment(experiment_key) if allocate_resources(experiment, config, verbose=verbose): def hold_job(): queue.hold(ack_key, hold_period) hold_job() sched = BackgroundScheduler() sched.add_job(hold_job, 'interval', minutes=hold_period / 2) sched.start() try: if setup_pyenv: logger.info( 'Setting up python packages for experiment') for pkg in experiment.pythonenv: pipp = subprocess.Popen(['pip', 'install', pkg], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) pipout, _ = pipp.communicate() logger.info("pip output: \n" + pipout) for tag, art in experiment.artifacts.iteritems(): if fetch_artifacts or 'local' not in art.keys(): logger.info('Fetching artifact ' + tag) if tag == 'workspace': art['local'] = db.get_artifact( art, only_newer=False) else: art['local'] = db.get_artifact(art) executor.run(experiment) finally: sched.shutdown() queue.acknowledge(ack_key) if single_experiment: logger.info('single_experiment is True, quitting') return else: logger.info('Cannot run experiment ' + experiment.key + ' due lack of resources. Will retry') time.sleep(config['sleep_time']) wait_for_messages(queue, timeout, logger) # queue = glob.glob(fs_tracker.get_queue_directory() + "/*") logger.info("Queue in {} is empty, quitting".format( fs_tracker.get_queue_directory()))