Ejemplo n.º 1
0
    def run(self, experiment):
        if isinstance(experiment, basestring):
            experiment = self.db.get_experiment(experiment)
        elif not isinstance(experiment, model.Experiment):
            raise ValueError("Unknown type of experiment: " +
                             str(type(experiment)))

        self.logger.info("Experiment key: " + experiment.key)

        with model.get_db_provider(self.config) as db:
            db.start_experiment(experiment)
            """ Override env variables with those inside the queued message
            """
            env = dict(os.environ)
            if 'env' in self.config.keys():
                for k, v in self.config['env'].iteritems():
                    if v is not None:
                        env[str(k)] = str(v)

            fs_tracker.setup_experiment(env, experiment, clean=True)
            log_path = fs_tracker.get_artifact_cache('output', experiment.key)

            # log_path = os.path.join(model_dir, self.config['log']['name'])

            self.logger.debug('Child process environment:')
            self.logger.debug(str(env))

            sched = BackgroundScheduler()
            sched.start()

            with open(log_path, 'w') as output_file:
                p = subprocess.Popen(
                    ["python", experiment.filename] + experiment.args,
                    stdout=output_file,
                    stderr=subprocess.STDOUT,
                    env=env,
                    cwd=experiment.artifacts['workspace']['local'])
                # simple hack to show what's in the log file
                ptail = subprocess.Popen(["tail", "-f", log_path])

                sched.add_job(
                    lambda: db.checkpoint_experiment(experiment),
                    'interval',
                    minutes=self.config['saveWorkspaceFrequencyMinutes'])

                def kill_if_stopped():
                    if db.get_experiment(experiment.key,
                                         getinfo=False).status == 'stopped':
                        p.kill()

                sched.add_job(kill_if_stopped, 'interval', seconds=10)

                try:
                    p.wait()
                finally:
                    ptail.kill()
                    db.finish_experiment(experiment)
                    sched.shutdown()
Ejemplo n.º 2
0
def add_experiment(args):
    try:
        config, python_pkg, e = args
        e.pythonenv = add_packages(e.pythonenv, python_pkg)
        with model.get_db_provider(config) as db:
            db.add_experiment(e)
    except BaseException:
        traceback.print_exc()
        raise
    return e
Ejemplo n.º 3
0
def get_db():
    global _db_provider
    global _db_provider_timestamp

    if not _db_provider or \
       not _db_provider_timestamp or \
            time.time() - _db_provider_timestamp > DB_PROVIDER_EXPIRATION:
        _db_provider = model.get_db_provider(blocking_auth=False)
        _db_provider_timestamp = time.time()

    return _db_provider
Ejemplo n.º 4
0
def get_experiment_fitnesses(experiments, optimizer, config, logger):
    db_provider = model.get_db_provider()
    has_result = [False] * len(experiments)
    fitnesses = [0.0] * len(experiments)
    try:
        term_criterion = config['optimizer']['termination_criterion']
    except BaseException:
        logger.warn("Cannot find termination criterion in config.yaml, looking"
                    "in optimizer source code instead")
        term_criterion = optimizer.get_configs()['termination_criterion']

    skip_gen_thres = term_criterion['skip_gen_thres']
    skip_gen_timeout = term_criterion['skip_gen_timeout']

    result_timestamp = time.time()
    while sum(has_result) < len(experiments):
        for i, experiment in enumerate(experiments):
            if float(sum(has_result)) / len(experiments) > skip_gen_thres \
                    and time.time() - result_timestamp > skip_gen_timeout:
                logger.warn(
                    "Skipping to next gen with %s of solutions evaled" %
                    (float(
                        sum(has_result)) /
                        len(experiments)))
                has_result = [True] * len(experiments)
                break
            if has_result[i]:
                continue
            returned_experiment = db_provider.get_experiment(experiment.key,
                                                             getinfo=True)
            # try:
            #     experiment_output = returned_experiment.info['logtail']
            # except:
            #     logger.warn('Cannot access "logtail" ' +
            #                 'field in experiment.info')
            output = db_provider._get_experiment_logtail(returned_experiment)

            for line in output:
                if line.startswith("Fitness") or line.startswith("fitness"):
                    try:
                        fitness = float(line.rstrip().split(':')[1])
                        assert fitness >= 0.0
                    except BaseException:
                        logger.warn('Error parsing or invalid fitness (%s)'
                                    % line)
                    else:
                        fitnesses[i] = fitness
                        has_result[i] = True
                        result_timestamp = time.time()
                        break

        time.sleep(config['sleep_time'])
    return fitnesses
Ejemplo n.º 5
0
def _list(args, cli_args):
    with model.get_db_provider(cli_args.config) as db:
        if len(args) == 0:
            experiments = db.get_user_experiments()
        elif args[0] == 'project':
            assert len(args) == 2
            experiments = db.get_project_experiments(args[1])
        elif args[0] == 'users':
            assert len(args) == 1
            users = db.get_users()
            for u in users:
                print(users[u].get('email'))
            return
        elif args[0] == 'user':
            assert len(args) == 2
            users = db.get_users()
            user_ids = [u for u in users if users[u].get('email') == args[1]]
            assert len(user_ids) == 1, \
                'The user with email ' + args[1] + \
                'not found!'
            experiments = db.get_user_experiments(user_ids[0])
        elif args[0] == 'all':
            assert len(args) == 1
            users = db.get_users()
            experiments = []
            for u in users:
                experiments += db.get_user_experiments(u)
        else:
            get_logger().critical('Unknown command ' + args[0])
            return

        if cli_args.short:
            for e in experiments:
                print(e)
            return

        experiments = [db.get_experiment(e) for e in experiments]

    experiments.sort(key=lambda e: -e.time_added)
    table = [['Time added', 'Key', 'Project', 'Status']]

    for e in experiments:
        table.append([
            time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(e.time_added)),
            e.key,
            e.project,
            e.status])

    print(AsciiTable(table).table)
Ejemplo n.º 6
0
def main(args=sys.argv[1:]):
    parser = argparse.ArgumentParser(description='Studio WebUI server. \
                     Usage: studio \
                     <arguments>')

    parser.add_argument('--config', help='configuration file', default=None)
    #    parser.add_argument('--guest',
    #                        help='Guest mode (does not require db credentials)',
    #                        action='store_true')

    parser.add_argument('--port',
                        help='port to run Flask server on',
                        type=int,
                        default=5000)

    parser.add_argument('--host', help='host name.', default='0.0.0.0')

    parser.add_argument('--verbose',
                        '-v',
                        help='Verbosity level. Allowed vaules: ' +
                        'debug, info, warn, error, crit ' +
                        'or numerical value of logger levels.',
                        default=None)

    args = parser.parse_args(args)
    config = model.get_config()
    if args.config:
        with open(args.config) as f:
            config = yaml.load(f)

    if args.verbose:
        config['verbose'] = args.verbose


#    if args.guest:
#        config['database']['guest'] = True
    global _config
    global _db_provider
    _config = config
    _db_provider = model.get_db_provider(_config, blocking_auth=False)

    getlogger().setLevel(model.parse_verbosity(config.get('verbose')))

    global _save_auth_cookie
    _save_auth_cookie = True

    print('Starting Studio UI on port {0}'.format(args.port))
    app.run(host=args.host, port=args.port)
Ejemplo n.º 7
0
    def __init__(self, args):
        self.config = model.get_config()
        if args.config:
            if isinstance(args.config, basestring):
                with open(args.config) as f:
                    self.config.update(yaml.load(f))
            else:
                self.config.update(args.config)

        if args.guest:
            self.config['database']['guest'] = True

        self.db = model.get_db_provider(self.config)
        self.logger = logging.getLogger('LocalExecutor')
        self.logger.setLevel(model.parse_verbosity(self.config.get('verbose')))
        self.logger.debug("Config: ")
        self.logger.debug(self.config)
Ejemplo n.º 8
0
def submit_experiments(
        experiments,
        config,
        runner_args,
        logger,
        resources_needed):
    db = model.get_db_provider(config)
    verbose = model.parse_verbosity(config['verbose'])

    queue_name = 'local'
    if 'queue' in config.keys():
        queue_name = config['queue']
    if runner_args.queue:
        queue_name = runner_args.queue

    for e in experiments:
        e.pythonenv = add_packages(e.pythonenv, runner_args.python_pkg)
        db.add_experiment(e)
        logger.info("Added experiment " + e.key)

    if runner_args.cloud is not None:
        assert runner_args.cloud in ['gcloud', 'gcspot', 'ec2', 'ec2spot']

        assert runner_args.queue is None, \
            '--queue argument cannot be provided with --cloud argument'
        auth_cookie = None if config['database'].get('guest') \
            else os.path.join(
            auth.token_dir,
            config['database']['apiKey']
        )

        if runner_args.cloud in ['gcloud', 'gcspot']:

            queue_name = 'pubsub_' + str(uuid.uuid4())

            queue = PubsubQueue(queue_name, verbose=verbose)
            worker_manager = GCloudWorkerManager(
                auth_cookie=auth_cookie,
                zone=config['cloud']['zone']
            )

        if runner_args.cloud in ['ec2', 'ec2spot']:

            queue_name = 'sqs_' + str(uuid.uuid4())

            queue = SQSQueue(queue_name, verbose=verbose)
            worker_manager = EC2WorkerManager(
                auth_cookie=auth_cookie
            )

        if runner_args.cloud == 'gcloud' or \
           runner_args.cloud == 'ec2':

            num_workers = int(
                runner_args.num_workers) if runner_args.num_workers else 1
            for i in range(num_workers):
                worker_manager.start_worker(
                    queue_name, resources_needed,
                    ssh_keypair=runner_args.ssh_keypair,
                    timeout=runner_args.cloud_timeout)
        else:
            assert runner_args.bid is not None
            if runner_args.num_workers:
                start_workers = runner_args.num_workers
                queue_upscaling = False
            else:
                start_workers = 1
                queue_upscaling = True

            worker_manager.start_spot_workers(
                queue_name,
                runner_args.bid,
                resources_needed,
                start_workers=start_workers,
                queue_upscaling=queue_upscaling,
                ssh_keypair=runner_args.ssh_keypair,
                timeout=runner_args.cloud_timeout)

    else:
        if queue_name == 'local':
            queue = LocalQueue()
            queue.clean()
        elif queue_name.startswith('sqs_'):
            queue = SQSQueue(queue_name, verbose=verbose)
        else:
            queue = PubsubQueue(
                queue_name,
                config['database']['projectId'],
                verbose=verbose)

    for e in experiments:
        queue.enqueue(json.dumps({
            'experiment': e.__dict__,
            'config': config}))

    if queue_name == 'local':
        worker_args = ['studio-local-worker']

        if runner_args.config:
            worker_args += ['--config=' + runner_args.config]
        if runner_args.guest:
            worker_args += ['--guest']

        logger.info('worker args: {}'.format(worker_args))
        if not runner_args.num_workers or int(runner_args.num_workers) == 1:
            local_worker.main(worker_args)
        else:
            raise NotImplementedError("Multiple local workers are not " +
                                      "implemented yet")
    return
Ejemplo n.º 9
0
def main(args=sys.argv):
    logger = logging.getLogger('studio-runner')
    parser = argparse.ArgumentParser(
        description='Studio runner. \
                     Usage: studio run <runner_arguments> \
                     script <script_arguments>')
    parser.add_argument('--config', help='configuration file', default=None)
    parser.add_argument('--project', help='name of the project', default=None)
    parser.add_argument(
        '--experiment', '-e',
        help='Name of the experiment. If none provided, ' +
             'random uuid will be generated',
        default=None)

    parser.add_argument(
        '--guest',
        help='Guest mode (does not require db credentials)',
        action='store_true')

    parser.add_argument(
        '--force-git',
        help='If run in a git directory, force running the experiment ' +
             'even if changes are not commited',
        action='store_true')

    parser.add_argument(
        '--gpus',
        help='Number of gpus needed to run the experiment',
        default=None)

    parser.add_argument(
        '--cpus',
        help='Number of cpus needed to run the experiment' +
             ' (used to configure cloud instance)',
        default=None)

    parser.add_argument(
        '--ram',
        help='Amount of RAM needed to run the experiment' +
             ' (used to configure cloud instance)',
        default=None)

    parser.add_argument(
        '--hdd',
        help='Amount of hard drive space needed to run the experiment' +
             ' (used to configure cloud instance)',
        default=None)

    parser.add_argument(
        '--queue', '-q',
        help='Name of the remote execution queue',
        default=None)

    parser.add_argument(
        '--cloud',
        help='Cloud execution mode. Could be gcloud, ec2 or ec2spot',
        default=None)

    parser.add_argument(
        '--bid',
        help='Spot instance price bid, specified in USD or in percentage ' +
             'of on-demand instance price. Default is %(default)s',
        default='100%')

    parser.add_argument(
        '--capture-once', '-co',
        help='Name of the immutable artifact to be captured. ' +
        'It will be captured once before the experiment is run',
        default=[], action='append')

    parser.add_argument(
        '--capture', '-c',
        help='Name of the mutable artifact to be captured continuously',
        default=[], action='append')

    parser.add_argument(
        '--reuse', '-r',
        help='Name of the artifact from another experiment to use',
        default=[], action='append')

    parser.add_argument(
        '--verbose', '-v',
        help='Verbosity level. Allowed values: ' +
             'debug, info, warn, error, crit ' +
             'or numerical value of logger levels.',
        default=None)

    parser.add_argument(
        '--metric', '-m',
        help='Metric to show in the summary of the experiment, ' +
             'and to base hyperparameter search on. ' +
             'Refers a scalar value in tensorboard log ' +
             'example: --metric=val_loss[:final | :min | :max] to report ' +
             'validation loss in the end of the keras experiment ' +
             '(or smallest or largest throughout the experiment for :min ' +
             'and :max respectively)',
        default=None)

    parser.add_argument(
        '--hyperparam', '-hp',
        help='Try out multiple values of a certain parameter. ' +
             'For example, --hyperparam=learning_rate:0.01:0.1:l10 ' +
             'will instantiate 10 versions of the script, replace ' +
             'learning_rate with a one of the 10 values for learning ' +
             'rate that lies on a log grid from 0.01 to 0.1, create '
             'experiments and place them in the queue.',
             default=[], action='append')

    parser.add_argument(
        '--num-workers',
        help='Number of local or cloud workers to spin up',
        default=None)

    parser.add_argument(
        '--python-pkg',
        help='Python package not present in the current environment ' +
             'that is needed for experiment. Only compatible with ' +
             'remote and cloud workers for now',
        default=[], action='append')

    parser.add_argument(
        '--ssh-keypair',
        help='Name of the SSH keypair used to access the EC2 ' +
             'instances directly',
        default=None)

    parser.add_argument(
        '--optimizer', '-opt',
        help='Name of optimizer to use, by default is grid search. ' +
        'The name of the optimizer must either be in ' +
        'studio/optimizer_plugins ' +
        'directory or the path to the optimizer source file ' +
        'must be supplied. ',
        default='grid')

    parser.add_argument(
        '--cloud-timeout',
        help="Time (in seconds) that cloud workers wait for messages. " +
             "If negative, " +
             "wait for the first message in the queue indefinitely " +
             "and shut down " +
             "as soon as no new messages are available. " +
             "If zero, don't wait at all." +
             "Default value is %(default)",
        type=int,
        default=300)

    # detect which argument is the script filename
    # and attribute all arguments past that index as related to the script
    py_suffix_args = [i for i, arg in enumerate(args) if arg.endswith('.py')]
    if len(py_suffix_args) < 1:
        print('At least one argument should be a python script ' +
              '(end with *.py)')
        parser.print_help()
        exit()

    script_index = py_suffix_args[0]
    runner_args = parser.parse_args(args[1:script_index])

    exec_filename, other_args = args[script_index], args[script_index + 1:]
    # TODO: Queue the job based on arguments and only then execute.

    config = model.get_config(runner_args.config)

    if runner_args.verbose:
        config['verbose'] = runner_args.verbose

    verbose = model.parse_verbosity(config['verbose'])
    logger.setLevel(verbose)

    db = model.get_db_provider(config)

    if git_util.is_git() and not git_util.is_clean():
        logger.warn('Running from dirty git repo')
        if not runner_args.force_git:
            logger.error(
                'Specify --force-git to run experiment from dirty git repo')
            sys.exit(1)

    resources_needed = parse_hardware(runner_args, config['cloud'])
    logger.debug('resources requested: ')
    logger.debug(str(resources_needed))

    artifacts = {}
    artifacts.update(parse_artifacts(runner_args.capture, mutable=True))
    artifacts.update(parse_artifacts(runner_args.capture_once, mutable=False))
    artifacts.update(parse_external_artifacts(runner_args.reuse, db))

    if any(runner_args.hyperparam):
        if runner_args.optimizer is "grid":
            experiments = add_hyperparam_experiments(
                exec_filename,
                other_args,
                runner_args,
                artifacts,
                resources_needed)
            submit_experiments(
                experiments,
                config,
                runner_args,
                logger,
                resources_needed)
        else:
            opt_modulepath = os.path.join(
                os.path.dirname(os.path.abspath(__file__)),
                "optimizer_plugins",
                runner_args.optimizer + ".py")
            # logger.info('optimizer path: %s' % opt_modulepath)
            if not os.path.exists(opt_modulepath):
                opt_modulepath = os.path.abspath(
                    os.path.expanduser(runner_args.optimizer))
            logger.info('optimizer path: %s' % opt_modulepath)
            assert os.path.exists(opt_modulepath)
            sys.path.append(os.path.dirname(opt_modulepath))
            opt_module = importlib.import_module(
                os.path.basename(opt_modulepath.replace(".py", '')))

            hyperparam_values, log_scale_dict = get_hyperparam_values(
                runner_args)

            optimizer = getattr(opt_module, "Optimizer")(hyperparam_values,
                                                         log_scale_dict)

            while not optimizer.stop():
                hyperparam_tuples = optimizer.ask()

                experiments = add_hyperparam_experiments(
                    exec_filename,
                    other_args,
                    runner_args,
                    artifacts,
                    resources_needed,
                    optimizer=optimizer,
                    hyperparam_tuples=hyperparam_tuples)
                submit_experiments(
                    experiments,
                    config,
                    runner_args,
                    logger,
                    resources_needed)

                fitnesses = get_experiment_fitnesses(experiments,
                                                     optimizer, config, logger)

                optimizer.tell(hyperparam_tuples, fitnesses)
                # if config['verbose'] == "info" or config['verbose'] ==
                # "debug":
                try:
                    optimizer.disp()
                except BaseException:
                    logger.warn('Optimizer has no disp() method')
    else:
        experiments = [model.create_experiment(
            filename=exec_filename,
            args=other_args,
            experiment_name=runner_args.experiment,
            project=runner_args.project,
            artifacts=artifacts,
            resources_needed=resources_needed,
            metric=runner_args.metric)]
        submit_experiments(
            experiments,
            config,
            runner_args,
            logger,
            resources_needed)

    db = None
    return
Ejemplo n.º 10
0
def _kill(args, cli_args):
    with model.get_db_provider(cli_args.config) as db:
        for e in args:
            get_logger().info('Deleting experiment ' + e)
            db.delete_experiment(e)
Ejemplo n.º 11
0
    def run(self, experiment):
        if isinstance(experiment, six.string_types):
            experiment = self.db.get_experiment(experiment)
        elif not isinstance(experiment, Experiment):
            raise ValueError("Unknown type of experiment: " +
                             str(type(experiment)))

        self.logger.info("Experiment key: " + experiment.key)

        with model.get_db_provider(self.config) as db:
            db.start_experiment(experiment)
            """ Override env variables with those inside the queued message
            """
            env = dict(os.environ)
            if 'env' in self.config.keys():
                for k, v in six.iteritems(self.config['env']):
                    if v is not None:
                        env[str(k)] = str(v)

            env['PYTHONUNBUFFERED'] = 'TRUE'

            fs_tracker.setup_experiment(env, experiment, clean=False)
            log_path = fs_tracker.get_artifact_cache('output', experiment.key)

            # log_path = os.path.join(model_dir, self.config['log']['name'])

            self.logger.debug('Child process environment:')
            self.logger.debug(str(env))

            sched = BackgroundScheduler()
            sched.start()

            with open(log_path, 'w') as output_file:
                python = 'python'
                if experiment.pythonver == 3:
                    python = 'python3'

                cmd = [python, experiment.filename] + experiment.args
                cwd = experiment.artifacts['workspace']['local']
                container_artifact = experiment.artifacts.get('_singularity')
                if container_artifact:
                    container = container_artifact.get('local')
                    if not container:
                        container = container_artifact.get('qualified')

                    cwd = fs_tracker.get_artifact_cache(
                        'workspace', experiment.key)

                    for tag, art in six.iteritems(experiment.artifacts):
                        local_path = art.get('local')
                        if not art['mutable'] and os.path.exists(local_path):
                            os.symlink(art['local'],
                                       os.path.join(os.path.dirname(cwd), tag))

                    if experiment.filename is not None:
                        cmd = [
                            'singularity',
                            'exec',
                            container,
                        ] + cmd
                    else:
                        cmd = ['singularity', 'run', container]

                self.logger.info('Running cmd: \n {} '.format(cmd))

                p = subprocess.Popen(cmd,
                                     stdout=output_file,
                                     stderr=subprocess.STDOUT,
                                     env=env,
                                     cwd=cwd)
                # simple hack to show what's in the log file
                # ptail = subprocess.Popen(["tail", "-f", log_path])

                logtail = Pygtail(log_path)

                def tail_func():
                    while logtail:
                        for line in logtail:
                            print(line)

                        time.sleep(0.1)

                tail_thread = threading.Thread(target=tail_func)
                tail_thread.start()

                minutes = 0
                if self.config.get('saveWorkspaceFrequency'):
                    minutes = int(
                        str2duration(self.config['saveWorkspaceFrequency']).
                        total_seconds() / 60)

                def checkpoint():
                    try:
                        db.checkpoint_experiment(experiment)
                    except BaseException as e:
                        self.logger.info(e)

                sched.add_job(checkpoint, 'interval', minutes=minutes)

                metrics_path = fs_tracker.get_artifact_cache(
                    '_metrics', experiment.key)

                minutes = 0
                if self.config.get('saveMetricsFrequency'):
                    minutes = int(
                        str2duration(self.config['saveMetricsFrequency']).
                        total_seconds() / 60)

                sched.add_job(lambda: save_metrics(metrics_path),
                              'interval',
                              minutes=minutes)

                def kill_if_stopped():
                    if db.get_experiment(experiment.key,
                                         getinfo=False).status == 'stopped':
                        p.kill()

                    if experiment.max_duration is not None and \
                            time.time() > experiment.time_started + \
                            int(str2duration(experiment.max_duration)
                                .total_seconds()):

                        p.kill()

                sched.add_job(kill_if_stopped, 'interval', seconds=10)

                try:
                    p.wait()
                finally:
                    save_metrics(metrics_path)
                    sched.shutdown()
                    logtail = None
                    db.checkpoint_experiment(experiment)
                    db.finish_experiment(experiment)
                    return p.returncode
Ejemplo n.º 12
0
def worker_loop(queue,
                parsed_args,
                single_experiment=False,
                timeout=0,
                verbose=None):

    fetch_artifacts = True

    logger = logs.getLogger('worker_loop')

    hold_period = 4
    retval = 0
    while True:
        msg = queue.dequeue(acknowledge=False, timeout=timeout)
        if not msg:
            break

        # first_exp, ack_key = queue.dequeue(acknowledge=False)
        first_exp, ack_key = msg

        data_dict = json.loads(sixdecode(first_exp))
        experiment_key = data_dict['experiment']['key']
        config = data_dict['config']

        parsed_args.config = config
        if verbose:
            config['verbose'] = verbose
        else:
            verbose = model.parse_verbosity(config.get('verbose'))

        logger.setLevel(verbose)

        logger.debug('Received message: \n{}'.format(data_dict))

        executor = LocalExecutor(parsed_args)

        with model.get_db_provider(config) as db:
            # experiment = experiment_from_dict(data_dict['experiment'])
            def try_get_experiment():
                experiment = db.get_experiment(experiment_key)
                if experiment is None:
                    raise ValueError(
                        'experiment is not found - indicates storage failure')
                return experiment

            experiment = retry(try_get_experiment,
                               sleep_time=10,
                               logger=logger)

            if config.get('experimentLifetime') and \
                int(str2duration(config['experimentLifetime'])
                    .total_seconds()) + experiment.time_added < time.time():
                logger.info(
                    'Experiment expired (max lifetime of {} was exceeded)'.
                    format(config.get('experimentLifetime')))
                queue.acknowledge(ack_key)
                continue

            if allocate_resources(experiment, config, verbose=verbose):

                def hold_job():
                    queue.hold(ack_key, hold_period)

                hold_job()
                sched = BackgroundScheduler()
                sched.add_job(hold_job, 'interval', minutes=hold_period / 2)
                sched.start()

                try:
                    python = 'python'
                    if experiment.pythonver == 3:
                        python = 'python3'
                    if '_singularity' not in experiment.artifacts.keys():
                        pip_diff = pip_needed_packages(experiment.pythonenv,
                                                       python)
                        if any(pip_diff):
                            logger.info(
                                'Setting up python packages for experiment')
                            if pip_install_packages(pip_diff, python,
                                                    logger) != 0:

                                logger.info(
                                    "Installation of all packages together " +
                                    " failed, "
                                    "trying one package at a time")

                                for pkg in pip_diff:
                                    pip_install_packages([pkg], python, logger)

                    for tag, art in six.iteritems(experiment.artifacts):
                        if fetch_artifacts or 'local' not in art.keys():
                            logger.info('Fetching artifact ' + tag)
                            if tag == 'workspace':
                                art['local'] = retry(lambda: db.get_artifact(
                                    art, only_newer=False),
                                                     sleep_time=10,
                                                     logger=logger)
                            else:
                                art['local'] = retry(
                                    lambda: db.get_artifact(art),
                                    sleep_time=10,
                                    logger=logger)

                    returncode = executor.run(experiment)
                    if returncode != 0:
                        retval = returncode
                finally:
                    sched.shutdown()
                    queue.acknowledge(ack_key)

                if single_experiment:
                    logger.info('single_experiment is True, quitting')
                    return retval
            else:
                logger.info('Cannot run experiment ' + experiment.key +
                            ' due lack of resources. Will retry')
                time.sleep(config['sleep_time'])

        # wait_for_messages(queue, timeout, logger)

        # queue = glob.glob(fs_tracker.get_queue_directory() + "/*")

    logger.info("Queue in {} is empty, quitting".format(
        fs_tracker.get_queue_directory()))

    return retval
Ejemplo n.º 13
0
def get_db():
    global _db_provider
    if not _db_provider:
        _db_provider = model.get_db_provider(blocking_auth=False)

    return _db_provider
Ejemplo n.º 14
0
def main(args=sys.argv[1:]):
    logger = logs.getLogger('studio-runner')
    parser = argparse.ArgumentParser(
        description='Studio runner. \
                     Usage: studio run <runner_arguments> \
                     script <script_arguments>')
    parser.add_argument('--config', help='configuration file', default=None)
    parser.add_argument('--project', help='name of the project', default=None)
    parser.add_argument(
        '--experiment', '-e',
        help='Name of the experiment. If none provided, ' +
             'random uuid will be generated',
        default=None)

    parser.add_argument(
        '--guest',
        help='Guest mode (does not require db credentials)',
        action='store_true')

    parser.add_argument(
        '--force-git',
        help='If run in a git directory, force running the experiment ' +
             'even if changes are not commited',
        action='store_true')

    parser.add_argument(
        '--gpus',
        help='Number of gpus needed to run the experiment',
        type=int,
        default=None)

    parser.add_argument(
        '--cpus',
        help='Number of cpus needed to run the experiment' +
             ' (used to configure cloud instance)',
        type=int,
        default=None)

    parser.add_argument(
        '--ram',
        help='Amount of RAM needed to run the experiment' +
             ' (used to configure cloud instance), ex: 10G, 10GB',
        default=None)

    parser.add_argument(
        '--gpuMem',
        help='Amount of GPU RAM needed to run the experiment',
        default=None)

    parser.add_argument(
        '--hdd',
        help='Amount of hard drive space needed to run the experiment' +
             ' (used to configure cloud instance), ex: 10G, 10GB',
        default=None)

    parser.add_argument(
        '--queue', '-q',
        help='Name of the remote execution queue',
        default=None)

    parser.add_argument(
        '--cloud',
        help='Cloud execution mode. Could be gcloud, gcspot, ec2 or ec2spot',
        default=None)

    parser.add_argument(
        '--bid',
        help='Spot instance price bid, specified in USD or in percentage ' +
             'of on-demand instance price. Default is %(default)s',
        default='100%')

    parser.add_argument(
        '--capture-once', '-co',
        help='Name of the immutable artifact to be captured. ' +
        'It will be captured once before the experiment is run',
        default=[], action='append')

    parser.add_argument(
        '--capture', '-c',
        help='Name of the mutable artifact to be captured continuously',
        default=[], action='append')

    parser.add_argument(
        '--reuse', '-r',
        help='Name of the artifact from another experiment to use',
        default=[], action='append')

    parser.add_argument(
        '--verbose', '-v',
        help='Verbosity level. Allowed values: ' +
             'debug, info, warn, error, crit ' +
             'or numerical value of logger levels.',
        default=None)

    parser.add_argument(
        '--metric',
        help='Metric to show in the summary of the experiment, ' +
             'and to base hyperparameter search on. ' +
             'Refers a scalar value in tensorboard log ' +
             'example: --metric=val_loss[:final | :min | :max] to report ' +
             'validation loss in the end of the keras experiment ' +
             '(or smallest or largest throughout the experiment for :min ' +
             'and :max respectively)',
        default=None)

    parser.add_argument(
        '--hyperparam', '-hp',
        help='Try out multiple values of a certain parameter. ' +
             'For example, --hyperparam=learning_rate:0.01:0.1:l10 ' +
             'will instantiate 10 versions of the script, replace ' +
             'learning_rate with a one of the 10 values for learning ' +
             'rate that lies on a log grid from 0.01 to 0.1, create '
             'experiments and place them in the queue.',
             default=[], action='append')

    parser.add_argument(
        '--num-workers',
        help='Number of local or cloud workers to spin up',
        type=int,
        default=None)

    parser.add_argument(
        '--python-pkg',
        help='Python package not present in the current environment ' +
             'that is needed for experiment. Only compatible with ' +
             'remote and cloud workers for now',
        default=[], action='append')

    parser.add_argument(
        '--ssh-keypair',
        help='Name of the SSH keypair used to access the EC2 ' +
             'instances directly',
        default=None)

    parser.add_argument(
        '--optimizer', '-opt',
        help='Name of optimizer to use, by default is grid search. ' +
        'The name of the optimizer must either be in ' +
        'studio/optimizer_plugins ' +
        'directory or the path to the optimizer source file ' +
        'must be supplied. ',
        default='grid')

    parser.add_argument(
        '--cloud-timeout',
        help="Time (in seconds) that cloud workers wait for messages. " +
             "If negative, " +
             "wait for the first message in the queue indefinitely " +
             "and shut down " +
             "as soon as no new messages are available. " +
             "If zero, don't wait at all." +
             "Default value is %(default)d",
        type=int,
        default=300)

    parser.add_argument(
        '--user-startup-script',
        help='Path of script to run immediately ' +
             'before running the remote worker',
        default=None)

    parser.add_argument(
        '--branch',
        help='Branch of studioml to use when running remote worker, useful ' +
             'for debugging pull requests. Default is current',
        default=None)

    parser.add_argument(
        '--max-duration',
        help='Max experiment runtime (i.e. time after which experiment ' +
             'should be killed no matter what.).  Examples of values ' +
             'might include 5h, 48h2m10s',
        default=None)

    parser.add_argument(
        '--lifetime',
        help='Max experiment lifetime (i.e. wait time after which ' +
             'experiment loses relevance and should not be started)' +
             '  Examples include 240h30m10s',
        default=None)

    parser.add_argument(
        '--container',
        help='Singularity container in which experiment should be run. ' +
             'Assumes that container has all dependencies installed',
        default=None
    )

    parser.add_argument(
        '--port',
        help='Ports to open on a cloud instance',
        default=[], action='append'
    )

    # detect which argument is the script filename
    # and attribute all arguments past that index as related to the script
    (runner_args, other_args) = parser.parse_known_args(args)
    py_suffix_args = [i for i, arg in enumerate(args) if arg.endswith('.py')
                      or '::' in arg]

    rerun = False
    if len(py_suffix_args) < 1:
        print('None of the arugments end with .py')
        if len(other_args) == 0:
            print("Trying to run a container job")
            assert runner_args.container is not None
            exec_filename = None
        elif len(other_args) == 1:
            print("Treating last argument as experiment key to rerun")
            rerun = True
            experiment_key = args[-1]
        else:
            print("Too many extra arguments - should be either none " +
                  "for container job or one for experiment re-run")
            sys.exit(1)
    else:
        script_index = py_suffix_args[0]
        exec_filename, other_args = args[script_index], args[script_index + 1:]
        runner_args = parser.parse_args(args[:script_index])

    # TODO: Queue the job based on arguments and only then execute.

    config = model.get_config(runner_args.config)

    if runner_args.verbose:
        config['verbose'] = runner_args.verbose

    if runner_args.guest:
        config['database']['guest'] = True

    if runner_args.container:
        runner_args.capture_once.append(
            runner_args.container + ':_singularity')

    verbose = model.parse_verbosity(config['verbose'])
    logger.setLevel(verbose)

    if git_util.is_git() and not git_util.is_clean() and not rerun:
        logger.warn('Running from dirty git repo')
        if not runner_args.force_git:
            logger.error(
                'Specify --force-git to run experiment from dirty git repo')
            sys.exit(1)

    resources_needed = parse_hardware(runner_args, config['resources_needed'])
    logger.debug('resources requested: ')
    logger.debug(str(resources_needed))

    artifacts = {}
    artifacts.update(parse_artifacts(runner_args.capture, mutable=True))
    artifacts.update(parse_artifacts(runner_args.capture_once, mutable=False))
    with model.get_db_provider(config) as db:
        artifacts.update(parse_external_artifacts(runner_args.reuse, db))

    if runner_args.branch:
        config['cloud']['branch'] = runner_args.branch

    if runner_args.user_startup_script:
        config['cloud']['user_startup_script'] = \
            runner_args.user_startup_script

    if runner_args.lifetime:
        config['experimentLifetime'] = runner_args.lifetime

    if any(runner_args.hyperparam):
        if runner_args.optimizer is "grid":
            experiments = add_hyperparam_experiments(
                exec_filename,
                other_args,
                runner_args,
                artifacts,
                resources_needed,
                logger)

            queue_name = submit_experiments(
                experiments,
                config=config,
                logger=logger,
                queue_name=runner_args.queue,
                cloud=runner_args.cloud)

            spin_up_workers(
                runner_args,
                config,
                resources_needed,
                queue_name=queue_name,
                verbose=verbose)
        else:
            opt_modulepath = os.path.join(
                os.path.dirname(os.path.abspath(__file__)),
                "optimizer_plugins",
                runner_args.optimizer + ".py")
            if not os.path.exists(opt_modulepath):
                opt_modulepath = os.path.abspath(
                    os.path.expanduser(runner_args.optimizer))
            logger.info('optimizer path: %s' % opt_modulepath)

            assert os.path.exists(opt_modulepath)
            sys.path.append(os.path.dirname(opt_modulepath))
            opt_module = importlib.import_module(
                os.path.basename(opt_modulepath.replace(".py", '')))

            h = HyperparameterParser(runner_args, logger)
            hyperparams = h.parse()
            optimizer = getattr(
                opt_module,
                "Optimizer")(
                hyperparams,
                config['optimizer'],
                logger)

            workers_started = False
            queue_name = runner_args.queue
            while not optimizer.stop():
                hyperparam_pop = optimizer.ask()
                hyperparam_tuples = h.convert_to_tuples(hyperparam_pop)

                experiments = add_hyperparam_experiments(
                    exec_filename,
                    other_args,
                    runner_args,
                    artifacts,
                    resources_needed,
                    logger,
                    optimizer=optimizer,
                    hyperparam_tuples=hyperparam_tuples)

                queue_name = submit_experiments(
                    experiments,
                    config=config,
                    logger=logger,
                    cloud=runner_args.cloud,
                    queue_name=queue_name)

                if not workers_started:
                    spin_up_workers(
                        runner_args,
                        config,
                        resources_needed,
                        queue_name=queue_name,
                        verbose=verbose)
                    workers_started = True

                fitnesses, behaviors = get_experiment_fitnesses(
                    experiments, optimizer, config, logger)

                # for i, hh in enumerate(hyperparam_pop):
                #     print fitnesses[i]
                #     for hhh in hh:
                #         print hhh
                try:
                    optimizer.tell(hyperparam_pop, fitnesses, behaviors)
                except BaseException:
                    optimizer.tell(hyperparam_pop, fitnesses)

                try:
                    optimizer.disp()
                except BaseException:
                    logger.warn('Optimizer has no disp() method')
    else:
        if rerun:
            with model.get_db_provider(config) as db:
                experiment = db.get_experiment(experiment_key)
                new_key = runner_args.experiment if runner_args.experiment \
                    else experiment_key + '_rerun' + str(uuid.uuid4())
                experiment.key = new_key
                for _, art in six.iteritems(experiment.artifacts):
                    art['mutable'] = False

                experiments = [experiment]

        else:
            experiments = [create_experiment(
                filename=exec_filename,
                args=other_args,
                experiment_name=runner_args.experiment,
                project=runner_args.project,
                artifacts=artifacts,
                resources_needed=resources_needed,
                metric=runner_args.metric,
                max_duration=runner_args.max_duration,
            )]

        queue_name = submit_experiments(
            experiments,
            config=config,
            logger=logger,
            cloud=runner_args.cloud,
            queue_name=runner_args.queue)

        spin_up_workers(
            runner_args,
            config,
            resources_needed,
            queue_name=queue_name,
            verbose=verbose)

    return
Ejemplo n.º 15
0
def _stop(args, cli_args):
    with model.get_db_provider(cli_args.config) as db:
        for e in args:
            get_logger().info('Stopping experiment ' + e)
            db.stop_experiment(e)
Ejemplo n.º 16
0
def get_experiment_fitnesses(experiments, optimizer, config, logger):
    with model.get_db_provider() as db:
        progbar = Progbar(len(experiments), interval=0.0)
        logger.info("Waiting for fitnesses from %s experiments" %
                    len(experiments))

        bad_line_dicts = [dict() for x in range(len(experiments))]
        has_result = [False for i in range(len(experiments))]
        fitnesses = [0.0 for i in range(len(experiments))]
        behaviors = [None for i in range(len(experiments))]
        term_criterion = config['optimizer']['termination_criterion']
        skip_gen_thres = term_criterion['skip_gen_thres']
        skip_gen_timeout = term_criterion['skip_gen_timeout']
        result_timestamp = time.time()

        while sum(has_result) < len(experiments):
            for i, experiment in enumerate(experiments):
                if float(sum(has_result)) / len(experiments) >= skip_gen_thres\
                        and time.time() - result_timestamp > skip_gen_timeout:
                    logger.warn(
                        "Skipping to next gen with %s of solutions evaled" %
                        (float(
                            sum(has_result)) /
                            len(experiments)))
                    has_result = [True] * len(experiments)
                    break
                if has_result[i]:
                    continue
                returned_experiment = db.get_experiment(experiment.key,
                                                        getinfo=True)
                # try:
                #     experiment_output = returned_experiment.info['logtail']
                # except BaseException:
                #     logger.warn('Cannot access "logtail" in experiment.info')
                output = db._get_experiment_logtail(
                    returned_experiment)
                if output is None:
                    continue

                for j, line in enumerate(output):

                    if line.startswith(
                            "Traceback (most recent call last):") and \
                            j not in bad_line_dicts[i]:
                        logger.warn("Experiment %s: error"
                                    " discovered in output" %
                                    returned_experiment.key)
                        logger.warn("".join(output[j:]))
                        bad_line_dicts[i][j] = True

                    if line.startswith("Behavior") or \
                            line.startswith("behavior"):
                        try:
                            behavior = eval(line.rstrip().split(':')[1])
                            if isinstance(behavior, np.ndarray):
                                pass
                            elif isinstance(behavior, list):
                                behavior = np.array(behavior)
                            else:
                                raise

                        except BaseException:
                            if j not in bad_line_dicts[i]:
                                logger.warn(
                                    'Experiment %s: error parsing or invalid'
                                    ' behavior' %
                                    returned_experiment.key)
                                logger.warn(line)
                                bad_line_dicts[i][j] = True
                        else:
                            behaviors[i] = behavior

                    if line.startswith("Fitness") or \
                            line.startswith("fitness"):
                        try:
                            fitness = float(line.rstrip().split(':')[1])
                            # assert fitness >= 0.0
                        except BaseException:
                            if j not in bad_line_dicts[i]:
                                logger.warn(
                                    'Experiment %s: error parsing or invalid'
                                    ' fitness' %
                                    returned_experiment.key)
                                logger.warn(line)
                                bad_line_dicts[i][j] = True
                        else:
                            if fitness < 0.0:
                                logger.warn('Experiment %s: returned'
                                            ' fitness is less than zero,'
                                            ' setting it to zero' %
                                            returned_experiment.key)
                                fitness = 0.0

                            fitnesses[i] = fitness
                            has_result[i] = True
                            progbar.add(1)
                            result_timestamp = time.time()
                            break

            time.sleep(config['sleep_time'])
        print
        return fitnesses, behaviors
Ejemplo n.º 17
0
def worker_loop(queue,
                parsed_args,
                setup_pyenv=False,
                single_experiment=False,
                fetch_artifacts=False,
                timeout=0):

    logger = logging.getLogger('worker_loop')

    hold_period = 4
    while queue.has_next():

        first_exp, ack_key = queue.dequeue(acknowledge=False)

        experiment_key = json.loads(first_exp)['experiment']['key']
        config = json.loads(first_exp)['config']
        parsed_args.config = config
        verbose = model.parse_verbosity(config.get('verbose'))
        logger.setLevel(verbose)

        logger.debug(
            'Received experiment {} with config {} from the queue'.format(
                experiment_key, config))

        executor = LocalExecutor(parsed_args)
        with model.get_db_provider(config) as db:
            experiment = db.get_experiment(experiment_key)

            if allocate_resources(experiment, config, verbose=verbose):

                def hold_job():
                    queue.hold(ack_key, hold_period)

                hold_job()
                sched = BackgroundScheduler()
                sched.add_job(hold_job, 'interval', minutes=hold_period / 2)
                sched.start()

                try:
                    if setup_pyenv:
                        logger.info(
                            'Setting up python packages for experiment')
                        for pkg in experiment.pythonenv:
                            pipp = subprocess.Popen(['pip', 'install', pkg],
                                                    stdout=subprocess.PIPE,
                                                    stderr=subprocess.STDOUT)

                            pipout, _ = pipp.communicate()
                            logger.info("pip output: \n" + pipout)

                    for tag, art in experiment.artifacts.iteritems():
                        if fetch_artifacts or 'local' not in art.keys():
                            logger.info('Fetching artifact ' + tag)
                            if tag == 'workspace':
                                art['local'] = db.get_artifact(
                                    art, only_newer=False)
                            else:
                                art['local'] = db.get_artifact(art)
                    executor.run(experiment)
                finally:
                    sched.shutdown()
                    queue.acknowledge(ack_key)

                if single_experiment:
                    logger.info('single_experiment is True, quitting')
                    return
            else:
                logger.info('Cannot run experiment ' + experiment.key +
                            ' due lack of resources. Will retry')
                time.sleep(config['sleep_time'])

        wait_for_messages(queue, timeout, logger)

        # queue = glob.glob(fs_tracker.get_queue_directory() + "/*")

    logger.info("Queue in {} is empty, quitting".format(
        fs_tracker.get_queue_directory()))