Beispiel #1
0
    def _update_user(self):
        if not os.path.exists(self.token_file):
            # refresh tokens don't expire, hence we can
            # use them forever once obtained
            # or (time.time() - os.path.getmtime(api_key)) > HOUR:
            if self.use_email_auth:
                self.sign_in_with_email()
                self.expired = False
            else:
                self.expired = True
        else:
            # If json file fails to load, try again
            counter = 0
            user = None
            while True:
                if user is not None or counter >= MAX_NUM_RETRIES:
                    break
                try:
                    with open(self.token_file) as f:
                        user = json.loads(f.read())
                except BaseException as e:
                    check_for_kb_interrupt()
                    self.logger.info(e)
                    time.sleep(SLEEP_TIME)
                    counter += 1
            if user is None:
                return

            self.user = user
            if time.time() > self.user.get('expiration', 0):
                counter = 0
                while counter < MAX_NUM_RETRIES:
                    try:
                        self.refresh_token(user['email'], user['refreshToken'])
                        break
                    except BaseException as e:
                        check_for_kb_interrupt()
                        self.logger.info(e)
                        time.sleep(SLEEP_TIME)
                        counter += 1
            else:
                self.expired = False
Beispiel #2
0
    def get_experiment(self, experiment, getinfo='True'):
        if isinstance(experiment, str):
            key = experiment
        else:
            key = experiment.key

        headers = self._get_headers()
        try:
            request = requests.post(self.url + '/api/get_experiment',
                                    headers=headers,
                                    data=json.dumps({"key": key}))

            self._raise_detailed_error(request)
            data = request.json()['experiment']
            return experiment_from_dict(data)
        except BaseException as e:
            check_for_kb_interrupt()
            self.logger.info('error getting experiment {}'.format(key))
            self.logger.info(e)
            return None
Beispiel #3
0
    def __init__(self, queue_name, sub_name=None, verbose=10):
        from google.cloud import pubsub

        assert 'GOOGLE_APPLICATION_CREDENTIALS' in os.environ.keys()
        with open(os.environ['GOOGLE_APPLICATION_CREDENTIALS']) as f:
            credentials = json.loads(f.read())

        project_name = credentials['project_id']
        self.logger = logs.get_logger(self.__class__.__name__)
        if verbose is not None:
            self.logger.setLevel(parse_verbosity(verbose))

        self.pubclient = pubsub.PublisherClient()
        self.subclient = pubsub.SubscriberClient()

        self.project = project_name
        self.topic_name = self.pubclient.topic_path(project_name, queue_name)
        self.logger.info("Topic name = {}".format(self.topic_name))
        try:
            self.pubtopic = self.pubclient.get_topic(self.topic_name)
        except BaseException as e:
            check_for_kb_interrupt()
            self.pubtopic = self.pubclient.create_topic(self.topic_name)
            self.logger.info('topic {} created'.format(self.topic_name))

        sub_name = sub_name if sub_name else queue_name + "_sub"
        self.logger.info("Topic name = {}".format(queue_name))
        self.logger.info("Subscription name = {}".format(sub_name))

        self.sub_name = self.subclient.subscription_path(
            project_name, sub_name)
        try:
            self.subclient.get_subscription(self.sub_name)
        except BaseException as e:
            check_for_kb_interrupt()
            self.logger.warn(e)
            self.subclient.create_subscription(self.sub_name, self.topic_name,
                                               ack_deadline_seconds=20)

        self.logger.info('subscription {} created'.format(sub_name))
Beispiel #4
0
    def get_hash(self, local_path=None):

        if local_path is None:
            local_path = self.local_path

        if local_path is None or not os.path.exists(local_path):
            return self._generate_key()

        tar_filename =\
            tar_artifact(local_path, self.key,
                         self.get_compression(), self.logger)

        try:
            retval = util.sha256_checksum(tar_filename)
            os.remove(tar_filename)
            self.logger.debug('deleted local artifact file %s', tar_filename)
            return retval
        except BaseException as exc:
            util.check_for_kb_interrupt()
            self.logger.error('error generating a hash for %s: %s',
                              tar_filename, repr(exc))
        return None
Beispiel #5
0
 def checkpoint():
     try:
         db.checkpoint_experiment(experiment)
     except BaseException as e:
         self.logger.info(e)
         check_for_kb_interrupt()
Beispiel #6
0
def get_experiment_fitnesses(experiments, optimizer, config, logger):
    with model.get_db_provider() as db:
        progbar = Progbar(len(experiments), interval=0.0)
        logger.info("Waiting for fitnesses from %s experiments" %
                    len(experiments))

        bad_line_dicts = [dict() for x in range(len(experiments))]
        has_result = [False for i in range(len(experiments))]
        fitnesses = [0.0 for i in range(len(experiments))]
        behaviors = [None for i in range(len(experiments))]
        term_criterion = config['optimizer']['termination_criterion']
        skip_gen_thres = term_criterion['skip_gen_thres']
        skip_gen_timeout = term_criterion['skip_gen_timeout']
        result_timestamp = time.time()

        while sum(has_result) < len(experiments):
            for i, experiment in enumerate(experiments):
                if float(sum(has_result)) / len(experiments) >= skip_gen_thres\
                        and time.time() - result_timestamp > skip_gen_timeout:
                    logger.warn(
                        "Skipping to next gen with %s of solutions evaled" %
                        (float(sum(has_result)) / len(experiments)))
                    has_result = [True] * len(experiments)
                    break
                if has_result[i]:
                    continue
                returned_experiment = db.get_experiment(experiment.key,
                                                        getinfo=True)
                output = db._get_experiment_logtail(returned_experiment)
                if output is None:
                    continue

                for j, line in enumerate(output):

                    if line.startswith(
                            "Traceback (most recent call last):") and \
                            j not in bad_line_dicts[i]:
                        logger.warn("Experiment %s: error"
                                    " discovered in output" %
                                    returned_experiment.key)
                        logger.warn("".join(output[j:]))
                        bad_line_dicts[i][j] = True

                    if line.startswith("Behavior") or \
                            line.startswith("behavior"):
                        try:
                            behavior = eval(line.rstrip().split(':')[1])
                            if isinstance(behavior, np.ndarray):
                                pass
                            elif isinstance(behavior, list):
                                behavior = np.array(behavior)
                            else:
                                raise

                        except BaseException:
                            util.check_for_kb_interrupt()
                            if j not in bad_line_dicts[i]:
                                logger.warn(
                                    'Experiment %s: error parsing or invalid'
                                    ' behavior' % returned_experiment.key)
                                logger.warn(line)
                                bad_line_dicts[i][j] = True
                        else:
                            behaviors[i] = behavior

                    if line.startswith("Fitness") or \
                            line.startswith("fitness"):
                        try:
                            fitness = float(line.rstrip().split(':')[1])
                            # assert fitness >= 0.0
                        except BaseException:
                            util.check_for_kb_interrupt()
                            if j not in bad_line_dicts[i]:
                                logger.warn(
                                    'Experiment %s: error parsing or invalid'
                                    ' fitness' % returned_experiment.key)
                                logger.warn(line)
                                bad_line_dicts[i][j] = True
                        else:
                            if fitness < 0.0:
                                logger.warn('Experiment %s: returned'
                                            ' fitness is less than zero,'
                                            ' setting it to zero' %
                                            returned_experiment.key)
                                fitness = 0.0

                            fitnesses[i] = fitness
                            has_result[i] = True
                            progbar.add(1)
                            result_timestamp = time.time()
                            break

            time.sleep(config['sleep_time'])
        return fitnesses, behaviors
Beispiel #7
0
def main(args=sys.argv[1:]):
    logger = logs.get_logger('studio-runner')
    parser = argparse.ArgumentParser(description='Studio runner. \
                     Usage: studio run <runner_arguments> \
                     script <script_arguments>')
    parser.add_argument('--config', help='configuration file', default=None)
    parser.add_argument('--project', help='name of the project', default=None)
    parser.add_argument('--experiment',
                        '-e',
                        help='Name of the experiment. If none provided, ' +
                        'random uuid will be generated',
                        default=None)

    parser.add_argument('--guest',
                        help='Guest mode (does not require db credentials)',
                        action='store_true')

    parser.add_argument(
        '--force-git',
        help='If run in a git directory, force running the experiment ' +
        'even if changes are not commited',
        action='store_true')

    parser.add_argument('--gpus',
                        help='Number of gpus needed to run the experiment',
                        type=int,
                        default=None)

    parser.add_argument('--cpus',
                        help='Number of cpus needed to run the experiment' +
                        ' (used to configure cloud instance)',
                        type=int,
                        default=None)

    parser.add_argument('--ram',
                        help='Amount of RAM needed to run the experiment' +
                        ' (used to configure cloud instance), ex: 10G, 10GB',
                        default=None)

    parser.add_argument('--gpuMem',
                        help='Amount of GPU RAM needed to run the experiment',
                        default=None)

    parser.add_argument(
        '--hdd',
        help='Amount of hard drive space needed to run the experiment' +
        ' (used to configure cloud instance), ex: 10G, 10GB',
        default=None)

    parser.add_argument('--queue',
                        '-q',
                        help='Name of the remote execution queue',
                        default=None)

    parser.add_argument(
        '--cloud',
        help='Cloud execution mode. Could be gcloud, gcspot, ec2 or ec2spot',
        default=None)

    parser.add_argument(
        '--bid',
        help='Spot instance price bid, specified in USD or in percentage ' +
        'of on-demand instance price. Default is %(default)s',
        default='100%')

    parser.add_argument(
        '--capture-once',
        '-co',
        help='Name of the immutable artifact to be captured. ' +
        'It will be captured once before the experiment is run',
        default=[],
        action='append')

    parser.add_argument(
        '--capture',
        '-c',
        help='Name of the mutable artifact to be captured continuously',
        default=[],
        action='append')

    parser.add_argument(
        '--reuse',
        '-r',
        help='Name of the artifact from another experiment to use',
        default=[],
        action='append')

    parser.add_argument('--verbose',
                        '-v',
                        help='Verbosity level. Allowed values: ' +
                        'debug, info, warn, error, crit ' +
                        'or numerical value of logger levels.',
                        default=None)

    parser.add_argument(
        '--metric',
        help='Metric to show in the summary of the experiment, ' +
        'and to base hyperparameter search on. ' +
        'Refers a scalar value in tensorboard log ' +
        'example: --metric=val_loss[:final | :min | :max] to report ' +
        'validation loss in the end of the keras experiment ' +
        '(or smallest or largest throughout the experiment for :min ' +
        'and :max respectively)',
        default=None)

    parser.add_argument(
        '--hyperparam',
        '-hp',
        help='Try out multiple values of a certain parameter. ' +
        'For example, --hyperparam=learning_rate:0.01:0.1:l10 ' +
        'will instantiate 10 versions of the script, replace ' +
        'learning_rate with a one of the 10 values for learning ' +
        'rate that lies on a log grid from 0.01 to 0.1, create '
        'experiments and place them in the queue.',
        default=[],
        action='append')

    parser.add_argument('--num-workers',
                        help='Number of local or cloud workers to spin up',
                        type=int,
                        default=None)

    parser.add_argument(
        '--python-pkg',
        help='Python package not present in the current environment ' +
        'that is needed for experiment. Only compatible with ' +
        'remote and cloud workers for now',
        default=[],
        action='append')

    parser.add_argument(
        '--ssh-keypair',
        help='Name of the SSH keypair used to access the EC2 ' +
        'instances directly',
        default=None)

    parser.add_argument(
        '--optimizer',
        '-opt',
        help='Name of optimizer to use, by default is grid search. ' +
        'The name of the optimizer must either be in ' +
        'studio/optimizer_plugins ' +
        'directory or the path to the optimizer source file ' +
        'must be supplied. ',
        default='grid')

    parser.add_argument(
        '--cloud-timeout',
        help="Time (in seconds) that cloud workers wait for messages. " +
        "If negative, " +
        "wait for the first message in the queue indefinitely " +
        "and shut down " + "as soon as no new messages are available. " +
        "If zero, don't wait at all." + "Default value is %(default)d",
        type=int,
        default=300)

    parser.add_argument('--user-startup-script',
                        help='Path of script to run immediately ' +
                        'before running the remote worker',
                        default=None)

    parser.add_argument(
        '--branch',
        help='Branch of studioml to use when running remote worker, useful ' +
        'for debugging pull requests. Default is current',
        default=None)

    parser.add_argument(
        '--max-duration',
        help='Max experiment runtime (i.e. time after which experiment ' +
        'should be killed no matter what.).  Examples of values ' +
        'might include 5h, 48h2m10s',
        default=None)

    parser.add_argument(
        '--lifetime',
        help='Max experiment lifetime (i.e. wait time after which ' +
        'experiment loses relevance and should not be started)' +
        '  Examples include 240h30m10s',
        default=None)

    parser.add_argument(
        '--container',
        help='Singularity container in which experiment should be run. ' +
        'Assumes that container has all dependencies installed',
        default=None)

    parser.add_argument('--port',
                        help='Ports to open on a cloud instance',
                        default=[],
                        action='append')

    # detect which argument is the script filename
    # and attribute all arguments past that index as related to the script
    (runner_args, other_args) = parser.parse_known_args(args)
    py_suffix_args = [
        i for i, arg in enumerate(args) if arg.endswith('.py') or '::' in arg
    ]

    rerun = False
    if len(py_suffix_args) < 1:
        print('None of the arugments end with .py')
        if len(other_args) == 0:
            print("Trying to run a container job")
            assert runner_args.container is not None
            exec_filename = None
        elif len(other_args) == 1:
            print("Treating last argument as experiment key to rerun")
            rerun = True
            experiment_key = args[-1]
        else:
            print("Too many extra arguments - should be either none " +
                  "for container job or one for experiment re-run")
            sys.exit(1)
    else:
        script_index = py_suffix_args[0]
        exec_filename, other_args = args[script_index], args[script_index + 1:]
        runner_args = parser.parse_args(args[:script_index])

    # TODO: Queue the job based on arguments and only then execute.

    config = model.get_config(runner_args.config)

    if runner_args.verbose:
        config['verbose'] = runner_args.verbose

    if runner_args.guest:
        config['database']['guest'] = True

    if runner_args.container:
        runner_args.capture_once.append(runner_args.container +
                                        ':_singularity')

    verbose = model.parse_verbosity(config['verbose'])
    logger.setLevel(verbose)

    if git_util.is_git() and not git_util.is_clean() and not rerun:
        logger.warn('Running from dirty git repo')
        if not runner_args.force_git:
            logger.error(
                'Specify --force-git to run experiment from dirty git repo')
            sys.exit(1)

    resources_needed = _parse_hardware(runner_args, config['resources_needed'])
    logger.debug('resources requested: ')
    logger.debug(str(resources_needed))

    # Set up default artifacts:
    # note that their "local" paths will be updated
    # on Experiment creation,
    # but they must have "local" field defined
    # to have storage credentials set up properly.
    artifacts = {
        'workspace': {
            'mutable': False,
            'local': os.getcwd(),
            'unpack': True
        },
        'modeldir': {
            'mutable': True,
            'local': '',
            'unpack': True
        },
        'retval': {
            'mutable': True,
            'local': '',
            'unpack': True
        },
        'output': {
            'mutable': True,
            'local': '',
            'unpack': True
        },
        'tb': {
            'mutable': True,
            'local': '',
            'unpack': True
        },
        '_metrics': {
            'mutable': True,
            'local': '',
            'unpack': True
        },
        '_metadata': {
            'mutable': True,
            'local': '',
            'unpack': True
        }
    }

    artifacts.update(_parse_artifacts(runner_args.capture, mutable=True))
    artifacts.update(_parse_artifacts(runner_args.capture_once, mutable=False))
    with model.get_db_provider(config) as db:
        artifacts.update(_parse_external_artifacts(runner_args.reuse, db))

    logger.debug("Task artifacts: %s", repr(artifacts))
    storage_creds = config.get('storage', {}).get(KEY_CREDENTIALS, None)
    _setup_artifacts_creds(artifacts, storage_creds)

    if runner_args.branch:
        config['cloud']['branch'] = runner_args.branch

    if runner_args.user_startup_script:
        config['cloud']['user_startup_script'] = \
            runner_args.user_startup_script

    if runner_args.lifetime:
        config['experimentLifetime'] = runner_args.lifetime

    queueLifetime = None

    if any(runner_args.hyperparam):
        if runner_args.optimizer == "grid":
            experiments = _add_hyperparam_experiments(exec_filename,
                                                      other_args, runner_args,
                                                      artifacts,
                                                      resources_needed, logger)

            queue = model.get_queue(queue_name=runner_args.queue,
                                    cloud=runner_args.cloud,
                                    config=config,
                                    close_after=queueLifetime,
                                    logger=logger,
                                    verbose=verbose)

            queue_name = submit_experiments(experiments,
                                            config=config,
                                            logger=logger,
                                            queue=queue)

            spin_up_workers(runner_args,
                            config,
                            resources_needed,
                            queue_name=queue_name,
                            verbose=verbose)
        else:
            opt_modulepath = os.path.join(
                os.path.dirname(os.path.abspath(__file__)),
                "optimizer_plugins", runner_args.optimizer + ".py")
            if not os.path.exists(opt_modulepath):
                opt_modulepath = os.path.abspath(
                    os.path.expanduser(runner_args.optimizer))
            logger.info('optimizer path: %s' % opt_modulepath)

            assert os.path.exists(opt_modulepath)
            sys.path.append(os.path.dirname(opt_modulepath))
            opt_module = importlib.import_module(
                os.path.basename(opt_modulepath.replace(".py", '')))

            h = HyperparameterParser(runner_args, logger)
            hyperparams = h.parse()
            optimizer = getattr(opt_module,
                                "Optimizer")(hyperparams, config['optimizer'],
                                             logger)

            workers_started = False
            queue_name = runner_args.queue
            while not optimizer.stop():
                hyperparam_pop = optimizer.ask()
                hyperparam_tuples = h.convert_to_tuples(hyperparam_pop)

                experiments = _add_hyperparam_experiments(
                    exec_filename,
                    other_args,
                    runner_args,
                    artifacts,
                    resources_needed,
                    logger,
                    optimizer=optimizer,
                    hyperparam_tuples=hyperparam_tuples)

                queue = model.get_queue(queue_name=queue_name,
                                        cloud=runner_args.cloud,
                                        config=config,
                                        close_after=queueLifetime,
                                        logger=logger,
                                        verbose=verbose)

                queue_name = submit_experiments(experiments,
                                                config=config,
                                                logger=logger,
                                                queue=queue)

                if not workers_started:
                    spin_up_workers(runner_args,
                                    config,
                                    resources_needed,
                                    queue_name=queue_name,
                                    verbose=verbose)
                    workers_started = True

                fitnesses, behaviors = get_experiment_fitnesses(
                    experiments, optimizer, config, logger)

                try:
                    optimizer.tell(hyperparam_pop, fitnesses, behaviors)
                except BaseException:
                    util.check_for_kb_interrupt()
                    optimizer.tell(hyperparam_pop, fitnesses)

                try:
                    optimizer.disp()
                except BaseException:
                    util.check_for_kb_interrupt()
                    logger.warn('Optimizer has no disp() method')
    else:
        if rerun:
            with model.get_db_provider(config) as db:
                experiment = db.get_experiment(experiment_key)
                new_key = runner_args.experiment if runner_args.experiment \
                    else experiment_key + '_rerun' + str(uuid.uuid4())
                experiment.key = new_key
                for _, art in six.iteritems(experiment.artifacts):
                    art['mutable'] = False

                experiments = [experiment]

        else:
            experiments = [
                create_experiment(filename=exec_filename,
                                  args=other_args,
                                  experiment_name=runner_args.experiment,
                                  project=runner_args.project,
                                  artifacts=artifacts,
                                  resources_needed=resources_needed,
                                  metric=runner_args.metric,
                                  max_duration=runner_args.max_duration,
                                  dependency_policy=StudioDependencyPolicy())
            ]

        queue = model.get_queue(queue_name=runner_args.queue,
                                cloud=runner_args.cloud,
                                config=config,
                                close_after=queueLifetime,
                                logger=logger,
                                verbose=verbose)

        queue_name = submit_experiments(experiments,
                                        config=config,
                                        logger=logger,
                                        queue=queue)

        spin_up_workers(runner_args,
                        config,
                        resources_needed,
                        queue_name=queue_name,
                        verbose=verbose)

    return
Beispiel #8
0
 def acknowledge(self, key):
     try:
         os.remove(key)
     except BaseException:
         check_for_kb_interrupt()