Exemple #1
0
def test_public_ip_existing(requests_mock):
    # Set-up
    utilities.public_ip = '1.2.3.4'

    # run
    ip = utilities.get_public_ip()

    # asserts
    assert ip == utilities.public_ip
    requests_mock.get.assert_not_called()
Exemple #2
0
def test_public_ip_fail(mock_get):
    # Set-up
    utilities.public_ip = None

    # run
    ip = utilities.get_public_ip()

    # asserts
    assert ip == utilities.public_ip
    assert ip == 'localhost'
    mock_get.assert_called_once_with(utilities.PUBLIC_IP_URL)
Exemple #3
0
def test_public_ip_fail(requests_mock):
    # Set-up
    utilities.public_ip = None
    requests_mock.get.side_effect = Exception  # Force fail

    # run
    ip = utilities.get_public_ip()

    # asserts
    assert ip == utilities.public_ip
    assert ip == 'localhost'
    requests_mock.get.assert_called_once_with(utilities.PUBLIC_IP_URL)
Exemple #4
0
def test_public_ip_success():
    # Set-up
    utilities.public_ip = None

    # run
    ip = utilities.get_public_ip()

    # asserts
    assert ip == utilities.public_ip
    try:
        socket.inet_aton(ip)
    except socket.error:
        pytest.fail("Invalid IP address")
Exemple #5
0
def work(db, datarun_ids=None, save_files=False, choose_randomly=True,
         cloud_mode=False, aws_config=None, log_config=None, total_time=None,
         wait=True):
    """
    Check the ModelHub database for unfinished dataruns, and spawn workers to
    work on them as they are added. This process will continue to run until it
    exceeds total_time or is broken with ctrl-C.

    db: Database instance with which we can make queries to ModelHub
    datarun_ids (optional): list of IDs of dataruns to compute on. If None,
        this will work on all unfinished dataruns in the database.
    choose_randomly: if True, work on all highest-priority dataruns in random
        order. If False, work on them in sequential order (by ID)
    cloud_mode: if True, save processed datasets to AWS. If this option is set,
        aws_config must be supplied.
    aws_config (optional): if cloud_mode is set, this must be an AWSConfig
        object with connection details for an S3 bucket.
    total_time (optional): if set to an integer, this worker will only work for
        total_time seconds. Otherwise, it will continue working until all
        dataruns are complete (or indefinitely).
    wait: if True, once all dataruns in the database are complete, keep spinning
        and wait for new runs to be added. If False, exit once all dataruns are
        complete.
    """
    start_time = datetime.datetime.now()
    public_ip = get_public_ip()

    # main loop
    while True:
        # get all pending and running dataruns, or all pending/running dataruns
        # from the list we were given
        dataruns = db.get_dataruns(include_ids=datarun_ids,
                                   ignore_complete=True)
        if not dataruns:
            if wait:
                logger.warning('No dataruns found. Sleeping %d seconds and trying again.'
                               % LOOP_WAIT)
                time.sleep(LOOP_WAIT)
                continue
            else:
                logger.warning('No dataruns found. Exiting.')
                break

        max_priority = max([r.priority for r in dataruns])
        priority_runs = [r for r in dataruns if r.priority == max_priority]

        # either choose a run randomly, or take the run with the lowest ID
        if choose_randomly:
            run = random.choice(priority_runs)
        else:
            run = sorted(dataruns, key=attrgetter('id'))[0]

        # say we've started working on this datarun, if we haven't already
        db.mark_datarun_running(run.id)

        logger.info('Computing on datarun %d' % run.id)
        # actual work happens here
        worker = Worker(db, run, save_files=save_files,
                        cloud_mode=cloud_mode, aws_config=aws_config,
                        log_config=log_config, public_ip=public_ip)
        try:
            worker.run_classifier()
        except ClassifierError:
            # the exception has already been handled; just wait a sec so we
            # don't go out of control reporting errors
            logger.warning('Something went wrong. Sleeping %d seconds.' % LOOP_WAIT)
            time.sleep(LOOP_WAIT)

        elapsed_time = (datetime.datetime.now() - start_time).total_seconds()
        if total_time is not None and elapsed_time >= total_time:
            logger.warning('Total run time for worker exceeded; exiting.')
            break
Exemple #6
0
def get_datarun_steps_info(datarun_id,
                           classifier_start=None,
                           classifier_end=None,
                           nice=False):
    """
    Get the scores of the hyperpartitions/method in each step.
    :param datarun_id: the id of the datarun
    :param classifier_start: only return the scores of and after the `classifier_start` th classifier
    :param classifier_end: only return the scores before the `classifier_end` th classifier,
        Note that :classifier_start and :classifier_end are not ids, they starts from 1.
        (This is because the caller may not know the classifier ids of the datarun)
    :param nice: A flag for return nice format result
    :return:
        if nice is False,
        [
            {"1": 0.2, "2": 0.3, ...},
            ...
        ]
        if nice is True,
        [
            {
                "knn": [0.2, 0.3],
                "logreg": [0.1],
                ...
            },
            ...
        ]
    """
    if classifier_start is None:
        classifier_start = -np.inf
    if classifier_end is None:
        classifier_end = np.inf
    db = get_db()

    datarun = db.get_datarun(datarun_id=datarun_id)
    hyperpartitions = db.get_hyperpartitions(datarun_id=datarun_id)

    # load classifiers and build scores lists
    # make sure all hyperpartitions are present in the dict, even ones that
    # don't have any classifiers. That way the selector can choose hyperpartitions
    # that haven't been scored yet.
    hyperpartition_scores = {fs.id: [] for fs in hyperpartitions}
    classifiers = db.get_classifiers(datarun_id=datarun_id,
                                     status=ClassifierStatus.COMPLETE)
    selected_classifiers = [
        c for c in classifiers if c.hyperpartition_id in hyperpartition_scores
    ]
    # Create a temporary worker
    worker = Worker(db, datarun, public_ip=get_public_ip())
    bandit_scores_of_steps = []
    for i, c in enumerate(selected_classifiers):
        if i >= classifier_end:
            break
        # the cast to float is necessary because the score is a Decimal;
        # doing Decimal-float arithmetic throws errors later on.
        score = float(getattr(c, datarun.score_target) or 0)
        hyperpartition_scores[c.hyperpartition_id].append(score)
        bandit_scores = selector_bandit_scores(worker.selector,
                                               hyperpartition_scores)
        bandit_scores = {
            key: float("%.5f" % val)
            for key, val in bandit_scores.items()
        }
        if i < classifier_start:
            continue
        bandit_scores_of_steps.append(bandit_scores)
    # For a nicer formatted output
    if nice:
        results = []
        hp_id2method = {fs.id: fs.method for fs in hyperpartitions}
        for bandit_scores in bandit_scores_of_steps:
            res = defaultdict(list)
            for hp_id, score in bandit_scores.items():
                res[hp_id2method[hp_id]].append(score)
            results.append(res)
        return results

    return bandit_scores_of_steps