Esempio n. 1
0
def run_flow_on_task(flow,
                     task,
                     avoid_duplicate_runs=True,
                     flow_tags=None,
                     seed=None,
                     add_local_measures=True):
    """Run the model provided by the flow on the dataset defined by task.

    Takes the flow and repeat information into account. In case a flow is not
    yet published, it is published after executing the run (requires
    internet connection).

    Parameters
    ----------
    model : sklearn model
        A model which has a function fit(X,Y) and predict(X),
        all supervised estimators of scikit learn follow this definition of a model [1]
        [1](http://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html)
    task : OpenMLTask
        Task to perform. This may be an OpenMLFlow instead if the second argument is an OpenMLTask.
    avoid_duplicate_runs : bool
        If this flag is set to True, the run will throw an error if the
        setup/task combination is already present on the server. Works only
        if the flow is already published on the server. This feature requires an
        internet connection.
        This may be an OpenMLTask instead if the first argument is the OpenMLFlow.
    flow_tags : list(str)
        A list of tags that the flow should have at creation.
    seed: int
        Models that are not seeded will be automatically seeded by a RNG. The
        RBG will be seeded with this seed.
    add_local_measures : bool
        Determines whether to calculate a set of evaluation measures locally,
        to later verify server behaviour. Defaults to True

    Returns
    -------
    run : OpenMLRun
        Result of the run.
    """
    if flow_tags is not None and not isinstance(flow_tags, list):
        raise ValueError("flow_tags should be list")

    # TODO: At some point in the future do not allow for arguments in old order (order changed 6-2018).
    if isinstance(flow, OpenMLTask) and isinstance(task, OpenMLFlow):
        # We want to allow either order of argument (to avoid confusion).
        warnings.warn(
            "The old argument order (Flow, model) is deprecated and will not be supported in the future. "
            "Please use the order (model, Flow).", DeprecationWarning)
        task, flow = flow, task

    flow.model = _get_seeded_model(flow.model, seed=seed)

    # skips the run if it already exists and the user opts for this in the config file.
    # also, if the flow is not present on the server, the check is not needed.
    flow_id = flow_exists(flow.name, flow.external_version)
    if avoid_duplicate_runs and flow_id:
        flow_from_server = get_flow(flow_id)
        flow_from_server.model = flow.model
        setup_id = setup_exists(flow_from_server)
        ids = _run_exists(task.task_id, setup_id)
        if ids:
            raise PyOpenMLError("Run already exists in server. Run id(s): %s" %
                                str(ids))
        _copy_server_fields(flow_from_server, flow)

    dataset = task.get_dataset()

    if task.class_labels is None:
        raise ValueError('The task has no class labels. This method currently '
                         'only works for tasks with class labels.')

    run_environment = _get_version_information()
    tags = ['openml-python', run_environment[1]]

    # execute the run
    res = _run_task_get_arffcontent(flow.model,
                                    task,
                                    add_local_measures=add_local_measures)

    # in case the flow not exists, flow_id will be False (as returned by
    # flow_exists). Also check whether there are no illegal flow.flow_id values
    # (compared to result of openml.flows.flow_exists)
    if flow_id is False:
        if flow.flow_id is not None:
            raise ValueError('flow.flow_id is not None, but the flow does not'
                             'exist on the server according to flow_exists')
        _publish_flow_if_necessary(flow)
        # if the flow was published successfully
        # and has an id
        if flow.flow_id is not None:
            flow_id = flow.flow_id

    data_content, trace, fold_evaluations, sample_evaluations = res
    if not isinstance(flow.flow_id, int):
        # This is the usual behaviour, where the flow object was initiated off
        # line and requires some additional information (flow_id, input_id for
        # each hyperparameter) to be usable by this library
        server_flow = get_flow(flow_id)
        openml.flows.flow._copy_server_fields(server_flow, flow)
        openml.flows.assert_flows_equal(flow,
                                        server_flow,
                                        ignore_parameter_values=True)
    else:
        # This can only happen when the function is called directly, and not
        # through "run_model_on_task"
        if flow.flow_id != flow_id:
            # This should never happen, unless user made a flow-creation fault
            raise ValueError(
                "Result from API call flow_exists and flow.flow_id are not "
                "same: '%s' vs '%s'" % (str(flow.flow_id), str(flow_id)))

    run = OpenMLRun(
        task_id=task.task_id,
        flow_id=flow.flow_id,
        dataset_id=dataset.dataset_id,
        model=flow.model,
        flow_name=flow.name,
        tags=tags,
        trace=trace,
        data_content=data_content,
    )
    # TODO: currently hard-coded sklearn assumption.
    run.parameter_settings = openml.flows.obtain_parameter_values(flow)

    # now we need to attach the detailed evaluations
    if task.task_type_id == 3:
        run.sample_evaluations = sample_evaluations
    else:
        run.fold_evaluations = fold_evaluations

    config.logger.info('Executed Task %d with Flow id: %d' %
                       (task.task_id, run.flow_id))

    return run
Esempio n. 2
0
def run_flow_on_task(
    flow: OpenMLFlow,
    task: OpenMLTask,
    avoid_duplicate_runs: bool = True,
    flow_tags: List[str] = None,
    seed: int = None,
    add_local_measures: bool = True,
    upload_flow: bool = False,
) -> OpenMLRun:
    """Run the model provided by the flow on the dataset defined by task.

    Takes the flow and repeat information into account.
    The Flow may optionally be published.

    Parameters
    ----------
    flow : OpenMLFlow
        A flow wraps a machine learning model together with relevant information.
        The model has a function fit(X,Y) and predict(X),
        all supervised estimators of scikit learn follow this definition of a model [1]
        [1](http://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html)
    task : OpenMLTask
        Task to perform. This may be an OpenMLFlow instead if the first argument is an OpenMLTask.
    avoid_duplicate_runs : bool, optional (default=True)
        If True, the run will throw an error if the setup/task combination is already present on
        the server. This feature requires an internet connection.
    avoid_duplicate_runs : bool, optional (default=True)
        If True, the run will throw an error if the setup/task combination is already present on
        the server. This feature requires an internet connection.
    flow_tags : List[str], optional (default=None)
        A list of tags that the flow should have at creation.
    seed: int, optional (default=None)
        Models that are not seeded will get this seed.
    add_local_measures : bool, optional (default=True)
        Determines whether to calculate a set of evaluation measures locally,
        to later verify server behaviour.
    upload_flow : bool (default=False)
        If True, upload the flow to OpenML if it does not exist yet.
        If False, do not upload the flow to OpenML.

    Returns
    -------
    run : OpenMLRun
        Result of the run.
    """
    if flow_tags is not None and not isinstance(flow_tags, list):
        raise ValueError("flow_tags should be a list")

    # TODO: At some point in the future do not allow for arguments in old order (changed 6-2018).
    # Flexibility currently still allowed due to code-snippet in OpenML100 paper (3-2019).
    if isinstance(flow, OpenMLTask) and isinstance(task, OpenMLFlow):
        # We want to allow either order of argument (to avoid confusion).
        warnings.warn(
            "The old argument order (Flow, model) is deprecated and "
            "will not be supported in the future. Please use the "
            "order (model, Flow).", DeprecationWarning)
        task, flow = flow, task

    if task.task_id is None:
        raise ValueError("The task should be published at OpenML")

    if flow.model is None:
        flow.model = flow.extension.flow_to_model(flow)
    flow.model = flow.extension.seed_model(flow.model, seed=seed)

    # We only need to sync with the server right now if we want to upload the flow,
    # or ensure no duplicate runs exist. Otherwise it can be synced at upload time.
    flow_id = None
    if upload_flow or avoid_duplicate_runs:
        flow_id = flow_exists(flow.name, flow.external_version)
        if isinstance(flow.flow_id, int) and flow_id != flow.flow_id:
            if flow_id:
                raise PyOpenMLError(
                    "Local flow_id does not match server flow_id: "
                    "'{}' vs '{}'".format(flow.flow_id, flow_id))
            else:
                raise PyOpenMLError("Flow does not exist on the server, "
                                    "but 'flow.flow_id' is not None.")

        if upload_flow and not flow_id:
            flow.publish()
            flow_id = flow.flow_id
        elif flow_id:
            flow_from_server = get_flow(flow_id)
            _copy_server_fields(flow_from_server, flow)
            if avoid_duplicate_runs:
                flow_from_server.model = flow.model
                setup_id = setup_exists(flow_from_server)
                ids = run_exists(task.task_id, setup_id)
                if ids:
                    error_message = ("One or more runs of this setup were "
                                     "already performed on the task.")
                    raise OpenMLRunsExistError(ids, error_message)
        else:
            # Flow does not exist on server and we do not want to upload it.
            # No sync with the server happens.
            flow_id = None
            pass

    dataset = task.get_dataset()

    run_environment = flow.extension.get_version_information()
    tags = ['openml-python', run_environment[1]]

    # execute the run
    res = _run_task_get_arffcontent(
        flow=flow,
        model=flow.model,
        task=task,
        extension=flow.extension,
        add_local_measures=add_local_measures,
    )

    data_content, trace, fold_evaluations, sample_evaluations = res

    run = OpenMLRun(
        task_id=task.task_id,
        flow_id=flow_id,
        dataset_id=dataset.dataset_id,
        model=flow.model,
        flow_name=flow.name,
        tags=tags,
        trace=trace,
        data_content=data_content,
        flow=flow,
        setup_string=flow.extension.create_setup_string(flow.model),
    )

    if (upload_flow or avoid_duplicate_runs) and flow.flow_id is not None:
        # We only extract the parameter settings if a sync happened with the server.
        # I.e. when the flow was uploaded or we found it in the avoid_duplicate check.
        # Otherwise, we will do this at upload time.
        run.parameter_settings = flow.extension.obtain_parameter_values(flow)

    # now we need to attach the detailed evaluations
    if task.task_type_id == TaskTypeEnum.LEARNING_CURVE:
        run.sample_evaluations = sample_evaluations
    else:
        run.fold_evaluations = fold_evaluations

    if flow_id:
        message = 'Executed Task {} with Flow id:{}'.format(
            task.task_id, run.flow_id)
    else:
        message = 'Executed Task {} on local Flow with name {}.'.format(
            task.task_id, flow.name)
    config.logger.info(message)

    return run
Esempio n. 3
0
def run_flow_on_task(flow, task, avoid_duplicate_runs=True, flow_tags=None,
                     seed=None, add_local_measures=True):
    """Run the model provided by the flow on the dataset defined by task.

    Takes the flow and repeat information into account. In case a flow is not
    yet published, it is published after executing the run (requires
    internet connection).

    Parameters
    ----------
    model : sklearn model
        A model which has a function fit(X,Y) and predict(X),
        all supervised estimators of scikit learn follow this definition of a model [1]
        [1](http://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html)
    task : OpenMLTask
        Task to perform. This may be an OpenMLFlow instead if the second argument is an OpenMLTask.
    avoid_duplicate_runs : bool
        If this flag is set to True, the run will throw an error if the
        setup/task combination is already present on the server. Works only
        if the flow is already published on the server. This feature requires an
        internet connection.
        This may be an OpenMLTask instead if the first argument is the OpenMLFlow.
    flow_tags : list(str)
        A list of tags that the flow should have at creation.
    seed: int
        Models that are not seeded will be automatically seeded by a RNG. The
        RBG will be seeded with this seed.
    add_local_measures : bool
        Determines whether to calculate a set of evaluation measures locally,
        to later verify server behaviour. Defaults to True

    Returns
    -------
    run : OpenMLRun
        Result of the run.
    """
    if flow_tags is not None and not isinstance(flow_tags, list):
        raise ValueError("flow_tags should be list")

    # TODO: At some point in the future do not allow for arguments in old order (order changed 6-2018).
    if isinstance(flow, OpenMLTask) and isinstance(task, OpenMLFlow):
        # We want to allow either order of argument (to avoid confusion).
        warnings.warn("The old argument order (Flow, model) is deprecated and will not be supported in the future. "
                      "Please use the order (model, Flow).", DeprecationWarning)
        task, flow = flow, task

    flow.model = _get_seeded_model(flow.model, seed=seed)

    # skips the run if it already exists and the user opts for this in the config file.
    # also, if the flow is not present on the server, the check is not needed.
    flow_id = flow_exists(flow.name, flow.external_version)
    if avoid_duplicate_runs and flow_id:
        flow_from_server = get_flow(flow_id)
        flow_from_server.model = flow.model
        setup_id = setup_exists(flow_from_server)
        ids = _run_exists(task.task_id, setup_id)
        if ids:
            raise PyOpenMLError("Run already exists in server. Run id(s): %s" % str(ids))
        _copy_server_fields(flow_from_server, flow)

    dataset = task.get_dataset()

    if task.class_labels is None:
        raise ValueError('The task has no class labels. This method currently '
                         'only works for tasks with class labels.')

    run_environment = _get_version_information()
    tags = ['openml-python', run_environment[1]]

    # execute the run
    res = _run_task_get_arffcontent(flow.model, task, add_local_measures=add_local_measures)

    # in case the flow not exists, flow_id will be False (as returned by
    # flow_exists). Also check whether there are no illegal flow.flow_id values
    # (compared to result of openml.flows.flow_exists)
    if flow_id is False:
        if flow.flow_id is not None:
            raise ValueError('flow.flow_id is not None, but the flow does not'
                             'exist on the server according to flow_exists')
        _publish_flow_if_necessary(flow)
        # if the flow was published successfully
        # and has an id
        if flow.flow_id is not None:
            flow_id = flow.flow_id


    data_content, trace, fold_evaluations, sample_evaluations = res
    if not isinstance(flow.flow_id, int):
        # This is the usual behaviour, where the flow object was initiated off
        # line and requires some additional information (flow_id, input_id for
        # each hyperparameter) to be usable by this library
        server_flow = get_flow(flow_id)
        openml.flows.flow._copy_server_fields(server_flow, flow)
        openml.flows.assert_flows_equal(flow, server_flow,
                                        ignore_parameter_values=True)
    else:
        # This can only happen when the function is called directly, and not
        # through "run_model_on_task"
        if flow.flow_id != flow_id:
            # This should never happen, unless user made a flow-creation fault
            raise ValueError(
                "Result from API call flow_exists and flow.flow_id are not "
                "same: '%s' vs '%s'" % (str(flow.flow_id), str(flow_id))
            )

    run = OpenMLRun(
        task_id=task.task_id,
        flow_id=flow.flow_id,
        dataset_id=dataset.dataset_id,
        model=flow.model,
        flow_name=flow.name,
        tags=tags,
        trace=trace,
        data_content=data_content,
    )
    # TODO: currently hard-coded sklearn assumption.
    run.parameter_settings = openml.flows.obtain_parameter_values(flow)

    # now we need to attach the detailed evaluations
    if task.task_type_id == 3:
        run.sample_evaluations = sample_evaluations
    else:
        run.fold_evaluations = fold_evaluations

    config.logger.info('Executed Task %d with Flow id: %d' % (task.task_id, run.flow_id))

    return run