def run_flow_on_task(flow, task, avoid_duplicate_runs=True, flow_tags=None, seed=None, add_local_measures=True): """Run the model provided by the flow on the dataset defined by task. Takes the flow and repeat information into account. In case a flow is not yet published, it is published after executing the run (requires internet connection). Parameters ---------- model : sklearn model A model which has a function fit(X,Y) and predict(X), all supervised estimators of scikit learn follow this definition of a model [1] [1](http://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html) task : OpenMLTask Task to perform. This may be an OpenMLFlow instead if the second argument is an OpenMLTask. avoid_duplicate_runs : bool If this flag is set to True, the run will throw an error if the setup/task combination is already present on the server. Works only if the flow is already published on the server. This feature requires an internet connection. This may be an OpenMLTask instead if the first argument is the OpenMLFlow. flow_tags : list(str) A list of tags that the flow should have at creation. seed: int Models that are not seeded will be automatically seeded by a RNG. The RBG will be seeded with this seed. add_local_measures : bool Determines whether to calculate a set of evaluation measures locally, to later verify server behaviour. Defaults to True Returns ------- run : OpenMLRun Result of the run. """ if flow_tags is not None and not isinstance(flow_tags, list): raise ValueError("flow_tags should be list") # TODO: At some point in the future do not allow for arguments in old order (order changed 6-2018). if isinstance(flow, OpenMLTask) and isinstance(task, OpenMLFlow): # We want to allow either order of argument (to avoid confusion). warnings.warn( "The old argument order (Flow, model) is deprecated and will not be supported in the future. " "Please use the order (model, Flow).", DeprecationWarning) task, flow = flow, task flow.model = _get_seeded_model(flow.model, seed=seed) # skips the run if it already exists and the user opts for this in the config file. # also, if the flow is not present on the server, the check is not needed. flow_id = flow_exists(flow.name, flow.external_version) if avoid_duplicate_runs and flow_id: flow_from_server = get_flow(flow_id) flow_from_server.model = flow.model setup_id = setup_exists(flow_from_server) ids = _run_exists(task.task_id, setup_id) if ids: raise PyOpenMLError("Run already exists in server. Run id(s): %s" % str(ids)) _copy_server_fields(flow_from_server, flow) dataset = task.get_dataset() if task.class_labels is None: raise ValueError('The task has no class labels. This method currently ' 'only works for tasks with class labels.') run_environment = _get_version_information() tags = ['openml-python', run_environment[1]] # execute the run res = _run_task_get_arffcontent(flow.model, task, add_local_measures=add_local_measures) # in case the flow not exists, flow_id will be False (as returned by # flow_exists). Also check whether there are no illegal flow.flow_id values # (compared to result of openml.flows.flow_exists) if flow_id is False: if flow.flow_id is not None: raise ValueError('flow.flow_id is not None, but the flow does not' 'exist on the server according to flow_exists') _publish_flow_if_necessary(flow) # if the flow was published successfully # and has an id if flow.flow_id is not None: flow_id = flow.flow_id data_content, trace, fold_evaluations, sample_evaluations = res if not isinstance(flow.flow_id, int): # This is the usual behaviour, where the flow object was initiated off # line and requires some additional information (flow_id, input_id for # each hyperparameter) to be usable by this library server_flow = get_flow(flow_id) openml.flows.flow._copy_server_fields(server_flow, flow) openml.flows.assert_flows_equal(flow, server_flow, ignore_parameter_values=True) else: # This can only happen when the function is called directly, and not # through "run_model_on_task" if flow.flow_id != flow_id: # This should never happen, unless user made a flow-creation fault raise ValueError( "Result from API call flow_exists and flow.flow_id are not " "same: '%s' vs '%s'" % (str(flow.flow_id), str(flow_id))) run = OpenMLRun( task_id=task.task_id, flow_id=flow.flow_id, dataset_id=dataset.dataset_id, model=flow.model, flow_name=flow.name, tags=tags, trace=trace, data_content=data_content, ) # TODO: currently hard-coded sklearn assumption. run.parameter_settings = openml.flows.obtain_parameter_values(flow) # now we need to attach the detailed evaluations if task.task_type_id == 3: run.sample_evaluations = sample_evaluations else: run.fold_evaluations = fold_evaluations config.logger.info('Executed Task %d with Flow id: %d' % (task.task_id, run.flow_id)) return run
def run_flow_on_task( flow: OpenMLFlow, task: OpenMLTask, avoid_duplicate_runs: bool = True, flow_tags: List[str] = None, seed: int = None, add_local_measures: bool = True, upload_flow: bool = False, ) -> OpenMLRun: """Run the model provided by the flow on the dataset defined by task. Takes the flow and repeat information into account. The Flow may optionally be published. Parameters ---------- flow : OpenMLFlow A flow wraps a machine learning model together with relevant information. The model has a function fit(X,Y) and predict(X), all supervised estimators of scikit learn follow this definition of a model [1] [1](http://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html) task : OpenMLTask Task to perform. This may be an OpenMLFlow instead if the first argument is an OpenMLTask. avoid_duplicate_runs : bool, optional (default=True) If True, the run will throw an error if the setup/task combination is already present on the server. This feature requires an internet connection. avoid_duplicate_runs : bool, optional (default=True) If True, the run will throw an error if the setup/task combination is already present on the server. This feature requires an internet connection. flow_tags : List[str], optional (default=None) A list of tags that the flow should have at creation. seed: int, optional (default=None) Models that are not seeded will get this seed. add_local_measures : bool, optional (default=True) Determines whether to calculate a set of evaluation measures locally, to later verify server behaviour. upload_flow : bool (default=False) If True, upload the flow to OpenML if it does not exist yet. If False, do not upload the flow to OpenML. Returns ------- run : OpenMLRun Result of the run. """ if flow_tags is not None and not isinstance(flow_tags, list): raise ValueError("flow_tags should be a list") # TODO: At some point in the future do not allow for arguments in old order (changed 6-2018). # Flexibility currently still allowed due to code-snippet in OpenML100 paper (3-2019). if isinstance(flow, OpenMLTask) and isinstance(task, OpenMLFlow): # We want to allow either order of argument (to avoid confusion). warnings.warn( "The old argument order (Flow, model) is deprecated and " "will not be supported in the future. Please use the " "order (model, Flow).", DeprecationWarning) task, flow = flow, task if task.task_id is None: raise ValueError("The task should be published at OpenML") if flow.model is None: flow.model = flow.extension.flow_to_model(flow) flow.model = flow.extension.seed_model(flow.model, seed=seed) # We only need to sync with the server right now if we want to upload the flow, # or ensure no duplicate runs exist. Otherwise it can be synced at upload time. flow_id = None if upload_flow or avoid_duplicate_runs: flow_id = flow_exists(flow.name, flow.external_version) if isinstance(flow.flow_id, int) and flow_id != flow.flow_id: if flow_id: raise PyOpenMLError( "Local flow_id does not match server flow_id: " "'{}' vs '{}'".format(flow.flow_id, flow_id)) else: raise PyOpenMLError("Flow does not exist on the server, " "but 'flow.flow_id' is not None.") if upload_flow and not flow_id: flow.publish() flow_id = flow.flow_id elif flow_id: flow_from_server = get_flow(flow_id) _copy_server_fields(flow_from_server, flow) if avoid_duplicate_runs: flow_from_server.model = flow.model setup_id = setup_exists(flow_from_server) ids = run_exists(task.task_id, setup_id) if ids: error_message = ("One or more runs of this setup were " "already performed on the task.") raise OpenMLRunsExistError(ids, error_message) else: # Flow does not exist on server and we do not want to upload it. # No sync with the server happens. flow_id = None pass dataset = task.get_dataset() run_environment = flow.extension.get_version_information() tags = ['openml-python', run_environment[1]] # execute the run res = _run_task_get_arffcontent( flow=flow, model=flow.model, task=task, extension=flow.extension, add_local_measures=add_local_measures, ) data_content, trace, fold_evaluations, sample_evaluations = res run = OpenMLRun( task_id=task.task_id, flow_id=flow_id, dataset_id=dataset.dataset_id, model=flow.model, flow_name=flow.name, tags=tags, trace=trace, data_content=data_content, flow=flow, setup_string=flow.extension.create_setup_string(flow.model), ) if (upload_flow or avoid_duplicate_runs) and flow.flow_id is not None: # We only extract the parameter settings if a sync happened with the server. # I.e. when the flow was uploaded or we found it in the avoid_duplicate check. # Otherwise, we will do this at upload time. run.parameter_settings = flow.extension.obtain_parameter_values(flow) # now we need to attach the detailed evaluations if task.task_type_id == TaskTypeEnum.LEARNING_CURVE: run.sample_evaluations = sample_evaluations else: run.fold_evaluations = fold_evaluations if flow_id: message = 'Executed Task {} with Flow id:{}'.format( task.task_id, run.flow_id) else: message = 'Executed Task {} on local Flow with name {}.'.format( task.task_id, flow.name) config.logger.info(message) return run
def run_flow_on_task(flow, task, avoid_duplicate_runs=True, flow_tags=None, seed=None, add_local_measures=True): """Run the model provided by the flow on the dataset defined by task. Takes the flow and repeat information into account. In case a flow is not yet published, it is published after executing the run (requires internet connection). Parameters ---------- model : sklearn model A model which has a function fit(X,Y) and predict(X), all supervised estimators of scikit learn follow this definition of a model [1] [1](http://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html) task : OpenMLTask Task to perform. This may be an OpenMLFlow instead if the second argument is an OpenMLTask. avoid_duplicate_runs : bool If this flag is set to True, the run will throw an error if the setup/task combination is already present on the server. Works only if the flow is already published on the server. This feature requires an internet connection. This may be an OpenMLTask instead if the first argument is the OpenMLFlow. flow_tags : list(str) A list of tags that the flow should have at creation. seed: int Models that are not seeded will be automatically seeded by a RNG. The RBG will be seeded with this seed. add_local_measures : bool Determines whether to calculate a set of evaluation measures locally, to later verify server behaviour. Defaults to True Returns ------- run : OpenMLRun Result of the run. """ if flow_tags is not None and not isinstance(flow_tags, list): raise ValueError("flow_tags should be list") # TODO: At some point in the future do not allow for arguments in old order (order changed 6-2018). if isinstance(flow, OpenMLTask) and isinstance(task, OpenMLFlow): # We want to allow either order of argument (to avoid confusion). warnings.warn("The old argument order (Flow, model) is deprecated and will not be supported in the future. " "Please use the order (model, Flow).", DeprecationWarning) task, flow = flow, task flow.model = _get_seeded_model(flow.model, seed=seed) # skips the run if it already exists and the user opts for this in the config file. # also, if the flow is not present on the server, the check is not needed. flow_id = flow_exists(flow.name, flow.external_version) if avoid_duplicate_runs and flow_id: flow_from_server = get_flow(flow_id) flow_from_server.model = flow.model setup_id = setup_exists(flow_from_server) ids = _run_exists(task.task_id, setup_id) if ids: raise PyOpenMLError("Run already exists in server. Run id(s): %s" % str(ids)) _copy_server_fields(flow_from_server, flow) dataset = task.get_dataset() if task.class_labels is None: raise ValueError('The task has no class labels. This method currently ' 'only works for tasks with class labels.') run_environment = _get_version_information() tags = ['openml-python', run_environment[1]] # execute the run res = _run_task_get_arffcontent(flow.model, task, add_local_measures=add_local_measures) # in case the flow not exists, flow_id will be False (as returned by # flow_exists). Also check whether there are no illegal flow.flow_id values # (compared to result of openml.flows.flow_exists) if flow_id is False: if flow.flow_id is not None: raise ValueError('flow.flow_id is not None, but the flow does not' 'exist on the server according to flow_exists') _publish_flow_if_necessary(flow) # if the flow was published successfully # and has an id if flow.flow_id is not None: flow_id = flow.flow_id data_content, trace, fold_evaluations, sample_evaluations = res if not isinstance(flow.flow_id, int): # This is the usual behaviour, where the flow object was initiated off # line and requires some additional information (flow_id, input_id for # each hyperparameter) to be usable by this library server_flow = get_flow(flow_id) openml.flows.flow._copy_server_fields(server_flow, flow) openml.flows.assert_flows_equal(flow, server_flow, ignore_parameter_values=True) else: # This can only happen when the function is called directly, and not # through "run_model_on_task" if flow.flow_id != flow_id: # This should never happen, unless user made a flow-creation fault raise ValueError( "Result from API call flow_exists and flow.flow_id are not " "same: '%s' vs '%s'" % (str(flow.flow_id), str(flow_id)) ) run = OpenMLRun( task_id=task.task_id, flow_id=flow.flow_id, dataset_id=dataset.dataset_id, model=flow.model, flow_name=flow.name, tags=tags, trace=trace, data_content=data_content, ) # TODO: currently hard-coded sklearn assumption. run.parameter_settings = openml.flows.obtain_parameter_values(flow) # now we need to attach the detailed evaluations if task.task_type_id == 3: run.sample_evaluations = sample_evaluations else: run.fold_evaluations = fold_evaluations config.logger.info('Executed Task %d with Flow id: %d' % (task.task_id, run.flow_id)) return run