Ejemplo n.º 1
0
def _apply_binding_to_task(task, binding, allow_object=False):
    """
    Helper method to apply bindings to a given task. This function modifies the given task object and returns
    it modified.

    When specifying bindings, there is no qualification for a binding being an input, output, or param, so when
    trying to apply the binding we need to try each. A TypeError is thrown whenever the name for an input is already
    applied to another type of input (ex. set_input('foo') called, then later set_param('foo','bar') will throw a
    TypeError. So it is expected that the name for each binding throw a TypeError for the other types of slot names
    (so if param named 'foo', then TypeError should be thrown for set_input('foo') and set_output('foo')
    """
    for (param_name, param_value) in binding.iteritems():
        bound = False

        if param_name in task.get_inputs():
            try:
                # special-case when task.run() is specifying input binding with object
                # this way when _realize_input is called it will find the object dependency
                if allow_object is True and _Task._is_valid_data_structure(param_value):
                    if not hasattr(task, "_local_binding") or task._local_binding is None:
                        task._local_binding = {}
                    task._local_binding[param_name] = param_value
                else:
                    task.set_inputs({param_name: param_value})
                bound = True
                __LOGGER__.debug("Applied binding named: '%s' as input, with value: '%s'" % (param_name, param_value))
            except TypeError:
                pass

        if param_name in task.get_outputs():
            try:
                task.set_outputs({param_name: param_value})
                bound = True
                __LOGGER__.debug("Applied binding named: '%s' as output, with value: '%s'" % (param_name, param_value))
            except TypeError:
                pass

        if param_name in task.get_params():
            try:
                task.set_params({param_name: param_value})
                bound = True
                __LOGGER__.debug("Applied binding named: '%s' as param, with value: '%s'" % (param_name, param_value))
            except TypeError:
                pass

        if bound is False:
            __LOGGER__.warning(
                "Binding not applied since not found in input, output, or params. Name: '%s', value: '%s'"
                % (param_name, param_value)
            )
Ejemplo n.º 2
0
def create(tasks=None, name=None, environment=None, function=None, function_arguments=None, required_packages=None):
    """
    Creates a Job and begins executing it. The Job can be defined by either
    specifying a list of tasks, with optional bindings, or with a function,
    with arguments defined. Each Job also needs to know where to run, and that
    is specified by the environment.

    By default, this method will kick off asynchronous work, and return a Job
    object to monitor/manage that work.

    Parameters
    ----------
    tasks : list [Task | str | tuple [ str, dict ]] | str
        List of Tasks to run.

    name : str
        Name for this execution (names the returned Job). Default is environment name + timestamp.

    environment : :class:`~graphlab.deploy.environment.EC2` |
                  :class:`~graphlab.deploy.environment.Hadoop` |
                  :class:`~graphlab.deploy.environment.LocalAsync`
        Optional environment for execution. This would commonly hold access
        keys, launch locations etc.  Also included in the environment object is
        a dictionary for associated metadata to pass to the execution. Default
        is 'LocalAsync', which will have the execution occur in the background
        locally.

    function : function
        Function to be executed in this Job, with arguments to pass to this
        function specified by function_arguments. If a function is specified,
        then tasks cannot be specified. Specifying a function makes it easy to
        get code running in a remote environment.

        If the function returns a dict then it will be collated into the
        results. If the function returns something other than a dict, it will
        be cast to an str and that will be collated into the results. See the
        examples below for more information. See the example below for more
        information.

    function_arguments : dict | list [ dict ] | :class:`~graphlab.SFrame`
        Arguments to pass to the specified function as kwargs. To run multiple
        invocations of the function, simply specify a list of arguments or an
        SFrame. Each element in the list will result in invoking the function
        once. Each row of the SFrame will be used to invoke the function.

    required_packages : list [ str ] | set [ str ]
        List of package requirements (same as disutils.requires) format for
        packages required for running this Job. This is most useful to specify
        any non-standard Python packages required to run the function
        specified.

    Returns
    -------
    job : :py:class:`~graphlab.deploy._job.Job`
        Used for monitoring and managing the execution of the Job.

    Notes
    -----
    - When this method is invoked, each Task specified is cloned and a snapshot
      of it is used for execution. This snapshot can then be queried by
      inspecting the Job object returned.

    Examples
    --------
    Creating a Job using a function instead of Tasks is easy. Just define a
    function and then use it when calling job.create.

    Using a list of dicts to specify arguments:
        >>> def sum_four(one, two, three, four):
        >>>     return {'sum': one + two + three + four}
        >>>
        >>> job = graphlab.deploy.job.create(
        >>>             function=sum_four,
        >>>             function_arguments=[{'one':1, 'two':2,
        >>>                 'three':3, 'four':4}])
        >>>
        >>> results = job.get_results() # SFrame with aggregated results

    Using an SFrame to specify multiple sets of arguments:
        >>> def mult_three(one, two, three):
        >>>     return {'product': one * two * three}
        >>>
        >>> sf = graphlab.SFrame(data={'one':[1,5], 'two':[2,6], 'three':[3,7]})
        >>> job = graphlab.deploy.job.create(function=mult_three,
        >>>                                  function_arguments=sf)
        >>>
        >>> +----+--------------------------------+------------------+-----------+
        >>> | id |             input              |      result      |   status  |
        >>> +----+--------------------------------+------------------+-----------+
        >>> | 0  | {'three': 3, 'two': 2, 'on ... |  {'product': 6}  | Completed |
        >>> | 1  | {'three': 7, 'two': 6, 'on ... | {'product': 210} | Completed |
        >>> +----+--------------------------------+------------------+-----------+
        >>> +---------------------------+---------------------------+---------+
        >>> |         start_time        |          end_time         | message |
        >>> +---------------------------+---------------------------+---------+
        >>> | 2014-11-17 11:06:38+00:00 | 2014-11-17 11:06:38+00:00 |         |
        >>> | 2014-11-17 11:06:40+00:00 | 2014-11-17 11:06:40+00:00 |         |
        >>> +---------------------------+---------------------------+---------+
        >>> [2 rows x 7 columns]

    Each entry in the tasks list could be a pair with a dictionary of bindings
    for that entry. For example:

        >>> tasks = [('task1', {'input':'s3://big-file'}),
        >>>          ('task2', {'input':'/localfile'})]
        >>> graphlab.deploy.job.create(tasks, name='with-bindings')

    """
    tracker = _mt._get_metric_tracker()
    _session = _gl.deploy._default_session

    if tasks is None and function is None:
        raise TypeError("tasks or function needs to be defined")
    if tasks is not None and function is not None:
        raise TypeError("Cannot specify BOTH tasks and function")

    if environment is None:
        environment = _gl.deploy.environments["async"]
        if environment is None:
            environment = _environment.LocalAsync("async")
            environment.save()
    else:
        if isinstance(environment, str):
            __LOGGER__.debug("Loading environment: %s" % environment)
            environment = _gl.deploy.environments[environment]

        elif not isinstance(environment, _environment.Environment):
            raise Exception("Unknown type of environment")

        if environment is None:
            raise TypeError(
                "Environment cannot be loaded correctly with name '%s', please confirm this environment exists by calling graphlab.deploy.environments."
                % environment
            )

    # always clone the environment, so not mutating existing
    environment = environment.clone()
    __LOGGER__.info("Preparing using environment: %s" % environment.name)

    if name is not None:
        if type(name) is not str:
            raise TypeError("The name you gave for this job is not a string.")

    __LOGGER__.info("Beginning Job Validation.")

    # special handling for function= parameter
    combiner = None
    if function is not None:
        # clobber tasks specified and create a Task for the execution of the function
        tasks = []
        if not _inspect.isfunction(function):
            raise TypeError("Invalid function, must be a Python function.")

        bindings = function_arguments
        if bindings is None:
            bindings = [{}]
        elif not isinstance(bindings, list) and not isinstance(bindings, _gl.SFrame):
            bindings = [bindings]

        # if no name specified make sure the Task names are prefixed with Job name to ensure uniqueness
        if name is None or name == "":
            name = "job-%s-%s-%s" % (function.__name__, environment.name, _time.time())

        combiner = _Task(name + "-combiner")
        combiner.set_code(_combiner_task)
        idx = -1

        for binding in bindings:
            idx = idx + 1
            task = _Task("%s-%s-%d" % (name, function.__name__, idx))
            task.set_code(_wrapper_task)

            # validate that no GL data structures being passed in function_arguments
            for key, value in binding.items():
                if _Task._is_valid_data_structure(value):
                    raise RuntimeError(
                        "Validation Failed: Unsupported type for function_arguments. Function arguments must be basic types that can be serialized into JSON. Invalid function_argument: '%s', type: '%s'"
                        % (key, type(value))
                    )

            task.set_params({"params": binding, "function": function})
            task.set_outputs(["output"])

            # create dependency for output from task to combiner task
            combiner.set_inputs({"in-%d" % idx: (task, "output")})
            tasks.append(task)
            _gl.deploy.tasks.delete(task)

        combiner.set_params({"num": len(bindings)})
        tasks.append(combiner)
        _gl.deploy.tasks.delete(combiner)
        tracker.track("deploy.job.create.fn", value=1)

    # now make the artifacts a list of objects
    if not isinstance(tasks, list):
        # not a list, let's turn it into a list
        tasks = [tasks]

    # if Environment object missing num_hosts attribute, set to 1
    if not hasattr(environment, "num_hosts"):
        environment.num_hosts = 1

    # If environment.num_hosts > 1 and not using model_parameter_search or parallel_for_each then
    # reset num_host = 1, since multiple hosts will not be used.
    if environment.num_hosts != 1 and all(map(lambda x: isinstance(x, _Task), tasks)):
        __LOGGER__.warn(
            "Ignoring Environment.num_hosts value since execution will occur only on one host. Using num_hosts=1 for this execution."
        )
        environment.num_hosts = 1

    # add required packages to first task in execution
    # ensures the packages will be present on execution
    if required_packages is not None:
        packages = tasks[0].get_required_packages()
        tasks[0].set_required_packages(packages | set(required_packages))

    if name is None or name == "":
        task = tasks[0]
        if isinstance(task, tuple):
            task = task[0]
        if isinstance(task, str):
            names = task
        else:
            names = task.name
        name = "job-%s-%s-%s" % (names, environment.name, _time.time())

    # if using fn= parameter, we need to wait until name has been determined to
    # set the results_path, so now that the name is settled, set results_path
    if combiner is not None:
        results_path = _get_results_path(environment, name)
        __LOGGER__.info("Job Results SFrame stored: %s" % results_path)
        combiner.set_outputs({"results": results_path})

    validation_msgs = []

    # verify job name is unique
    if _gl.deploy.jobs[name] is not None:
        # found another job same name, fail
        raise RuntimeError(
            "Validation Error: Job already exists with the name '%s', please rename or delete the existing job." % name
        )

    # Create artifact from their names, if necessary. Clone all artifacts. Add any bindings.
    cloned_artifacts = []
    using_pipeline = False
    for steps in tasks:

        # handle pipeline differently then task
        if isinstance(steps, _Pipeline):
            using_pipeline = True
            binding = None
            if isinstance(steps, tuple):
                (cur_artifact, binding) = steps
            else:
                cur_artifact = steps
            if not isinstance(cur_artifact, _Task) and not isinstance(cur_artifact, _Pipeline):
                cur_artifact = _session._open(cur_artifact, {}, check_cache=True, typename="Task")

            clone = cur_artifact._clone(cur_artifact.name, session_aware=False)

            # apply bindings if paired with task
            if binding is not None:
                _apply_binding_to_task(clone, binding)
            cloned_artifacts.append(clone)
            continue

        if not isinstance(steps, list):
            steps = [steps]

        cloned_step = []
        for step in steps:
            binding = None
            if isinstance(step, tuple):
                (cur_artifact, binding) = step
            else:
                cur_artifact = step
            if not isinstance(cur_artifact, _Task) and not isinstance(cur_artifact, _Pipeline):
                cur_artifact = _session._open(cur_artifact, {}, check_cache=True, typename="Task")

            if cur_artifact is None:
                raise TypeError("Unable to find Task to try to run")

            clone = cur_artifact._clone(cur_artifact.name, session_aware=False)

            # apply bindings if paired with task
            if binding is not None:
                _apply_binding_to_task(clone, binding)

            # if environment is not local then write out any outputs not bound to a location to an
            # intermediate location, so any subsequent steps can find the output
            _validate_output_to_environment(clone, environment, validation_msgs)

            cloned_step.append(clone)

        cloned_artifacts.append(cloned_step)

    num_tasks = len(cloned_artifacts)
    if isinstance(environment, _environment.Local):
        tracker.track("deploy.job.create.local", value=1, properties={"num_tasks": num_tasks})
        env = _env.LocalExecutionEnvironment()
    elif isinstance(environment, _environment.LocalAsync):
        tracker.track("deploy.job.create.localasync", value=1, properties={"num_tasks": num_tasks})
        env = _env.LocalAsynchronousEnvironment()
    elif isinstance(environment, _environment.EC2):
        tracker.track("deploy.job.create.ec2", value=1, properties={"num_tasks": num_tasks})
        # name the ec2 instance the job name
        if not environment.tags:
            environment.tags = {}
        if not "Name" in environment.tags:
            environment.tags["Name"] = name
        environment.tags["Job"] = name
        env = _env.Ec2ExecutionEnvironment()
    elif isinstance(environment, _environment.Hadoop):
        tracker.track("deploy.job.create.hadoop", value=1, properties={"num_tasks": num_tasks})
        env = _env.HadoopExecutionEnvironment()
    else:
        raise Exception("Validation Failed: Unknown execution environment.")

    if len(validation_msgs) > 0:
        for msg in validation_msgs:
            __LOGGER__.error(msg)
        raise RuntimeError(
            "Validation Failed: output(s) not set to appropriate location for execution environment. See logs for more details."
        )

    try:
        __LOGGER__.info("Validation complete. Job: '%s' ready for execution" % name)
        job = env.run(_session, cloned_artifacts, name, environment)
        _session.register(job)
        job.save()  # save the job once prior to returning.

        # add a .get_results() method to this job.
        if function is not None:
            job.get_results = _types.MethodType(_get_results, job)

        return job
    except LicenseValidationException as e:
        # catch exception and print license check hint message here instead of raise
        __LOGGER__.info(e)