Beispiel #1
0
def create(function, name=None, environment=None, **kwargs):
    """
    Execute arbitrary functions in a remote environment.

    The job is specified as a function. All functions that are called from
    within the function are automatically captured. By default, this method will
    kick off asynchronous work, and return a Job object to monitor/manage that
    work.

    Parameters
    ----------
    function : function
        Function to be executed in this Job, with arguments to pass to this
        function specified by `kwargs`.

    name : str, optional
        Name for this execution (names the returned Job). If set to None, then
        the name of the job is set to the name of the function with a time-stamp.
        Valid characters in job name include: digits, characters, '-' and '_'.

    environment : :class:`~graphlab.deploy.hadoop_cluster.HadoopCluster` | :class:`~graphlab.deploy.ec2_cluster.Ec2Cluster` | :class:`~graphlab.deploy.LocalAsync`, optional
        Optional environment for execution. If set to None, then a `LocalAsync`
        by the name `async` is created and used. This will execute the code in
        the background on your local machine.

    kwargs:
        Function kwargs that are passed to the function for execution.

    Returns
    -------
    job : :py:class:`~graphlab.deploy.Job`
        Used for monitoring and managing the execution of the Job.

    See Also
    --------
    graphlab.deploy.map_job.create, graphlab.deploy.Job

    Examples
    --------
    Let us start out with a simple example to execute a function that can
    add two numbers.

    .. sourcecode:: python

        # Define a function
        def add(x, y):
            return x + y

        # Create a job.
        job = graphlab.deploy.job.create(add, x=1, y=1)

        # Get results from the execution when ready. This call waits for the
        # job to complete before retrieving the results.
        >>> print job.get_results()
        2

    Exceptions within the function calls can be captured as follows:

    .. sourcecode:: python

        def add(x, y):
            if x and y:
                return x + y
            else:
                raise ValueError('x or y cannot be None')

        # Job execution capture the exception raised by the function.
        job = graphlab.deploy.job.create(add, x=1, y=None)

        # Get results from the execution when ready. This call waits for the
        # job to complete before retrieving the results.
        >>> print job.get_results()
        None

        # Get the exceptions raised from this execution by calling
        # job.get_metrics()
        >>> print job.get_metrics()
        +-----------+--------+------------+----------+-----------------------+
        | task_name | status | start_time | run_time |   exception_message   |
        +-----------+--------+------------+----------+-----------------------+
        |    add    | Failed | 1427928898 |   None   | x or y cannot be None |
        +-----------+--------+------------+----------+-----------------------+
        +-------------------------------+
        |      exception_traceback      |
        +-------------------------------+
        | Traceback (most recent cal... |
        +-------------------------------+
        [1 rows x 6 columns]


    If a function requires a package to be installed, the function can be
    annotated with a decorator.

    .. sourcecode:: python

        @graphlab.deploy.required_packages(['names == 0.3.0'])
        def my_function(number = 10):
            import names
            people = [names.get_full_name() for i in range(number)]
            sf = graphlab.SFrame({'names':people})
            return sf

        job = graphlab.deploy.job.create(my_function)

        >>> print job.get_results()

        Columns:
                names    str

        Data:
        +-------------------+
        |       names       |
        +-------------------+
        |   Annette Logan   |
        |   Nancy Anthony   |
        |  Tiffany Zupancic |
        |    Andre Coppin   |
        |     Robert Coe    |
        |    Donald Dean    |
        |    Lynne Bunton   |
        |   John Sartwell   |
        |   Peter Nicholas  |
        | Chester Rodriguez |
        +-------------------+
        [10 rows x 1 columns]

    Complex functions that require SFrames, GraphLab models etc. can be deployed
    with ease. All additional state required by the function are automatically
    captured.

    .. sourcecode:: python

        GLOBAL_CONSTANT = 10

        def foo(x):
            return x + 1

        def bar(x):
            return x + 2

        def my_function(x, y):
            foo_x = foo(x)
            bar_y = bar(y)
            return foo_x + bar_y + GLOBAL_CONSTANT

        # Automatically captures all state needed by the deployed function.
        job = graphlab.deploy.job.create(my_function, x = 1, y = 1)

        >>> print job.get_results()
        15

    You can execute the same job remotely by passing a different environment.

    .. sourcecode:: python

        # Define a function
        def add(x, y):
            return x + y

        # Define an EC2 environment
        ec2 = graphlab.deploy.Ec2Config()

        # Create an EC2 cluster object
        c = graphlab.deploy.ec2_cluster.create('my_cluster', 's3://bucket/path', ec2)

        # Create a job.
        job = graphlab.deploy.job.create(add, environment=c, x=1, y=1)

        >>> print job.get_results()
        2

    Notes
    -----
    - When an exception is raised within the deployed function,
      :func:`~graphlab.deploy.Job.get_results` returns None.

    - For asynchronous jobs, :func:`~graphlab.deploy.Job.get_results` is a
      blocking call which will wait for the job execution to complete
      before returning the results.

    """
    _session = _gl.deploy._default_session


    _raise_error_if_not_function(function)

    _get_metric_tracker().track('jobs.job')

    # Name the job
    now = _datetime.now().strftime('%b-%d-%Y-%H-%M-%S')
    function_name = _sub('[<>]','',function.__name__)

    name = '%s-%s' % (function_name, now) if not name else name
    # Validate args
    function, name, environment = _validate_job_create_args(function,
                                                            name, environment)
    while _session.exists(name, _job.Job._typename):
        rand = str(_uuid.uuid4())[:5]
        old_name = name
        name = "%s-%s" % (name, rand)
        __LOGGER__.info("A job with name '%s' already exists. "
                        "Renaming the job to '%s'." % (old_name, name))

    # Setup the task & job
    task = _task.Task(function,function_name)
    task.set_inputs(kwargs)
    job = _job.Job(name, stages=[[task]], environment=environment,
                                        final_stage=task)
    # Setup the env.
    __LOGGER__.info("Validation complete. Job: '%s' ready for execution." % name)
    exec_env = _env._get_execution_env(environment)
    job = exec_env.run_job(job)

    # Save the job and return to user
    if not isinstance(environment, _environment.Local):
        __LOGGER__.info("Job: '%s' scheduled." % name)
    else:
        __LOGGER__.info("Job: '%s' finished." % name)

    _session.register(job)
    _session.save(job)
    return job
Beispiel #2
0
def create(function, parameter_set, name=None, environment=None, combiner_function=None):
    """
    Distributed execution of a function once for each entry in the parameter_set.

    Similar to the map() function in python, this method `maps` a single function
    to each provided set of parameters. The results are a list corresponding to each
    parameter in `parameter_set`.

    Parameters
    ----------
    function : function
        Function to be executed, with arguments to pass to this
        function specified by parameter_set.

    parameter_set : iterable of dict
        Each element of the list corresponds to an evaluation of the function
        with the dictionary argument.

    name : str, optional
        Name for the returned Job. If set to None, then the name of the Job is
        set to the name of the function with a timestamp.

    environment : :class:`~graphlab.deploy.hadoop_cluster.HadoopCluster` |:class:`~graphlab.deploy.ec2_cluster.Ec2Cluster` | :class:`~graphlab.deploy.LocalAsync`, optional
        Optional environment for execution. If set to None, then a `LocalAsync`
        by the name `async` is created and used. This will execute the code in
        the background on your local machine.

    combiner_function : function (kwargs -> object), optional
        An optional function that will be run once at end of the map_job. The
        combiner function will have access to all previous map job results. If
        a combiner is provided, only the output of the combiner will be reported.
        The input type of the combiner is `kwargs` where the values (in order)
        correspond to the values of the results from the map.

    Returns
    -------
    job : :py:class:`~graphlab.deploy.Job`
        The job for the map_job, which was run using the `environment`
        parameter. This object can be used to track the progress of
        map_job work.

    See Also
    --------
    graphlab.deploy.job.create, graphlab.deploy.Job

    Notes
    -----
    - The map job achieves the same behavior as `results = map(func, args)`

    Examples
    --------
    Let us start out with a simple example to execute a function that can
    add two numbers over 2 sets of arguments.

    .. sourcecode:: python

      # Define the function.
      def add(x, y):
          return x + y

      # Create a map-job
      params = [{'x': 1, 'y': 2}, {'x': 10, 'y': -1}]
      job = graphlab.deploy.map_job.create(add, params)

      # Get results from the execution when ready. This call waits for the
      # job to complete before retrieving the results.
      >>> print job.get_results()
      [3, 9]

    Exceptions within the function calls can be captured as follows:

    .. sourcecode:: python

        def add(x, y):
            if x and y:
                return x + y
            else:
                raise ValueError('x or y cannot be None')

        params = [{'x': 1, 'y': 2}, {'x': 10, 'y': None}]
        job = graphlab.deploy.map_job.create(add, params)

        # Get results from the execution when ready.
        >>> print job.get_results()
        [3, None]

        # Get the exceptions raised from this execution by calling
        # job.get_metrics()
        >>> print job.get_metrics()
        +-----------+-----------+------------+-------------------+-----------------------+
        | task_name |   status  | start_time |      run_time     |   exception_message   |
        +-----------+-----------+------------+-------------------+-----------------------+
        |  add-0-0  | Completed | 1427931034 | 3.81469726562e-05 |                       |
        |  add-1-0  |   Failed  | 1427931034 |        None       | x or y cannot be None |
        +-----------+-----------+------------+-------------------+-----------------------+
        +-------------------------------+
        |      exception_traceback      |
        +-------------------------------+
        |                               |
        | Traceback (most recent cal... |
        +-------------------------------+
        [2 rows x 6 columns]


    Use the combiner function to perform aggregations on the results.

    .. sourcecode:: python

      # Combiner to combine all results from the map.
      def max_combiner(**kwargs):
          return max(kwargs.values())

      # The function being mapped to the arguments.
      def add(x, y):
           return x + y

      # Create a map-job.
      params = [{'x': 1, 'y': 2}, {'x': 10, 'y': -1}]
      job = graphlab.deploy.map_job.create(add, params,
                                    combiner_function = max_combiner)

      # Get results. (Applies the combiner on the results of the map.)
      >>> print job.get_results()
      9
    """

    _get_metric_tracker().track(
        "jobs.map_job", properties={"num_tasks": len(parameter_set), "has_combiner": combiner_function is not None}
    )

    _session = _gl.deploy._default_session

    job = _create_map_job(function, parameter_set, name, environment, combiner_function, _job_type="PIPELINE")

    # Setup the env.
    __LOGGER__.info("Validation complete. Job: '%s' ready for execution" % job.name)
    exec_env = _env._get_execution_env(environment)
    job = exec_env.run_job(job)

    # Save the job and return to user
    if not isinstance(environment, _environment.Local):
        __LOGGER__.info("Job: '%s' scheduled." % job.name)
    else:
        __LOGGER__.info("Job: '%s' finished." % job.name)

    _session.register(job)
    _session.save(job)
    return job