Exemple #1
0
def _create_map_job(function, parameter_set, name=None, environment=None, combiner_function=None, _job_type="PIPELINE"):

    _raise_error_if_not_function(function)

    # Name the job
    now = _datetime.now().strftime("%b-%d-%Y-%H-%M-%S")
    function_name = _sub("[<>]", "", function.__name__)

    name = "%s-%s" % (function_name, now) if not name else name

    # Validate args
    function, name, environment = _job._validate_job_create_args(function, name, environment)
    _session = _gl.deploy._default_session
    while _session.exists(name, __job.Job._typename):
        rand = str(_uuid.uuid4())[:5]
        old_name = name
        name = "%s-%s" % (name, rand)
        __LOGGER__.info("A job with name '%s' already exists. " "Renaming the job to '%s'." % (old_name, name))

    # Convert SFrame to a dict
    if not parameter_set:
        raise RuntimeError("An empty parameter_set was given. Nothing to do.")

    # If parameter set is a generator/SFrame, make sure it gets expanded out.
    parameter_set_copy = []
    for i in parameter_set:
        if not isinstance(i, dict):
            raise TypeError(
                "'parameter_set' has to be an iterable of dictionary."
                " For void functions, use an empty dictionary as inputs."
            )
        parameter_set_copy.append(i)

    # Create the task.
    task_prototype = _task.Task(function, function_name)
    for_each_iterations = _generate_mapjob_tasks(task_prototype, parameter_set_copy)

    # List of outputs for the final step.
    if not combiner_function:
        list_of_tasks = for_each_iterations[0]
    else:
        combiner = _task.Task(combiner_function)

        # The input to this task is all other tasks
        task_name_to_task = {}
        for stage in for_each_iterations:
            for t in stage:
                task_name_to_task[t.name] = t
        combiner.set_inputs_from_task(task_name_to_task)

        for_each_iterations.append([combiner])
        list_of_tasks = combiner

    # Create the job
    job = __job.Job(
        name, stages=for_each_iterations, environment=environment, final_stage=list_of_tasks, _job_type=_job_type
    )
    return job
    def __init__(self, func, name=None, description=None):
        """
        Create a new Task specifying its name and optionally a description.
        """

        # Must be a function
        _raise_error_if_not_function(func, "func")

        # Set the name
        name = func.__name__ if not name else name
        _raise_error_if_not_of_type(name, str, "name")

        self.name = name
        self._data = dict()
        self._data['code'] = None
        self._data['codestr'] = None
        self._data['inputs'] = dict()
        self._data['output'] = None
        self._data['packages'] = set()
        self._data['description'] = ''
        self._modified_since_last_saved = None

        if description is not None:
            self.set_description(description)

        # Inspect the function.
        specs = _inspect.getargspec(func)
        varargs = specs.varargs
        defaults = _copy.copy(specs.defaults)
        args = _copy.copy(specs.args)

        # Set the code to function arguments + *args + **kwargs
        self.set_code(func)

        # Set the inputs
        all_args = _copy.copy(args)
        if varargs:
            all_args.append(varargs)
        self.set_inputs(all_args)

        # Bind default values
        if defaults:
            for index, arg in enumerate(args[-len(defaults):]):
                self.set_inputs({arg : defaults[index]})

        # Set required packages
        if _sys.version_info.major == 3:
            func_dict = func.__dict__
        else:
            func_dict = func.func_dict
    def __init__(self, func, name=None, description=None):
        """
        Create a new Task specifying its name and optionally a description.
        """

        # Must be a function
        _raise_error_if_not_function(func, "func")

        # Set the name
        name = func.__name__ if not name else name
        _raise_error_if_not_of_type(name, str, "name")

        self.name = name
        self._data = dict()
        self._data['code'] = None
        self._data['codestr'] = None
        self._data['inputs'] = dict()
        self._data['output'] = None
        self._data['packages'] = set()
        self._data['description'] = ''
        self._modified_since_last_saved = None

        if description is not None:
            self.set_description(description)

        # Inspect the function.
        specs = _inspect.getargspec(func)
        varargs = specs.varargs
        defaults = _copy.copy(specs.defaults)
        args = _copy.copy(specs.args)

        # Set the code to function arguments + *args + **kwargs
        self.set_code(func)

        # Set the inputs
        all_args = _copy.copy(args)
        if varargs:
            all_args.append(varargs)
        self.set_inputs(all_args)

        # Bind default values
        if defaults:
            for index, arg in enumerate(args[-len(defaults):]):
                self.set_inputs({arg: defaults[index]})

        # Set required packages
        if 'required_packages' in func.func_dict:
            self.set_required_packages(func.func_dict['required_packages'])
    def set_code(self, code):
        """
        Set the code block to run when Task is executed.

        The code to be run needs to be a function that takes one argument. When
        this function is called, the arguments will be the inputs and the return
        will be in the output.

        The inputs dictionary will have instantiated data sources by name. The
        output dictionary needs to be assigned by name to the results to save.

        Parameters
        ----------
        code : function
            Function to be called when this Task is executed.

        Returns
        -------
        self : Task

        Examples
        --------
        Using a defined function:

        >>> def func(task):
        >>>     input = task.inputs['input']
        >>>     task.output['output'] = input.apply(lambda x : x * 2)

        >>> t1 = graphlab.deploy._task.Task("set_code_ex1")
        >>> t1.set_code(func)

        """

        # Make sure it is a function.
        _raise_error_if_not_function(code)

        # Cannot work with instance method
        if (_inspect.ismethod(code)):
            raise TypeError(("Function cannot be an instance method, please"
                             " use a function."))

        # code is callable, so store it as is
        self._data['code'] = code
        self._data['codestr'] = _inspect.getsource(code)
        self._set_dirty_bit()
    def set_code(self, code):
        """
        Set the code block to run when Task is executed.

        The code to be run needs to be a function that takes one argument. When
        this function is called, the arguments will be the inputs and the return
        will be in the output.

        The inputs dictionary will have instantiated data sources by name. The
        output dictionary needs to be assigned by name to the results to save.

        Parameters
        ----------
        code : function
            Function to be called when this Task is executed.

        Returns
        -------
        self : Task

        Examples
        --------
        Using a defined function:

        >>> def func(task):
        >>>     input = task.inputs['input']
        >>>     task.output['output'] = input.apply(lambda x : x * 2)

        >>> t1 = graphlab.deploy._task.Task("set_code_ex1")
        >>> t1.set_code(func)

        """

        # Make sure it is a function.
        _raise_error_if_not_function(code)

        # Cannot work with instance method
        if(_inspect.ismethod(code)):
            raise TypeError(("Function cannot be an instance method, please"
                       " use a function."))

        # code is callable, so store it as is
        self._data['code'] = code
        self._data['codestr'] = _inspect.getsource(code)
        self._set_dirty_bit()
def create(function, name=None, environment=None, **kwargs):
    """
    Execute arbitrary functions in a remote environment.

    The job is specified as a function. All functions that are called from
    within the function are automatically captured. By default, this method will
    kick off asynchronous work, and return a Job object to monitor/manage that
    work.

    Parameters
    ----------
    function : function
        Function to be executed in this Job, with arguments to pass to this
        function specified by `kwargs`.

    name : str, optional
        Name for this execution (names the returned Job). If set to None, then
        the name of the job is set to the name of the function with a time-stamp.
        Valid characters in job name include: digits, characters, '-' and '_'.

    environment : :class:`~graphlab.deploy.hadoop_cluster.HadoopCluster` | :class:`~graphlab.deploy.ec2_cluster.Ec2Cluster` | :class:`~graphlab.deploy.LocalAsync`, optional
        Optional environment for execution. If set to None, then a `LocalAsync`
        by the name `async` is created and used. This will execute the code in
        the background on your local machine.

    kwargs:
        Function kwargs that are passed to the function for execution.

    Returns
    -------
    job : :py:class:`~graphlab.deploy.Job`
        Used for monitoring and managing the execution of the Job.

    See Also
    --------
    graphlab.deploy.map_job.create, graphlab.deploy.Job

    Examples
    --------
    Let us start out with a simple example to execute a function that can
    add two numbers.

    .. sourcecode:: python

        # Define a function
        def add(x, y):
            return x + y

        # Create a job.
        job = graphlab.deploy.job.create(add, x=1, y=1)

        # Get results from the execution when ready. This call waits for the
        # job to complete before retrieving the results.
        >>> print job.get_results()
        2

    Exceptions within the function calls can be captured as follows:

    .. sourcecode:: python

        def add(x, y):
            if x and y:
                return x + y
            else:
                raise ValueError('x or y cannot be None')

        # Job execution capture the exception raised by the function.
        job = graphlab.deploy.job.create(add, x=1, y=None)

        # Get results from the execution when ready. This call waits for the
        # job to complete before retrieving the results.
        >>> print job.get_results()
        None

        # Get the exceptions raised from this execution by calling
        # job.get_metrics()
        >>> print job.get_metrics()
        +-----------+--------+------------+----------+-----------------------+
        | task_name | status | start_time | run_time |   exception_message   |
        +-----------+--------+------------+----------+-----------------------+
        |    add    | Failed | 1427928898 |   None   | x or y cannot be None |
        +-----------+--------+------------+----------+-----------------------+
        +-------------------------------+
        |      exception_traceback      |
        +-------------------------------+
        | Traceback (most recent cal... |
        +-------------------------------+
        [1 rows x 6 columns]


    If a function requires a package to be installed, the function can be
    annotated with a decorator.

    .. sourcecode:: python

        @graphlab.deploy.required_packages(['names == 0.3.0'])
        def my_function(number = 10):
            import names
            people = [names.get_full_name() for i in range(number)]
            sf = graphlab.SFrame({'names':people})
            return sf

        job = graphlab.deploy.job.create(my_function)

        >>> print job.get_results()

        Columns:
                names    str

        Data:
        +-------------------+
        |       names       |
        +-------------------+
        |   Annette Logan   |
        |   Nancy Anthony   |
        |  Tiffany Zupancic |
        |    Andre Coppin   |
        |     Robert Coe    |
        |    Donald Dean    |
        |    Lynne Bunton   |
        |   John Sartwell   |
        |   Peter Nicholas  |
        | Chester Rodriguez |
        +-------------------+
        [10 rows x 1 columns]

    Complex functions that require SFrames, GraphLab models etc. can be deployed
    with ease. All additional state required by the function are automatically
    captured.

    .. sourcecode:: python

        GLOBAL_CONSTANT = 10

        def foo(x):
            return x + 1

        def bar(x):
            return x + 2

        def my_function(x, y):
            foo_x = foo(x)
            bar_y = bar(y)
            return foo_x + bar_y + GLOBAL_CONSTANT

        # Automatically captures all state needed by the deployed function.
        job = graphlab.deploy.job.create(my_function, x = 1, y = 1)

        >>> print job.get_results()
        15

    You can execute the same job remotely by passing a different environment.

    .. sourcecode:: python

        # Define a function
        def add(x, y):
            return x + y

        # Define an EC2 environment
        ec2 = graphlab.deploy.Ec2Config()

        # Create an EC2 cluster object
        c = graphlab.deploy.ec2_cluster.create('my_cluster', 's3://bucket/path', ec2)

        # Create a job.
        job = graphlab.deploy.job.create(add, environment=c, x=1, y=1)

        >>> print job.get_results()
        2

    Notes
    -----
    - When an exception is raised within the deployed function,
      :func:`~graphlab.deploy.Job.get_results` returns None.

    - For asynchronous jobs, :func:`~graphlab.deploy.Job.get_results` is a
      blocking call which will wait for the job execution to complete
      before returning the results.

    """
    _session = _gl.deploy._default_session


    _raise_error_if_not_function(function)

    _get_metric_tracker().track('jobs.job')

    # Name the job
    now = _datetime.now().strftime('%b-%d-%Y-%H-%M-%S')
    function_name = _sub('[<>]','',function.__name__)

    name = '%s-%s' % (function_name, now) if not name else name
    # Validate args
    function, name, environment = _validate_job_create_args(function,
                                                            name, environment)
    while _session.exists(name, _job.Job._typename):
        rand = str(_uuid.uuid4())[:5]
        old_name = name
        name = "%s-%s" % (name, rand)
        __LOGGER__.info("A job with name '%s' already exists. "
                        "Renaming the job to '%s'." % (old_name, name))

    # Setup the task & job
    task = _task.Task(function,function_name)
    task.set_inputs(kwargs)
    job = _job.Job(name, stages=[[task]], environment=environment,
                                        final_stage=task)
    # Setup the env.
    __LOGGER__.info("Validation complete. Job: '%s' ready for execution." % name)
    exec_env = _env._get_execution_env(environment)
    job = exec_env.run_job(job)

    # Save the job and return to user
    if not isinstance(environment, _environment.Local):
        __LOGGER__.info("Job: '%s' scheduled." % name)
    else:
        __LOGGER__.info("Job: '%s' finished." % name)

    _session.register(job)
    _session.save(job)
    return job
Exemple #7
0
def create(function, name=None, environment=None, **kwargs):
    """
    Execute arbitrary functions in a remote environment.

    The job is specified as a function. All functions that are called from
    within the function are automatically captured. By default, this method will
    kick off asynchronous work, and return a Job object to monitor/manage that
    work.

    Parameters
    ----------
    function : function
        Function to be executed in this Job, with arguments to pass to this
        function specified by `kwargs`.

    name : str, optional
        Name for this execution (names the returned Job). If set to None, then
        the name of the job is set to the name of the function with a time-stamp.
        Valid characters in job name include: digits, characters, '-' and '_'.

    environment : :class:`~graphlab.deploy.hadoop_cluster.HadoopCluster` | :class:`~graphlab.deploy.ec2_cluster.Ec2Cluster` | :class:`~graphlab.deploy.LocalAsync`, optional
        Optional environment for execution. If set to None, then a `LocalAsync`
        by the name `async` is created and used. This will execute the code in
        the background on your local machine.

    kwargs:
        Function kwargs that are passed to the function for execution.

    Returns
    -------
    job : :py:class:`~graphlab.deploy.Job`
        Used for monitoring and managing the execution of the Job.

    See Also
    --------
    graphlab.deploy.map_job.create, graphlab.deploy.Job

    Examples
    --------
    Let us start out with a simple example to execute a function that can
    add two numbers.

    .. sourcecode:: python

        # Define a function
        def add(x, y):
            return x + y

        # Create a job.
        job = graphlab.deploy.job.create(add, x=1, y=1)

        # Get results from the execution when ready. This call waits for the
        # job to complete before retrieving the results.
        >>> print job.get_results()
        2

    Exceptions within the function calls can be captured as follows:

    .. sourcecode:: python

        def add(x, y):
            if x and y:
                return x + y
            else:
                raise ValueError('x or y cannot be None')

        # Job execution capture the exception raised by the function.
        job = graphlab.deploy.job.create(add, x=1, y=None)

        # Get results from the execution when ready. This call waits for the
        # job to complete before retrieving the results.
        >>> print job.get_results()
        None

        # Get the exceptions raised from this execution by calling
        # job.get_metrics()
        >>> print job.get_metrics()
        +-----------+--------+------------+----------+-----------------------+
        | task_name | status | start_time | run_time |   exception_message   |
        +-----------+--------+------------+----------+-----------------------+
        |    add    | Failed | 1427928898 |   None   | x or y cannot be None |
        +-----------+--------+------------+----------+-----------------------+
        +-------------------------------+
        |      exception_traceback      |
        +-------------------------------+
        | Traceback (most recent cal... |
        +-------------------------------+
        [1 rows x 6 columns]


    If a function requires a package to be installed, the function can be
    annotated with a decorator.

    .. sourcecode:: python

        def my_function(number = 10):
            import names
            people = [names.get_full_name() for i in range(number)]
            sf = graphlab.SFrame({'names':people})
            return sf

        job = graphlab.deploy.job.create(my_function)

        >>> print job.get_results()

        Columns:
                names    str

        Data:
        +-------------------+
        |       names       |
        +-------------------+
        |   Annette Logan   |
        |   Nancy Anthony   |
        |  Tiffany Zupancic |
        |    Andre Coppin   |
        |     Robert Coe    |
        |    Donald Dean    |
        |    Lynne Bunton   |
        |   John Sartwell   |
        |   Peter Nicholas  |
        | Chester Rodriguez |
        +-------------------+
        [10 rows x 1 columns]

    Complex functions that require SFrames, GraphLab models etc. can be deployed
    with ease. All additional state required by the function are automatically
    captured.

    .. sourcecode:: python

        GLOBAL_CONSTANT = 10

        def foo(x):
            return x + 1

        def bar(x):
            return x + 2

        def my_function(x, y):
            foo_x = foo(x)
            bar_y = bar(y)
            return foo_x + bar_y + GLOBAL_CONSTANT

        # Automatically captures all state needed by the deployed function.
        job = graphlab.deploy.job.create(my_function, x = 1, y = 1)

        >>> print job.get_results()
        15

    You can execute the same job remotely by passing a different environment.

    .. sourcecode:: python

        # Define a function
        def add(x, y):
            return x + y

        # Define an EC2 environment
        ec2 = graphlab.deploy.Ec2Config()

        # Create an EC2 cluster object
        c = graphlab.deploy.ec2_cluster.create('my_cluster', 's3://bucket/path', ec2)

        # Create a job.
        job = graphlab.deploy.job.create(add, environment=c, x=1, y=1)

        >>> print job.get_results()
        2

    Notes
    -----
    - When an exception is raised within the deployed function,
      :func:`~graphlab.deploy.Job.get_results` returns None.

    - For asynchronous jobs, :func:`~graphlab.deploy.Job.get_results` is a
      blocking call which will wait for the job execution to complete
      before returning the results.

    """
    _session = _gl.deploy._default_session

    _raise_error_if_not_function(function)

    _get_metric_tracker().track('jobs.job')

    # Name the job
    now = _datetime.now().strftime('%b-%d-%Y-%H-%M-%S')
    function_name = _sub('[<>]', '', function.__name__)

    name = '%s-%s' % (function_name, now) if not name else name
    # Validate args
    function, name, environment = _validate_job_create_args(
        function, name, environment)
    while _session.exists(name, _job.Job._typename):
        rand = str(_uuid.uuid4())[:5]
        old_name = name
        name = "%s-%s" % (name, rand)
        __LOGGER__.info("A job with name '%s' already exists. "
                        "Renaming the job to '%s'." % (old_name, name))

    # Setup the task & job
    task = _task.Task(function, function_name)
    task.set_inputs(kwargs)
    job = _job.Job(name,
                   stages=[[task]],
                   environment=environment,
                   final_stage=task)
    # Setup the env.
    __LOGGER__.info("Validation complete. Job: '%s' ready for execution." %
                    name)
    exec_env = _env._get_execution_env(environment)
    job = exec_env.run_job(job)

    # Save the job and return to user
    if not isinstance(environment, _environment.Local):
        __LOGGER__.info("Job: '%s' scheduled." % name)
    else:
        __LOGGER__.info("Job: '%s' finished." % name)

    _session.register(job)
    _session.save(job)
    return job