def create(function, name=None, environment=None, **kwargs): """ Execute arbitrary functions in a remote environment. The job is specified as a function. All functions that are called from within the function are automatically captured. By default, this method will kick off asynchronous work, and return a Job object to monitor/manage that work. Parameters ---------- function : function Function to be executed in this Job, with arguments to pass to this function specified by `kwargs`. name : str, optional Name for this execution (names the returned Job). If set to None, then the name of the job is set to the name of the function with a time-stamp. Valid characters in job name include: digits, characters, '-' and '_'. environment : :class:`~graphlab.deploy.hadoop_cluster.HadoopCluster` | :class:`~graphlab.deploy.ec2_cluster.Ec2Cluster` | :class:`~graphlab.deploy.LocalAsync`, optional Optional environment for execution. If set to None, then a `LocalAsync` by the name `async` is created and used. This will execute the code in the background on your local machine. kwargs: Function kwargs that are passed to the function for execution. Returns ------- job : :py:class:`~graphlab.deploy.Job` Used for monitoring and managing the execution of the Job. See Also -------- graphlab.deploy.map_job.create, graphlab.deploy.Job Examples -------- Let us start out with a simple example to execute a function that can add two numbers. .. sourcecode:: python # Define a function def add(x, y): return x + y # Create a job. job = graphlab.deploy.job.create(add, x=1, y=1) # Get results from the execution when ready. This call waits for the # job to complete before retrieving the results. >>> print job.get_results() 2 Exceptions within the function calls can be captured as follows: .. sourcecode:: python def add(x, y): if x and y: return x + y else: raise ValueError('x or y cannot be None') # Job execution capture the exception raised by the function. job = graphlab.deploy.job.create(add, x=1, y=None) # Get results from the execution when ready. This call waits for the # job to complete before retrieving the results. >>> print job.get_results() None # Get the exceptions raised from this execution by calling # job.get_metrics() >>> print job.get_metrics() +-----------+--------+------------+----------+-----------------------+ | task_name | status | start_time | run_time | exception_message | +-----------+--------+------------+----------+-----------------------+ | add | Failed | 1427928898 | None | x or y cannot be None | +-----------+--------+------------+----------+-----------------------+ +-------------------------------+ | exception_traceback | +-------------------------------+ | Traceback (most recent cal... | +-------------------------------+ [1 rows x 6 columns] If a function requires a package to be installed, the function can be annotated with a decorator. .. sourcecode:: python @graphlab.deploy.required_packages(['names == 0.3.0']) def my_function(number = 10): import names people = [names.get_full_name() for i in range(number)] sf = graphlab.SFrame({'names':people}) return sf job = graphlab.deploy.job.create(my_function) >>> print job.get_results() Columns: names str Data: +-------------------+ | names | +-------------------+ | Annette Logan | | Nancy Anthony | | Tiffany Zupancic | | Andre Coppin | | Robert Coe | | Donald Dean | | Lynne Bunton | | John Sartwell | | Peter Nicholas | | Chester Rodriguez | +-------------------+ [10 rows x 1 columns] Complex functions that require SFrames, GraphLab models etc. can be deployed with ease. All additional state required by the function are automatically captured. .. sourcecode:: python GLOBAL_CONSTANT = 10 def foo(x): return x + 1 def bar(x): return x + 2 def my_function(x, y): foo_x = foo(x) bar_y = bar(y) return foo_x + bar_y + GLOBAL_CONSTANT # Automatically captures all state needed by the deployed function. job = graphlab.deploy.job.create(my_function, x = 1, y = 1) >>> print job.get_results() 15 You can execute the same job remotely by passing a different environment. .. sourcecode:: python # Define a function def add(x, y): return x + y # Define an EC2 environment ec2 = graphlab.deploy.Ec2Config() # Create an EC2 cluster object c = graphlab.deploy.ec2_cluster.create('my_cluster', 's3://bucket/path', ec2) # Create a job. job = graphlab.deploy.job.create(add, environment=c, x=1, y=1) >>> print job.get_results() 2 Notes ----- - When an exception is raised within the deployed function, :func:`~graphlab.deploy.Job.get_results` returns None. - For asynchronous jobs, :func:`~graphlab.deploy.Job.get_results` is a blocking call which will wait for the job execution to complete before returning the results. """ _session = _gl.deploy._default_session _raise_error_if_not_function(function) _get_metric_tracker().track('jobs.job') # Name the job now = _datetime.now().strftime('%b-%d-%Y-%H-%M-%S') function_name = _sub('[<>]','',function.__name__) name = '%s-%s' % (function_name, now) if not name else name # Validate args function, name, environment = _validate_job_create_args(function, name, environment) while _session.exists(name, _job.Job._typename): rand = str(_uuid.uuid4())[:5] old_name = name name = "%s-%s" % (name, rand) __LOGGER__.info("A job with name '%s' already exists. " "Renaming the job to '%s'." % (old_name, name)) # Setup the task & job task = _task.Task(function,function_name) task.set_inputs(kwargs) job = _job.Job(name, stages=[[task]], environment=environment, final_stage=task) # Setup the env. __LOGGER__.info("Validation complete. Job: '%s' ready for execution." % name) exec_env = _env._get_execution_env(environment) job = exec_env.run_job(job) # Save the job and return to user if not isinstance(environment, _environment.Local): __LOGGER__.info("Job: '%s' scheduled." % name) else: __LOGGER__.info("Job: '%s' finished." % name) _session.register(job) _session.save(job) return job
def create(function, parameter_set, name=None, environment=None, combiner_function=None): """ Distributed execution of a function once for each entry in the parameter_set. Similar to the map() function in python, this method `maps` a single function to each provided set of parameters. The results are a list corresponding to each parameter in `parameter_set`. Parameters ---------- function : function Function to be executed, with arguments to pass to this function specified by parameter_set. parameter_set : iterable of dict Each element of the list corresponds to an evaluation of the function with the dictionary argument. name : str, optional Name for the returned Job. If set to None, then the name of the Job is set to the name of the function with a timestamp. environment : :class:`~graphlab.deploy.hadoop_cluster.HadoopCluster` |:class:`~graphlab.deploy.ec2_cluster.Ec2Cluster` | :class:`~graphlab.deploy.LocalAsync`, optional Optional environment for execution. If set to None, then a `LocalAsync` by the name `async` is created and used. This will execute the code in the background on your local machine. combiner_function : function (kwargs -> object), optional An optional function that will be run once at end of the map_job. The combiner function will have access to all previous map job results. If a combiner is provided, only the output of the combiner will be reported. The input type of the combiner is `kwargs` where the values (in order) correspond to the values of the results from the map. Returns ------- job : :py:class:`~graphlab.deploy.Job` The job for the map_job, which was run using the `environment` parameter. This object can be used to track the progress of map_job work. See Also -------- graphlab.deploy.job.create, graphlab.deploy.Job Notes ----- - The map job achieves the same behavior as `results = map(func, args)` Examples -------- Let us start out with a simple example to execute a function that can add two numbers over 2 sets of arguments. .. sourcecode:: python # Define the function. def add(x, y): return x + y # Create a map-job params = [{'x': 1, 'y': 2}, {'x': 10, 'y': -1}] job = graphlab.deploy.map_job.create(add, params) # Get results from the execution when ready. This call waits for the # job to complete before retrieving the results. >>> print job.get_results() [3, 9] Exceptions within the function calls can be captured as follows: .. sourcecode:: python def add(x, y): if x and y: return x + y else: raise ValueError('x or y cannot be None') params = [{'x': 1, 'y': 2}, {'x': 10, 'y': None}] job = graphlab.deploy.map_job.create(add, params) # Get results from the execution when ready. >>> print job.get_results() [3, None] # Get the exceptions raised from this execution by calling # job.get_metrics() >>> print job.get_metrics() +-----------+-----------+------------+-------------------+-----------------------+ | task_name | status | start_time | run_time | exception_message | +-----------+-----------+------------+-------------------+-----------------------+ | add-0-0 | Completed | 1427931034 | 3.81469726562e-05 | | | add-1-0 | Failed | 1427931034 | None | x or y cannot be None | +-----------+-----------+------------+-------------------+-----------------------+ +-------------------------------+ | exception_traceback | +-------------------------------+ | | | Traceback (most recent cal... | +-------------------------------+ [2 rows x 6 columns] Use the combiner function to perform aggregations on the results. .. sourcecode:: python # Combiner to combine all results from the map. def max_combiner(**kwargs): return max(kwargs.values()) # The function being mapped to the arguments. def add(x, y): return x + y # Create a map-job. params = [{'x': 1, 'y': 2}, {'x': 10, 'y': -1}] job = graphlab.deploy.map_job.create(add, params, combiner_function = max_combiner) # Get results. (Applies the combiner on the results of the map.) >>> print job.get_results() 9 """ _get_metric_tracker().track( "jobs.map_job", properties={"num_tasks": len(parameter_set), "has_combiner": combiner_function is not None} ) _session = _gl.deploy._default_session job = _create_map_job(function, parameter_set, name, environment, combiner_function, _job_type="PIPELINE") # Setup the env. __LOGGER__.info("Validation complete. Job: '%s' ready for execution" % job.name) exec_env = _env._get_execution_env(environment) job = exec_env.run_job(job) # Save the job and return to user if not isinstance(environment, _environment.Local): __LOGGER__.info("Job: '%s' scheduled." % job.name) else: __LOGGER__.info("Job: '%s' finished." % job.name) _session.register(job) _session.save(job) return job