Exemple #1
0
def create_map_job(config, internal_storage, executor_id, map_job_id, map_function, iterdata, runtime_meta,
                   runtime_memory=None, extra_params=None, extra_env=None, obj_chunk_size=None,
                   obj_chunk_number=None, remote_invocation=False, remote_invocation_groups=None,
                   invoke_pool_threads=128, include_modules=[], exclude_modules=[], is_remote_cluster=False,
                   execution_timeout=EXECUTION_TIMEOUT):
    """
    Wrapper to create a map job.  It integrates COS logic to process objects.
    """
    map_func = map_function
    map_iterdata = utils.verify_args(map_function, iterdata, extra_params)
    new_invoke_pool_threads = invoke_pool_threads
    new_runtime_memory = runtime_memory

    # Object processing functionality
    parts_per_object = None
    if utils.is_object_processing_function(map_function):
        # If it is object processing function, create partitions according chunk_size or chunk_number
        logger.debug('ExecutorID {} | JobID {} - Calling map on partitions from object storage flow'.format(executor_id, map_job_id))
        map_iterdata, parts_per_object = create_partitions(config, map_iterdata, obj_chunk_size, obj_chunk_number)
    # ########

    # Remote invocation functionality
    original_total_tasks = len(map_iterdata)
    if original_total_tasks == 1 or is_remote_cluster:
        remote_invocation = False
    if remote_invocation:
        def remote_invoker(input_data):
            pw = pywren.ibm_cf_executor()
            return pw.map(map_function, input_data,
                          runtime_memory=runtime_memory,
                          invoke_pool_threads=invoke_pool_threads,
                          extra_env=extra_env)

        map_func = remote_invoker
        if remote_invocation_groups:
            map_iterdata = [[iterdata[x:x+remote_invocation_groups]]
                            for x in range(0, original_total_tasks, remote_invocation_groups)]
        else:
            map_iterdata = [iterdata]
        map_iterdata = utils.verify_args(remote_invoker, map_iterdata, extra_params)
        new_invoke_pool_threads = 1
        new_runtime_memory = runtime_memory
    # ########

    job_description = _create_job(config, internal_storage, executor_id,
                                  map_job_id, map_func, map_iterdata,
                                  runtime_meta=runtime_meta,
                                  runtime_memory=new_runtime_memory,
                                  extra_env=extra_env,
                                  invoke_pool_threads=new_invoke_pool_threads,
                                  include_modules=include_modules,
                                  exclude_modules=exclude_modules,
                                  remote_invocation=remote_invocation,
                                  original_total_tasks=original_total_tasks,
                                  execution_timeout=execution_timeout)

    job_description['parts_per_object'] = parts_per_object

    return job_description
Exemple #2
0
def create_map_job(config,
                   internal_storage,
                   executor_id,
                   job_id,
                   map_function,
                   iterdata,
                   runtime_meta,
                   runtime_memory=None,
                   extra_params=None,
                   extra_env=None,
                   obj_chunk_size=None,
                   obj_chunk_number=None,
                   invoke_pool_threads=128,
                   include_modules=[],
                   exclude_modules=[],
                   execution_timeout=None):
    """
    Wrapper to create a map job.  It integrates COS logic to process objects.
    """
    job_created_timestamp = time.time()
    map_func = map_function
    map_iterdata = utils.verify_args(map_function, iterdata, extra_params)
    new_invoke_pool_threads = invoke_pool_threads
    new_runtime_memory = runtime_memory

    if config['pywren'].get('rabbitmq_monitor', False):
        rabbit_amqp_url = config['rabbitmq'].get('amqp_url')
        utils.create_rabbitmq_resources(rabbit_amqp_url, executor_id, job_id)

    # Object processing functionality
    parts_per_object = None
    if is_object_processing_function(map_function):
        # If it is object processing function, create partitions according chunk_size or chunk_number
        logger.debug(
            'ExecutorID {} | JobID {} - Calling map on partitions from object storage flow'
            .format(executor_id, job_id))
        map_iterdata, parts_per_object = create_partitions(
            config, map_iterdata, obj_chunk_size, obj_chunk_number)
    # ########

    job_description = _create_job(config,
                                  internal_storage,
                                  executor_id,
                                  job_id,
                                  map_func,
                                  map_iterdata,
                                  runtime_meta=runtime_meta,
                                  runtime_memory=new_runtime_memory,
                                  extra_env=extra_env,
                                  invoke_pool_threads=new_invoke_pool_threads,
                                  include_modules=include_modules,
                                  exclude_modules=exclude_modules,
                                  execution_timeout=execution_timeout,
                                  job_created_timestamp=job_created_timestamp)

    if parts_per_object:
        job_description['parts_per_object'] = parts_per_object

    return job_description
Exemple #3
0
def create_map_job(config,
                   internal_storage,
                   executor_id,
                   map_job_id,
                   map_function,
                   iterdata,
                   runtime_meta,
                   runtime_memory=None,
                   extra_params=None,
                   extra_env=None,
                   obj_chunk_size=None,
                   obj_chunk_number=None,
                   invoke_pool_threads=128,
                   include_modules=[],
                   exclude_modules=[],
                   execution_timeout=EXECUTION_TIMEOUT):
    """
    Wrapper to create a map job.  It integrates COS logic to process objects.
    """
    map_func = map_function
    map_iterdata = utils.verify_args(map_function, iterdata, extra_params)
    new_invoke_pool_threads = invoke_pool_threads
    new_runtime_memory = runtime_memory

    # Object processing functionality
    parts_per_object = None
    if utils.is_object_processing_function(map_function):
        # If it is object processing function, create partitions according chunk_size or chunk_number
        logger.debug(
            'ExecutorID {} | JobID {} - Calling map on partitions from object storage flow'
            .format(executor_id, map_job_id))
        map_iterdata, parts_per_object = create_partitions(
            config, map_iterdata, obj_chunk_size, obj_chunk_number)
    # ########

    job_description = _create_job(config,
                                  internal_storage,
                                  executor_id,
                                  map_job_id,
                                  map_func,
                                  map_iterdata,
                                  runtime_meta=runtime_meta,
                                  runtime_memory=new_runtime_memory,
                                  extra_env=extra_env,
                                  invoke_pool_threads=new_invoke_pool_threads,
                                  include_modules=include_modules,
                                  exclude_modules=exclude_modules,
                                  execution_timeout=execution_timeout)

    job_description['parts_per_object'] = parts_per_object

    return job_description
    def run(self):
        """
        Runs the function
        """
        logger.info("Started")
        result = None
        exception = False
        try:
            self.internal_storage = InternalStorage(self.storage_config)
            self.internal_storage.tmp_obj_prefix = self.output_key.rsplit(
                '/', 1)[0]
            loaded_func_all = self._get_function_and_modules()
            self._save_modules(loaded_func_all['module_data'])
            function = self._unpickle_function(loaded_func_all['func'])
            data = self._load_data()

            if is_object_processing_function(function):
                self._create_data_stream(data)

            self._fill_optional_args(function, data)

            if self.show_memory:
                logger.debug(
                    "Memory usage before call the function: {}".format(
                        get_current_memory_usage()))

            logger.info("Going to execute '{}()'".format(str(
                function.__name__)))
            print('---------------------- FUNCTION LOG ----------------------',
                  flush=True)
            func_exec_time_t1 = time.time()
            result = function(**data)
            func_exec_time_t2 = time.time()
            print('----------------------------------------------------------',
                  flush=True)
            logger.info("Success function execution")

            if self.show_memory:
                logger.debug("Memory usage after call the function: {}".format(
                    get_current_memory_usage()))

            self.stats.write('function_exec_time',
                             round(func_exec_time_t2 - func_exec_time_t1, 8))

            # Check for new futures
            if result is not None:
                self.stats.write("result", True)
                if isinstance(result, ResponseFuture) or \
                   (type(result) == list and len(result) > 0 and isinstance(result[0], ResponseFuture)):
                    self.stats.write('new_futures', True)

                logger.debug("Pickling result")
                output_dict = {'result': result}
                pickled_output = pickle.dumps(output_dict)

                if self.show_memory:
                    logger.debug(
                        "Memory usage after output serialization: {}".format(
                            get_current_memory_usage()))
            else:
                logger.debug("No result to store")
                self.stats.write("result", False)

        except Exception:
            exception = True
            self.stats.write("exception", True)
            exc_type, exc_value, exc_traceback = sys.exc_info()
            print('----------------------- EXCEPTION !-----------------------',
                  flush=True)
            traceback.print_exc(file=sys.stdout)
            print('----------------------------------------------------------',
                  flush=True)

            if self.show_memory:
                logger.debug("Memory usage after call the function: {}".format(
                    get_current_memory_usage()))

            try:
                logger.debug("Pickling exception")
                pickled_exc = pickle.dumps(
                    (exc_type, exc_value, exc_traceback))
                pickle.loads(
                    pickled_exc
                )  # this is just to make sure they can be unpickled
                self.stats.write("exc_info", str(pickled_exc))

            except Exception as pickle_exception:
                # Shockingly often, modules like subprocess don't properly
                # call the base Exception.__init__, which results in them
                # being unpickleable. As a result, we actually wrap this in a try/catch block
                # and more-carefully handle the exceptions if any part of this save / test-reload
                # fails
                self.stats.write("exc_pickle_fail", True)
                pickled_exc = pickle.dumps({
                    'exc_type': str(exc_type),
                    'exc_value': str(exc_value),
                    'exc_traceback': exc_traceback,
                    'pickle_exception': pickle_exception
                })
                pickle.loads(
                    pickled_exc
                )  # this is just to make sure they can be unpickled
                self.stats.write("exc_info", str(pickled_exc))
        finally:
            store_result = strtobool(os.environ.get('STORE_RESULT', 'True'))
            if result is not None and store_result and not exception:
                output_upload_timestamp_t1 = time.time()
                logger.info(
                    "Storing function result - output.pickle - Size: {}".
                    format(sizeof_fmt(len(pickled_output))))
                self.internal_storage.put_data(self.output_key, pickled_output)
                output_upload_timestamp_t2 = time.time()
                self.stats.write(
                    "output_upload_time",
                    round(
                        output_upload_timestamp_t2 -
                        output_upload_timestamp_t1, 8))
            self.result_queue.put("Finished")
            logger.info("Finished")
Exemple #5
0
def create_map_job(config,
                   internal_storage,
                   executor_id,
                   job_id,
                   map_function,
                   iterdata,
                   obj_chunk_size=None,
                   extra_env=None,
                   extra_meta=None,
                   runtime_memory=None,
                   remote_invocation=False,
                   remote_invocation_groups=None,
                   invoke_pool_threads=128,
                   exclude_modules=None,
                   is_cf_cluster=False,
                   execution_timeout=EXECUTION_TIMEOUT,
                   overwrite_invoke_args=None):
    """
    Wrapper to create a map job.  It integrates COS logic to process objects.
    """
    map_job_id = f'M{job_id}'
    data = utils.iterdata_as_list(iterdata)
    map_func = map_function
    map_iterdata = data
    new_invoke_pool_threads = invoke_pool_threads
    new_runtime_memory = runtime_memory

    # Object processing functionality
    parts_per_object = None
    if utils.is_object_processing_function(map_function):
        '''
        If it is object processing function, create partitions according chunk_size
        '''
        logger.debug(
            'ExecutorID {} | JobID {} - Calling map on partitions from object storage flow'
            .format(executor_id, job_id))
        arg_data = utils.verify_args(map_function,
                                     data,
                                     object_processing=True)
        map_iterdata, parts_per_object = create_partitions(
            config, arg_data, obj_chunk_size)
        map_func = partition_processor(map_function)
    # ########

    # Remote invocation functionality
    original_total_tasks = len(map_iterdata)
    if original_total_tasks == 1 or is_cf_cluster:
        remote_invocation = False
    if remote_invocation:
        rabbitmq_monitor = "CB_RABBITMQ_MONITOR" in os.environ

        def remote_invoker(input_data):
            pw = pywren.ibm_cf_executor(rabbitmq_monitor=rabbitmq_monitor)
            return pw.map(map_function,
                          input_data,
                          runtime_memory=runtime_memory,
                          invoke_pool_threads=invoke_pool_threads,
                          extra_env=extra_env,
                          extra_meta=extra_meta)

        map_func = remote_invoker
        if remote_invocation_groups:
            map_iterdata = [[
                iterdata[x:x + remote_invocation_groups]
            ] for x in range(0, original_total_tasks, remote_invocation_groups)
                            ]
        else:
            map_iterdata = [iterdata]
        new_invoke_pool_threads = 1
        new_runtime_memory = runtime_memory
    # ########

    job_description = _create_job(config,
                                  internal_storage,
                                  executor_id,
                                  map_job_id,
                                  map_func,
                                  map_iterdata,
                                  extra_env=extra_env,
                                  extra_meta=extra_meta,
                                  runtime_memory=new_runtime_memory,
                                  invoke_pool_threads=new_invoke_pool_threads,
                                  overwrite_invoke_args=overwrite_invoke_args,
                                  exclude_modules=exclude_modules,
                                  original_func_name=map_function.__name__,
                                  remote_invocation=remote_invocation,
                                  original_total_tasks=original_total_tasks,
                                  execution_timeout=execution_timeout)

    return job_description, parts_per_object