Example #1
0
def _create_job(config,
                internal_storage,
                executor_id,
                job_id,
                func,
                iterdata,
                runtime_meta,
                runtime_memory,
                extra_env,
                include_modules,
                exclude_modules,
                execution_timeout,
                host_job_meta,
                invoke_pool_threads=128):
    """
    :param func: the function to map over the data
    :param iterdata: An iterable of input data
    :param extra_env: Additional environment variables for CF environment. Default None.
    :param extra_meta: Additional metadata to pass to CF. Default None.
    :param remote_invocation: Enable remote invocation. Default False.
    :param invoke_pool_threads: Number of threads to use to invoke.
    :param data_all_as_one: upload the data as a single object. Default True
    :param overwrite_invoke_args: Overwrite other args. Mainly used for testing.
    :param exclude_modules: Explicitly keep these modules from pickled dependencies.
    :return: A list with size `len(iterdata)` of futures for each job
    :rtype:  list of futures.
    """
    ext_env = {} if extra_env is None else extra_env.copy()
    if ext_env:
        ext_env = utils.convert_bools_to_string(ext_env)
        logger.debug("Extra environment vars {}".format(ext_env))

    job = SimpleNamespace()
    job.executor_id = executor_id
    job.job_id = job_id
    job.extra_env = ext_env
    job.execution_timeout = execution_timeout or config['lithops'][
        'execution_timeout']
    job.function_name = func.__name__
    job.total_calls = len(iterdata)

    mode = config['lithops']['mode']

    if mode == SERVERLESS:
        job.invoke_pool_threads = invoke_pool_threads or config['serverless'][
            'invoke_pool_threads']
        job.runtime_memory = runtime_memory or config['serverless'][
            'runtime_memory']
        job.runtime_timeout = config['serverless']['runtime_timeout']
        if job.execution_timeout >= job.runtime_timeout:
            job.execution_timeout = job.runtime_timeout - 5

    elif mode == STANDALONE:
        job.runtime_memory = None
        runtime_timeout = config['standalone']['hard_dismantle_timeout']
        if job.execution_timeout >= runtime_timeout:
            job.execution_timeout = runtime_timeout - 10

    elif mode == LOCALHOST:
        job.runtime_memory = None
        job.runtime_timeout = execution_timeout

    exclude_modules_cfg = config['lithops'].get('exclude_modules', [])
    include_modules_cfg = config['lithops'].get('include_modules', [])

    exc_modules = set()
    inc_modules = set()
    if exclude_modules_cfg:
        exc_modules.update(exclude_modules_cfg)
    if exclude_modules:
        exc_modules.update(exclude_modules)
    if include_modules_cfg is not None:
        inc_modules.update(include_modules_cfg)
    if include_modules_cfg is None and not include_modules:
        inc_modules = None
    if include_modules is not None and include_modules:
        inc_modules.update(include_modules)
    if include_modules is None:
        inc_modules = None

    logger.debug(
        'ExecutorID {} | JobID {} - Serializing function and data'.format(
            executor_id, job_id))
    job_serialize_start = time.time()
    serializer = SerializeIndependent(runtime_meta['preinstalls'])
    func_and_data_ser, mod_paths = serializer([func] + iterdata, inc_modules,
                                              exc_modules)
    data_strs = func_and_data_ser[1:]
    data_size_bytes = sum(len(x) for x in data_strs)
    module_data = create_module_data(mod_paths)
    func_str = func_and_data_ser[0]
    func_module_str = pickle.dumps(
        {
            'func': func_str,
            'module_data': module_data
        }, -1)
    func_module_size_bytes = len(func_module_str)
    total_size = utils.sizeof_fmt(data_size_bytes + func_module_size_bytes)
    host_job_meta['host_job_serialize_time'] = round(
        time.time() - job_serialize_start, 6)

    host_job_meta['data_size_bytes'] = data_size_bytes
    host_job_meta['func_module_size_bytes'] = func_module_size_bytes

    if 'data_limit' in config['lithops']:
        data_limit = config['lithops']['data_limit']
    else:
        data_limit = MAX_AGG_DATA_SIZE

    if data_limit and data_size_bytes > data_limit * 1024**2:
        log_msg = (
            'ExecutorID {} | JobID {} - Total data exceeded maximum size '
            'of {}'.format(executor_id, job_id,
                           sizeof_fmt(data_limit * 1024**2)))
        raise Exception(log_msg)

    logger.info('ExecutorID {} | JobID {} - Uploading function and data '
                '- Total: {}'.format(executor_id, job_id, total_size))

    # Upload data
    data_key = create_agg_data_key(JOBS_PREFIX, executor_id, job_id)
    job.data_key = data_key
    data_bytes, data_ranges = utils.agg_data(data_strs)
    job.data_ranges = data_ranges
    data_upload_start = time.time()
    internal_storage.put_data(data_key, data_bytes)
    data_upload_end = time.time()

    host_job_meta['host_data_upload_time'] = round(
        data_upload_end - data_upload_start, 6)
    func_upload_start = time.time()

    # Upload function and modules
    if config[mode].get('customized_runtime'):
        # Prepare function and modules locally to store in the runtime image later
        function_file = func.__code__.co_filename
        function_hash = hashlib.md5(open(function_file,
                                         'rb').read()).hexdigest()[:16]
        mod_hash = hashlib.md5(repr(
            sorted(mod_paths)).encode('utf-8')).hexdigest()[:16]

        uuid = f'{function_hash}{mod_hash}'
        func_key = create_func_key(JOBS_PREFIX, uuid, "")

        _store_func_and_modules(func_key, func_str, module_data)

        job.ext_runtime_uuid = uuid
    else:
        func_key = create_func_key(JOBS_PREFIX, executor_id, job_id)
        internal_storage.put_func(func_key, func_module_str)

    job.func_key = func_key
    func_upload_end = time.time()

    host_job_meta['host_func_upload_time'] = round(
        func_upload_end - func_upload_start, 6)

    host_job_meta['host_job_created_time'] = round(
        time.time() - host_job_meta['host_job_create_tstamp'], 6)

    job.metadata = host_job_meta

    return job
Example #2
0
def _create_job(config,
                internal_storage,
                executor_id,
                job_id,
                func,
                data,
                runtime_meta,
                runtime_memory=None,
                extra_env=None,
                invoke_pool_threads=128,
                include_modules=[],
                exclude_modules=[],
                execution_timeout=None,
                host_job_meta=None):
    """
    :param func: the function to map over the data
    :param iterdata: An iterable of input data
    :param extra_env: Additional environment variables for CF environment. Default None.
    :param extra_meta: Additional metadata to pass to CF. Default None.
    :param remote_invocation: Enable remote invocation. Default False.
    :param invoke_pool_threads: Number of threads to use to invoke.
    :param data_all_as_one: upload the data as a single object. Default True
    :param overwrite_invoke_args: Overwrite other args. Mainly used for testing.
    :param exclude_modules: Explicitly keep these modules from pickled dependencies.
    :return: A list with size `len(iterdata)` of futures for each job
    :rtype:  list of futures.
    """
    log_level = logger.getEffectiveLevel() != logging.WARNING

    runtime_name = config['lithops']['runtime']
    if runtime_memory is None:
        runtime_memory = config['lithops']['runtime_memory']

    ext_env = {} if extra_env is None else extra_env.copy()
    if ext_env:
        ext_env = utils.convert_bools_to_string(ext_env)
        logger.debug("Extra environment vars {}".format(ext_env))

    if not data:
        return []

    if execution_timeout is None:
        execution_timeout = config['lithops']['runtime_timeout'] - 5

    job_description = {}
    job_description['runtime_name'] = runtime_name
    job_description['runtime_memory'] = runtime_memory
    job_description['execution_timeout'] = execution_timeout
    job_description['function_name'] = func.__name__
    job_description['extra_env'] = ext_env
    job_description['total_calls'] = len(data)
    job_description['invoke_pool_threads'] = invoke_pool_threads
    job_description['executor_id'] = executor_id
    job_description['job_id'] = job_id

    exclude_modules_cfg = config['lithops'].get('exclude_modules', [])
    include_modules_cfg = config['lithops'].get('include_modules', [])

    exc_modules = set()
    inc_modules = set()
    if exclude_modules_cfg:
        exc_modules.update(exclude_modules_cfg)
    if exclude_modules:
        exc_modules.update(exclude_modules)
    if include_modules_cfg is not None:
        inc_modules.update(include_modules_cfg)
    if include_modules_cfg is None and not include_modules:
        inc_modules = None
    if include_modules is not None and include_modules:
        inc_modules.update(include_modules)
    if include_modules is None:
        inc_modules = None

    logger.debug(
        'ExecutorID {} | JobID {} - Serializing function and data'.format(
            executor_id, job_id))
    job_serialize_start = time.time()
    serializer = SerializeIndependent(runtime_meta['preinstalls'])
    func_and_data_ser, mod_paths = serializer([func] + data, inc_modules,
                                              exc_modules)
    data_strs = func_and_data_ser[1:]
    data_size_bytes = sum(len(x) for x in data_strs)
    module_data = create_module_data(mod_paths)
    func_str = func_and_data_ser[0]
    func_module_str = pickle.dumps(
        {
            'func': func_str,
            'module_data': module_data
        }, -1)
    func_module_size_bytes = len(func_module_str)
    total_size = utils.sizeof_fmt(data_size_bytes + func_module_size_bytes)
    host_job_meta['host_job_serialize_time'] = round(
        time.time() - job_serialize_start, 6)

    host_job_meta['data_size_bytes'] = data_size_bytes
    host_job_meta['func_module_size_bytes'] = func_module_size_bytes

    if 'data_limit' in config['lithops']:
        data_limit = config['lithops']['data_limit']
    else:
        data_limit = MAX_AGG_DATA_SIZE

    if data_limit and data_size_bytes > data_limit * 1024**2:
        log_msg = (
            'ExecutorID {} | JobID {} - Total data exceeded maximum size '
            'of {}'.format(executor_id, job_id,
                           sizeof_fmt(data_limit * 1024**2)))
        raise Exception(log_msg)

    log_msg = ('ExecutorID {} | JobID {} - Uploading function and data '
               '- Total: {}'.format(executor_id, job_id, total_size))
    logger.info(log_msg)
    if not log_level:
        print(log_msg)

    # Upload data
    data_key = create_agg_data_key(JOBS_PREFIX, executor_id, job_id)
    job_description['data_key'] = data_key
    data_bytes, data_ranges = utils.agg_data(data_strs)
    job_description['data_ranges'] = data_ranges
    data_upload_start = time.time()
    internal_storage.put_data(data_key, data_bytes)
    data_upload_end = time.time()

    host_job_meta['host_data_upload_time'] = round(
        data_upload_end - data_upload_start, 6)

    # Upload function and modules
    func_upload_start = time.time()
    func_key = create_func_key(JOBS_PREFIX, executor_id, job_id)
    job_description['func_key'] = func_key
    internal_storage.put_func(func_key, func_module_str)
    func_upload_end = time.time()

    host_job_meta['host_func_upload_time'] = round(
        func_upload_end - func_upload_start, 6)

    host_job_meta['host_job_created_time'] = round(
        time.time() - host_job_meta['host_job_create_tstamp'], 6)

    job_description['metadata'] = host_job_meta

    return job_description
Example #3
0
def _create_job(config,
                internal_storage,
                executor_id,
                job_id,
                func,
                iterdata,
                runtime_meta,
                runtime_memory,
                extra_env,
                include_modules,
                exclude_modules,
                execution_timeout,
                host_job_meta,
                chunksize=None,
                worker_processes=None,
                invoke_pool_threads=16):
    """
    Creates a new Job
    """
    ext_env = {} if extra_env is None else extra_env.copy()
    if ext_env:
        ext_env = utils.convert_bools_to_string(ext_env)
        logger.debug("Extra environment vars {}".format(ext_env))

    job = SimpleNamespace()
    job.chunksize = chunksize or config['lithops']['chunksize']
    job.worker_processes = worker_processes or config['lithops'][
        'worker_processes']
    job.execution_timeout = execution_timeout or config['lithops'][
        'execution_timeout']
    job.executor_id = executor_id
    job.job_id = job_id
    job.job_key = create_job_key(job.executor_id, job.job_id)
    job.extra_env = ext_env
    job.function_name = func.__name__
    job.total_calls = len(iterdata)

    mode = config['lithops']['mode']

    if mode == SERVERLESS:
        job.invoke_pool_threads = invoke_pool_threads or config['serverless'][
            'invoke_pool_threads']
        job.runtime_memory = runtime_memory or config['serverless'][
            'runtime_memory']
        job.runtime_timeout = config['serverless']['runtime_timeout']
        if job.execution_timeout >= job.runtime_timeout:
            job.execution_timeout = job.runtime_timeout - 5

    elif mode == STANDALONE:
        job.runtime_memory = None
        runtime_timeout = config['standalone']['hard_dismantle_timeout']
        if job.execution_timeout >= runtime_timeout:
            job.execution_timeout = runtime_timeout - 10

    elif mode == LOCALHOST:
        job.runtime_memory = None
        job.runtime_timeout = execution_timeout

    exclude_modules_cfg = config['lithops'].get('exclude_modules', [])
    include_modules_cfg = config['lithops'].get('include_modules', [])

    exc_modules = set()
    inc_modules = set()
    if exclude_modules_cfg:
        exc_modules.update(exclude_modules_cfg)
    if exclude_modules:
        exc_modules.update(exclude_modules)
    if include_modules_cfg is not None:
        inc_modules.update(include_modules_cfg)
    if include_modules_cfg is None and not include_modules:
        inc_modules = None
    if include_modules is not None and include_modules:
        inc_modules.update(include_modules)
    if include_modules is None:
        inc_modules = None

    logger.debug(
        'ExecutorID {} | JobID {} - Serializing function and data'.format(
            executor_id, job_id))
    job_serialize_start = time.time()
    serializer = SerializeIndependent(runtime_meta['preinstalls'])
    func_and_data_ser, mod_paths = serializer([func] + iterdata, inc_modules,
                                              exc_modules)
    data_strs = func_and_data_ser[1:]
    data_size_bytes = sum(len(x) for x in data_strs)
    module_data = create_module_data(mod_paths)
    func_str = func_and_data_ser[0]
    func_module_str = pickle.dumps(
        {
            'func': func_str,
            'module_data': module_data
        }, -1)
    func_module_size_bytes = len(func_module_str)
    total_size = utils.sizeof_fmt(data_size_bytes + func_module_size_bytes)
    host_job_meta['host_job_serialize_time'] = round(
        time.time() - job_serialize_start, 6)

    host_job_meta['data_size_bytes'] = data_size_bytes
    host_job_meta['func_module_size_bytes'] = func_module_size_bytes

    if 'data_limit' in config['lithops']:
        data_limit = config['lithops']['data_limit']
    else:
        data_limit = MAX_AGG_DATA_SIZE

    if data_limit and data_size_bytes > data_limit * 1024**2:
        log_msg = (
            'ExecutorID {} | JobID {} - Total data exceeded maximum size '
            'of {}'.format(executor_id, job_id,
                           sizeof_fmt(data_limit * 1024**2)))
        raise Exception(log_msg)

    logger.info('ExecutorID {} | JobID {} - Uploading function and data '
                '- Total: {}'.format(executor_id, job_id, total_size))

    # Upload data
    data_key = create_agg_data_key(JOBS_PREFIX, executor_id, job_id)
    job.data_key = data_key
    data_bytes, data_byte_ranges = utils.agg_data(data_strs)
    job.data_byte_ranges = data_byte_ranges
    data_upload_start = time.time()
    internal_storage.put_data(data_key, data_bytes)
    data_upload_end = time.time()

    host_job_meta['host_data_upload_time'] = round(
        data_upload_end - data_upload_start, 6)
    func_upload_start = time.time()

    # Upload function and modules
    if config[mode].get('customized_runtime'):
        # Prepare function and modules locally to store in the runtime image later
        function_file = func.__code__.co_filename
        function_hash = hashlib.md5(open(function_file,
                                         'rb').read()).hexdigest()[:16]
        mod_hash = hashlib.md5(repr(
            sorted(mod_paths)).encode('utf-8')).hexdigest()[:16]

        uuid = '{}{}'.format(function_hash, mod_hash)
        func_key = create_func_key(JOBS_PREFIX, uuid, "")

        _store_func_and_modules(func_key, func_str, module_data)

        job.ext_runtime_uuid = uuid
    else:
        func_key = create_func_key(JOBS_PREFIX, executor_id, job_id)
        internal_storage.put_func(func_key, func_module_str)

    job.func_key = func_key
    func_upload_end = time.time()

    host_job_meta['host_func_upload_time'] = round(
        func_upload_end - func_upload_start, 6)

    host_job_meta['host_job_created_time'] = round(
        time.time() - host_job_meta['host_job_create_tstamp'], 6)

    job.metadata = host_job_meta

    return job
Example #4
0
def _create_job(config, internal_storage, executor_id, job_id, func,
                iterdata,  runtime_meta, runtime_memory, extra_env,
                include_modules, exclude_modules, execution_timeout,
                host_job_meta, chunksize=None):
    """
    Creates a new Job
    """
    global FUNCTION_CACHE

    ext_env = {} if extra_env is None else extra_env.copy()
    if ext_env:
        ext_env = utils.convert_bools_to_string(ext_env)
        logger.debug("Extra environment vars {}".format(ext_env))

    mode = config['lithops']['mode']
    backend = config['lithops']['backend']

    job = SimpleNamespace()
    job.chunksize = chunksize or config['lithops']['chunksize']
    job.worker_processes = config[backend]['worker_processes']
    job.execution_timeout = execution_timeout or config['lithops']['execution_timeout']
    job.executor_id = executor_id
    job.job_id = job_id
    job.job_key = create_job_key(job.executor_id, job.job_id)
    job.extra_env = ext_env
    job.function_name = func.__name__ if inspect.isfunction(func) or inspect.ismethod(func) else type(func).__name__
    job.total_calls = len(iterdata)

    if mode == SERVERLESS:
        job.runtime_memory = runtime_memory or config[backend]['runtime_memory']
        job.runtime_timeout = config[backend]['runtime_timeout']
        if job.execution_timeout >= job.runtime_timeout:
            job.execution_timeout = job.runtime_timeout - 5

    elif mode in STANDALONE:
        job.runtime_memory = None
        runtime_timeout = config[STANDALONE]['hard_dismantle_timeout']
        if job.execution_timeout >= runtime_timeout:
            job.execution_timeout = runtime_timeout - 10

    elif mode == LOCALHOST:
        job.runtime_memory = None
        job.runtime_timeout = None

    exclude_modules_cfg = config['lithops'].get('exclude_modules', [])
    include_modules_cfg = config['lithops'].get('include_modules', [])

    exc_modules = set()
    inc_modules = set()
    if exclude_modules_cfg:
        exc_modules.update(exclude_modules_cfg)
    if exclude_modules:
        exc_modules.update(exclude_modules)
    if include_modules_cfg is not None:
        inc_modules.update(include_modules_cfg)
    if include_modules_cfg is None and not include_modules:
        inc_modules = None
    if include_modules is not None and include_modules:
        inc_modules.update(include_modules)
    if include_modules is None:
        inc_modules = None

    logger.debug('ExecutorID {} | JobID {} - Serializing function and data'.format(executor_id, job_id))
    job_serialize_start = time.time()
    serializer = SerializeIndependent(runtime_meta['preinstalls'])
    func_and_data_ser, mod_paths = serializer([func] + iterdata, inc_modules, exc_modules)
    data_strs = func_and_data_ser[1:]
    data_size_bytes = sum(len(x) for x in data_strs)
    module_data = create_module_data(mod_paths)
    func_str = func_and_data_ser[0]
    func_module_str = pickle.dumps({'func': func_str, 'module_data': module_data}, -1)
    func_module_size_bytes = len(func_module_str)

    host_job_meta['host_job_serialize_time'] = round(time.time()-job_serialize_start, 6)
    host_job_meta['data_size_bytes'] = data_size_bytes
    host_job_meta['func_module_size_bytes'] = func_module_size_bytes

    # Check data limit
    if 'data_limit' in config['lithops']:
        data_limit = config['lithops']['data_limit']
    else:
        data_limit = MAX_AGG_DATA_SIZE
    if data_limit and data_size_bytes > data_limit*1024**2:
        log_msg = ('ExecutorID {} | JobID {} - Total data exceeded maximum size '
                   'of {}'.format(executor_id, job_id, utils.sizeof_fmt(data_limit*1024**2)))
        raise Exception(log_msg)

    # Upload function and data
    upload_function = not config['lithops'].get('customized_runtime', False)
    upload_data = not (len(str(data_strs[0])) * job.chunksize < 8*1204 and backend in FAAS_BACKENDS)

    # Upload function and modules
    if upload_function:
        function_hash = hashlib.md5(func_module_str).hexdigest()
        job.func_key = create_func_key(executor_id, function_hash)
        if job.func_key not in FUNCTION_CACHE:
            logger.debug('ExecutorID {} | JobID {} - Uploading function and modules '
                         'to the storage backend'.format(executor_id, job_id))
            func_upload_start = time.time()
            internal_storage.put_func(job.func_key, func_module_str)
            func_upload_end = time.time()
            host_job_meta['host_func_upload_time'] = round(func_upload_end - func_upload_start, 6)
            FUNCTION_CACHE.add(job.func_key)
        else:
            logger.debug('ExecutorID {} | JobID {} - Function and modules '
                         'found in local cache'.format(executor_id, job_id))
            host_job_meta['host_func_upload_time'] = 0

    else:
        # Prepare function and modules locally to store in the runtime image later
        function_file = func.__code__.co_filename
        function_hash = hashlib.md5(open(function_file, 'rb').read()).hexdigest()[:16]
        mod_hash = hashlib.md5(repr(sorted(mod_paths)).encode('utf-8')).hexdigest()[:16]
        job.func_key = func_key_suffix
        job.ext_runtime_uuid = '{}{}'.format(function_hash, mod_hash)
        job.local_tmp_dir = os.path.join(CUSTOM_RUNTIME_DIR, job.ext_runtime_uuid)
        _store_func_and_modules(job.local_tmp_dir, job.func_key, func_str, module_data)
        host_job_meta['host_func_upload_time'] = 0

    # upload data
    if upload_data:
        # Upload iterdata to COS only if a single element is greater than 8KB
        logger.debug('ExecutorID {} | JobID {} - Uploading data to the storage backend'
                     .format(executor_id, job_id))
        # pass_iteradata through an object storage file
        data_key = create_data_key(executor_id, job_id)
        job.data_key = data_key
        data_bytes, data_byte_ranges = utils.agg_data(data_strs)
        job.data_byte_ranges = data_byte_ranges
        data_upload_start = time.time()
        internal_storage.put_data(data_key, data_bytes)
        data_upload_end = time.time()
        host_job_meta['host_data_upload_time'] = round(data_upload_end-data_upload_start, 6)

    else:
        # pass iteradata as part of the invocation payload
        logger.debug('ExecutorID {} | JobID {} - Data per activation is < '
                     '{}. Passing data through invocation payload'
                     .format(executor_id, job_id, utils.sizeof_fmt(8*1024)))
        job.data_key = None
        job.data_byte_ranges = None
        job.data_byte_strs = data_strs
        host_job_meta['host_data_upload_time'] = 0

    host_job_meta['host_job_created_time'] = round(time.time() - host_job_meta['host_job_create_tstamp'], 6)

    job.metadata = host_job_meta

    return job