def __init__(self, gcp_functions_config): self.log_active = logger.getEffectiveLevel() != logging.WARNING self.name = 'gcp_functions' self.gcp_functions_config = gcp_functions_config self.package = 'pywren_v'+__version__ self.region = gcp_functions_config['region'] self.service_account = gcp_functions_config['service_account'] self.project = gcp_functions_config['project_name'] self.credentials_path = gcp_functions_config['credentials_path'] self.num_retries = gcp_functions_config['retries'] self.retry_sleeps = gcp_functions_config['retry_sleeps'] # Instantiate storage client (to upload function bin) self.internal_storage = InternalStorage(gcp_functions_config['storage']) # Setup pubsub client try: # Get credenitals from JSON file service_account_info = json.load(open(self.credentials_path)) credentials = jwt.Credentials.from_service_account_info(service_account_info, audience=AUDIENCE) credentials_pub = credentials.with_claims(audience=AUDIENCE) except Exception: # Get credentials from gcp function environment credentials_pub = None self.publisher_client = pubsub_v1.PublisherClient(credentials=credentials_pub) log_msg = 'PyWren v{} init for GCP Functions - Project: {} - Region: {}'.format( __version__, self.project, self.region) logger.info(log_msg) if not self.log_active: print(log_msg)
def update_runtime(name, config=None): config = default_config(config) storage_config = extract_storage_config(config) internal_storage = InternalStorage(storage_config) compute_config = extract_compute_config(config) compute_handler = Compute(compute_config) timeout = config['pywren']['runtime_timeout'] logger.info('Updating runtime: {}'.format(name)) if name != 'all': runtime_meta = compute_handler.generate_runtime_meta(name) else: runtime_meta = None runtimes = compute_handler.list_runtimes(name) for runtime in runtimes: compute_handler.create_runtime(runtime[0], runtime[1], timeout) if runtime_meta: try: runtime_key = compute_handler.get_runtime_key( runtime[0], runtime[1]) internal_storage.put_runtime_meta(runtime_key, runtime_meta) except Exception: raise ("Unable to upload 'preinstalled modules' file into {}". format(internal_storage.backend))
def __init__(self, config, invoke_type, log_level): self.config = config self.invoke_type = invoke_type self.log_level = log_level storage_config = extract_storage_config(self.config) self.internal_storage = InternalStorage(storage_config) compute_config = extract_compute_config(self.config) self.remote_invoker = self.config['pywren'].get( 'remote_invoker', False) self.rabbitmq_monitor = self.config['pywren'].get( 'rabbitmq_monitor', False) if self.rabbitmq_monitor: self.rabbit_amqp_url = self.config['rabbitmq'].get('amqp_url') self.workers = self.config['pywren'].get('workers') logger.debug('ExecutorID {} - Total workers:'.format(self.workers)) self.compute_handlers = [] cb = compute_config['backend'] regions = compute_config[cb].get('region') if regions and type(regions) == list: for region in regions: new_compute_config = compute_config.copy() new_compute_config[cb]['region'] = region self.compute_handlers.append(Compute(new_compute_config)) else: self.compute_handlers.append(Compute(compute_config)) self.token_bucket_q = Queue() self.pending_calls_q = Queue()
def __init__(self, pywren_config): self.config = pywren_config self.rabbitmq_monitor = self.config['pywren'].get('rabbitmq_monitor', False) self.store_status = strtobool(os.environ.get('STORE_STATUS', 'True')) storage_config = extract_storage_config(self.config) self.internal_storage = InternalStorage(storage_config) self.response = {'exception': False}
def __init__(self, config=None, runtime=None, runtime_memory=None, compute_backend=None, compute_backend_region=None, log_level=None, rabbitmq_monitor=False): """ Initialize and return a ServerlessExecutor class. :param config: Settings passed in here will override those in config file. Default None. :param runtime: Runtime name to use. Default None. :param runtime_memory: memory to use in the runtime :param log_level: log level to use during the execution :param rabbitmq_monitor: use rabbitmq as monitoring system :return `ServerlessExecutor` object. """ self.start_time = time.time() self._state = ExecutorState.new self.config = default_config(config) self.is_cf_cluster = is_cf_cluster() self.data_cleaner = self.config['pywren']['data_cleaner'] # Overwrite runtime variables if runtime: self.config['pywren']['runtime'] = runtime if runtime_memory: self.config['pywren']['runtime_memory'] = int(runtime_memory) if compute_backend: self.config['pywren']['compute_backend'] = compute_backend if compute_backend_region: self.config['pywren']['compute_backend_region'] = compute_backend_region # Log level Configuration self.log_level = log_level if not self.log_level: if(logger.getEffectiveLevel() != logging.WARNING): self.log_level = logging.getLevelName(logger.getEffectiveLevel()) if self.log_level: os.environ["CB_LOG_LEVEL"] = self.log_level if not self.is_cf_cluster: default_logging_config(self.log_level) if 'CB_EXECUTOR_ID' in os.environ: self.executor_id = os.environ['CB_EXECUTOR_ID'] else: self.executor_id = create_executor_id() logger.debug('ServerlessExecutor created with ID: {}'.format(self.executor_id)) # RabbitMQ monitor configuration self.rabbitmq_monitor = rabbitmq_monitor if self.rabbitmq_monitor: if self.config['rabbitmq']['amqp_url']: os.environ["CB_RABBITMQ_MONITOR"] = 'True' else: self.rabbitmq_monitor = False else: self.config['rabbitmq']['amqp_url'] = None storage_config = extract_storage_config(self.config) self.internal_storage = InternalStorage(storage_config) self.invoker = Invoker(self.config, self.executor_id) self.jobs = {}
def delete_runtime(name, config=None): config = default_config(config) storage_config = extract_storage_config(config) internal_storage = InternalStorage(storage_config) compute_config = extract_compute_config(config) compute_handler = Compute(compute_config) runtimes = compute_handler.list_runtimes(name) for runtime in runtimes: compute_handler.delete_runtime(runtime[0], runtime[1]) runtime_key = compute_handler.get_runtime_key(runtime[0], runtime[1]) internal_storage.delete_runtime_meta(runtime_key)
def clean_runtimes(config=None): logger.info('Cleaning all runtimes and cache information') config = default_config(config) storage_config = extract_storage_config(config) internal_storage = InternalStorage(storage_config) compute_config = extract_compute_config(config) compute_handler = Compute(compute_config) # Clean local runtime_meta cache if os.path.exists(CACHE_DIR): shutil.rmtree(CACHE_DIR) # Clean localhost dirs localhost_jobs_path = os.path.join(TEMP, STORAGE_PREFIX_DEFAULT) if os.path.exists(localhost_jobs_path): shutil.rmtree(localhost_jobs_path) localhost_runtimes_path = os.path.join(TEMP, RUNTIMES_PREFIX_DEFAULT) if os.path.exists(localhost_runtimes_path): shutil.rmtree(localhost_runtimes_path) # Clean runtime metadata in the object storage sh = internal_storage.storage_handler runtimes = sh.list_keys(storage_config['bucket'], RUNTIMES_PREFIX_DEFAULT) if runtimes: sh.delete_objects(storage_config['bucket'], runtimes) compute_handler.delete_all_runtimes()
def clean_all(config=None): logger.info('Cleaning all PyWren information') config = default_config(config) storage_config = extract_storage_config(config) internal_storage = InternalStorage(storage_config) compute_config = extract_compute_config(config) compute_handler = Compute(compute_config) # Clean object storage temp dirs sh = internal_storage.storage_handler runtimes = sh.list_keys(storage_config['bucket'], RUNTIMES_PREFIX) if runtimes: sh.delete_objects(storage_config['bucket'], runtimes) compute_handler.delete_all_runtimes() clean_bucket(storage_config['bucket'], JOBS_PREFIX, internal_storage, sleep=1) # Clean local runtime_meta cache if os.path.exists(CACHE_DIR): shutil.rmtree(CACHE_DIR) # Clean localhost temp dirs localhost_jobs_path = os.path.join(TEMP, JOBS_PREFIX) if os.path.exists(localhost_jobs_path): shutil.rmtree(localhost_jobs_path) localhost_runtimes_path = os.path.join(TEMP, RUNTIMES_PREFIX) if os.path.exists(localhost_runtimes_path): shutil.rmtree(localhost_runtimes_path)
def create_runtime(name, memory=None, config=None): config = default_config(config) storage_config = extract_storage_config(config) internal_storage = InternalStorage(storage_config) compute_config = extract_compute_config(config) compute_handler = Compute(compute_config) memory = config['pywren']['runtime_memory'] if not memory else memory timeout = config['pywren']['runtime_timeout'] logger.info('Creating runtime: {}, memory: {}'.format(name, memory)) runtime_key = compute_handler.get_runtime_key(name, memory) runtime_meta = compute_handler.create_runtime(name, memory, timeout=timeout) try: internal_storage.put_runtime_meta(runtime_key, runtime_meta) except Exception: raise("Unable to upload 'preinstalled-modules' file into {}".format(internal_storage.backend))
def clean_bucket(bucket, prefix, storage_config): """ Wrapper of clean_os_bucket(). Use this method only when storage_config is in JSON format. In any other case, call directly clean_os_bucket() method. """ from pywren_ibm_cloud.storage import InternalStorage internal_storage = InternalStorage(json.loads(storage_config)) # sys.stdout = open(os.devnull, 'w') clean_os_bucket(bucket, prefix, internal_storage)
def __init__(self, config, num_invokers, log_level): self.config = config self.num_invokers = num_invokers self.log_level = log_level storage_config = extract_storage_config(self.config) self.internal_storage = InternalStorage(storage_config) compute_config = extract_compute_config(self.config) self.remote_invoker = self.config['pywren'].get( 'remote_invoker', False) self.rabbitmq_monitor = self.config['pywren'].get( 'rabbitmq_monitor', False) if self.rabbitmq_monitor: self.rabbit_amqp_url = self.config['rabbitmq'].get('amqp_url') self.num_workers = self.config['pywren'].get('workers') logger.debug('Total workers: {}'.format(self.num_workers)) self.compute_handlers = [] cb = compute_config['backend'] regions = compute_config[cb].get('region') if regions and type(regions) == list: for region in regions: new_compute_config = compute_config.copy() new_compute_config[cb]['region'] = region compute_handler = Compute(new_compute_config) self.compute_handlers.append(compute_handler) else: if cb == 'localhost': global CBH if cb in CBH and CBH[ cb].compute_handler.num_workers != self.num_workers: del CBH[cb] if cb in CBH: logger.info( '{} compute handler already started'.format(cb)) compute_handler = CBH[cb] self.compute_handlers.append(compute_handler) else: logger.info('Starting {} compute handler'.format(cb)) compute_handler = Compute(compute_config) CBH[cb] = compute_handler self.compute_handlers.append(compute_handler) else: compute_handler = Compute(compute_config) self.compute_handlers.append(compute_handler) self.token_bucket_q = Queue() self.pending_calls_q = Queue() self.job_monitor = JobMonitor(self.config, self.internal_storage, self.token_bucket_q)
def clean_runtimes(config=None): logger.info('Cleaning all runtimes and cache information') config = default_config(config) storage_config = extract_storage_config(config) internal_storage = InternalStorage(storage_config) compute_config = extract_compute_config(config) compute_handler = Compute(compute_config) # Clean local runtime_meta cache if os.path.exists(CACHE_DIR): shutil.rmtree(CACHE_DIR) sh = internal_storage.storage_handler runtimes = sh.list_keys(storage_config['bucket'], 'runtime') if runtimes: sh.delete_objects(storage_config['bucket'], runtimes) compute_handler.delete_all_runtimes()
def clean_runtimes(config=None): logger.info('Cleaning all runtimes') config = default_config(config) storage_config = extract_storage_config(config) internal_storage = InternalStorage(storage_config) compute_config = extract_compute_config(config) compute_handler = Compute(compute_config) # Clean local runtime_meta cache cache_dir = os.path.join(os.path.expanduser('~'), '.cloudbutton') if os.path.exists(cache_dir): shutil.rmtree(cache_dir) sh = internal_storage.storage_handler runtimes = sh.list_keys(storage_config['bucket'], 'runtime') if runtimes: sh.delete_objects(storage_config['bucket'], runtimes) compute_handler.delete_all_runtimes()
def run_tests(test_to_run, config=None): global CONFIG, STORAGE_CONFIG, STORAGE CONFIG = json.load(args.config) if config else default_config() STORAGE_CONFIG = extract_storage_config(CONFIG) STORAGE = InternalStorage(STORAGE_CONFIG).storage suite = unittest.TestSuite() if test_to_run == 'all': suite.addTest(unittest.makeSuite(TestPywren)) else: try: suite.addTest(TestPywren(test_to_run)) except ValueError: print("unknown test, use: --help") sys.exit() runner = unittest.TextTestRunner() runner.run(suite)
def clean_all(config=None): logger.info('Cleaning all PyWren information') config = default_config(config) storage_config = extract_storage_config(config) internal_storage = InternalStorage(storage_config) compute_config = extract_compute_config(config) compute_handler = Compute(compute_config) # Clean localhost executor temp dirs shutil.rmtree(STORAGE_FOLDER, ignore_errors=True) shutil.rmtree(DOCKER_FOLDER, ignore_errors=True) # Clean object storage temp dirs compute_handler.delete_all_runtimes() storage = internal_storage.storage clean_bucket(storage, storage_config['bucket'], RUNTIMES_PREFIX, sleep=1) clean_bucket(storage, storage_config['bucket'], JOBS_PREFIX, sleep=1) # Clean local pywren cache shutil.rmtree(CACHE_DIR, ignore_errors=True)
def result(self, throw_except=True, internal_storage=None): """ Return the value returned by the call. If the call raised an exception, this method will raise the same exception If the future is cancelled before completing then CancelledError will be raised. :param throw_except: Reraise exception if call raised. Default true. :param internal_storage: Storage handler to poll cloud storage. Default None. :return: Result of the call. :raises CancelledError: If the job is cancelled before completed. :raises TimeoutError: If job is not complete after `timeout` seconds. """ if self._state == CallState.new: raise ValueError("task not yet invoked") if self._state == CallState.success: return self._return_val if self._state == CallState.futures: return self._new_futures if internal_storage is None: internal_storage = InternalStorage( storage_config=self.storage_config) self.status(throw_except, internal_storage) if not self.produce_output: return if self._state == CallState.success: return self._return_val if self._state == CallState.futures: return self._new_futures if self._state == CallState.error: if throw_except: raise FunctionException(self.executor_id, self.job_id, self.activation_id, self._exception) else: return None call_output_time = time.time() call_output = internal_storage.get_call_output(self.executor_id, self.job_id, self.call_id) self.output_query_count += 1 while call_output is None and self.output_query_count < self.GET_RESULT_MAX_RETRIES: time.sleep(self.GET_RESULT_SLEEP_SECS) call_output = internal_storage.get_call_output( self.executor_id, self.job_id, self.call_id) self.output_query_count += 1 if call_output is None: if throw_except: raise Exception( 'Unable to get the output of the function {} - ' 'Activation ID: {}'.format(self.call_id, self.activation_id)) else: self._set_state(CallState.error) return None call_output = pickle.loads(call_output) call_output_time_done = time.time() self._call_output = call_output self.invoke_status[ 'download_output_time'] = call_output_time_done - call_output_time self.invoke_status['output_query_count'] = self.output_query_count self.invoke_status['download_output_timestamp'] = call_output_time_done log_msg = ( 'ExecutorID {} | JobID {} - Got output from Function {} - Activation ' 'ID: {}'.format(self.executor_id, self.job_id, self.call_id, self.activation_id)) logger.debug(log_msg) function_result = call_output['result'] if isinstance(function_result, ResponseFuture): self._new_futures = [function_result] self._set_state(CallState.futures) self.invoke_status['status_done_timestamp'] = self.invoke_status[ 'download_output_timestamp'] del self.invoke_status['download_output_timestamp'] return self._new_futures elif type(function_result ) == list and len(function_result) > 0 and isinstance( function_result[0], ResponseFuture): self._new_futures = function_result self._set_state(CallState.futures) self.invoke_status['status_done_timestamp'] = self.invoke_status[ 'download_output_timestamp'] del self.invoke_status['download_output_timestamp'] return self._new_futures else: self._return_val = function_result self._set_state(CallState.success) return self._return_val
def function_handler(event): start_time = time.time() log_level = event['log_level'] cloud_logging_config(log_level) logger.debug("Action handler started") response_status = {'exception': False} response_status['host_submit_time'] = event['host_submit_time'] response_status['start_time'] = start_time context_dict = { 'python_version': os.environ.get("PYTHON_VERSION"), } config = event['config'] storage_config = extract_storage_config(config) call_id = event['call_id'] job_id = event['job_id'] executor_id = event['executor_id'] logger.info("Execution ID: {}/{}/{}".format(executor_id, job_id, call_id)) execution_timeout = event['execution_timeout'] logger.debug( "Set function execution timeout to {}s".format(execution_timeout)) status_key = event['status_key'] func_key = event['func_key'] data_key = event['data_key'] data_byte_range = event['data_byte_range'] output_key = event['output_key'] extra_env = event.get('extra_env', {}) response_status['call_id'] = call_id response_status['job_id'] = job_id response_status['executor_id'] = executor_id # response_status['func_key'] = func_key # response_status['data_key'] = data_key # response_status['output_key'] = output_key # response_status['status_key'] = status_key try: if version.__version__ != event['pywren_version']: raise Exception("WRONGVERSION", "PyWren version mismatch", version.__version__, event['pywren_version']) # response_status['free_disk_bytes'] = free_disk_space("/tmp") custom_env = { 'PYWREN_CONFIG': json.dumps(config), 'PYWREN_REMOTE': 'TRUE', 'PYTHONPATH': "{}:{}".format(os.getcwd(), PYWREN_LIBS_PATH), 'PYTHONUNBUFFERED': 'True' } os.environ.update(custom_env) os.environ.update(extra_env) jobrunner_config = { 'pywren_config': config, 'call_id': call_id, 'job_id': job_id, 'executor_id': executor_id, 'func_key': func_key, 'data_key': data_key, 'log_level': log_level, 'data_byte_range': data_byte_range, 'python_module_path': PYTHON_MODULE_PATH, 'output_key': output_key, 'stats_filename': JOBRUNNER_STATS_FILENAME } if os.path.exists(JOBRUNNER_STATS_FILENAME): os.remove(JOBRUNNER_STATS_FILENAME) setup_time = time.time() response_status['setup_time'] = round(setup_time - start_time, 8) result_queue = multiprocessing.Queue() tr = JobRunner(jobrunner_config, result_queue) tr.daemon = True logger.debug('Starting JobRunner process') tr.start() tr.join(execution_timeout) logger.debug('Finished JobRunner process') response_status['exec_time'] = round(time.time() - setup_time, 8) if tr.is_alive(): # If process is still alive after jr.join(job_max_runtime), kill it tr.terminate() msg = ('Jobrunner process exceeded maximum time of {} ' 'seconds and was killed'.format(execution_timeout)) raise Exception('OUTATIME', msg) if result_queue.empty(): # Only 1 message is returned by jobrunner when it finishes. # If no message, this means that the jobrunner process was killed. # 99% of times the jobrunner is killed due an OOM, so we assume here an OOM. msg = 'Jobrunner process exceeded maximum memory and was killed' raise Exception('OUTOFMEMORY', msg) # print(subprocess.check_output("find {}".format(PYTHON_MODULE_PATH), shell=True)) # print(subprocess.check_output("find {}".format(os.getcwd()), shell=True)) if os.path.exists(JOBRUNNER_STATS_FILENAME): with open(JOBRUNNER_STATS_FILENAME, 'r') as fid: for l in fid.readlines(): key, value = l.strip().split(" ", 1) try: response_status[key] = float(value) except Exception: response_status[key] = value if key in [ 'exception', 'exc_pickle_fail', 'result', 'new_futures' ]: response_status[key] = eval(value) # response_status['server_info'] = get_server_info() response_status.update(context_dict) response_status['end_time'] = time.time() except Exception: # internal runtime exceptions print('----------------------- EXCEPTION !-----------------------', flush=True) traceback.print_exc(file=sys.stdout) print('----------------------------------------------------------', flush=True) response_status['end_time'] = time.time() response_status['exception'] = True pickled_exc = pickle.dumps(sys.exc_info()) pickle.loads( pickled_exc) # this is just to make sure they can be unpickled response_status['exc_info'] = str(pickled_exc) finally: store_status = strtobool(os.environ.get('STORE_STATUS', 'True')) dmpd_response_status = json.dumps(response_status) drs = sizeof_fmt(len(dmpd_response_status)) rabbitmq_monitor = config['pywren'].get('rabbitmq_monitor', False) if rabbitmq_monitor and store_status: rabbit_amqp_url = config['rabbitmq'].get('amqp_url') status_sent = False output_query_count = 0 params = pika.URLParameters(rabbit_amqp_url) queue = '{}-{}'.format(executor_id, job_id) while not status_sent and output_query_count < 5: output_query_count = output_query_count + 1 try: connection = pika.BlockingConnection(params) channel = connection.channel() channel.queue_declare(queue=queue, auto_delete=True) channel.basic_publish(exchange='', routing_key=queue, body=dmpd_response_status) connection.close() logger.info( "Execution status sent to rabbitmq - Size: {}".format( drs)) status_sent = True except Exception as e: logger.error("Unable to send status to rabbitmq") logger.error(str(e)) logger.info('Retrying to send status to rabbitmq...') time.sleep(0.2) if store_status: internal_storage = InternalStorage(storage_config) logger.info( "Storing execution stats - status.json - Size: {}".format(drs)) internal_storage.put_data(status_key, dmpd_response_status)
import urllib.request from pywren_ibm_cloud.storage import InternalStorage from pywren_ibm_cloud.config import default_config, extract_storage_config from multiprocessing.pool import ThreadPool import logging # logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser(description="test all PyWren's functionality", usage='python -m pywren_ibm_cloud.tests [-c CONFIG] [-f TESTNAME]') parser.add_argument('-c', '--config', type=argparse.FileType('r'), metavar='', default=None, help="use json config file") parser.add_argument('-t', '--test', metavar='', default='all', help='run a specific test, type "-t help" for tests list') args = parser.parse_args() CONFIG = default_config() STORAGE_CONFIG = extract_storage_config(CONFIG) STORAGE = InternalStorage(STORAGE_CONFIG).storage_handler PREFIX = '__pywren.test' TEST_FILES_URLS = ["http://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/vocab.enron.txt", "http://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/vocab.kos.txt", "http://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/vocab.nips.txt", "http://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/vocab.nytimes.txt", "http://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/vocab.pubmed.txt"] def initTests(): print('Uploading test files...') def up(param): i, url = param content = urllib.request.urlopen(url).read() STORAGE.put_object(bucket_name=STORAGE_CONFIG['bucket'],
class JobRunner(Process): def __init__(self, tr_config, result_queue): super().__init__() start_time = time.time() self.config = tr_config log_level = self.config['log_level'] self.result_queue = result_queue cloud_logging_config(log_level) self.stats = stats(self.config['stats_filename']) self.stats.write('jobrunner_start', start_time) cb_config = json.loads(os.environ.get('CB_CONFIG')) self.storage_config = extract_storage_config(cb_config) if 'SHOW_MEMORY_USAGE' in os.environ: self.show_memory = eval(os.environ['SHOW_MEMORY_USAGE']) else: self.show_memory = False self.func_key = self.config['func_key'] self.data_key = self.config['data_key'] self.data_byte_range = self.config['data_byte_range'] self.output_key = self.config['output_key'] def _get_function_and_modules(self): """ Gets and unpickles function and modules from storage """ logger.debug("Getting function and modules") func_download_time_t1 = time.time() func_obj = self.internal_storage.get_func(self.func_key) loaded_func_all = pickle.loads(func_obj) func_download_time_t2 = time.time() self.stats.write('func_download_time', round(func_download_time_t2-func_download_time_t1, 8)) logger.debug("Finished getting Function and modules") return loaded_func_all def _save_modules(self, module_data): """ Save modules, before we unpickle actual function """ logger.debug("Writing Function dependencies to local disk") PYTHON_MODULE_PATH = self.config['python_module_path'] shutil.rmtree(PYTHON_MODULE_PATH, True) # delete old modules os.mkdir(PYTHON_MODULE_PATH) sys.path.append(PYTHON_MODULE_PATH) for m_filename, m_data in module_data.items(): m_path = os.path.dirname(m_filename) if len(m_path) > 0 and m_path[0] == "/": m_path = m_path[1:] to_make = os.path.join(PYTHON_MODULE_PATH, m_path) try: os.makedirs(to_make) except OSError as e: if e.errno == 17: pass else: raise e full_filename = os.path.join(to_make, os.path.basename(m_filename)) with open(full_filename, 'wb') as fid: fid.write(b64str_to_bytes(m_data)) #logger.info("Finished writing {} module files".format(len(loaded_func_all['module_data']))) #logger.debug(subprocess.check_output("find {}".format(PYTHON_MODULE_PATH), shell=True)) #logger.debug(subprocess.check_output("find {}".format(os.getcwd()), shell=True)) logger.debug("Finished writing Function dependencies") def _unpickle_function(self, pickled_func): """ Unpickle function; it will expect modules to be there """ logger.debug("Unpickle Function") loaded_func = pickle.loads(pickled_func) logger.debug("Finished Function unpickle") return loaded_func def _load_data(self): extra_get_args = {} if self.data_byte_range is not None: range_str = 'bytes={}-{}'.format(*self.data_byte_range) extra_get_args['Range'] = range_str logger.debug("Getting function data") data_download_time_t1 = time.time() data_obj = self.internal_storage.get_data(self.data_key, extra_get_args=extra_get_args) logger.debug("Finished getting Function data") logger.debug("Unpickle Function data") loaded_data = pickle.loads(data_obj) logger.debug("Finished unpickle Function data") data_download_time_t2 = time.time() self.stats.write('data_download_time', round(data_download_time_t2-data_download_time_t1, 8)) return loaded_data def _create_storage_clients(self, function, data): # Verify storage parameters - Create clients func_sig = inspect.signature(function) if 'ibm_cos' in func_sig.parameters: ibm_boto3_client = ibm_cos_backend(self.storage_config['ibm_cos']).get_client() data['ibm_cos'] = ibm_boto3_client if 'internal_storage' in func_sig.parameters: data['internal_storage'] = self.internal_storage return data def run(self): """ Runs the function """ logger.info("Started") # initial output file in case job fails result = None exception = False try: self.internal_storage = InternalStorage(self.storage_config) self.internal_storage.tmp_obj_prefix = self.output_key.rsplit('/', 1)[0] loaded_func_all = self._get_function_and_modules() self._save_modules(loaded_func_all['module_data']) function = self._unpickle_function(loaded_func_all['func']) data = self._load_data() data = self._create_storage_clients(function, data) if self.show_memory: logger.debug("Memory usage before call the function: {}".format(get_current_memory_usage())) logger.info("Function: Going to execute '{}()'".format(str(function.__name__))) print('---------------------- FUNCTION LOG ----------------------', flush=True) func_exec_time_t1 = time.time() result = function(**data) func_exec_time_t2 = time.time() print('----------------------------------------------------------', flush=True) logger.info("Function: Success execution") if self.show_memory: logger.debug("Memory usage after call the function: {}".format(get_current_memory_usage())) self.stats.write('function_exec_time', round(func_exec_time_t2-func_exec_time_t1, 8)) # Check for new futures if result is not None: self.stats.write("result", True) if isinstance(result, ResponseFuture): callgroup_id = result.callgroup_id self.stats.write('new_futures', '{}/{}'.format(callgroup_id, 1)) elif type(result) == list and len(result) > 0 and isinstance(result[0], ResponseFuture): callgroup_id = result[0].callgroup_id self.stats.write('new_futures', '{}/{}'.format(callgroup_id, len(result))) else: self.stats.write('new_futures', '{}/{}'.format(None, 0)) logger.debug("Pickling result") output_dict = {'result': result} pickled_output = pickle.dumps(output_dict) if self.show_memory: logger.debug("Memory usage after output serialization: {}".format(get_current_memory_usage())) else: logger.debug("No result to store") self.stats.write("result", False) except Exception as e: exception = True self.stats.write("exception", True) print('----------------------- EXCEPTION !-----------------------') logger.error("There was an exception: {}".format(str(e))) print('----------------------------------------------------------', flush=True) if self.show_memory: logger.debug("Memory usage after call the function: {}".format(get_current_memory_usage())) try: logger.debug("Pickling exception") pickled_exc = pickle.dumps(sys.exc_info()) pickle.loads(pickled_exc) # this is just to make sure they can be unpickled self.stats.write("exc_info", str(pickled_exc)) except Exception as pickle_exception: # Shockingly often, modules like subprocess don't properly # call the base Exception.__init__, which results in them # being unpickleable. As a result, we actually wrap this in a try/catch block # and more-carefully handle the exceptions if any part of this save / test-reload # fails logger.debug("Failed pickling exception: {}".format(str(pickle_exception))) self.stats.write("exc_pickle_fail", True) exc_type, exc_value, exc_traceback = sys.exc_info() pickled_exc = pickle.dumps({'exc_type': str(exc_type), 'exc_value': str(exc_value), 'exc_traceback': exc_traceback, 'pickle_exception': pickle_exception}) pickle.loads(pickled_exc) # this is just to make sure they can be unpickled self.stats.write("exc_info", str(pickled_exc)) finally: store_result = strtobool(os.environ.get('STORE_RESULT', 'True')) if result is not None and store_result and not exception: output_upload_timestamp_t1 = time.time() logger.info("Storing function result - output.pickle - Size: {}".format(sizeof_fmt(len(pickled_output)))) self.internal_storage.put_data(self.output_key, pickled_output) output_upload_timestamp_t2 = time.time() self.stats.write("output_upload_time", round(output_upload_timestamp_t2 - output_upload_timestamp_t1, 8)) self.result_queue.put("Finished") logger.info("Finished")
def run(self): """ Runs the function """ logger.info("Started") # initial output file in case job fails result = None exception = False try: self.internal_storage = InternalStorage(self.storage_config) self.internal_storage.tmp_obj_prefix = self.output_key.rsplit('/', 1)[0] loaded_func_all = self._get_function_and_modules() self._save_modules(loaded_func_all['module_data']) function = self._unpickle_function(loaded_func_all['func']) data = self._load_data() data = self._create_storage_clients(function, data) if self.show_memory: logger.debug("Memory usage before call the function: {}".format(get_current_memory_usage())) logger.info("Function: Going to execute '{}()'".format(str(function.__name__))) print('---------------------- FUNCTION LOG ----------------------', flush=True) func_exec_time_t1 = time.time() result = function(**data) func_exec_time_t2 = time.time() print('----------------------------------------------------------', flush=True) logger.info("Function: Success execution") if self.show_memory: logger.debug("Memory usage after call the function: {}".format(get_current_memory_usage())) self.stats.write('function_exec_time', round(func_exec_time_t2-func_exec_time_t1, 8)) # Check for new futures if result is not None: self.stats.write("result", True) if isinstance(result, ResponseFuture): callgroup_id = result.callgroup_id self.stats.write('new_futures', '{}/{}'.format(callgroup_id, 1)) elif type(result) == list and len(result) > 0 and isinstance(result[0], ResponseFuture): callgroup_id = result[0].callgroup_id self.stats.write('new_futures', '{}/{}'.format(callgroup_id, len(result))) else: self.stats.write('new_futures', '{}/{}'.format(None, 0)) logger.debug("Pickling result") output_dict = {'result': result} pickled_output = pickle.dumps(output_dict) if self.show_memory: logger.debug("Memory usage after output serialization: {}".format(get_current_memory_usage())) else: logger.debug("No result to store") self.stats.write("result", False) except Exception as e: exception = True self.stats.write("exception", True) print('----------------------- EXCEPTION !-----------------------') logger.error("There was an exception: {}".format(str(e))) print('----------------------------------------------------------', flush=True) if self.show_memory: logger.debug("Memory usage after call the function: {}".format(get_current_memory_usage())) try: logger.debug("Pickling exception") pickled_exc = pickle.dumps(sys.exc_info()) pickle.loads(pickled_exc) # this is just to make sure they can be unpickled self.stats.write("exc_info", str(pickled_exc)) except Exception as pickle_exception: # Shockingly often, modules like subprocess don't properly # call the base Exception.__init__, which results in them # being unpickleable. As a result, we actually wrap this in a try/catch block # and more-carefully handle the exceptions if any part of this save / test-reload # fails logger.debug("Failed pickling exception: {}".format(str(pickle_exception))) self.stats.write("exc_pickle_fail", True) exc_type, exc_value, exc_traceback = sys.exc_info() pickled_exc = pickle.dumps({'exc_type': str(exc_type), 'exc_value': str(exc_value), 'exc_traceback': exc_traceback, 'pickle_exception': pickle_exception}) pickle.loads(pickled_exc) # this is just to make sure they can be unpickled self.stats.write("exc_info", str(pickled_exc)) finally: store_result = strtobool(os.environ.get('STORE_RESULT', 'True')) if result is not None and store_result and not exception: output_upload_timestamp_t1 = time.time() logger.info("Storing function result - output.pickle - Size: {}".format(sizeof_fmt(len(pickled_output)))) self.internal_storage.put_data(self.output_key, pickled_output) output_upload_timestamp_t2 = time.time() self.stats.write("output_upload_time", round(output_upload_timestamp_t2 - output_upload_timestamp_t1, 8)) self.result_queue.put("Finished") logger.info("Finished")
def status(self, throw_except=True, internal_storage=None): """ Return the status returned by the call. If the call raised an exception, this method will raise the same exception If the future is cancelled before completing then CancelledError will be raised. :param check_only: Return None immediately if job is not complete. Default False. :param throw_except: Reraise exception if call raised. Default true. :param storage_handler: Storage handler to poll cloud storage. Default None. :return: Result of the call. :raises CancelledError: If the job is cancelled before completed. :raises TimeoutError: If job is not complete after `timeout` seconds. """ if self._state == ResponseFuture.State.New: raise ValueError("task not yet invoked") if self._state in [ResponseFuture.State.Ready, ResponseFuture.State.Success]: return self._call_status if internal_storage is None: internal_storage = InternalStorage(self._storage_config) if self._call_status is None: check_storage_path(internal_storage.get_storage_config(), self._storage_path) self._call_status = internal_storage.get_call_status(self.executor_id, self.job_id, self.call_id) self._status_query_count += 1 while self._call_status is None: time.sleep(self.GET_RESULT_SLEEP_SECS) self._call_status = internal_storage.get_call_status(self.executor_id, self.job_id, self.call_id) self._status_query_count += 1 self.activation_id = self._call_status.get('activation_id', None) if self._call_status['type'] == '__init__': self._set_state(ResponseFuture.State.Running) return self._call_status if self._call_status['exception']: self._set_state(ResponseFuture.State.Error) self._exception = pickle.loads(eval(self._call_status['exc_info'])) msg1 = ('ExecutorID {} | JobID {} - There was an exception - Activation ' 'ID: {}'.format(self.executor_id, self.job_id, self.activation_id)) if not self._call_status.get('exc_pickle_fail', False): fn_exctype = self._exception[0] fn_exc = self._exception[1] if fn_exc.args and fn_exc.args[0] == "HANDLER": self._handler_exception = True try: del fn_exc.errno except Exception: pass fn_exc.args = (fn_exc.args[1],) else: fn_exctype = Exception fn_exc = Exception(self._exception['exc_value']) self._exception = (fn_exctype, fn_exc, self._exception['exc_traceback']) def exception_hook(exctype, exc, trcbck): if exctype == fn_exctype and str(exc) == str(fn_exc): msg2 = '--> Exception: {} - {}'.format(fn_exctype.__name__, fn_exc) print(msg1) if not self.log_level else logger.info(msg1) if self._handler_exception: print(msg2+'\n') if not self.log_level else logger.info(msg2) else: traceback.print_exception(*self._exception) else: sys.excepthook = sys.__excepthook__ traceback.print_exception(exctype, exc, trcbck) if throw_except: sys.excepthook = exception_hook reraise(*self._exception) else: logger.info(msg1) logger.debug('Exception: {} - {}'.format(self._exception[0].__name__, self._exception[1])) return None self._call_metadata['host_submit_time'] = self._call_status.pop('host_submit_time') self._call_metadata['status_done_timestamp'] = time.time() self._call_metadata['status_query_count'] = self._status_query_count total_time = format(round(self._call_status['end_time'] - self._call_status['start_time'], 2), '.2f') log_msg = ('ExecutorID {} | JobID {} - Got status from call {} - Activation ' 'ID: {} - Time: {} seconds'.format(self.executor_id, self.job_id, self.call_id, self.activation_id, str(total_time))) logger.info(log_msg) self._set_state(ResponseFuture.State.Ready) if not self._call_status['result']: self._produce_output = False if not self._produce_output: self._set_state(ResponseFuture.State.Success) if 'new_futures' in self._call_status: self.result(throw_except=throw_except, internal_storage=internal_storage) return self._call_status
def function_handler(event): start_tstamp = time.time() log_level = event['log_level'] cloud_logging_config(log_level) logger.debug("Action handler started") extra_env = event.get('extra_env', {}) os.environ.update(extra_env) os.environ.update({'PYWREN_FUNCTION': 'True', 'PYTHONUNBUFFERED': 'True'}) config = event['config'] call_id = event['call_id'] job_id = event['job_id'] executor_id = event['executor_id'] exec_id = "{}/{}/{}".format(executor_id, job_id, call_id) logger.info("Execution-ID: {}".format(exec_id)) runtime_name = event['runtime_name'] runtime_memory = event['runtime_memory'] execution_timeout = event['execution_timeout'] logger.debug("Runtime name: {}".format(runtime_name)) logger.debug("Runtime memory: {}MB".format(runtime_memory)) logger.debug("Function timeout: {}s".format(execution_timeout)) func_key = event['func_key'] data_key = event['data_key'] data_byte_range = event['data_byte_range'] storage_config = extract_storage_config(config) internal_storage = InternalStorage(storage_config) call_status = CallStatus(config, internal_storage) call_status.response['host_submit_tstamp'] = event['host_submit_tstamp'] call_status.response['start_tstamp'] = start_tstamp context_dict = { 'python_version': os.environ.get("PYTHON_VERSION"), 'call_id': call_id, 'job_id': job_id, 'executor_id': executor_id, 'activation_id': os.environ.get('__PW_ACTIVATION_ID') } call_status.response.update(context_dict) show_memory_peak = strtobool(os.environ.get('SHOW_MEMORY_PEAK', 'False')) try: if version.__version__ != event['pywren_version']: msg = ( "PyWren version mismatch. Host version: {} - Runtime version: {}" .format(event['pywren_version'], version.__version__)) raise RuntimeError('HANDLER', msg) # send init status event call_status.send('__init__') # call_status.response['free_disk_bytes'] = free_disk_space("/tmp") custom_env = { 'PYWREN_CONFIG': json.dumps(config), 'PYWREN_EXECUTION_ID': exec_id, 'PYTHONPATH': "{}:{}".format(os.getcwd(), PYWREN_LIBS_PATH) } os.environ.update(custom_env) jobrunner_stats_dir = os.path.join(STORAGE_FOLDER, storage_config['bucket'], JOBS_PREFIX, executor_id, job_id, call_id) os.makedirs(jobrunner_stats_dir, exist_ok=True) jobrunner_stats_filename = os.path.join(jobrunner_stats_dir, 'jobrunner.stats.txt') jobrunner_config = { 'pywren_config': config, 'call_id': call_id, 'job_id': job_id, 'executor_id': executor_id, 'func_key': func_key, 'data_key': data_key, 'log_level': log_level, 'data_byte_range': data_byte_range, 'output_key': create_output_key(JOBS_PREFIX, executor_id, job_id, call_id), 'stats_filename': jobrunner_stats_filename } if show_memory_peak: mm_handler_conn, mm_conn = Pipe() memory_monitor = Thread(target=memory_monitor_worker, args=(mm_conn, )) memory_monitor.start() handler_conn, jobrunner_conn = Pipe() jobrunner = JobRunner(jobrunner_config, jobrunner_conn, internal_storage) logger.debug('Starting JobRunner process') local_execution = strtobool( os.environ.get('__PW_LOCAL_EXECUTION', 'False')) jrp = Thread(target=jobrunner.run) if local_execution else Process( target=jobrunner.run) jrp.start() jrp.join(execution_timeout) logger.debug('JobRunner process finished') if jrp.is_alive(): # If process is still alive after jr.join(job_max_runtime), kill it try: jrp.terminate() except Exception: # thread does not have terminate method pass msg = ('Function exceeded maximum time of {} seconds and was ' 'killed'.format(execution_timeout)) raise TimeoutError('HANDLER', msg) if show_memory_peak: mm_handler_conn.send('STOP') memory_monitor.join() peak_memory_usage = int(mm_handler_conn.recv()) logger.info("Peak memory usage: {}".format( sizeof_fmt(peak_memory_usage))) call_status.response['peak_memory_usage'] = peak_memory_usage if not handler_conn.poll(): logger.error( 'No completion message received from JobRunner process') logger.debug('Assuming memory overflow...') # Only 1 message is returned by jobrunner when it finishes. # If no message, this means that the jobrunner process was killed. # 99% of times the jobrunner is killed due an OOM, so we assume here an OOM. msg = 'Function exceeded maximum memory and was killed' raise MemoryError('HANDLER', msg) if os.path.exists(jobrunner_stats_filename): with open(jobrunner_stats_filename, 'r') as fid: for l in fid.readlines(): key, value = l.strip().split(" ", 1) try: call_status.response[key] = float(value) except Exception: call_status.response[key] = value if key in [ 'exception', 'exc_pickle_fail', 'result', 'new_futures' ]: call_status.response[key] = eval(value) except Exception: # internal runtime exceptions print('----------------------- EXCEPTION !-----------------------', flush=True) traceback.print_exc(file=sys.stdout) print('----------------------------------------------------------', flush=True) call_status.response['exception'] = True pickled_exc = pickle.dumps(sys.exc_info()) pickle.loads( pickled_exc) # this is just to make sure they can be unpickled call_status.response['exc_info'] = str(pickled_exc) finally: call_status.response['end_tstamp'] = time.time() call_status.send('__end__') for key in extra_env: os.environ.pop(key) logger.info("Finished")
class GCPFunctionsBackend: def __init__(self, gcp_functions_config): self.log_active = logger.getEffectiveLevel() != logging.WARNING self.name = 'gcp_functions' self.gcp_functions_config = gcp_functions_config self.package = 'pywren_v'+__version__ self.region = gcp_functions_config['region'] self.service_account = gcp_functions_config['service_account'] self.project = gcp_functions_config['project_name'] self.credentials_path = gcp_functions_config['credentials_path'] self.num_retries = gcp_functions_config['retries'] self.retry_sleeps = gcp_functions_config['retry_sleeps'] # Instantiate storage client (to upload function bin) self.internal_storage = InternalStorage(gcp_functions_config['storage']) # Setup pubsub client try: # Get credenitals from JSON file service_account_info = json.load(open(self.credentials_path)) credentials = jwt.Credentials.from_service_account_info(service_account_info, audience=AUDIENCE) credentials_pub = credentials.with_claims(audience=AUDIENCE) except Exception: # Get credentials from gcp function environment credentials_pub = None self.publisher_client = pubsub_v1.PublisherClient(credentials=credentials_pub) log_msg = 'PyWren v{} init for GCP Functions - Project: {} - Region: {}'.format( __version__, self.project, self.region) logger.info(log_msg) if not self.log_active: print(log_msg) def _format_action_name(self, runtime_name, runtime_memory): runtime_name = (self.package+'_'+runtime_name).replace('.', '-') return '{}_{}MB'.format(runtime_name, runtime_memory) def _format_topic_name(self, runtime_name, runtime_memory): return self._format_action_name(runtime_name, runtime_memory)+'_topic' def _unformat_action_name(self, action_name): split = action_name.split('_') runtime_name = split[1].replace('-', '.') runtime_memory = int(split[2].replace('MB', '')) return runtime_name, runtime_memory def _full_function_location(self, function_name): return 'projects/{}/locations/{}/functions/{}'.format(self.project, self.region, function_name) def _full_topic_location(self, topic_name): return 'projects/{}/topics/{}'.format(self.project, topic_name) def _full_default_location(self): return 'projects/{}/locations/{}'.format(self.project, self.region) def _encode_payload(self, payload): return base64.b64encode(bytes(json.dumps(payload), 'utf-8')).decode('utf-8') def _get_auth_session(self): credentials = service_account.Credentials.from_service_account_file(self.credentials_path, scopes=SCOPES) http = httplib2.Http() return AuthorizedHttp(credentials, http=http) def _get_funct_conn(self): http = self._get_auth_session() return build('cloudfunctions', FUNCTIONS_API_VERSION, http=http, cache_discovery=False) def _get_default_runtime_image_name(self): return 'python'+version_str(sys.version_info) def _create_function(self, runtime_name, memory, code, timeout=60, trigger='HTTP'): logger.debug("Creating function {} - Memory: {} Timeout: {} Trigger: {}".format( runtime_name, memory, timeout, trigger)) default_location = self._full_default_location() function_location = self._full_function_location( self._format_action_name(runtime_name, memory)) bin_name = self._format_action_name(runtime_name, memory)+'_bin.zip' self.internal_storage.put_data(bin_name, code) cloud_function = { 'name': function_location, 'description': self.package, 'entryPoint': 'main', 'runtime': runtime_name.lower().replace('.', ''), 'timeout': str(timeout)+'s', 'availableMemoryMb': memory, 'serviceAccountEmail': self.service_account, 'maxInstances': 0, 'sourceArchiveUrl': 'gs://{}/{}'.format(self.internal_storage.bucket, bin_name) } if trigger == 'HTTP': cloud_function['httpsTrigger'] = {} elif trigger == 'Pub/Sub': topic_location = self._full_topic_location( self._format_topic_name(runtime_name, memory)) cloud_function['eventTrigger'] = { 'eventType': 'providers/cloud.pubsub/eventTypes/topic.publish', 'resource': topic_location, 'failurePolicy': {} } response = self._get_funct_conn().projects().locations().functions().create( # pylint: disable=no-member location=default_location, body=cloud_function ).execute(num_retries=self.num_retries) # Wait until function is completely deployed while True: response = self._get_funct_conn().projects().locations().functions().get( # pylint: disable=no-member name=function_location ).execute(num_retries=self.num_retries) if response['status'] == 'ACTIVE': break else: time.sleep(random.choice(self.retry_sleeps)) def build_runtime(self): pass def update_runtime(self, runtime_name, code, memory=3008, timeout=900): pass def create_runtime(self, runtime_name, memory, timeout=60): logger.debug("Creating runtime {} - \ Memory: {} Timeout: {}".format(runtime_name, memory, timeout)) # Get runtime preinstalls runtime_meta = self._generate_runtime_meta(runtime_name) # Create topic topic_name = self._format_topic_name(runtime_name, memory) topic_location = self._full_topic_location(topic_name) try: # Try getting topic config # pylint: disable=no-member self.publisher_client.get_topic(topic_location) # If no exception is raised, then the topic exists logger.info( "Topic {} already exists - Restarting queue...".format(topic_location)) self.publisher_client.delete_topic(topic_location) except google.api_core.exceptions.GoogleAPICallError: pass logger.debug("Creating topic {}...".format(topic_location)) self.publisher_client.create_topic(topic_location) # Create function create_function_handler_zip(ZIP_LOCATION, 'main.py', __file__) with open(ZIP_LOCATION, "rb") as action_zip: action_bin = action_zip.read() self._create_function(runtime_name, memory, action_bin, timeout=timeout, trigger='Pub/Sub') return runtime_meta def delete_runtime(self, runtime_name, runtime_memory): function_location = self._full_function_location( self._format_action_name(runtime_name, runtime_memory)) self._get_funct_conn().projects().locations().functions().delete( # pylint: disable=no-member name=function_location, ).execute(num_retries=self.num_retries) # Wait until function is completely deleted while True: try: response = self._get_funct_conn().projects().locations().functions().get( # pylint: disable=no-member name=function_location ).execute(num_retries=self.num_retries) except HttpError: break if response['status'] == 'DELETE_IN_PROGRESS': time.sleep(random.choice(self.retry_sleeps)) def delete_all_runtimes(self): runtimes = self.list_runtimes() for runtime in runtimes: if 'cloudbutton_v' in runtime: runtime_name, runtime_memory = self._unformat_action_name( runtime) self.delete_runtime(runtime_name, runtime_memory) def list_runtimes(self, docker_image_name='all'): default_location = self._full_default_location() response = self._get_funct_conn().projects().locations().functions().list( # pylint: disable=no-member location=default_location, body={} ).execute(num_retries=self.num_retries) result = response['Functions'] if 'Functions' in response else [] return result def invoke(self, runtime_name, runtime_memory, payload={}): exec_id = payload['executor_id'] call_id = payload['call_id'] topic_location = self._full_topic_location( self._format_topic_name(runtime_name, runtime_memory)) start = time.time() try: # Publish message fut = self.publisher_client.publish( topic_location, bytes(json.dumps(payload).encode('utf-8'))) invokation_id = fut.result() except Exception as e: logger.debug( 'ExecutorID {} - Function {} invocation failed: {}'.format(exec_id, call_id, str(e))) return None roundtrip = time.time() - start resp_time = format(round(roundtrip, 3), '.3f') logger.debug('ExecutorID {} - Function {} invocation done! ({}s) - Activation ID: {}'.format( exec_id, call_id, resp_time, invokation_id)) return(invokation_id) def invoke_with_result(self, runtime_name, runtime_memory, payload={}): action_name = self._format_action_name(runtime_name, runtime_memory) function_location = self._full_function_location(action_name) response = self._get_funct_conn().projects().locations().functions().call( # pylint: disable=no-member name=function_location, body={'data': json.dumps({'data': self._encode_payload(payload)})} ).execute(num_retries=self.num_retries) return json.loads(response['result']) def get_runtime_key(self, runtime_name, runtime_memory): action_name = self._format_action_name(runtime_name, runtime_memory) runtime_key = os.path.join(self.name, self.region, action_name) return runtime_key def _generate_runtime_meta(self, runtime_name): action_code = """ import sys import pkgutil import json def main(request): runtime_meta = dict() mods = list(pkgutil.iter_modules()) runtime_meta['preinstalls'] = [entry for entry in sorted([[mod, is_pkg] for _, mod, is_pkg in mods])] python_version = sys.version_info runtime_meta['python_ver'] = str(python_version[0])+"."+str(python_version[1]) return json.dumps(runtime_meta) """ action_location = os.path.join( tempfile.gettempdir(), 'extract_preinstalls_gcp.py') with open(action_location, 'w') as f: f.write(textwrap.dedent(action_code)) modules_zip_action = os.path.join( tempfile.gettempdir(), 'extract_preinstalls_gcp.zip') with zipfile.ZipFile(modules_zip_action, 'w') as extract_modules_zip: extract_modules_zip.write(action_location, 'main.py') extract_modules_zip.close() with open(modules_zip_action, 'rb') as modules_zip: action_code = modules_zip.read() self._create_function(runtime_name, 128, action_code, trigger='HTTP') logger.debug( "Extracting Python modules list from: {}".format(runtime_name)) try: runtime_meta = self.invoke_with_result(runtime_name, 128) except Exception: raise("Unable to invoke 'modules' action") try: self.delete_runtime(runtime_name, 128) except Exception: raise("Unable to delete 'modules' action") if not runtime_meta or 'preinstalls' not in runtime_meta: raise Exception(runtime_meta) return runtime_meta
class JobRunner(Process): def __init__(self, jr_config, result_queue): super().__init__() start_time = time.time() self.jr_config = jr_config self.result_queue = result_queue log_level = self.jr_config['log_level'] cloud_logging_config(log_level) self.pywren_config = self.jr_config['pywren_config'] self.storage_config = extract_storage_config(self.pywren_config) self.call_id = self.jr_config['call_id'] self.job_id = self.jr_config['job_id'] self.executor_id = self.jr_config['executor_id'] self.func_key = self.jr_config['func_key'] self.data_key = self.jr_config['data_key'] self.data_byte_range = self.jr_config['data_byte_range'] self.output_key = self.jr_config['output_key'] self.stats = stats(self.jr_config['stats_filename']) self.stats.write('jobrunner_start', start_time) self.show_memory = strtobool( os.environ.get('SHOW_MEMORY_USAGE', 'False')) def _get_function_and_modules(self): """ Gets and unpickles function and modules from storage """ logger.debug("Getting function and modules") func_download_time_t1 = time.time() func_obj = self.internal_storage.get_func(self.func_key) loaded_func_all = pickle.loads(func_obj) func_download_time_t2 = time.time() self.stats.write( 'func_download_time', round(func_download_time_t2 - func_download_time_t1, 8)) logger.debug("Finished getting Function and modules") return loaded_func_all def _save_modules(self, module_data): """ Save modules, before we unpickle actual function """ logger.debug("Writing Function dependencies to local disk") PYTHON_MODULE_PATH = self.jr_config['python_module_path'] shutil.rmtree(PYTHON_MODULE_PATH, True) # delete old modules os.mkdir(PYTHON_MODULE_PATH) sys.path.append(PYTHON_MODULE_PATH) for m_filename, m_data in module_data.items(): m_path = os.path.dirname(m_filename) if len(m_path) > 0 and m_path[0] == "/": m_path = m_path[1:] to_make = os.path.join(PYTHON_MODULE_PATH, m_path) try: os.makedirs(to_make) except OSError as e: if e.errno == 17: pass else: raise e full_filename = os.path.join(to_make, os.path.basename(m_filename)) with open(full_filename, 'wb') as fid: fid.write(b64str_to_bytes(m_data)) #logger.info("Finished writing {} module files".format(len(loaded_func_all['module_data']))) #logger.debug(subprocess.check_output("find {}".format(PYTHON_MODULE_PATH), shell=True)) #logger.debug(subprocess.check_output("find {}".format(os.getcwd()), shell=True)) logger.debug("Finished writing Function dependencies") def _unpickle_function(self, pickled_func): """ Unpickle function; it will expect modules to be there """ logger.debug("Unpickle Function") loaded_func = pickle.loads(pickled_func) logger.debug("Finished Function unpickle") return loaded_func def _load_data(self): extra_get_args = {} if self.data_byte_range is not None: range_str = 'bytes={}-{}'.format(*self.data_byte_range) extra_get_args['Range'] = range_str logger.debug("Getting function data") data_download_time_t1 = time.time() data_obj = self.internal_storage.get_data( self.data_key, extra_get_args=extra_get_args) logger.debug("Finished getting Function data") logger.debug("Unpickle Function data") loaded_data = pickle.loads(data_obj) logger.debug("Finished unpickle Function data") data_download_time_t2 = time.time() self.stats.write( 'data_download_time', round(data_download_time_t2 - data_download_time_t1, 8)) return loaded_data def _fill_optional_args(self, function, data): """ Fills in those reserved, optional parameters that might be write to the function signature """ func_sig = inspect.signature(function) if 'ibm_cos' in func_sig.parameters: if 'ibm_cos' in self.pywren_config: try: ibm_boto3_client = Storage(self.storage_config, 'ibm_cos').get_client() data['ibm_cos'] = ibm_boto3_client except Exception as e: logger.error('Cannot create the ibm_cos connection: {}', str(e)) data['ibm_cos'] = None else: logger.error( 'Cannot create the ibm_cos connection: Configuration not provided' ) data['ibm_cos'] = None if 'internal_storage' in func_sig.parameters: data['internal_storage'] = self.internal_storage if 'rabbitmq' in func_sig.parameters: if 'rabbitmq' in self.pywren_config: try: rabbit_amqp_url = self.pywren_config['rabbitmq'].get( 'amqp_url') params = pika.URLParameters(rabbit_amqp_url) connection = pika.BlockingConnection(params) data['rabbitmq'] = connection except Exception as e: logger.error('Cannot create the rabbitmq connection: {}', str(e)) data['rabbitmq'] = None else: logger.error( 'Cannot create the rabbitmq connection: Configuration not provided' ) data['rabbitmq'] = None if 'id' in func_sig.parameters: data['id'] = int(self.call_id) def _create_data_stream(self, data): """ Creates the data stream in case of object processing """ extra_get_args = {} if 'url' in data: url = data['url'] logger.info('Getting dataset from {}'.format(url.path)) if url.data_byte_range is not None: range_str = 'bytes={}-{}'.format(*url.data_byte_range) extra_get_args['Range'] = range_str logger.info('Chunk: {} - Range: {}'.format( url.part, extra_get_args['Range'])) resp = requests.get(url.path, headers=extra_get_args, stream=True) url.data_stream = resp.raw if 'obj' in data: obj = data['obj'] obj.storage_backend storage_handler = Storage( self.pywren_config, obj.storage_backend).get_storage_handler() logger.info('Getting dataset from {}://{}/{}'.format( obj.storage_backend, obj.bucket, obj.key)) if obj.data_byte_range is not None: extra_get_args['Range'] = 'bytes={}-{}'.format( *obj.data_byte_range) logger.info('Chunk: {} - Range: {}'.format( obj.part, extra_get_args['Range'])) sb = storage_handler.get_object(obj.bucket, obj.key, stream=True, extra_get_args=extra_get_args) obj.data_stream = WrappedStreamingBodyPartition( sb, obj.chunk_size, obj.data_byte_range) else: obj.data_stream = storage_handler.get_object(obj.bucket, obj.key, stream=True) def run(self): """ Runs the function """ logger.info("Started") result = None exception = False try: self.internal_storage = InternalStorage(self.storage_config) self.internal_storage.tmp_obj_prefix = self.output_key.rsplit( '/', 1)[0] loaded_func_all = self._get_function_and_modules() self._save_modules(loaded_func_all['module_data']) function = self._unpickle_function(loaded_func_all['func']) data = self._load_data() if is_object_processing_function(function): self._create_data_stream(data) self._fill_optional_args(function, data) if self.show_memory: logger.debug( "Memory usage before call the function: {}".format( get_current_memory_usage())) logger.info("Going to execute '{}()'".format(str( function.__name__))) print('---------------------- FUNCTION LOG ----------------------', flush=True) func_exec_time_t1 = time.time() result = function(**data) func_exec_time_t2 = time.time() print('----------------------------------------------------------', flush=True) logger.info("Success function execution") if self.show_memory: logger.debug("Memory usage after call the function: {}".format( get_current_memory_usage())) self.stats.write('function_exec_time', round(func_exec_time_t2 - func_exec_time_t1, 8)) # Check for new futures if result is not None: self.stats.write("result", True) if isinstance(result, ResponseFuture) or \ (type(result) == list and len(result) > 0 and isinstance(result[0], ResponseFuture)): self.stats.write('new_futures', True) logger.debug("Pickling result") output_dict = {'result': result} pickled_output = pickle.dumps(output_dict) if self.show_memory: logger.debug( "Memory usage after output serialization: {}".format( get_current_memory_usage())) else: logger.debug("No result to store") self.stats.write("result", False) except Exception: exception = True self.stats.write("exception", True) exc_type, exc_value, exc_traceback = sys.exc_info() print('----------------------- EXCEPTION !-----------------------', flush=True) traceback.print_exc(file=sys.stdout) print('----------------------------------------------------------', flush=True) if self.show_memory: logger.debug("Memory usage after call the function: {}".format( get_current_memory_usage())) try: logger.debug("Pickling exception") pickled_exc = pickle.dumps( (exc_type, exc_value, exc_traceback)) pickle.loads( pickled_exc ) # this is just to make sure they can be unpickled self.stats.write("exc_info", str(pickled_exc)) except Exception as pickle_exception: # Shockingly often, modules like subprocess don't properly # call the base Exception.__init__, which results in them # being unpickleable. As a result, we actually wrap this in a try/catch block # and more-carefully handle the exceptions if any part of this save / test-reload # fails self.stats.write("exc_pickle_fail", True) pickled_exc = pickle.dumps({ 'exc_type': str(exc_type), 'exc_value': str(exc_value), 'exc_traceback': exc_traceback, 'pickle_exception': pickle_exception }) pickle.loads( pickled_exc ) # this is just to make sure they can be unpickled self.stats.write("exc_info", str(pickled_exc)) finally: store_result = strtobool(os.environ.get('STORE_RESULT', 'True')) if result is not None and store_result and not exception: output_upload_timestamp_t1 = time.time() logger.info( "Storing function result - output.pickle - Size: {}". format(sizeof_fmt(len(pickled_output)))) self.internal_storage.put_data(self.output_key, pickled_output) output_upload_timestamp_t2 = time.time() self.stats.write( "output_upload_time", round( output_upload_timestamp_t2 - output_upload_timestamp_t1, 8)) self.result_queue.put("Finished") logger.info("Finished")
class FunctionExecutor: def __init__(self, config=None, runtime=None, runtime_memory=None, compute_backend=None, compute_backend_region=None, log_level=None, rabbitmq_monitor=False): """ Initialize and return a ServerlessExecutor class. :param config: Settings passed in here will override those in config file. Default None. :param runtime: Runtime name to use. Default None. :param runtime_memory: memory to use in the runtime :param log_level: log level to use during the execution :param rabbitmq_monitor: use rabbitmq as monitoring system :return `ServerlessExecutor` object. """ self.start_time = time.time() self._state = ExecutorState.new self.config = default_config(config) self.is_cf_cluster = is_cf_cluster() self.data_cleaner = self.config['pywren']['data_cleaner'] # Overwrite runtime variables if runtime: self.config['pywren']['runtime'] = runtime if runtime_memory: self.config['pywren']['runtime_memory'] = int(runtime_memory) if compute_backend: self.config['pywren']['compute_backend'] = compute_backend if compute_backend_region: self.config['pywren']['compute_backend_region'] = compute_backend_region # Log level Configuration self.log_level = log_level if not self.log_level: if(logger.getEffectiveLevel() != logging.WARNING): self.log_level = logging.getLevelName(logger.getEffectiveLevel()) if self.log_level: os.environ["CB_LOG_LEVEL"] = self.log_level if not self.is_cf_cluster: default_logging_config(self.log_level) if 'CB_EXECUTOR_ID' in os.environ: self.executor_id = os.environ['CB_EXECUTOR_ID'] else: self.executor_id = create_executor_id() logger.debug('ServerlessExecutor created with ID: {}'.format(self.executor_id)) # RabbitMQ monitor configuration self.rabbitmq_monitor = rabbitmq_monitor if self.rabbitmq_monitor: if self.config['rabbitmq']['amqp_url']: os.environ["CB_RABBITMQ_MONITOR"] = 'True' else: self.rabbitmq_monitor = False else: self.config['rabbitmq']['amqp_url'] = None storage_config = extract_storage_config(self.config) self.internal_storage = InternalStorage(storage_config) self.invoker = Invoker(self.config, self.executor_id) self.jobs = {} def call_async(self, func, data, extra_env=None, extra_meta=None, runtime_memory=None, timeout=EXECUTION_TIMEOUT): """ For running one function execution asynchronously :param func: the function to map over the data :param data: input data :param extra_env: Additional environment variables for action environment. Default None. :param extra_meta: Additional metadata to pass to action. Default None. """ if self._state == ExecutorState.finished: raise Exception('You cannot run call_async() in the current state,' ' create a new FunctionExecutor() instance.') job_id = str(len(self.jobs)).zfill(3) job = create_call_async_job(self.config, self.internal_storage, self.executor_id, job_id, func, data, extra_env, extra_meta, runtime_memory, timeout) future = self.invoker.run(job) self.jobs[job['job_id']] = {'futures': future, 'total': job['total_calls'], 'state': JobState.running} self._state = ExecutorState.running return future[0] def map(self, map_function, map_iterdata, extra_env=None, extra_meta=None, runtime_memory=None, chunk_size=None, remote_invocation=False, timeout=EXECUTION_TIMEOUT, remote_invocation_groups=None, invoke_pool_threads=500, overwrite_invoke_args=None, exclude_modules=None): """ :param func: the function to map over the data :param iterdata: An iterable of input data :param extra_env: Additional environment variables for action environment. Default None. :param extra_meta: Additional metadata to pass to action. Default None. :param chunk_size: the size of the data chunks. 'None' for processing the whole file in one map :param remote_invocation: Enable or disable remote_invocayion mechanism. Default 'False' :param timeout: Time that the functions have to complete their execution before raising a timeout. :param data_type: the type of the data. Now allowed: None (files with newline) and csv. :param invoke_pool_threads: Number of threads to use to invoke. :param data_all_as_one: upload the data as a single object. Default True :param overwrite_invoke_args: Overwrite other args. Mainly used for testing. :param exclude_modules: Explicitly keep these modules from pickled dependencies. :return: A list with size `len(iterdata)` of futures for each job :rtype: list of futures. """ if self._state == ExecutorState.finished: raise Exception('You cannot run map() in the current state.' ' Create a new FunctionExecutor() instance.') job_id = str(len(self.jobs)).zfill(3) job, unused_ppo = create_map_job(self.config, self.internal_storage, self.executor_id, job_id, map_function=map_function, iterdata=map_iterdata, extra_env=extra_env, extra_meta=extra_meta, obj_chunk_size=chunk_size, runtime_memory=runtime_memory, remote_invocation=remote_invocation, remote_invocation_groups=remote_invocation_groups, invoke_pool_threads=invoke_pool_threads, exclude_modules=exclude_modules, is_cf_cluster=self.is_cf_cluster, overwrite_invoke_args=overwrite_invoke_args, execution_timeout=timeout) map_futures = self.invoker.run(job) self.jobs[job['job_id']] = {'futures': map_futures, 'total': job['total_calls'], 'state': JobState.running} self._state = ExecutorState.running if len(map_futures) == 1: return map_futures[0] return map_futures def map_reduce(self, map_function, map_iterdata, reduce_function, extra_env=None, map_runtime_memory=None, reduce_runtime_memory=None, extra_meta=None, chunk_size=None, remote_invocation=False, remote_invocation_groups=None, timeout=EXECUTION_TIMEOUT, reducer_one_per_object=False, reducer_wait_local=False, invoke_pool_threads=500, overwrite_invoke_args=None, exclude_modules=None): """ Map the map_function over the data and apply the reduce_function across all futures. This method is executed all within CF. :param map_function: the function to map over the data :param map_iterdata: the function to reduce over the futures :param reduce_function: the function to reduce over the futures :param extra_env: Additional environment variables for action environment. Default None. :param extra_meta: Additional metadata to pass to action. Default None. :param chunk_size: the size of the data chunks. 'None' for processing the whole file in one map :param remote_invocation: Enable or disable remote_invocayion mechanism. Default 'False' :param timeout: Time that the functions have to complete their execution before raising a timeout. :param data_type: the type of the data. Now allowed: None (files with newline) and csv. :param reducer_one_per_object: Set one reducer per object after running the partitioner :param reducer_wait_local: Wait for results locally :param invoke_pool_threads: Number of threads to use to invoke. :param data_all_as_one: upload the data as a single object. Default True :param overwrite_invoke_args: Overwrite other args. Mainly used for testing. :param exclude_modules: Explicitly keep these modules from pickled dependencies. :return: A list with size `len(map_iterdata)` of futures for each job """ if self._state == ExecutorState.finished: raise Exception('You cannot run map_reduce() in the current state.' ' Create a new FunctionExecutor() instance.') job_id = str(len(self.jobs)).zfill(3) job, parts_per_object = create_map_job(self.config, self.internal_storage, self.executor_id, job_id, map_function=map_function, iterdata=map_iterdata, extra_env=extra_env, extra_meta=extra_meta, obj_chunk_size=chunk_size, runtime_memory=map_runtime_memory, remote_invocation=remote_invocation, remote_invocation_groups=remote_invocation_groups, invoke_pool_threads=invoke_pool_threads, exclude_modules=exclude_modules, is_cf_cluster=self.is_cf_cluster, overwrite_invoke_args=overwrite_invoke_args, execution_timeout=timeout) map_futures = self.invoker.run(job) self.jobs[job['job_id']] = {'futures': map_futures, 'total': job['total_calls'], 'state': JobState.running} self._state = ExecutorState.running if reducer_wait_local: self.monitor(futures=map_futures) job = create_reduce_job(self.config, self.internal_storage, self.executor_id, job_id, reduce_function, reduce_runtime_memory, map_futures, parts_per_object, reducer_one_per_object, extra_env, extra_meta) reduce_futures = self.invoker.run(job) self.jobs[job['job_id']] = {'futures': reduce_futures, 'total': job['total_calls'], 'state': JobState.running} for f in map_futures: f.produce_output = False return map_futures + reduce_futures def monitor(self, futures=None, throw_except=True, return_when=ALL_COMPLETED, download_results=False, timeout=EXECUTION_TIMEOUT, THREADPOOL_SIZE=128, WAIT_DUR_SEC=1): """ Wait for the Future instances `fs` to complete. Returns a 2-tuple of lists. The first list contains the futures that completed (finished or cancelled) before the wait completed. The second contains uncompleted futures. :param futures: Futures list. Default None :param throw_except: Re-raise exception if call raised. Default True. :param return_when: One of `ALL_COMPLETED`, `ANY_COMPLETED`, `ALWAYS` :param download_results: Download results. Default false (Only download statuses) :param timeout: Timeout of waiting for results. :param THREADPOOL_SIZE: Number of threads to use. Default 64 :param WAIT_DUR_SEC: Time interval between each check. :return: `(fs_done, fs_notdone)` where `fs_done` is a list of futures that have completed and `fs_notdone` is a list of futures that have not completed. :rtype: 2-tuple of list """ if not futures: futures = [] for job in self.jobs: if self.jobs[job]['state'] == JobState.running: futures.extend(self.jobs[job]['futures']) self.jobs[job]['state'] = JobState.ready if type(futures) != list: ftrs = [futures] else: ftrs = futures if not ftrs: raise Exception('You must run call_async(), map() or map_reduce()' ' before calling get_result() method') rabbit_amqp_url = None if self.rabbitmq_monitor: rabbit_amqp_url = self.config['rabbitmq'].get('amqp_url') if rabbit_amqp_url and not download_results: logger.info('Going to use RabbitMQ to monitor function activations') logging.getLogger('pika').setLevel(logging.WARNING) if download_results: msg = 'ExecutorID {} - Getting results...'.format(self.executor_id) else: msg = 'ExecutorID {} - Waiting for functions to complete...'.format(self.executor_id) logger.info(msg) if not self.log_level and self._state == ExecutorState.running: print(msg) if is_unix_system(): signal.signal(signal.SIGALRM, timeout_handler) signal.alarm(timeout) pbar = None if not self.is_cf_cluster and self._state == ExecutorState.running \ and not self.log_level: from tqdm.auto import tqdm if is_notebook(): pbar = tqdm(bar_format='{n}/|/ {n_fmt}/{total_fmt}', total=len(ftrs)) # ncols=800 else: print() pbar = tqdm(bar_format=' {l_bar}{bar}| {n_fmt}/{total_fmt} ', total=len(ftrs), disable=False) try: wait(ftrs, self.executor_id, self.internal_storage, download_results=download_results, throw_except=throw_except, return_when=return_when, rabbit_amqp_url=rabbit_amqp_url, pbar=pbar, THREADPOOL_SIZE=THREADPOOL_SIZE, WAIT_DUR_SEC=WAIT_DUR_SEC) except FunctionException as e: if is_unix_system(): signal.alarm(0) if pbar: pbar.close() logger.info(e.msg) if not is_notebook(): print() if not self.log_level: print(e.msg) if e.exc_msg: print('--> Exception: ' + e.exc_msg) else: print() traceback.print_exception(*e.exception) sys.exit() except TimeoutError: if download_results: not_dones_activation_ids = [f.activation_id for f in ftrs if not f.done and not (f.ready and not f.produce_output)] else: not_dones_activation_ids = [f.activation_id for f in ftrs if not f.ready] msg = ('ExecutorID {} - Raised timeout of {} seconds waiting for results ' '\nActivations not done: {}'.format(self.executor_id, timeout, not_dones_activation_ids)) self._state = ExecutorState.error except KeyboardInterrupt: if download_results: not_dones_activation_ids = [f.activation_id for f in ftrs if not f.done and not (f.ready and not f.produce_output)] else: not_dones_activation_ids = [f.activation_id for f in ftrs if not f.ready] msg = 'ExecutorID {} - Cancelled \nActivations not done: {}'.format(self.executor_id, not_dones_activation_ids) self._state = ExecutorState.error finally: if is_unix_system(): signal.alarm(0) if pbar: pbar.close() if not is_notebook(): print() if self._state == ExecutorState.error: logger.info(msg) if not self.log_level: print(msg) if download_results and self.data_cleaner and not self.is_cf_cluster: self.clean() if download_results: fs_dones = [f for f in ftrs if f.done] fs_notdones = [f for f in ftrs if not f.done] else: fs_dones = [f for f in ftrs if f.ready] fs_notdones = [f for f in ftrs if not f.ready] self._state = ExecutorState.ready return fs_dones, fs_notdones def get_result(self, futures=None, throw_except=True, timeout=EXECUTION_TIMEOUT, THREADPOOL_SIZE=64, WAIT_DUR_SEC=1): """ For getting results :param futures: Futures list. Default None :param throw_except: Reraise exception if call raised. Default True. :param verbose: Shows some information prints. Default False :param timeout: Timeout for waiting for results. :param THREADPOOL_SIZE: Number of threads to use. Default 64 :param WAIT_DUR_SEC: Time interval between each check. :return: The result of the future/s """ if not futures: futures = [] for job in self.jobs: if self.jobs[job]['state'] != JobState.done: futures.extend(self.jobs[job]['futures']) self.jobs[job]['state'] = JobState.done fs_dones, unused_fs_notdones = self.monitor(futures=futures, throw_except=throw_except, timeout=timeout, download_results=True, THREADPOOL_SIZE=THREADPOOL_SIZE, WAIT_DUR_SEC=WAIT_DUR_SEC) result = [f.result(internal_storage=self.internal_storage) for f in fs_dones if not f.futures and f.produce_output] self._state = ExecutorState.success msg = "ExecutorID {} Finished getting results".format(self.executor_id) logger.debug(msg) if result and len(result) == 1: return result[0] return result def create_timeline_plots(self, dst_dir, dst_file_name, futures=None): """ Creates timeline and histogram of the current execution in dst_dir. :param futures: list of futures. :param dst_dir: destination folder to save .png plots. :param dst_file_name: name of the file. """ if futures is None and (self._state == ExecutorState.new or self._state == ExecutorState.running): raise Exception('You must run call_async(), map() or map_reduce()' ' followed by monitor() or get_results()' ' before calling create_timeline_plots() method') if not futures: futures = [] for job in self.jobs: if self.jobs[job]['state'] == JobState.ready or \ self.jobs[job]['state'] == JobState.done: futures.extend(self.jobs[job]['futures']) self.jobs[job]['state'] = JobState.finished if type(futures) != list: ftrs = [futures] else: ftrs = futures ftrs_to_plot = [f for f in ftrs if f.ready or f.done] if not ftrs_to_plot: return logging.getLogger('matplotlib').setLevel(logging.WARNING) from pywren_ibm_cloud.plots import create_timeline, create_histogram msg = 'ExecutorID {} - Creating timeline plots'.format(self.executor_id) logger.info(msg) if not self.log_level: print(msg) run_statuses = [f.run_status for f in ftrs_to_plot] invoke_statuses = [f.invoke_status for f in ftrs_to_plot] create_timeline(dst_dir, dst_file_name, self.start_time, run_statuses, invoke_statuses, self.config['ibm_cos']) create_histogram(dst_dir, dst_file_name, self.start_time, run_statuses, self.config['ibm_cos']) def clean(self, local_execution=True, delete_all=False): """ Deletes all the files from COS. These files include the function, the data serialization and the function invocation results. """ storage_bucket = self.config['pywren']['storage_bucket'] storage_prerix = self.config['pywren']['storage_prefix'] if delete_all: storage_prerix = '/'.join([storage_prerix]) else: storage_prerix = '/'.join([storage_prerix, self.executor_id]) msg = "ExecutorID {} - Cleaning temporary data".format(self.executor_id) logger.info(msg) if not self.log_level: print(msg) if local_execution: # 1st case: Not background. The main code waits until the cleaner finishes its execution. # It is not ideal for performance tests, since it can take long time to complete. # clean_os_bucket(storage_bucket, storage_prerix, self.internal_storage) # 2nd case: Execute in Background as a subprocess. The main program does not wait for its completion. storage_config = json.dumps(self.internal_storage.get_storage_config()) storage_config = storage_config.replace('"', '\\"') cmdstr = ("{} -c 'from pywren_ibm_cloud.storage.utils import clean_bucket; \ clean_bucket(\"{}\", \"{}\", \"{}\")'".format(sys.executable, storage_bucket, storage_prerix, storage_config)) os.popen(cmdstr) else: extra_env = {'STORE_STATUS': False, 'STORE_RESULT': False} old_stdout = sys.stdout sys.stdout = open(os.devnull, 'w') self.executor.call_async(clean_os_bucket, [storage_bucket, storage_prerix], extra_env=extra_env) sys.stdout = old_stdout self._state = ExecutorState.finished
def status(self, throw_except=True, internal_storage=None): """ Return the status returned by the call. If the call raised an exception, this method will raise the same exception If the future is cancelled before completing then CancelledError will be raised. :param check_only: Return None immediately if job is not complete. Default False. :param throw_except: Reraise exception if call raised. Default true. :param storage_handler: Storage handler to poll cloud storage. Default None. :return: Result of the call. :raises CancelledError: If the job is cancelled before completed. :raises TimeoutError: If job is not complete after `timeout` seconds. """ if self._state == CallState.new: raise ValueError("task not yet invoked") if self._state == CallState.ready or self._state == CallState.success: return self.run_status if internal_storage is None: internal_storage = InternalStorage(self.storage_config) if self._call_status is None: check_storage_path(internal_storage.get_storage_config(), self.storage_path) self._call_status = internal_storage.get_call_status( self.executor_id, self.job_id, self.call_id) self.status_query_count += 1 while self._call_status is None: time.sleep(self.GET_RESULT_SLEEP_SECS) self._call_status = internal_storage.get_call_status( self.executor_id, self.job_id, self.call_id) self.status_query_count += 1 self.invoke_status['status_done_timestamp'] = time.time() self.invoke_status['status_query_count'] = self.status_query_count self.run_status = self._call_status # this is the remote status information total_time = format( round( self._call_status['end_time'] - self._call_status['start_time'], 2), '.2f') if self._call_status['exception']: # the action handler/jobrunner/function had an exception self._set_state(CallState.error) self._exception = pickle.loads(eval(self._call_status['exc_info'])) msg = None if not self._call_status.get('exc_pickle_fail', False): exception_args = self._exception[1].args if exception_args[0] == "WRONGVERSION": msg = "PyWren version mismatch: remote library is version {}, local " \ "library is version {}".format(exception_args[2], exception_args[3]) if exception_args[0] == "OUTATIME": msg = "Process ran out of time and was killed" if exception_args[0] == "OUTOFMEMORY": msg = "Process exceeded maximum memory and was killed" else: fault = Exception(self._exception['exc_value']) self._exception = (Exception, fault, self._exception['exc_traceback']) if throw_except: raise FunctionException(self.executor_id, self.job_id, self.activation_id, self._exception, msg) return None log_msg = ( 'ExecutorID {} | JobID {} - Got status from Function {} - Activation ' 'ID: {} - Time: {} seconds'.format(self.executor_id, self.job_id, self.call_id, self.activation_id, str(total_time))) logger.debug(log_msg) self._set_state(CallState.ready) if not self._call_status['result'] or not self.produce_output: # Function did not produce output, so let's put it as success self._set_state(CallState.success) if 'new_futures' in self._call_status: self.result(throw_except=throw_except, internal_storage=internal_storage) return self.run_status
class CallStatus: def __init__(self, pywren_config): self.config = pywren_config self.rabbitmq_monitor = self.config['pywren'].get('rabbitmq_monitor', False) self.store_status = strtobool(os.environ.get('STORE_STATUS', 'True')) storage_config = extract_storage_config(self.config) self.internal_storage = InternalStorage(storage_config) self.response = {'exception': False} def send(self, event_type): self.response['type'] = event_type if self.store_status: if self.rabbitmq_monitor: self._send_status_rabbitmq() if not self.rabbitmq_monitor or event_type == '__end__': self._send_status_os() def _send_status_os(self): """ Send the status event to the Object Storage """ executor_id = self.response['executor_id'] job_id = self.response['job_id'] call_id = self.response['call_id'] if self.response['type'] == '__init__': init_key = create_init_key(JOBS_PREFIX, executor_id, job_id, call_id) self.internal_storage.put_data(init_key, '') elif self.response['type'] == '__end__': status_key = create_status_key(JOBS_PREFIX, executor_id, job_id, call_id) dmpd_response_status = json.dumps(self.response) drs = sizeof_fmt(len(dmpd_response_status)) logger.info("Storing execution stats - status.json - Size: {}".format(drs)) self.internal_storage.put_data(status_key, dmpd_response_status) def _send_status_rabbitmq(self): """ Send the status event to RabbitMQ """ dmpd_response_status = json.dumps(self.response) drs = sizeof_fmt(len(dmpd_response_status)) executor_id = self.response['executor_id'] job_id = self.response['job_id'] rabbit_amqp_url = self.config['rabbitmq'].get('amqp_url') status_sent = False output_query_count = 0 params = pika.URLParameters(rabbit_amqp_url) exchange = 'pywren-{}-{}'.format(executor_id, job_id) while not status_sent and output_query_count < 5: output_query_count = output_query_count + 1 try: connection = pika.BlockingConnection(params) channel = connection.channel() channel.exchange_declare(exchange=exchange, exchange_type='fanout', auto_delete=True) channel.basic_publish(exchange=exchange, routing_key='', body=dmpd_response_status) connection.close() logger.info("Execution status sent to rabbitmq - Size: {}".format(drs)) status_sent = True except Exception as e: logger.error("Unable to send status to rabbitmq") logger.error(str(e)) logger.info('Retrying to send status to rabbitmq...') time.sleep(0.2)
class FunctionExecutor: class State: New = 'New' Running = 'Running' Ready = 'Ready' Done = 'Done' Error = 'Error' Finished = 'Finished' def __init__(self, config=None, runtime=None, runtime_memory=None, compute_backend=None, compute_backend_region=None, storage_backend=None, storage_backend_region=None, rabbitmq_monitor=None, log_level=None): """ Initialize a FunctionExecutor class. :param config: Settings passed in here will override those in config file. Default None. :param runtime: Runtime name to use. Default None. :param runtime_memory: memory to use in the runtime. Default None. :param compute_backend: Name of the compute backend to use. Default None. :param compute_backend_region: Name of the compute backend region to use. Default None. :param storage_backend: Name of the storage backend to use. Default None. :param storage_backend_region: Name of the storage backend region to use. Default None. :param log_level: log level to use during the execution. Default None. :param rabbitmq_monitor: use rabbitmq as the monitoring system. Default None. :return `FunctionExecutor` object. """ self.start_time = time.time() self._state = FunctionExecutor.State.New self.is_remote_cluster = is_remote_cluster() # Log level Configuration self.log_level = log_level if not self.log_level: if (logger.getEffectiveLevel() != logging.WARNING): self.log_level = logging.getLevelName( logger.getEffectiveLevel()) if self.log_level: os.environ["PYWREN_LOGLEVEL"] = self.log_level if not self.is_remote_cluster: default_logging_config(self.log_level) # Overwrite pywren config parameters config_ow = {'pywren': {}} if runtime is not None: config_ow['pywren']['runtime'] = runtime if runtime_memory is not None: config_ow['pywren']['runtime_memory'] = int(runtime_memory) if compute_backend is not None: config_ow['pywren']['compute_backend'] = compute_backend if compute_backend_region is not None: config_ow['pywren'][ 'compute_backend_region'] = compute_backend_region if storage_backend is not None: config_ow['pywren']['storage_backend'] = storage_backend if storage_backend_region is not None: config_ow['pywren'][ 'storage_backend_region'] = storage_backend_region if rabbitmq_monitor is not None: config_ow['pywren']['rabbitmq_monitor'] = rabbitmq_monitor self.config = default_config(config, config_ow) self.executor_id = create_executor_id() logger.debug('FunctionExecutor created with ID: {}'.format( self.executor_id)) # RabbitMQ monitor configuration self.rabbitmq_monitor = self.config['pywren'].get( 'rabbitmq_monitor', False) if self.rabbitmq_monitor: if 'rabbitmq' in self.config and 'amqp_url' in self.config[ 'rabbitmq']: self.rabbit_amqp_url = self.config['rabbitmq'].get('amqp_url') else: raise Exception( "You cannot use rabbitmq_mnonitor since 'amqp_url'" " is not present in configuration") self.data_cleaner = self.config['pywren']['data_cleaner'] storage_config = extract_storage_config(self.config) self.internal_storage = InternalStorage(storage_config) self.invoker = FunctionInvoker(self.config, self.executor_id, self.internal_storage) self.jobs = {} @property def futures(self): futures = [] for job in self.jobs: futures.extend(self.jobs[job]['futures']) return futures def call_async(self, func, data, extra_env=None, runtime_memory=None, timeout=EXECUTION_TIMEOUT, include_modules=[], exclude_modules=[]): """ For running one function execution asynchronously :param func: the function to map over the data :param data: input data :param extra_data: Additional data to pass to action. Default None. :param extra_env: Additional environment variables for action environment. Default None. :param runtime_memory: Memory to use to run the function. Default None (loaded from config). :param timeout: Time that the functions have to complete their execution before raising a timeout. :param include_modules: Explicitly pickle these dependencies. :param exclude_modules: Explicitly keep these modules from pickled dependencies. :return: future object. """ if self._state == FunctionExecutor.State.Finished: raise Exception('You cannot run call_async() in the current state,' ' create a new FunctionExecutor() instance.') job_id = str(len(self.jobs)).zfill(3) async_job_id = 'A{}'.format(job_id) runtime_meta = self.invoker.select_runtime(async_job_id, runtime_memory) job = create_map_job(self.config, self.internal_storage, self.executor_id, async_job_id, map_function=func, iterdata=[data], runtime_meta=runtime_meta, runtime_memory=runtime_memory, extra_env=extra_env, include_modules=include_modules, exclude_modules=exclude_modules, execution_timeout=timeout) future = self.invoker.run(job) self.jobs[async_job_id] = { 'futures': future, 'state': JobState.Running } self._state = FunctionExecutor.State.Running return future[0] def map(self, map_function, map_iterdata, extra_params=None, extra_env=None, runtime_memory=None, chunk_size=None, chunk_n=None, remote_invocation=False, remote_invocation_groups=None, timeout=EXECUTION_TIMEOUT, invoke_pool_threads=450, include_modules=[], exclude_modules=[]): """ :param map_function: the function to map over the data :param map_iterdata: An iterable of input data :param extra_params: Additional parameters to pass to the function activation. Default None. :param extra_env: Additional environment variables for action environment. Default None. :param runtime_memory: Memory to use to run the function. Default None (loaded from config). :param chunk_size: the size of the data chunks to split each object. 'None' for processing the whole file in one function activation. :param chunk_n: Number of chunks to split each object. 'None' for processing the whole file in one function activation. :param remote_invocation: Enable or disable remote_invocation mechanism. Default 'False' :param timeout: Time that the functions have to complete their execution before raising a timeout. :param invoke_pool_threads: Number of threads to use to invoke. :param include_modules: Explicitly pickle these dependencies. :param exclude_modules: Explicitly keep these modules from pickled dependencies. :return: A list with size `len(iterdata)` of futures. """ if self._state == FunctionExecutor.State.Finished: raise Exception('You cannot run map() in the current state.' ' Create a new FunctionExecutor() instance.') total_current_jobs = len(self.jobs) job_id = str(total_current_jobs).zfill(3) map_job_id = 'M{}'.format(job_id) runtime_meta = self.invoker.select_runtime(map_job_id, runtime_memory) job = create_map_job(self.config, self.internal_storage, self.executor_id, map_job_id, map_function=map_function, iterdata=map_iterdata, runtime_meta=runtime_meta, runtime_memory=runtime_memory, extra_params=extra_params, extra_env=extra_env, obj_chunk_size=chunk_size, obj_chunk_number=chunk_n, remote_invocation=remote_invocation, remote_invocation_groups=remote_invocation_groups, invoke_pool_threads=invoke_pool_threads, include_modules=include_modules, exclude_modules=exclude_modules, is_remote_cluster=self.is_remote_cluster, execution_timeout=timeout) map_futures = self.invoker.run(job) self.jobs[map_job_id] = { 'futures': map_futures, 'state': JobState.Running } self._state = FunctionExecutor.State.Running if len(map_futures) == 1: return map_futures[0] return map_futures def map_reduce(self, map_function, map_iterdata, reduce_function, extra_params=None, extra_env=None, map_runtime_memory=None, reduce_runtime_memory=None, chunk_size=None, chunk_n=None, remote_invocation=False, remote_invocation_groups=None, timeout=EXECUTION_TIMEOUT, reducer_one_per_object=False, reducer_wait_local=False, invoke_pool_threads=450, include_modules=[], exclude_modules=[]): """ Map the map_function over the data and apply the reduce_function across all futures. This method is executed all within CF. :param map_function: the function to map over the data :param map_iterdata: the function to reduce over the futures :param reduce_function: the function to reduce over the futures :param extra_env: Additional environment variables for action environment. Default None. :param extra_params: Additional parameters to pass to function activation. Default None. :param map_runtime_memory: Memory to use to run the map function. Default None (loaded from config). :param reduce_runtime_memory: Memory to use to run the reduce function. Default None (loaded from config). :param chunk_size: the size of the data chunks to split each object. 'None' for processing the whole file in one function activation. :param chunk_n: Number of chunks to split each object. 'None' for processing the whole file in one function activation. :param remote_invocation: Enable or disable remote_invocation mechanism. Default 'False' :param timeout: Time that the functions have to complete their execution before raising a timeout. :param reducer_one_per_object: Set one reducer per object after running the partitioner :param reducer_wait_local: Wait for results locally :param invoke_pool_threads: Number of threads to use to invoke. :param include_modules: Explicitly pickle these dependencies. :param exclude_modules: Explicitly keep these modules from pickled dependencies. :return: A list with size `len(map_iterdata)` of futures. """ if self._state == FunctionExecutor.State.Finished: raise Exception('You cannot run map_reduce() in the current state.' ' Create a new FunctionExecutor() instance.') total_current_jobs = len(self.jobs) job_id = str(total_current_jobs).zfill(3) map_job_id = 'M{}'.format(job_id) runtime_meta = self.invoker.select_runtime(map_job_id, map_runtime_memory) map_job = create_map_job( self.config, self.internal_storage, self.executor_id, map_job_id, map_function=map_function, iterdata=map_iterdata, runtime_meta=runtime_meta, runtime_memory=map_runtime_memory, extra_params=extra_params, extra_env=extra_env, obj_chunk_size=chunk_size, obj_chunk_number=chunk_n, remote_invocation=remote_invocation, remote_invocation_groups=remote_invocation_groups, invoke_pool_threads=invoke_pool_threads, include_modules=include_modules, exclude_modules=exclude_modules, is_remote_cluster=self.is_remote_cluster, execution_timeout=timeout) map_futures = self.invoker.run(map_job) self.jobs[map_job_id] = { 'futures': map_futures, 'state': JobState.Running } self._state = FunctionExecutor.State.Running if reducer_wait_local: self.wait(fs=map_futures) reduce_job_id = 'R{}'.format(job_id) runtime_meta = self.invoker.select_runtime(reduce_job_id, reduce_runtime_memory) reduce_job = create_reduce_job( self.config, self.internal_storage, self.executor_id, reduce_job_id, reduce_function, map_job, map_futures, runtime_meta=runtime_meta, reducer_one_per_object=reducer_one_per_object, runtime_memory=reduce_runtime_memory, extra_env=extra_env, include_modules=include_modules, exclude_modules=exclude_modules) reduce_futures = self.invoker.run(reduce_job) self.jobs[reduce_job_id] = { 'futures': reduce_futures, 'state': JobState.Running } for f in map_futures: f.produce_output = False return map_futures + reduce_futures def wait(self, fs=None, throw_except=True, return_when=ALL_COMPLETED, download_results=False, timeout=EXECUTION_TIMEOUT, THREADPOOL_SIZE=128, WAIT_DUR_SEC=1): """ Wait for the Future instances (possibly created by different Executor instances) given by fs to complete. Returns a named 2-tuple of sets. The first set, named done, contains the futures that completed (finished or cancelled futures) before the wait completed. The second set, named not_done, contains the futures that did not complete (pending or running futures). timeout can be used to control the maximum number of seconds to wait before returning. :param fs: Futures list. Default None :param throw_except: Re-raise exception if call raised. Default True. :param return_when: One of `ALL_COMPLETED`, `ANY_COMPLETED`, `ALWAYS` :param download_results: Download results. Default false (Only get statuses) :param timeout: Timeout of waiting for results. :param THREADPOOL_SIZE: Number of threads to use. Default 64 :param WAIT_DUR_SEC: Time interval between each check. :return: `(fs_done, fs_notdone)` where `fs_done` is a list of futures that have completed and `fs_notdone` is a list of futures that have not completed. :rtype: 2-tuple of list """ if not fs: fs = [] for job in self.jobs: if not download_results and self.jobs[job][ 'state'] == JobState.Running: fs.extend(self.jobs[job]['futures']) self.jobs[job]['state'] = JobState.Ready elif download_results and self.jobs[job][ 'state'] != JobState.Done: fs.extend(self.jobs[job]['futures']) self.jobs[job]['state'] = JobState.Done if type(fs) != list: futures = [fs] else: futures = fs if not futures: raise Exception( 'You must run the call_async(), map() or map_reduce(), or provide' ' a list of futures before calling the monitor()/get_result() method' ) if download_results: msg = 'ExecutorID {} - Getting results...'.format(self.executor_id) else: msg = 'ExecutorID {} - Waiting for functions to complete...'.format( self.executor_id) logger.info(msg) if not self.log_level and self._state == FunctionExecutor.State.Running: print(msg) if is_unix_system(): signal.signal(signal.SIGALRM, timeout_handler) signal.alarm(timeout) pbar = None if not self.is_remote_cluster and self._state == FunctionExecutor.State.Running \ and not self.log_level: from tqdm.auto import tqdm if is_notebook(): pbar = tqdm(bar_format='{n}/|/ {n_fmt}/{total_fmt}', total=len(futures)) # ncols=800 else: print() pbar = tqdm(bar_format=' {l_bar}{bar}| {n_fmt}/{total_fmt} ', total=len(futures), disable=False) try: if self.rabbitmq_monitor: logger.info('Using RabbitMQ to monitor function activations') wait_rabbitmq(futures, self.internal_storage, rabbit_amqp_url=self.rabbit_amqp_url, download_results=download_results, throw_except=throw_except, pbar=pbar, return_when=return_when, THREADPOOL_SIZE=THREADPOOL_SIZE) else: wait_storage(futures, self.internal_storage, download_results=download_results, throw_except=throw_except, return_when=return_when, pbar=pbar, THREADPOOL_SIZE=THREADPOOL_SIZE, WAIT_DUR_SEC=WAIT_DUR_SEC) except FunctionException as e: if is_unix_system(): signal.alarm(0) if pbar: pbar.close() logger.info(e.msg) if not self.log_level: if not is_notebook(): print() print(e.msg) if e.exc_msg: logger.info('Exception: ' + e.exc_msg) if not self.log_level: print('--> Exception: ' + e.exc_msg) else: print() traceback.print_exception(*e.exception) sys.exit() except TimeoutError: if download_results: not_dones_call_ids = [(f.job_id, f.call_id) for f in futures if not f.done] else: not_dones_call_ids = [(f.job_id, f.call_id) for f in futures if not f.ready and not f.done] msg = ( 'ExecutorID {} - Raised timeout of {} seconds waiting for results - Total Activations not done: {}' .format(self.executor_id, timeout, len(not_dones_call_ids))) self._state = FunctionExecutor.State.Error except KeyboardInterrupt: if download_results: not_dones_call_ids = [(f.job_id, f.call_id) for f in futures if not f.done] else: not_dones_call_ids = [(f.job_id, f.call_id) for f in futures if not f.ready and not f.done] msg = ('ExecutorID {} - Cancelled - Total Activations not done: {}' .format(self.executor_id, len(not_dones_call_ids))) self._state = FunctionExecutor.State.Error except Exception as e: if not self.is_remote_cluster: self.clean() raise e finally: if is_unix_system(): signal.alarm(0) if pbar: pbar.close() if not is_notebook(): print() if self._state == FunctionExecutor.State.Error: logger.debug(msg) if not self.log_level: print(msg) if download_results and self.data_cleaner and not self.is_remote_cluster: self.clean() if download_results: fs_done = [f for f in futures if f.done] fs_notdone = [f for f in futures if not f.done] self._state = FunctionExecutor.State.Done else: fs_done = [f for f in futures if f.ready or f.done] fs_notdone = [f for f in futures if not f.ready and not f.done] self._state = FunctionExecutor.State.Ready return fs_done, fs_notdone def get_result(self, fs=None, throw_except=True, timeout=EXECUTION_TIMEOUT, THREADPOOL_SIZE=128, WAIT_DUR_SEC=1): """ For getting the results from all function activations :param fs: Futures list. Default None :param throw_except: Reraise exception if call raised. Default True. :param verbose: Shows some information prints. Default False :param timeout: Timeout for waiting for results. :param THREADPOOL_SIZE: Number of threads to use. Default 128 :param WAIT_DUR_SEC: Time interval between each check. :return: The result of the future/s """ fs_done, unused_fs_notdone = self.wait(fs=fs, throw_except=throw_except, timeout=timeout, download_results=True, THREADPOOL_SIZE=THREADPOOL_SIZE, WAIT_DUR_SEC=WAIT_DUR_SEC) result = [ f.result(throw_except=throw_except, internal_storage=self.internal_storage) for f in fs_done if not f.futures and f.produce_output ] msg = "ExecutorID {} Finished getting results".format(self.executor_id) logger.debug(msg) if result and len(result) == 1: return result[0] return result def create_execution_plots(self, dst_dir, dst_file_name, futures=None): """ Creates timeline and histogram of the current execution in dst_dir. :param futures: list of futures. :param dst_dir: destination folder to save .png plots. :param dst_file_name: name of the file. """ if not futures: futures = [] for job in self.jobs: if self.jobs[job]['state'] == JobState.Ready or \ self.jobs[job]['state'] == JobState.Done: futures.extend(self.jobs[job]['futures']) self.jobs[job]['state'] = JobState.Finished if type(futures) != list: ftrs = [futures] else: ftrs = futures ftrs_to_plot = [f for f in ftrs if f.ready or f.done] if not ftrs_to_plot: msg = ('You must run call_async(), map() or map_reduce()' ' followed by monitor() or get_results()' ' before calling create_timeline_plots() method') logger.debug(msg) return logging.getLogger('matplotlib').setLevel(logging.WARNING) from pywren_ibm_cloud.plots import create_timeline, create_histogram msg = 'ExecutorID {} - Creating execution plots'.format( self.executor_id) logger.info(msg) if not self.log_level: print(msg) call_status = [f._call_status for f in ftrs_to_plot] call_metadata = [f._call_metadata for f in ftrs_to_plot] create_timeline(dst_dir, dst_file_name, self.start_time, call_status, call_metadata, self.config['ibm_cos']) create_histogram(dst_dir, dst_file_name, self.start_time, call_status, self.config['ibm_cos']) def clean(self, local_execution=True, delete_all=False): """ Deletes all the files from COS. These files include the function, the data serialization and the function invocation results. """ storage_bucket = self.config['pywren']['storage_bucket'] storage_prerix = self.config['pywren']['storage_prefix'] if delete_all: storage_prerix = '/'.join([storage_prerix]) else: storage_prerix = '/'.join([storage_prerix, self.executor_id]) msg = "ExecutorID {} - Cleaning temporary data".format( self.executor_id) logger.info(msg) if not self.log_level: print(msg) if local_execution: # 1st case: Not background. The main code waits until the cleaner finishes its execution. # It is not ideal for performance tests, since it can take long time to complete. # clean_os_bucket(storage_bucket, storage_prerix, self.internal_storage) # 2nd case: Execute in Background as a subprocess. The main program does not wait for its completion. storage_config = json.dumps( self.internal_storage.get_storage_config()) storage_config = storage_config.replace('"', '\\"') cmdstr = ( "{} -c 'from pywren_ibm_cloud.storage.utils import clean_bucket; \ clean_bucket(\"{}\", \"{}\", \"{}\")'".format( sys.executable, storage_bucket, storage_prerix, storage_config)) os.popen(cmdstr) else: extra_env = {'STORE_STATUS': False, 'STORE_RESULT': False} old_stdout = sys.stdout sys.stdout = open(os.devnull, 'w') self.call_async(clean_os_bucket, [storage_bucket, storage_prerix], extra_env=extra_env) sys.stdout = old_stdout self._state = FunctionExecutor.State.Finished
def __init__(self, config=None, runtime=None, runtime_memory=None, compute_backend=None, compute_backend_region=None, storage_backend=None, storage_backend_region=None, rabbitmq_monitor=None, log_level=None): """ Initialize a FunctionExecutor class. :param config: Settings passed in here will override those in config file. Default None. :param runtime: Runtime name to use. Default None. :param runtime_memory: memory to use in the runtime. Default None. :param compute_backend: Name of the compute backend to use. Default None. :param compute_backend_region: Name of the compute backend region to use. Default None. :param storage_backend: Name of the storage backend to use. Default None. :param storage_backend_region: Name of the storage backend region to use. Default None. :param log_level: log level to use during the execution. Default None. :param rabbitmq_monitor: use rabbitmq as the monitoring system. Default None. :return `FunctionExecutor` object. """ self.start_time = time.time() self._state = FunctionExecutor.State.New self.is_remote_cluster = is_remote_cluster() # Log level Configuration self.log_level = log_level if not self.log_level: if (logger.getEffectiveLevel() != logging.WARNING): self.log_level = logging.getLevelName( logger.getEffectiveLevel()) if self.log_level: os.environ["PYWREN_LOGLEVEL"] = self.log_level if not self.is_remote_cluster: default_logging_config(self.log_level) # Overwrite pywren config parameters config_ow = {'pywren': {}} if runtime is not None: config_ow['pywren']['runtime'] = runtime if runtime_memory is not None: config_ow['pywren']['runtime_memory'] = int(runtime_memory) if compute_backend is not None: config_ow['pywren']['compute_backend'] = compute_backend if compute_backend_region is not None: config_ow['pywren'][ 'compute_backend_region'] = compute_backend_region if storage_backend is not None: config_ow['pywren']['storage_backend'] = storage_backend if storage_backend_region is not None: config_ow['pywren'][ 'storage_backend_region'] = storage_backend_region if rabbitmq_monitor is not None: config_ow['pywren']['rabbitmq_monitor'] = rabbitmq_monitor self.config = default_config(config, config_ow) self.executor_id = create_executor_id() logger.debug('FunctionExecutor created with ID: {}'.format( self.executor_id)) # RabbitMQ monitor configuration self.rabbitmq_monitor = self.config['pywren'].get( 'rabbitmq_monitor', False) if self.rabbitmq_monitor: if 'rabbitmq' in self.config and 'amqp_url' in self.config[ 'rabbitmq']: self.rabbit_amqp_url = self.config['rabbitmq'].get('amqp_url') else: raise Exception( "You cannot use rabbitmq_mnonitor since 'amqp_url'" " is not present in configuration") self.data_cleaner = self.config['pywren']['data_cleaner'] storage_config = extract_storage_config(self.config) self.internal_storage = InternalStorage(storage_config) self.invoker = FunctionInvoker(self.config, self.executor_id, self.internal_storage) self.jobs = {}
def function_handler(event): start_time = time.time() logger.debug("Action handler started") response_status = {'exception': False} response_status['host_submit_time'] = event['host_submit_time'] response_status['start_time'] = start_time context_dict = { 'ibm_cf_request_id': os.environ.get("__OW_ACTIVATION_ID"), 'ibm_cf_python_version': os.environ.get("PYTHON_VERSION"), } config = event['config'] storage_config = extract_storage_config(config) log_level = event['log_level'] cloud_logging_config(log_level) call_id = event['call_id'] job_id = event['job_id'] executor_id = event['executor_id'] logger.info("Execution ID: {}/{}/{}".format(executor_id, job_id, call_id)) task_execution_timeout = event.get("task_execution_timeout", 590) # default for CF status_key = event['status_key'] func_key = event['func_key'] data_key = event['data_key'] data_byte_range = event['data_byte_range'] output_key = event['output_key'] extra_env = event.get('extra_env', {}) response_status['call_id'] = call_id response_status['job_id'] = job_id response_status['executor_id'] = executor_id # response_status['func_key'] = func_key # response_status['data_key'] = data_key # response_status['output_key'] = output_key # response_status['status_key'] = status_key try: if version.__version__ != event['pywren_version']: raise Exception("WRONGVERSION", "PyWren version mismatch", version.__version__, event['pywren_version']) # response_status['free_disk_bytes'] = free_disk_space("/tmp") custom_env = { 'CB_CONFIG': json.dumps(config), 'CB_CALL_ID': call_id, 'CB_JOB_ID': job_id, 'CB_EXECUTOR_ID': executor_id, 'PYTHONPATH': "{}:{}".format(os.getcwd(), PYWREN_LIBS_PATH), 'PYTHONUNBUFFERED': 'True' } os.environ.update(custom_env) os.environ.update(extra_env) # pass a full json blob jobrunner_config = { 'func_key': func_key, 'data_key': data_key, 'log_level': log_level, 'data_byte_range': data_byte_range, 'python_module_path': PYTHON_MODULE_PATH, 'output_key': output_key, 'stats_filename': JOBRUNNER_STATS_FILENAME } if os.path.exists(JOBRUNNER_STATS_FILENAME): os.remove(JOBRUNNER_STATS_FILENAME) setup_time = time.time() response_status['setup_time'] = round(setup_time - start_time, 8) result_queue = multiprocessing.Queue() tr = JobRunner(jobrunner_config, result_queue) tr.daemon = True logger.info("Starting JobRunner process") tr.start() tr.join(task_execution_timeout) response_status['exec_time'] = round(time.time() - setup_time, 8) if tr.is_alive(): # If process is still alive after jr.join(job_max_runtime), kill it logger.error( "Process exceeded maximum runtime of {} seconds".format( task_execution_timeout)) # Send the signal to all the process groups tr.terminate() raise Exception("OUTATIME", "Process executed for too long and was killed") try: # Only 1 message is returned by jobrunner result_queue.get(block=False) except Exception: # If no message, this means that the process was killed due an exception pickling an exception raise Exception( "EXCPICKLEERROR", "PyWren was unable to pickle the exception, check function logs" ) # print(subprocess.check_output("find {}".format(PYTHON_MODULE_PATH), shell=True)) # print(subprocess.check_output("find {}".format(os.getcwd()), shell=True)) if os.path.exists(JOBRUNNER_STATS_FILENAME): with open(JOBRUNNER_STATS_FILENAME, 'r') as fid: for l in fid.readlines(): key, value = l.strip().split(" ", 1) try: response_status[key] = float(value) except Exception: response_status[key] = value if key == 'exception' or key == 'exc_pickle_fail' \ or key == 'result': response_status[key] = eval(value) # response_status['server_info'] = get_server_info() response_status.update(context_dict) response_status['end_time'] = time.time() except Exception as e: # internal runtime exceptions logger.error("There was an exception: {}".format(str(e))) response_status['end_time'] = time.time() response_status['exception'] = True pickled_exc = pickle.dumps(sys.exc_info()) pickle.loads( pickled_exc) # this is just to make sure they can be unpickled response_status['exc_info'] = str(pickled_exc) finally: store_status = strtobool(os.environ.get('STORE_STATUS', 'True')) rabbit_amqp_url = config['rabbitmq'].get('amqp_url') dmpd_response_status = json.dumps(response_status) drs = sizeof_fmt(len(dmpd_response_status)) if rabbit_amqp_url and store_status: status_sent = False output_query_count = 0 while not status_sent and output_query_count < 5: output_query_count = output_query_count + 1 try: params = pika.URLParameters(rabbit_amqp_url) connection = pika.BlockingConnection(params) channel = connection.channel() channel.queue_declare(queue=executor_id, auto_delete=True) channel.basic_publish(exchange='', routing_key=executor_id, body=dmpd_response_status) connection.close() logger.info( "Execution stats sent to rabbitmq - Size: {}".format( drs)) status_sent = True except Exception as e: logger.error("Unable to send status to rabbitmq") logger.error(str(e)) logger.info('Retrying to send stats to rabbitmq...') time.sleep(0.2) if store_status: internal_storage = InternalStorage(storage_config) logger.info( "Storing execution stats - status.json - Size: {}".format(drs)) internal_storage.put_data(status_key, dmpd_response_status)