def __init__(self, invoker, config, job_max_runtime): self.invoker = invoker self.job_max_runtime = job_max_runtime self.config = config self.storage_config = wrenconfig.extract_storage_config(self.config) self.storage = storage.Storage(self.storage_config) self.runtime_meta_info = runtime.get_runtime_info(config['runtime']) # print('runtime_meta_info: ', self.runtime_meta_info) self.runtime_meta_info['preinstalls'].append(['pandas', True]) self.runtime_meta_info['preinstalls'].append(['thrift', True]) self.runtime_meta_info['preinstalls'].append(['Thrift', True]) if 'preinstalls' in self.runtime_meta_info: logger.info("using serializer with meta-supplied preinstalls") self.serializer = serialize.SerializeIndependent( self.runtime_meta_info['preinstalls']) else: self.serializer = serialize.SerializeIndependent() self.map_item_limit = None if 'scheduler' in self.config: if 'map_item_limit' in config['scheduler']: self.map_item_limit = config['scheduler']['map_item_limit']
def cancel(self, storage_handler=None): # TODO Figure out a better way for this function to have # access to a custom storage handler if storage_handler is None: storage_config = wrenconfig.extract_storage_config( wrenconfig.default()) storage_handler = storage.Storage(storage_config) storage_handler.put_cancelled(self.callset_id, self.call_id, "CANCEL")
def _wait(fs, THREADPOOL_SIZE): """ internal function that performs the majority of the WAIT task work. """ # get all the futures that are not yet done not_done_futures = [ f for f in fs if f._state not in [JobState.success, JobState.error] ] if len(not_done_futures) == 0: return fs, [] # check if the not-done ones have the same callset_id present_callsets = set([f.callset_id for f in not_done_futures]) if len(present_callsets) > 1: raise NotImplementedError() # get the list of all objects in this callset callset_id = present_callsets.pop() # FIXME assume only one storage_config = wrenconfig.extract_storage_config(wrenconfig.default()) storage_handler = storage.Storage(storage_config) callids_done = storage_handler.get_callset_status(callset_id) callids_done = set(callids_done) fs_dones = [] fs_notdones = [] f_to_wait_on = [] for f in fs: if f._state in [JobState.success, JobState.error]: # done, don't need to do anything fs_dones.append(f) else: if f.call_id in callids_done: f_to_wait_on.append(f) fs_dones.append(f) else: fs_notdones.append(f) def test(f): f.result(throw_except=False, storage_handler=storage_handler) pool = ThreadPool(THREADPOOL_SIZE) pool.map(test, f_to_wait_on) pool.close() pool.join() return fs_dones, fs_notdones
def __init__(self, invoker, config, job_max_runtime): self.invoker = invoker self.job_max_runtime = job_max_runtime self.config = config self.storage_config = wrenconfig.extract_storage_config(self.config) self.storage = storage.Storage(self.storage_config) self.runtime_meta_info = runtime.get_runtime_info(config['runtime']) if 'preinstalls' in self.runtime_meta_info: logger.info("using serializer with meta-supplied preinstalls") self.serializer = serialize.SerializeIndependent(self.runtime_meta_info['preinstalls']) else: self.serializer = serialize.SerializeIndependent()
def get_runtime_info(runtime_config, storage_handler=None): """ Download runtime information from storage at deserialize """ if storage_handler is None: storage_config = wrenconfig.extract_storage_config( wrenconfig.default()) storage_handler = storage.Storage(storage_config) runtime_meta = storage_handler.get_runtime_info(runtime_config) if not runtime_valid(runtime_meta): raise Exception(("The indicated runtime: {} " + "is not approprite for this python version." ).format(runtime_config)) return runtime_meta
def handle_generic_failure(future, failed_warc_paths, exc): if future not in failed_warc_paths: failed_warc_paths.add(future) print('A future failed with error: %s' % exc) print('') storage_config = wrenconfig.extract_storage_config( wrenconfig.default()) storage_handler = storage.Storage(storage_config) call_status = storage_handler.get_call_status(future.callset_id, future.call_id) exception_traceback = call_status.get('exception_traceback', None) if exception_traceback is not None: print(exception_traceback) else: exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_tb(exc_traceback)
def result(self, timeout=None, check_only=False, throw_except=True, storage_handler=None): """ check_only = True implies we only check if the job is completed. # FIXME check_only is the worst API and should be refactored # out to be part of done() From the python docs: Return the value returned by the call. If the call hasn't yet completed then this method will wait up to timeout seconds. If the call hasn't completed in timeout seconds then a TimeoutError will be raised. timeout can be an int or float.If timeout is not specified or None then there is no limit to the wait time. Return the value returned by the call. If the call raised an exception, this method will raise the same exception If the future is cancelled before completing then CancelledError will be raised. :param timeout: This method will wait up to timeout seconds before raising a TimeoutError if function hasn't completed. If None, wait indefinitely. Default None. :param check_only: Return None immediately if job is not complete. Default False. :param throw_except: Reraise exception if call raised. Default true. :param storage_handler: Storage handler to poll cloud storage. Default None. :return: Result of the call. :raises CancelledError: If the job is cancelled before completed. :raises TimeoutError: If job is not complete after `timeout` seconds. """ if self._state == JobState.new: raise ValueError("job not yet invoked") if check_only: if self._state == JobState.success or self._state == JobState.error: return True if self._state == JobState.success: return self._return_val if self._state == JobState.error: if throw_except: raise self._exception else: return None if storage_handler is None: storage_config = wrenconfig.extract_storage_config( wrenconfig.default()) storage_handler = storage.Storage(storage_config) storage_utils.check_storage_path(storage_handler.get_storage_config(), self.storage_path) call_status = storage_handler.get_call_status(self.callset_id, self.call_id) self.status_query_count += 1 ## FIXME implement timeout if timeout is not None: raise NotImplementedError() if check_only: if call_status is None: return False else: return True while call_status is None: time.sleep(self.GET_RESULT_SLEEP_SECS) call_status = storage_handler.get_call_status( self.callset_id, self.call_id) self.status_query_count += 1 self._invoke_metadata['status_done_timestamp'] = time.time() self._invoke_metadata['status_query_count'] = self.status_query_count self.run_status = call_status # this is the remote status information self.invoke_status = self._invoke_metadata # local status information if call_status['exception'] is not None: # the wrenhandler had an exception exception_str = call_status['exception'] exception_args = call_status['exception_args'] if exception_args[0] == "WRONGVERSION": if throw_except: raise Exception("Pywren version mismatch: remote " + \ "expected version {}, local library is version {}".format( exception_args[2], exception_args[3])) return None elif exception_args[0] == "OUTATIME": if throw_except: raise Exception("process ran out of time") return None elif exception_args[0] == "CANCELLED": if throw_except: raise Exception("job was cancelled") elif exception_args[0] == "RETCODE": if throw_except: raise Exception( "python process failed, returned a non-zero return code" "(check stdout for information)") return None else: if throw_except: if 'exception_traceback' in call_status: logger.error(call_status['exception_traceback']) raise Exception(exception_str, *exception_args) return None # FIXME this shouldn't be called if check_only is True call_output_time = time.time() call_invoker_result = pickle.loads( storage_handler.get_call_output(self.callset_id, self.call_id)) call_output_time_done = time.time() self._invoke_metadata[ 'download_output_time'] = call_output_time_done - call_output_time self._invoke_metadata[ 'download_output_timestamp'] = call_output_time_done call_success = call_invoker_result['success'] logger.info("ResponseFuture.result() {} {} call_success {}".format( self.callset_id, self.call_id, call_success)) self._call_invoker_result = call_invoker_result if call_success: self._return_val = call_invoker_result['result'] self._set_state(JobState.success) return self._return_val else: self._set_state(JobState.error) self._exception = call_invoker_result['result'] self._traceback = (call_invoker_result['exc_type'], call_invoker_result['exc_value'], call_invoker_result['exc_traceback']) if throw_except: if call_invoker_result.get('pickle_fail', False): logging.warning( "there was an error pickling. The original exception: " + \ "{}\nThe pickling exception: {}".format( call_invoker_result['exc_value'], str(call_invoker_result['pickle_exception']))) reraise(Exception, call_invoker_result['exc_value'], call_invoker_result['exc_traceback']) else: # reraise the exception reraise(*self._traceback) else: return None # nothing, don't raise, no value
def result(self, timeout=None, check_only=False, throw_except=True, storage_handler=None): """ From the python docs: Return the value returned by the call. If the call hasn't yet completed then this method will wait up to timeout seconds. If the call hasn't completed in timeout seconds then a TimeoutError will be raised. timeout can be an int or float.If timeout is not specified or None then there is no limit to the wait time. If the future is cancelled before completing then CancelledError will be raised. If the call raised then this method will raise the same exception. """ if self._state == JobState.new: raise ValueError("job not yet invoked") if self._state == JobState.success: return self._return_val if self._state == JobState.error: if throw_except: raise self._exception else: return None if storage_handler is None: storage_config = wrenconfig.extract_storage_config( wrenconfig.default()) storage_handler = storage.Storage(storage_config) storage_utils.check_storage_path(storage_handler.get_storage_config(), self.storage_path) call_status = storage_handler.get_call_status(self.callset_id, self.call_id) self.status_query_count += 1 ## FIXME implement timeout if timeout is not None: raise NotImplementedError() if check_only is True: if call_status is None: return None while call_status is None: time.sleep(self.GET_RESULT_SLEEP_SECS) call_status = storage_handler.get_call_status( self.callset_id, self.call_id) self.status_query_count += 1 self._invoke_metadata['status_done_timestamp'] = time.time() self._invoke_metadata['status_query_count'] = self.status_query_count self.run_status = call_status # this is the remote status information self.invoke_status = self._invoke_metadata # local status information print self.callset_id if call_status['exception'] is not None: # the wrenhandler had an exception exception_str = call_status['exception'] print(call_status) exception_args = call_status['exception_args'] if exception_args[0] == "WRONGVERSION": if throw_except: raise Exception("Pywren version mismatch: remote " + \ "expected version {}, local library is version {}".format( exception_args[2], exception_args[3])) return None elif exception_args[0] == "OUTATIME": if throw_except: raise Exception("process ran out of time") return None else: if throw_except: if 'exception_traceback' in call_status: logger.error(call_status['exception_traceback']) raise Exception(exception_str, *exception_args) return None call_output_time = time.time() call_invoker_result = pickle.loads( storage_handler.get_call_output(self.callset_id, self.call_id)) call_output_time_done = time.time() self._invoke_metadata[ 'download_output_time'] = call_output_time_done - call_output_time self._invoke_metadata[ 'download_output_timestamp'] = call_output_time_done call_success = call_invoker_result['success'] logger.info("ResponseFuture.result() {} {} call_success {}".format( self.callset_id, self.call_id, call_success)) self._call_invoker_result = call_invoker_result if call_success: self._return_val = call_invoker_result['result'] self._state = JobState.success return self._return_val elif throw_except: print "exception" self._exception = call_invoker_result['result'] self._traceback = (call_invoker_result['exc_type'], call_invoker_result['exc_value'], call_invoker_result['exc_traceback']) self._state = JobState.error if call_invoker_result.get('pickle_fail', False): logging.warning( "there was an error pickling. The original exception: " + \ "{}\nThe pickling exception: {}".format( call_invoker_result['exc_value'], str(call_invoker_result['pickle_exception']))) reraise(Exception, call_invoker_result['exc_value'], call_invoker_result['exc_traceback']) else: # reraise the exception reraise(*self._traceback) else: return None # nothing, don't raise, no value
def _wait(fs, return_early_n, max_direct_query_n, random_query=False, THREADPOOL_SIZE=16): """ internal function that performs the majority of the WAIT task work. For the list of futures fn, we will check at a minimum `max_direct_query_n` futures at least once. Internally we : 1. use list() to quickly get a list of which ones are done (but list can be behind due to eventual consistency issues) 2. then individually call get_status on at most `max_direct_query_n` returning early if we have found at least `return_early_n` This can mitigate the stragglers. random_query decides whether we get the fs in the order they are presented or in a random order. """ # get all the futures that are not yet done not_done_futures = [ f for f in fs if f._state not in [JobState.success, JobState.error] ] if len(not_done_futures) == 0: return fs, [] storage_config = wrenconfig.extract_storage_config(wrenconfig.default()) storage_handler = storage.Storage(storage_config) ### Callset optimization via object store convenience functions: # check if the not-done ones have the same callset_id present_callsets = set([f.callset_id for f in not_done_futures]) if len(present_callsets) > 1: raise NotImplementedError() # get the list of all objects in this callset callset_id = present_callsets.pop() # FIXME assume only one # note this returns everything done, so we have to figure out # the intersection of those that are done callids_done_in_callset = set( storage_handler.get_callset_status(callset_id)) not_done_call_ids = set([f.call_id for f in not_done_futures]) done_call_ids = not_done_call_ids.intersection(callids_done_in_callset) not_done_call_ids = not_done_call_ids - done_call_ids still_not_done_futures = [ f for f in not_done_futures if (f.call_id in not_done_call_ids) ] def fetch_future_status(f): return storage_handler.get_call_status(f.callset_id, f.call_id) pool = ThreadPool(THREADPOOL_SIZE) # now try up to max_direct_query_n direct status queries, quitting once # we have return_n done. query_count = 0 max_queries = min(max_direct_query_n, len(still_not_done_futures)) if random_query: random.shuffle(still_not_done_futures) while query_count < max_queries: if len(done_call_ids) >= return_early_n: break num_to_query_at_once = THREADPOOL_SIZE fs_to_query = still_not_done_futures[query_count:query_count + num_to_query_at_once] fs_statuses = pool.map(fetch_future_status, fs_to_query) callids_found = [ fs_to_query[i].call_id for i in range(len(fs_to_query)) if (fs_statuses[i] is not None) ] done_call_ids = done_call_ids.union(set(callids_found)) # # update done call_ids # callids_done.update(callids_found) # # break if not all N tasks completed # if (len(callids_found) < len(fs_samples)): # break # # calculate new still_not_done_futures # still_not_done_futures = [f for f in not_done_futures if (f.call_id not in callids_done)] query_count += len(fs_to_query) # now we walk through all the original queries and get # the ones that are actually done. fs_dones = [] fs_notdones = [] f_to_wait_on = [] for f in fs: if f._state in [JobState.success, JobState.error]: # done, don't need to do anything fs_dones.append(f) else: if f.call_id in done_call_ids: f_to_wait_on.append(f) fs_dones.append(f) else: fs_notdones.append(f) def get_result(f): f.result(throw_except=False, storage_handler=storage_handler) pool.map(get_result, f_to_wait_on) pool.close() pool.join() return fs_dones, fs_notdones