class FuncXExecutor(Executor): def __init__(self, endpoint_id, process_function="301f653b-40b6-449e-ad2e-e57d3aaa33cd"): self.fxc = FuncXClient(asynchronous=True) self.endpoint_id = endpoint_id self.process_function = process_function def run_async_analysis(self, file_url, tree_name, accumulator, process_func): if not self.process_function: self.process_function = self.fxc.register_function(run_coffea_processor) pickled_process_func = pickle.dumps(process_func) data_result = self.safe_run(file_url, tree_name, accumulator, pickled_process_func, function_id=self.process_function) # Pass this down to the next item in the stream. return data_result @retry(wait=wait_fixed(5), retry=retry_if_exception_type(MaxRequestsExceeded)) def safe_run(self, file_url, tree_name, accumulator, proc, function_id): return self.fxc.run(file_url, tree_name, accumulator, proc, True, endpoint_id=self.endpoint_id, function_id=function_id)
def register_function(self, container_type='docker', location=None, ep_id=None, group=None): from funcx import FuncXClient assert self.extr_func is not None, "Extractor function must first be registered!" if location is None: location = self.store_url fxc = FuncXClient() container_id = fxc.register_container( location=location, container_type=container_type, name='kube-tabular', description='I don\'t think so!', ) self.func_id = fxc.register_function(self.extr_func, ep_id, group=group, container_uuid=container_id, description="A sum function") print(f"The function has been updated! " f"Please copy/paste the following code into {self.func_id} function class:\n") print(self.func_id) return self.func_id
def _register_function(): """Register the inference function with FuncX""" client = FuncXClient() # Get the Group UUID config = json.loads(_config_path.read_text()) function_id = client.register_function(_funcx_func, group=config['group_uuid']) _set_config(function_id=function_id)
def __init__(self): self.current_tasks_on_ep = 0 self.max_tasks_on_ep = file_cutoff # IF SET TO FILE_CUTOFF, THEN THIS IS THE MAX. self.fxc = FuncXClient() self.funcx_batches = Queue() self.polling_queue = Queue() self.num_poll_reqs = 0 self.num_send_reqs = 0 self.total_families_sent = 0 self.successes = 0 self.failures = 0 self.max_outstanding_tasks = max_outstanding_tasks self.family_queue = Queue() self.fam_batches = [] # big_json = "/home/ubuntu/old_xtracthub-service/experiments/tyler_everything.json" # big_json = "/Users/tylerskluzacek/Desktop/tyler_everything.json" import os print(os.getcwd()) #big_json = "../experiments/tyler_30k.json" big_json = "experiments/tyler_200k.json" # big_json = "/Users/tylerskluzacek/PyCharmProjects/xtracthub-service/experiments/tyler_20k.json" t0 = time.time() with open(big_json, 'r') as f: self.fam_list = json.load(f) print(f"Number of famlilies in fam_list: {len(self.fam_list)}") t1 = time.time() print(f"Time to load families: {t1-t0}") time.sleep(5) # Time to read!!! # Transfer the stored list to a queue to promote good concurrency while making batches. i = 0 # TODO: added skip logic here! for item in self.fam_list: if i < skip_n: continue self.family_queue.put(item) self.start_time = time.time() self.preproc_fam_batches() print(f"Number of funcX batches: {self.funcx_batches.qsize()}")
def funcx_client(self): """ :returns an authorized funcx client """ if getattr(self, '__funcx_client', None) is not None: return self.__funcx_client self.__funcx_client = FuncXClient() return self.__funcx_client
def get_fx_client(headers): tokens = headers fx_auth = AccessTokenAuthorizer(tokens['Authorization'].replace( 'Bearer ', '')) search_auth = AccessTokenAuthorizer(tokens['Search']) openid_auth = AccessTokenAuthorizer(tokens['Openid']) fxc = FuncXClient(fx_authorizer=fx_auth, search_authorizer=search_auth, openid_authorizer=openid_auth) return fxc
def __init__(self): self.current_tasks_on_ep = 0 self.max_tasks_on_ep = 90000 self.fxc = FuncXClient() self.funcx_batches = Queue() self.polling_queue = Queue() self.num_poll_reqs = 0 self.num_send_reqs = 0 self.total_families_sent = 0 self.successes = 0 self.failures = 0 self.fam_batches = [] # NOTE: Changed away from X in order to load from CSV. # big_json = "/Users/tylerskluzacek/PyCharmProjects/xtracthub-service/experiments/tyler_20k.json" # # with open(big_json, 'r') as f: # self.fam_list = json.load(f) self.image_path_list = Queue() with open('train2014_images.csv') as f: reader = csv.reader(f) for row in reader: # print(row[0]) self.image_path_list.put(row[0]) # exit() self.start_time = time.time() self.preproc_fam_batches()
def main(args: Optional[List[str]] = None): """Launch service that automatically processes images and displays results as a web service""" # Make the argument parser parser = ArgumentParser() subparsers = parser.add_subparsers( dest='command', help='Which mode to launch the server in', required=True) # Add in the configuration settings config_parser = subparsers.add_parser( 'config', help='Define the configuration for the server') config_parser.add_argument('--function-id', help='UUID of the function to be run') config_parser.add_argument( '--funcx-endpoint', help='FuncX endpoint on which to run image processing') # Add in the launch setting start_parser = subparsers.add_parser('start', help='Launch the processing service') start_parser.add_argument('--model', choices=['tf', 'pytorch'], default='pytorch', help='Which segmentation model to use') start_parser.add_argument('--regex', default=r'.*.tiff?$', help='Regex to match files') start_parser.add_argument( '--redo-existing', action='store_true', help='Submit any existing files in the directory') start_parser.add_argument('--local', action='store_true', help='Perform image analysis locally,' ' instead of via FuncX') start_parser.add_argument('watch_dir', help='Which directory to watch for new files') # Add in the register setting subparsers.add_parser('register', help='(Re)-register the funcX function') # Parse the input arguments args = parser.parse_args(args) # Make the logger logging.basicConfig( format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO) # Handle the configuration if args.command == 'config': return _set_config(function_id=args.function_id, endpoint_id=args.funcx_endpoint) elif args.command == 'register': return _register_function() assert args.command == 'start', f'Internal Error: The command "{args.command}" is not yet supported. Contact Logan' # Select the correct segmenter if args.model == 'tf': segmenter = TFSegmenter() elif args.model == 'pytorch': segmenter = PyTorchSegmenter() else: raise ValueError(f'Model type "{args.model}" is not supported yet') # Prepare the event handler if args.local: handler = LocalProcessingHandler(segmenter=segmenter, file_regex=args.regex) else: client = FuncXClient() client.max_request_size = 50 * 1024**2 with open(_config_path, 'r') as fp: config = json.load(fp) handler = FuncXSubmitEventHandler(segmenter, client, config['function_id'], config['endpoint_id'], file_regex=args.regex) # Prepare the watch directory watch_dir = Path(args.watch_dir) mask_dir = watch_dir.joinpath('masks') mask_dir.mkdir(exist_ok=True) # Launch the flask app app.config['exec_queue'] = handler.queue app.config['watch_dir'] = Path(args.watch_dir) flask_thr = Thread(target=app.run, daemon=True, name='rtdefects.flask') flask_thr.start() # Launch the watcher obs = Observer() obs.schedule(handler, path=args.watch_dir, recursive=False) obs.start() # If desired, submit the existing files data_path = mask_dir.joinpath('defect-details.json') if args.redo_existing: data_path.unlink(missing_ok=True) # Delete any existing data for file in watch_dir.iterdir(): if file.is_file(): handler.submit_file(file) # Wait for results to complete try: for index, (img_path, mask, defect_info, rtt) in enumerate(handler.iterate_results()): # Report the completed result logger.info( f'Result received for {index + 1}/{handler.index}. RTT: {rtt:.2f}s.' f' Backlog: {handler.queue.qsize()}') # Save the mask to disk out_name = mask_dir.joinpath(img_path.name) with out_name.open('wb') as fp: fp.write(mask) logger.info(f'Wrote output file to: {out_name}') # Write out the image defect information defect_info['created_time'] = datetime.fromtimestamp( img_path.stat().st_mtime).isoformat() defect_info['completed_time'] = datetime.now().isoformat() defect_info['mask-path'] = str(out_name) defect_info['image-path'] = str(img_path) defect_info['rtt'] = rtt with data_path.open('a') as fp: print(json.dumps(defect_info), file=fp) except KeyboardInterrupt: logger.info('Detected an interrupt. Stopping system') except BaseException: obs.stop() logger.warning('Unexpected failure!') raise # Shut down the file reader obs.stop() obs.join()
"--ws_uri", default="ws://localhost:6000", help="WebSocket URI to get task results", ) parser.add_argument( "-e", "--endpoint_id", required=True, help="Target endpoint to send functions to", ) parser.add_argument("-b", "--batch", action="store_true", help="Enable batch or not") args = parser.parse_args() fx = FuncXExecutor( FuncXClient(funcx_service_address=args.service_url, results_ws_uri=args.ws_uri), batch_enabled=args.batch, ) start = time.time() print("Running simple test") test_simple(fx, args.endpoint_id) print(f"Complete in {time.time() - start}") start = time.time() run_loop(fx, args.endpoint_id) print(f"Complete in {time.time() - start}")
class test_orch(): def __init__(self): self.current_tasks_on_ep = 0 self.max_tasks_on_ep = 90000 self.fxc = FuncXClient() self.funcx_batches = Queue() self.polling_queue = Queue() self.num_poll_reqs = 0 self.num_send_reqs = 0 self.total_families_sent = 0 self.successes = 0 self.failures = 0 self.fam_batches = [] # NOTE: Changed away from X in order to load from CSV. # big_json = "/Users/tylerskluzacek/PyCharmProjects/xtracthub-service/experiments/tyler_20k.json" # # with open(big_json, 'r') as f: # self.fam_list = json.load(f) self.image_path_list = Queue() with open('train2014_images.csv') as f: reader = csv.reader(f) for row in reader: # print(row[0]) self.image_path_list.put(row[0]) # exit() self.start_time = time.time() self.preproc_fam_batches() def path_converter(self, family_id, old_path): path_ls = old_path.split('/') file_name = path_ls[-1] new_path = None if system == "midway2": new_path = f"/project2/chard/skluzacek/{family_id}/{file_name}" elif system == "theta": new_path = f"/projects/CSC249ADCD01/skluzacek/data_to_process/{family_id}/{file_name}" return new_path def preproc_fam_batches(self): total_tasks = 0 print("PREPROCESSING!") while not self.image_path_list.empty(): fam_batch = FamilyBatch() # print(len(fam_batch.families)) while len(fam_batch.families) < map_size: if self.image_path_list.empty(): break path = self.image_path_list.get() print(path) family = dict() family['family_id'] = None # TODO: CHANGE THIS FOR THETA. if system == 'midway2': family['files'] = [{ 'path': f'/project2/chard/skluzacek/train2014/{path}' }] elif system == 'theta': family['files'] = [{ 'path': f'/projects/CSC249ADCD01/skluzacek/train2014/{path}' }] family['metadata'] = dict() family['headers'] = None family['download_type'] = None family['groups'] = [] empty_fam = Family() empty_fam.from_dict(family) print("ADDING FAMILY TO FAM BATCH") fam_batch.add_family(empty_fam) #if total_tasks > max_tasks: self.fam_batches.append(fam_batch) img_extractor = ImageExtractor() print(f"REGISTERING FUNCTION") self.fn_uuid = img_extractor.register_function( container_type=container_type, location=location, ep_id=ep_id, group="a31d8dce-5d0a-11ea-afea-0a53601d30b5") current_batch = [] for fam_batch in self.fam_batches: if len(current_batch) < batch_size: current_batch.append(fam_batch) else: print(f"Length of current batch: {len(current_batch)}") self.funcx_batches.put(current_batch) current_batch = [fam_batch] # Grab the stragglers. if len(current_batch) > 0: self.funcx_batches.put(current_batch) print("Let me see") batch_counter = 0 # while not self.funcx_batches.empty(): # funcx_batch = self.funcx_batches.get() # batch_counter += 1 # for batch in funcx_batch: # print(len(batch.families)) # # print(batch_counter) # # # exit() # TODO: let the failures fail. def send_batches_thr_loop(self): while not self.funcx_batches.empty(): if self.current_tasks_on_ep > self.max_tasks_on_ep: print(f"There are {self.current_tasks_on_ep}. Sleeping...") time.sleep(5) continue batch = self.funcx_batches.get() fx_batch = self.fxc.create_batch() for item in batch: fam_batch_size = len(item.families) fx_batch.add( { 'family_batch': item, 'creds': None, 'download_file': None }, endpoint_id=ep_id, function_id=self.fn_uuid) self.current_tasks_on_ep += fam_batch_size try: res = self.fxc.batch_run(fx_batch) self.num_send_reqs += 1 except: time.sleep(0.5) continue num_tids = 0 for tid in res: self.polling_queue.put(tid) num_tids += 1 # print(f"Put {num_tids} tids into polling queue! ") if self.current_tasks_on_ep + self.successes > task_stop: # This is our unclean (approximate) way of breaking at the 'task send' stage. break # time.sleep(1) def polling_loop(self): while True: current_tid_batch = [] for i in range(500): # TODO: 1000 might be too big? if self.polling_queue.empty(): print("Polling queue empty. Creating batch!") time.sleep(5) break else: tid = self.polling_queue.get() current_tid_batch.append(tid) if len(current_tid_batch) == 0: print("Batch is empty. Sleeping... ") time.sleep(5) res = self.fxc.get_batch_status(current_tid_batch) self.num_poll_reqs += 1 for item in res: # print(res[item]) # print(res[item]) if 'result' in res[item]: print(res[item]) # self.successes += 1 ret_fam_batch = res[item]['result']['family_batch'] fam_len = len(ret_fam_batch.families) self.successes += fam_len self.current_tasks_on_ep -= fam_len # NOTE -- we're doing nothing with the returned metadata here. elif 'exception' in res[item]: res[item]['exception'].reraise() elif 'status' in res[item]: self.polling_queue.put(item) else: print("*********ERROR *************") self.failures += 1 print(res) def stats_loop(self): while True: print("*********************************") print(f"Num successes: {self.successes}") print(f"Num failures: {self.failures}") print(f"Only {self.current_tasks_on_ep} tasks at endpoint. ") print(f"Number of send requests: {self.num_send_reqs}") print(f"Number of poll requests: {self.num_poll_reqs}") print("*********************************") print(f"Elapsed time: {time.time() - self.start_time}") time.sleep(5)
class test_orch(): def __init__(self): self.current_tasks_on_ep = 0 self.max_tasks_on_ep = file_cutoff # IF SET TO FILE_CUTOFF, THEN THIS IS THE MAX. self.fxc = FuncXClient() self.funcx_batches = Queue() self.polling_queue = Queue() self.num_poll_reqs = 0 self.num_send_reqs = 0 self.total_families_sent = 0 self.successes = 0 self.failures = 0 self.max_outstanding_tasks = max_outstanding_tasks self.family_queue = Queue() self.fam_batches = [] # big_json = "/home/ubuntu/old_xtracthub-service/experiments/tyler_everything.json" # big_json = "/Users/tylerskluzacek/Desktop/tyler_everything.json" import os print(os.getcwd()) #big_json = "../experiments/tyler_30k.json" big_json = "experiments/tyler_200k.json" # big_json = "/Users/tylerskluzacek/PyCharmProjects/xtracthub-service/experiments/tyler_20k.json" t0 = time.time() with open(big_json, 'r') as f: self.fam_list = json.load(f) print(f"Number of famlilies in fam_list: {len(self.fam_list)}") t1 = time.time() print(f"Time to load families: {t1-t0}") time.sleep(5) # Time to read!!! # Transfer the stored list to a queue to promote good concurrency while making batches. i = 0 # TODO: added skip logic here! for item in self.fam_list: if i < skip_n: continue self.family_queue.put(item) self.start_time = time.time() self.preproc_fam_batches() print(f"Number of funcX batches: {self.funcx_batches.qsize()}") # exit() def path_converter(self, family_id, old_path): path_ls = old_path.split('/') file_name = path_ls[-1] new_path = None if system == "midway2": new_path = f"/project2/chard/skluzacek/data_to_process/{family_id}/{file_name}" elif system == "theta": new_path = f"/projects/CSC249ADCD01/skluzacek{old_path}" #TODO: change this for things elif system == "js": new_path = f"/home/tskluzac/{family_id}/{file_name}" return new_path def preproc_fam_batches(self): fam_count = 0 # Just create an empty one out here so Python doesn't yell at me. fam_batch = FamilyBatch() num_overloads = 0 # while we have files and haven't exceeded the weak scaling threshold (file_cutoff) while not self.family_queue.empty() and fam_count < file_cutoff: fam_batch = FamilyBatch() total_fam_batch_size = 0 # Keep making batch until while len(fam_batch.families ) < map_size and not self.family_queue.empty( ) and fam_count < file_cutoff: fam_count += 1 fam = self.family_queue.get() total_family_size = 0 # First convert to the correct paths for file_obj in fam['files']: old_path = file_obj['path'] new_path = self.path_converter(fam['family_id'], old_path) file_obj['path'] = new_path file_size = file_obj['metadata']['physical']['size'] total_family_size += file_size for group in fam['groups']: for file_obj in group['files']: old_path = file_obj['path'] new_path = self.path_converter(fam['family_id'], old_path) file_obj['path'] = new_path empty_fam = Family() empty_fam.from_dict(fam) # We will ONLY handle the SIZE issue in here. if soft_batch_bytes_max > 0: # So if this last file would put us over the top, if total_fam_batch_size + total_family_size > soft_batch_bytes_max: num_overloads += 1 print(f"Num overloads {num_overloads}") # then we append the old batch (if not empty), if len(fam_batch.families) > 0: self.fam_batches.append(fam_batch) # empty the old one fam_batch = FamilyBatch() total_fam_batch_size = total_family_size assert (len(fam_batch.families) == 0) # and then continue (here we either add to our prior fam_batch OR the new one). fam_batch.add_family(empty_fam) assert len(fam_batch.families) <= map_size self.fam_batches.append(fam_batch) # img_extractor = NothingExtractor() img_extractor = MatioExtractor() # TODO: ADDING TEST. Making sure we have all of our files here. ta = time.time() num_families = 0 for item in self.fam_batches: num_families += len(item.families) print(num_families) tb = time.time() print(f"Time to move families: {tb-ta}") time.sleep(5) # exit() # exit() # This check makes sure our batches are the correct size to avoid the January 2021 disaster of having vastly # incorrect numbers of batches. # # Here we are checking that the number of families we are processing is LESS than the total number of # batches times the batch size (e.g., the last batch can be full or empty), and the number of families # is GREATER than the case where our last map is missing. # # # This leaves a very small window for error. Could use modulus to be more exact. # TODO: Bring this back (but use for grouping by num. files) # try: # assert len(self.fam_batches) * (map_size-1) <= fam_count <= len(self.fam_batches) * map_size # except AssertionError as e: # print(f"Caught {e} after creating client batches...") # print(f"Number of batches: {len(self.fam_batches)}") # print(f"Family Count: {fam_count}") # # print("Cannot continue. Exiting...") # exit() print(f"Container type: {container_type}") print(f"Location: {location}") self.fn_uuid = img_extractor.register_function( container_type=container_type, location=location, ep_id=ep_id, group="a31d8dce-5d0a-11ea-afea-0a53601d30b5") # funcX batching. Here we take the 'user' FamilyBatch objects and put them into a batch we send to funcX. num_fx_batches = 0 current_batch = [] print(f"Number of family batches: {len(self.fam_batches)}") for fam_batch in self.fam_batches: # print(len(current_batch)) # print(batch_size) if len(current_batch) < batch_size: current_batch.append(fam_batch) else: # print("Marking batch!") # print(len(current_batch)) self.funcx_batches.put(current_batch) current_batch = [fam_batch] num_fx_batches += 1 # Grab the stragglers. if len(current_batch) > 0: print("Marking batch!") self.funcx_batches.put(current_batch) num_fx_batches += 1 # See same description as above (map example) for explanation. try: theor_full_batches = math.ceil(len(self.fam_batches) / batch_size) # print(f"Theoretical full batches: {}") assert theor_full_batches == num_fx_batches except AssertionError as e: print(f"Caught {e} after creating funcX batches...") print(f"Number of batches: {self.funcx_batches.qsize()}") print(f"Family Count: {num_fx_batches}") print("Cannot continue. Exiting...") exit() # TODO: let the failures fail. def send_batches_thr_loop(self): # While there are still batches to send. # Note that this should not be 'limiting' as we do that in preprocessing. while not self.funcx_batches.empty(): # current_tasks_on_ep = tasks_sent - tasks_received if self.current_tasks_on_ep > self.max_outstanding_tasks: print(f"There are {self.current_tasks_on_ep}. Sleeping...") time.sleep(5) continue # Grab one batch = self.funcx_batches.get() fx_batch = self.fxc.create_batch() # Now we formally pull down each funcX batch and add each of its elements to an fx_batch. # TODO: could do this before putting in list. for item in batch: fam_batch_size = len(item.families) fx_batch.add({'family_batch': item}, endpoint_id=ep_id, function_id=self.fn_uuid) self.current_tasks_on_ep += fam_batch_size # try: # TODO: bring this back when we figure out what errors it's causing. import random x = random.randint(1, 5) time.sleep(x / 2) res = self.fxc.batch_run(fx_batch) self.num_send_reqs += 1 # except Exception as e: # print("WE CAUGHT AN EXCEPTION WHILE SENDING. ") # time.sleep(0.5) # continue for tid in res: self.polling_queue.put(tid) # import random # time.sleep(random.randint(1,3)) # time.sleep(0.75) def polling_loop(self): while True: current_tid_batch = [] for i in range(500): # TODO: 1000 might be too big? if self.polling_queue.empty(): print("Polling queue empty. Creating batch!") time.sleep(3) break else: tid = self.polling_queue.get() current_tid_batch.append(tid) if len(current_tid_batch) == 0: print("Batch is empty. Sleeping... ") time.sleep(5) time.sleep(0.5) start_req = time.time() res = self.fxc.get_batch_status(current_tid_batch) end_req = time.time() self.num_poll_reqs += 1 print(f"Time to process batch: {end_req-start_req}") for item in res: # print(res[item]) if 'result' in res[item]: print(f"Received result: {res[item]['result']}") exit() # print(res[item]) #print(res[item]['result']) # ret_fam_batch = res[item]['result']['family_batch'] ret_fam_batch = res[item]['result'] num_finished = ret_fam_batch['finished'] print(num_finished) # timer = res[item]['result']['total_time'] family_file_size = 0 bad_extract_time = 0 good_extract_time = 0 good_parsers = "" # family_mdata_size = get_deep_size(ret_fam_batch) # # for family in ret_fam_batch.families: # # # print(family.metadata) # # for file in family.files: # family_file_size += file['metadata']['physical']['size'] # # for gid in family.groups: # g_mdata = family.groups[gid].metadata # # print(g_mdata) # # if g_mdata['matio'] != {} and g_mdata['matio'] is not None: # good_parsers = good_parsers + g_mdata['parser'] # good_extract_time += g_mdata['extract time'] # else: # bad_extract_time = g_mdata['extract time'] # # # TODO: These are at the family_batch level. # # import_time = res[item]['result']["import_time"] # family_fetch_time = res[item]['result']["family_fetch_time"] # file_unpack_time = res[item]['result']["file_unpack_time"] # full_extraction_loop_time = res[item]['result']["full_extract_loop_time"] # import_time = 0 # family_fetch_time = 0 # file_unpack_time = 0 # full_extraction_loop_time = 0 # # with open('timer_file.txt', 'a') as g: # csv_writer = csv.writer(g) # csv_writer.writerow([timer, family_file_size, family_mdata_size, good_extract_time, # bad_extract_time, import_time, family_fetch_time, file_unpack_time, # full_extraction_loop_time, good_parsers]) # fam_len = len(ret_fam_batch.families) with open('timer2.txt', 'a') as g: csv_writer = csv.writer(g) csv_writer.writerow([time.time(), num_finished]) self.successes += num_finished self.current_tasks_on_ep -= num_finished # NOTE -- we're doing nothing with the returned metadata here. elif 'exception' in res[item]: res[item]['exception'].reraise() else: self.polling_queue.put(item) """ else: print("*********ERROR *************") self.failures += 1 print(res) """ def stats_loop(self): while True: print("*********************************") print(f"Num successes: {self.successes}") print(f"Num failures: {self.failures}") print(f"Only {self.current_tasks_on_ep} tasks at endpoint. ") print(f"Number of send requests: {self.num_send_reqs}") print(f"Number of poll requests: {self.num_poll_reqs}") print("*********************************") print(f"Elapsed time: {time.time() - self.start_time}") time.sleep(5)
def __init__(self, endpoints, strategy='round-robin', runtime_predictor='rolling-average', last_n=3, train_every=1, log_level='INFO', import_model_file=None, transfer_model_file=None, sync_level='exists', max_backups=0, backup_delay_threshold=2.0, *args, **kwargs): self._fxc = FuncXClient(*args, **kwargs) # Initialize a transfer client self._transfer_manger = TransferManager(endpoints=endpoints, sync_level=sync_level, log_level=log_level) # Info about FuncX endpoints we can execute on self._endpoints = endpoints self._dead_endpoints = set() self.last_result_time = defaultdict(float) self.temperature = defaultdict(lambda: 'WARM') self._imports = defaultdict(list) self._imports_required = defaultdict(list) # Track which endpoints a function can't run on self._blocked = defaultdict(set) # Track pending tasks # We will provide the client our own task ids, since we may submit the # same task multiple times to the FuncX service, and sometimes we may # wait to submit a task to FuncX (e.g., wait for a data transfer). self._task_id_translation = {} self._pending = {} self._pending_by_endpoint = defaultdict(set) self._task_info = {} # List of endpoints a (virtual) task was scheduled to self._endpoints_sent_to = defaultdict(list) self.max_backups = max_backups self.backup_delay_threshold = backup_delay_threshold self._latest_status = {} self._last_task_ETA = defaultdict(float) # Maximum ETA, if any, of a task which we allow to be scheduled on an # endpoint. This is to prevent backfill tasks to be longer than the # estimated time for when a pending data transfer will finish. self._transfer_ETAs = defaultdict(dict) # Estimated error in the pending-task time of an endpoint. # Updated every time a task result is received from an endpoint. self._queue_error = defaultdict(float) # Set logging levels logger.setLevel(log_level) self.execution_log = [] # Intialize serializer self.fx_serializer = FuncXSerializer() self.fx_serializer.use_custom('03\n', 'code') # Initialize runtime predictor self.runtime = init_runtime_predictor(runtime_predictor, endpoints=endpoints, last_n=last_n, train_every=train_every) logger.info(f"Runtime predictor using strategy {self.runtime}") # Initialize transfer-time predictor self.transfer_time = TransferPredictor(endpoints=endpoints, train_every=train_every, state_file=transfer_model_file) # Initialize import-time predictor self.import_predictor = ImportPredictor(endpoints=endpoints, state_file=import_model_file) # Initialize scheduling strategy self.strategy = init_strategy(strategy, endpoints=endpoints, runtime_predictor=self.runtime, queue_predictor=self.queue_delay, cold_start_predictor=self.cold_start, transfer_predictor=self.transfer_time) logger.info(f"Scheduler using strategy {self.strategy}") # Start thread to check on endpoints regularly self._endpoint_watchdog = Thread(target=self._check_endpoints) self._endpoint_watchdog.start() # Start thread to monitor tasks and send tasks to FuncX service self._scheduled_tasks = Queue() self._task_watchdog_sleep = 0.15 self._task_watchdog = Thread(target=self._monitor_tasks) self._task_watchdog.start()
class CentralScheduler(object): def __init__(self, endpoints, strategy='round-robin', runtime_predictor='rolling-average', last_n=3, train_every=1, log_level='INFO', import_model_file=None, transfer_model_file=None, sync_level='exists', max_backups=0, backup_delay_threshold=2.0, *args, **kwargs): self._fxc = FuncXClient(*args, **kwargs) # Initialize a transfer client self._transfer_manger = TransferManager(endpoints=endpoints, sync_level=sync_level, log_level=log_level) # Info about FuncX endpoints we can execute on self._endpoints = endpoints self._dead_endpoints = set() self.last_result_time = defaultdict(float) self.temperature = defaultdict(lambda: 'WARM') self._imports = defaultdict(list) self._imports_required = defaultdict(list) # Track which endpoints a function can't run on self._blocked = defaultdict(set) # Track pending tasks # We will provide the client our own task ids, since we may submit the # same task multiple times to the FuncX service, and sometimes we may # wait to submit a task to FuncX (e.g., wait for a data transfer). self._task_id_translation = {} self._pending = {} self._pending_by_endpoint = defaultdict(set) self._task_info = {} # List of endpoints a (virtual) task was scheduled to self._endpoints_sent_to = defaultdict(list) self.max_backups = max_backups self.backup_delay_threshold = backup_delay_threshold self._latest_status = {} self._last_task_ETA = defaultdict(float) # Maximum ETA, if any, of a task which we allow to be scheduled on an # endpoint. This is to prevent backfill tasks to be longer than the # estimated time for when a pending data transfer will finish. self._transfer_ETAs = defaultdict(dict) # Estimated error in the pending-task time of an endpoint. # Updated every time a task result is received from an endpoint. self._queue_error = defaultdict(float) # Set logging levels logger.setLevel(log_level) self.execution_log = [] # Intialize serializer self.fx_serializer = FuncXSerializer() self.fx_serializer.use_custom('03\n', 'code') # Initialize runtime predictor self.runtime = init_runtime_predictor(runtime_predictor, endpoints=endpoints, last_n=last_n, train_every=train_every) logger.info(f"Runtime predictor using strategy {self.runtime}") # Initialize transfer-time predictor self.transfer_time = TransferPredictor(endpoints=endpoints, train_every=train_every, state_file=transfer_model_file) # Initialize import-time predictor self.import_predictor = ImportPredictor(endpoints=endpoints, state_file=import_model_file) # Initialize scheduling strategy self.strategy = init_strategy(strategy, endpoints=endpoints, runtime_predictor=self.runtime, queue_predictor=self.queue_delay, cold_start_predictor=self.cold_start, transfer_predictor=self.transfer_time) logger.info(f"Scheduler using strategy {self.strategy}") # Start thread to check on endpoints regularly self._endpoint_watchdog = Thread(target=self._check_endpoints) self._endpoint_watchdog.start() # Start thread to monitor tasks and send tasks to FuncX service self._scheduled_tasks = Queue() self._task_watchdog_sleep = 0.15 self._task_watchdog = Thread(target=self._monitor_tasks) self._task_watchdog.start() def block(self, func, endpoint): if endpoint not in self._endpoints: logger.error('Cannot block unknown endpoint {}'.format(endpoint)) return { 'status': 'Failed', 'reason': 'Unknown endpoint {}'.format(endpoint) } elif len(self._blocked[func]) == len(self._endpoints) - 1: logger.error( 'Cannot block last remaining endpoint {}'.format(endpoint)) return { 'status': 'Failed', 'reason': 'Cannot block all endpoints for {}'.format(func) } else: logger.info('Blocking endpoint {} for function {}'.format( endpoint_name(endpoint), func)) self._blocked[func].add(endpoint) return {'status': 'Success'} def register_imports(self, func, imports): logger.info('Registered function {} with imports {}'.format( func, imports)) self._imports_required[func] = imports def batch_submit(self, tasks, headers): # TODO: smarter scheduling for batch submissions task_ids = [] endpoints = [] for func, payload in tasks: _, ser_kwargs = self.fx_serializer.unpack_buffers(payload) kwargs = self.fx_serializer.deserialize(ser_kwargs) files = kwargs['_globus_files'] task_id, endpoint = self._schedule_task(func=func, payload=payload, headers=headers, files=files) task_ids.append(task_id) endpoints.append(endpoint) return task_ids, endpoints def _schedule_task(self, func, payload, headers, files, task_id=None): # If this is the first time scheduling this task_id # (i.e., non-backup task), record the necessary metadata if task_id is None: # Create (fake) task id to return to client task_id = str(uuid.uuid4()) # Store task information self._task_id_translation[task_id] = set() # Information required to schedule the task, now and in the future info = { 'function_id': func, 'payload': payload, 'headers': headers, 'files': files, 'time_requested': time.time() } self._task_info[task_id] = info # TODO: do not choose a dead endpoint (reliably) # exclude = self._blocked[func] | self._dead_endpoints | set(self._endpoints_sent_to[task_id]) # noqa if len(self._dead_endpoints) > 0: logger.warn('{} endpoints seem dead. Hope they still work!'.format( len(self._dead_endpoints))) exclude = self._blocked[func] | set(self._endpoints_sent_to[task_id]) choice = self.strategy.choose_endpoint( func, payload=payload, files=files, exclude=exclude, transfer_ETAs=self._transfer_ETAs) # noqa endpoint = choice['endpoint'] logger.info('Choosing endpoint {} for func {}, task id {}'.format( endpoint_name(endpoint), func, task_id)) choice['ETA'] = self.strategy.predict_ETA(func, endpoint, payload, files=files) # Start Globus transfer of required files, if any if len(files) > 0: transfer_num = self._transfer_manger.transfer( files, endpoint, task_id) if transfer_num is not None: transfer_ETA = time.time() + self.transfer_time( files, endpoint) self._transfer_ETAs[endpoint][transfer_num] = transfer_ETA else: transfer_num = None # Record endpoint ETA for queue-delay prediction here, # since task will be immediately scheduled self._last_task_ETA[endpoint] = choice['ETA'] # If a cold endpoint is being started, mark it as no longer cold, # so that subsequent launch-time predictions are correct (i.e., 0) if self.temperature[endpoint] == 'COLD': self.temperature[endpoint] = 'WARMING' logger.info( 'A cold endpoint {} was chosen; marked as warming.'.format( endpoint_name(endpoint))) # Schedule task for sending to FuncX self._endpoints_sent_to[task_id].append(endpoint) self._scheduled_tasks.put((task_id, endpoint, transfer_num)) return task_id, endpoint def translate_task_id(self, task_id): return self._task_id_translation[task_id] def log_status(self, real_task_id, data): if real_task_id not in self._pending: logger.warn('Ignoring unknown task id {}'.format(real_task_id)) return task_id = self._pending[real_task_id]['task_id'] func = self._pending[real_task_id]['function_id'] endpoint = self._pending[real_task_id]['endpoint_id'] # Don't overwrite latest status if it is a result/exception if task_id not in self._latest_status or \ self._latest_status[task_id].get('status') == 'PENDING': self._latest_status[task_id] = data if 'result' in data: result = self.fx_serializer.deserialize(data['result']) runtime = result['runtime'] name = endpoint_name(endpoint) logger.info('Got result from {} for task {} with time {}'.format( name, real_task_id, runtime)) self.runtime.update(self._pending[real_task_id], runtime) self._pending[real_task_id]['runtime'] = runtime self._record_completed(real_task_id) self.last_result_time[endpoint] = time.time() self._imports[endpoint] = result['imports'] elif 'exception' in data: exception = self.fx_serializer.deserialize(data['exception']) try: exception.reraise() except Exception as e: logger.error('Got exception on task {}: {}'.format( real_task_id, e)) exc_type, _, _ = sys.exc_info() if exc_type in BLOCK_ERRORS: self.block(func, endpoint) self._record_completed(real_task_id) self.last_result_time[endpoint] = time.time() elif 'status' in data and data['status'] == 'PENDING': pass else: logger.error('Unexpected status message: {}'.format(data)) def get_status(self, task_id): if task_id not in self._task_id_translation: logger.warn('Unknown client task id {}'.format(task_id)) elif len(self._task_id_translation[task_id]) == 0: return {'status': 'PENDING'} # Task has not been scheduled yet elif task_id not in self._latest_status: return {'status': 'PENDING'} # Status has not been queried yet else: return self._latest_status[task_id] def queue_delay(self, endpoint): # Otherwise, queue delay is the ETA of most recent task, # plus the estimated error in the ETA prediction. # Note that if there are no pending tasks on endpoint, no queue delay. # This is implicit since, in this case, both summands will be 0. delay = self._last_task_ETA[endpoint] + self._queue_error[endpoint] return max(delay, time.time()) def _record_completed(self, real_task_id): info = self._pending[real_task_id] endpoint = info['endpoint_id'] # If this is the last pending task on this endpoint, reset ETA offset if len(self._pending_by_endpoint[endpoint]) == 1: self._last_task_ETA[endpoint] = 0.0 self._queue_error[endpoint] = 0.0 else: prediction_error = time.time() - self._pending[real_task_id]['ETA'] self._queue_error[endpoint] = prediction_error # print(colored(f'Prediction error {prediction_error}', 'red')) info['ATA'] = time.time() del info['headers'] self.execution_log.append(info) logger.info( 'Task exec time: expected = {:.3f}, actual = {:.3f}'.format( info['ETA'] - info['time_sent'], time.time() - info['time_sent'])) # logger.info(f'ETA_offset = {self._queue_error[endpoint]:.3f}') # Stop tracking this task del self._pending[real_task_id] self._pending_by_endpoint[endpoint].remove(real_task_id) if info['task_id'] in self._task_info: del self._task_info[info['task_id']] def cold_start(self, endpoint, func): # If endpoint is warm, there is no launch time if self.temperature[endpoint] != 'COLD': launch_time = 0.0 # Otherwise, return the launch time in the endpoint config elif 'launch_time' in self._endpoints[endpoint]: launch_time = self._endpoints[endpoint]['launch_time'] else: logger.warn( 'Endpoint {} should always be warm, but is cold'.format( endpoint_name(endpoint))) launch_time = 0.0 # Time to import dependencies import_time = 0.0 for pkg in self._imports_required[func]: if pkg not in self._imports[endpoint]: logger.debug( 'Cold-start has import time for pkg {} on {}'.format( pkg, endpoint_name(endpoint))) import_time += self.import_predictor(pkg, endpoint) return launch_time + import_time def _monitor_tasks(self): logger.info('Starting task-watchdog thread') scheduled = {} while True: time.sleep(self._task_watchdog_sleep) # Get newly scheduled tasks while True: try: task_id, end, num = self._scheduled_tasks.get_nowait() if task_id not in self._task_info: logger.warn( 'Task id {} scheduled but no info found'.format( task_id)) continue info = self._task_info[task_id] scheduled[task_id] = dict(info) # Create new copy of info scheduled[task_id]['task_id'] = task_id scheduled[task_id]['endpoint_id'] = end scheduled[task_id]['transfer_num'] = num except Empty: break # Filter out all tasks whose data transfer has not been completed ready_to_send = set() for task_id, info in scheduled.items(): transfer_num = info['transfer_num'] if transfer_num is None: ready_to_send.add(task_id) info['transfer_time'] = 0.0 elif self._transfer_manger.is_complete(transfer_num): ready_to_send.add(task_id) del self._transfer_ETAs[info['endpoint_id']][transfer_num] info[ 'transfer_time'] = self._transfer_manger.get_transfer_time( transfer_num) # noqa else: # This task cannot be scheduled yet continue if len(ready_to_send) == 0: logger.debug('No new tasks to send. Task watchdog sleeping...') continue # TODO: different clients send different headers. change eventually headers = list(scheduled.values())[0]['headers'] logger.info('Scheduling a batch of {} tasks'.format( len(ready_to_send))) # Submit all ready tasks to FuncX data = {'tasks': []} for task_id in ready_to_send: info = scheduled[task_id] submit_info = (info['function_id'], info['endpoint_id'], info['payload']) data['tasks'].append(submit_info) res_str = requests.post(f'{FUNCX_API}/submit', headers=headers, data=json.dumps(data)) try: res = res_str.json() except ValueError: logger.error(f'Could not parse JSON from {res_str.text}') continue if res['status'] != 'Success': logger.error( 'Could not send tasks to FuncX. Got response: {}'.format( res)) continue # Update task info with submission info for task_id, real_task_id in zip(ready_to_send, res['task_uuids']): info = scheduled[task_id] # This ETA calculation does not take into account transfer time # since, at this point, the transfer has already completed. info['ETA'] = self.strategy.predict_ETA( info['function_id'], info['endpoint_id'], info['payload']) # Record if this ETA prediction is "reliable". If it is not # (e.g., when we have not learned about this (func, ep) pair), # backup tasks will not be sent for this task if it is delayed. info['is_ETA_reliable'] = self.runtime.has_learned( info['function_id'], info['endpoint_id']) info['time_sent'] = time.time() endpoint = info['endpoint_id'] self._task_id_translation[task_id].add(real_task_id) self._pending[real_task_id] = info self._pending_by_endpoint[endpoint].add(real_task_id) # Record endpoint ETA for queue-delay prediction self._last_task_ETA[endpoint] = info['ETA'] logger.info( 'Sent task id {} to {} with real task id {}'.format( task_id, endpoint_name(endpoint), real_task_id)) # Stop tracking all newly sent tasks for task_id in ready_to_send: del scheduled[task_id] def _check_endpoints(self): logger.info('Starting endpoint-watchdog thread') while True: for end in self._endpoints.keys(): statuses = self._fxc.get_endpoint_status(end) if len(statuses) == 0: logger.warn( 'Endpoint {} does not have any statuses'.format( endpoint_name(end))) else: status = statuses[0] # Most recent endpoint status # Mark endpoint as dead/alive based on heartbeat's age # Heartbeats are delayed when an endpoint is executing # tasks, so take into account last execution too age = time.time() - max(status['timestamp'], self.last_result_time[end]) is_dead = end in self._dead_endpoints if not is_dead and age > HEARTBEAT_THRESHOLD: self._dead_endpoints.add(end) logger.warn( 'Endpoint {} seems to have died! ' 'Last heartbeat was {:.2f} seconds ago.'.format( endpoint_name(end), age)) elif is_dead and age <= HEARTBEAT_THRESHOLD: self._dead_endpoints.remove(end) logger.warn( 'Endpoint {} is back alive! ' 'Last heartbeat was {:.2f} seconds ago.'.format( endpoint_name(end), age)) # Mark endpoint as "cold" or "warm" depending on if it # has active managers (nodes) allocated to it if self.temperature[end] == 'WARM' \ and status['active_managers'] == 0: self.temperature[end] = 'COLD' logger.info('Endpoint {} is cold!'.format( endpoint_name(end))) elif self.temperature[end] != 'WARM' \ and status['active_managers'] > 0: self.temperature[end] = 'WARM' logger.info('Endpoint {} is warm again!'.format( endpoint_name(end))) # Send backup tasks if needed self._send_backups_if_needed() # Sleep before checking statuses again time.sleep(5) def _send_backups_if_needed(self): # Get all tasks which have not been completed yet and still have a # pending (real) task on a dead endpoint task_ids = { self._pending[real_task_id]['task_id'] for endpoint in self._dead_endpoints for real_task_id in self._pending_by_endpoint[endpoint] if self._pending[real_task_id]['task_id'] in self._task_info } # Get all tasks for which we had ETA-predictions but haven't # been completed even past their ETA for real_task_id, info in self._pending.items(): # If the predicted ETA wasn't reliable, don't send backups if not info['is_ETA_reliable']: continue expected = info['ETA'] - info['time_sent'] elapsed = time.time() - info['time_sent'] if elapsed / expected > self.backup_delay_threshold: task_ids.add(info['task_id']) for task_id in task_ids: if len(self._endpoints_sent_to[task_id]) > self.max_backups: logger.debug(f'Skipping sending new backup task for {task_id}') else: logger.info(f'Sending new backup task for {task_id}') info = self._task_info[task_id] self._schedule_task(info['function_id'], info['payload'], info['headers'], info['files'], task_id)
def __init__(self, abyss_id: str, globus_source_eid: str, transfer_token: str, compressed_files: List[Dict], worker_params: List[Dict], psql_conn, s3_conn, grouper="", batcher="mmd", dispatcher="fifo", prediction_mode="ml"): """Abyss orchestrator class. Parameters ---------- abyss_id : str Abyss ID for orchestration. globus_source_eid : str Globus endpoint of source data storage. transfer_token : str Globus token to authorize transfers between endpoints. compressed_files : list(dict) List of dictionaries for compressed files to process. Dictionaries contain "file_path" and "compressed_size". worker_params : list(dict) List of valid worker parameter dictionaries to create workers. psql_conn : PostgreSQL connection object to update status. sqs_conn : SQS connection object to push results to SQS. grouper : str Name of grouper to use when crawling. batcher : str Name of batcher to use. dispatcher : str Name of dispatchers to use. prediction_mode: str Mode of prediction to use to predict decompressed file size. "ml" to use machine learning method or "header" to use metadata stored in the header of compressed files (where possible). """ self.abyss_id = abyss_id self.globus_source_eid = globus_source_eid self.transfer_token = transfer_token self.grouper = grouper self.prediction_mode = prediction_mode self.worker_dict = dict() for worker_param in worker_params: worker = Worker.from_dict(worker_param) self.worker_dict[worker.worker_id] = worker self.prefetchers = dict() for worker in self.worker_dict.values(): globus_dest_eid = worker.globus_eid transfer_dir = worker.transfer_dir prefetcher = GlobusPrefetcher(self.transfer_token, self.globus_source_eid, globus_dest_eid, transfer_dir, 4) self.prefetchers[worker.worker_id] = prefetcher self.predictors = dict() for file_type, predictor in FILE_PREDICTOR_MAPPING.items(): file_predictor = predictor() file_predictor.load_models() self.predictors[file_type] = file_predictor self.job_statuses = dict( zip([x for x in JobStatus], [Queue() for _ in range(len(JobStatus))])) unpredicted_set = self.job_statuses[JobStatus.UNPREDICTED] for compressed_file in compressed_files: job = Job.from_dict(compressed_file) job.status = JobStatus.UNPREDICTED job.file_id = str(uuid.uuid4()) job.decompressed_size = 0 unpredicted_set.put(job) logger.info( f"LATENCY PLACING {job.file_id} INTO UNPREDICTED AT {time.time()}" ) self.scheduler = Scheduler(batcher, dispatcher, list(self.worker_dict.values()), []) self.worker_queues = dict() self.psql_conn = psql_conn self.abyss_metadata = [] self.s3_conn = s3_conn self._unpredicted_preprocessing_thread = threading.Thread( target=self._unpredicted_preprocessing, daemon=True) self._predictor_thread = threading.Thread( target=self._predict_decompressed_size, daemon=True) self._scheduler_thread = threading.Thread( target=self._thread_schedule_jobs, daemon=True) self._prefetcher_thread = threading.Thread( target=self._thread_prefetch, daemon=True) self._prefetcher_poll_thread = threading.Thread( target=self._thread_poll_prefetch, daemon=True) self._funcx_process_headers_thread = threading.Thread( target=self._thread_funcx_process_headers, daemon=True) self._funcx_decompress_thread = threading.Thread( target=self._thread_funcx_decompress, daemon=True) self._funcx_crawl_thread = threading.Thread( target=self._thread_funcx_crawl, daemon=True) self._funcx_poll_thread = threading.Thread( target=self._thread_funcx_poll, daemon=True) self._consolidate_results_thread = threading.Thread( target=self._thread_consolidate_crawl_results, daemon=True) self._lock = threading.Lock() self.thread_statuses = { "predictor_thread": True, "scheduler_thread": True, "prefetcher_thread": True, "prefetcher_poll_thread": True, "funcx_decompress_thread": True, "funcx_crawl_thread": True, "funcx_poll_thread": True, "consolidate_results_thread": True } self.funcx_client = FuncXClient() self.kill_status = False self.crawl_results = Queue()
sys.exit(1) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "-e", "--endpoint_id", required=True, help="Target endpoint to send functions to", ) parser.add_argument( "-v", "--ep_version", required=True, help="EP VERSION", ) parser.add_argument( "-w", "--worker_version", required=True, help="Target endpoint to send functions to", ) args = parser.parse_args() fx = FuncXExecutor(FuncXClient()) test_worker_version(fx, args.endpoint_id, args.ep_version, args.worker_version) test_app_exception(fx, args.endpoint_id, args.ep_version, args.worker_version) test_kill_manager(fx, args.endpoint_id, args.ep_version, args.worker_version)
from extractors.utils.base_event import create_event """ This script will run the Gladier team's XPCS script on each file from the 2021-1 file set on Petrel. It will do so on Theta. """ # extractors = # TODO: # 1. Point to the right xpcs_data file. # 2. Point to the right bunch of metadata files. fxc = FuncXClient() # xpcs_x = XPCSExtractor() # xpcs_x = NetCDFExtractor() # xpcs_x = JsonXMLExtractor() # xpcs_x = HDFExtractor() # xpcs_x = ImagesExtractor() # xpcs_x = KeywordExtractor() # xpcs_x = PythonExtractor() # xpcs_x = TabularExtractor() xpcs_x = CCodeExtractor() # xpcs_x = TikaExtractor() ep_id = "2293034e-4c9f-459c-a6f0-0ed310a8e618" extractor_name = "matio" repo_name = "mdf"
def __init__(self, endpoint_id, process_function="301f653b-40b6-449e-ad2e-e57d3aaa33cd"): self.fxc = FuncXClient(asynchronous=True) self.endpoint_id = endpoint_id self.process_function = process_function
) parser.add_argument( "-e", "--endpoint_id", required=True, help="Target endpoint to send functions to", ) parser.add_argument( "-d", "--debug", action="store_true", help="Count of apps to launch" ) args = parser.parse_args() endpoint_id = args.endpoint_id # set_stream_logger() fx = FuncXExecutor(FuncXClient(funcx_service_address=args.service_url)) print("In main") endpoint_id = args.endpoint_id future = fx.submit(double, 5, endpoint_id=endpoint_id) print("Got future back : ", future) for _i in range(5): time.sleep(0.2) # Non-blocking check whether future is done print("Is the future done? :", future.done()) print("Blocking for result") x = future.result() # <--- This is a blocking call print("Result : ", x)
# 'xtract-keyword/xtract-keyword.img', # 'xtract-images/xtract-images.img', # 'xtract-jsonxml/xtract-jsonxml.img', 'xtract-hdf/xtract-hdf.img', # 'xtract-netcdf/xtract-netcdf.img' ] def hello_container(event): import os return f"Container version: {os.environ['container_version']}" for container in all_containers: print(f"Using funcX version: {funcx.__version__}") fxc = FuncXClient() base_path = '/home/tskluzac/ext_repos/' container_path = os.path.join(base_path, container) print(f"Container path: {container_path}") container_uuid = fxc.register_container(container_path, 'singularity') fn_uuid = fxc.register_function( hdf_extract, container_uuid=container_uuid, description="New sum function defined without string spec") print(f"FN UUID: {fn_uuid}") res = fxc.run(sample_hdf_1, endpoint_id=js_ep_id, function_id=fn_uuid) print(res) for i in range(100): # TODO: break when successful
my_update_mpnn = update_wrapper(my_update_mpnn, update_mpnn) my_retrain_mpnn = partial(retrain_mpnn, num_epochs=args.num_epochs, learning_rate=args.learning_rate, bootstrap=True, timeout=2700) my_retrain_mpnn = update_wrapper(my_retrain_mpnn, retrain_mpnn) my_run_simulation = partial(run_simulation, n_nodes=args.nodes_per_task, spec=args.qc_specification) my_run_simulation = update_wrapper(my_run_simulation, run_simulation) # Create the task servers fx_client = FuncXClient() task_map = dict( (f, args.ml_endpoint) for f in [my_evaluate_mpnn, my_update_mpnn, my_retrain_mpnn]) task_map[my_run_simulation] = args.qc_endpoint doer = FuncXTaskServer(task_map, fx_client, server_queues) # Configure the "thinker" application thinker = Thinker(client_queues, database, args.search_space, args.search_size, args.retrain_frequency, args.retrain_from_scratch, models, args.molecules_per_ml_task, args.num_qc_workers, args.qc_specification, out_dir, args.beta, args.pause_during_update, ps_names) logging.info('Created the method server and task generator')
# * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # * Neither the name of the copyright holder nor the names of its # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from funcx import FuncXClient from sx_multi import run_coffea_processor fxc = FuncXClient() container_id = fxc.register_container( "bengal1/funcx_coffea:add_schema_to_notebooks", "docker", "Coffea Processor") function_id = fxc.register_function( run_coffea_processor, "Run your coffea process code in a setup environment", container_uuid=container_id) print("Function_ID is ", function_id)
def fxc(fxc_args): fxc = FuncXClient(**fxc_args) return fxc
class AbyssOrchestrator: def __init__(self, abyss_id: str, globus_source_eid: str, transfer_token: str, compressed_files: List[Dict], worker_params: List[Dict], psql_conn, s3_conn, grouper="", batcher="mmd", dispatcher="fifo", prediction_mode="ml"): """Abyss orchestrator class. Parameters ---------- abyss_id : str Abyss ID for orchestration. globus_source_eid : str Globus endpoint of source data storage. transfer_token : str Globus token to authorize transfers between endpoints. compressed_files : list(dict) List of dictionaries for compressed files to process. Dictionaries contain "file_path" and "compressed_size". worker_params : list(dict) List of valid worker parameter dictionaries to create workers. psql_conn : PostgreSQL connection object to update status. sqs_conn : SQS connection object to push results to SQS. grouper : str Name of grouper to use when crawling. batcher : str Name of batcher to use. dispatcher : str Name of dispatchers to use. prediction_mode: str Mode of prediction to use to predict decompressed file size. "ml" to use machine learning method or "header" to use metadata stored in the header of compressed files (where possible). """ self.abyss_id = abyss_id self.globus_source_eid = globus_source_eid self.transfer_token = transfer_token self.grouper = grouper self.prediction_mode = prediction_mode self.worker_dict = dict() for worker_param in worker_params: worker = Worker.from_dict(worker_param) self.worker_dict[worker.worker_id] = worker self.prefetchers = dict() for worker in self.worker_dict.values(): globus_dest_eid = worker.globus_eid transfer_dir = worker.transfer_dir prefetcher = GlobusPrefetcher(self.transfer_token, self.globus_source_eid, globus_dest_eid, transfer_dir, 4) self.prefetchers[worker.worker_id] = prefetcher self.predictors = dict() for file_type, predictor in FILE_PREDICTOR_MAPPING.items(): file_predictor = predictor() file_predictor.load_models() self.predictors[file_type] = file_predictor self.job_statuses = dict( zip([x for x in JobStatus], [Queue() for _ in range(len(JobStatus))])) unpredicted_set = self.job_statuses[JobStatus.UNPREDICTED] for compressed_file in compressed_files: job = Job.from_dict(compressed_file) job.status = JobStatus.UNPREDICTED job.file_id = str(uuid.uuid4()) job.decompressed_size = 0 unpredicted_set.put(job) logger.info( f"LATENCY PLACING {job.file_id} INTO UNPREDICTED AT {time.time()}" ) self.scheduler = Scheduler(batcher, dispatcher, list(self.worker_dict.values()), []) self.worker_queues = dict() self.psql_conn = psql_conn self.abyss_metadata = [] self.s3_conn = s3_conn self._unpredicted_preprocessing_thread = threading.Thread( target=self._unpredicted_preprocessing, daemon=True) self._predictor_thread = threading.Thread( target=self._predict_decompressed_size, daemon=True) self._scheduler_thread = threading.Thread( target=self._thread_schedule_jobs, daemon=True) self._prefetcher_thread = threading.Thread( target=self._thread_prefetch, daemon=True) self._prefetcher_poll_thread = threading.Thread( target=self._thread_poll_prefetch, daemon=True) self._funcx_process_headers_thread = threading.Thread( target=self._thread_funcx_process_headers, daemon=True) self._funcx_decompress_thread = threading.Thread( target=self._thread_funcx_decompress, daemon=True) self._funcx_crawl_thread = threading.Thread( target=self._thread_funcx_crawl, daemon=True) self._funcx_poll_thread = threading.Thread( target=self._thread_funcx_poll, daemon=True) self._consolidate_results_thread = threading.Thread( target=self._thread_consolidate_crawl_results, daemon=True) self._lock = threading.Lock() self.thread_statuses = { "predictor_thread": True, "scheduler_thread": True, "prefetcher_thread": True, "prefetcher_poll_thread": True, "funcx_decompress_thread": True, "funcx_crawl_thread": True, "funcx_poll_thread": True, "consolidate_results_thread": True } self.funcx_client = FuncXClient() self.kill_status = False self.crawl_results = Queue() @staticmethod def validate_dict_params(orchestrator_params: Dict) -> None: """Ensures dictionary of orchestrator parameters contains necessary parameters. Parameters ---------- orchestrator_params : dict Dictionary containing parameters for AbyssOrchestrator object. Returns ------- Returns None if parameters are valid, raises error if invalid. """ try: for parameter_name, parameter_type in REQUIRED_ORCHESTRATOR_PARAMETERS: parameter = orchestrator_params[parameter_name] assert isinstance(parameter, parameter_type) except AssertionError: raise ValueError( f"Parameter {parameter_name} is not of type {parameter_type}") except KeyError: raise ValueError(f"Required parameter {parameter_name} not found") worker_params = orchestrator_params["worker_params"] for worker_param in worker_params: Worker.validate_dict_params(worker_param) def start(self) -> None: threading.Thread(target=self._orchestrate).start() def _update_kill_status(self) -> None: """Checks whether all jobs are either succeeded or failed. Returns ------- None """ for status in JobStatus: if status in [JobStatus.SUCCEEDED, JobStatus.FAILED]: pass else: if not self.job_statuses[status].empty(): self.kill_status = False return for status in self.thread_statuses.values(): if status: self.kill_status = False return self.kill_status = True logger.info(f"KILL STATUS {self.kill_status}") def _update_psql_entry(self) -> None: """Updates a PostgreSQL entry with orchestration status. Assumes that a table entry has already been created. Returns ------- """ table_entry = dict() for job_status, job_queue in self.job_statuses.items(): table_entry[job_status.value.lower()] = job_queue.qsize() logger.info(table_entry) logger.info(self.thread_statuses) for worker_id, worker in self.worker_dict.items(): logger.info( f"{worker.worker_id} has {worker.curr_available_space}") update_table_entry(self.psql_conn, "abyss_status", {"abyss_id": self.abyss_id}, **table_entry) def _orchestrate(self) -> None: """ Step 1: Predict sizes of jobs using ML predictors Step 2: Batch jobs to worker using Batchers Step 3: Begin transferring files one at a time to each worker using one Prefetcher item per worker. Step 4: Constantly poll prefetcher for file completion. Step 5: When a file is done, send a funcx job request to crawl on worker Step 6: Poll funcx result Step 7: Pull result from sqs queue and validate/consolidate Returns ------- None """ logger.info("STARTING ORCHESTRATION") self._unpredicted_preprocessing_thread.start() self._predictor_thread.start() self._scheduler_thread.start() self._prefetcher_thread.start() self._prefetcher_poll_thread.start() self._funcx_process_headers_thread.start() self._funcx_decompress_thread.start() self._funcx_crawl_thread.start() self._funcx_poll_thread.start() self._consolidate_results_thread.start() t0 = time.time() while not self.kill_status: time.sleep(1) self._update_kill_status() self._update_psql_entry() logger.info(f"ELAPSED: {time.time() - t0}") self._unpredicted_preprocessing_thread.join() self._predictor_thread.join() self._scheduler_thread.join() self._prefetcher_thread.join() self._prefetcher_poll_thread.join() self._funcx_process_headers_thread.join() self._funcx_decompress_thread.join() self._funcx_crawl_thread.join() self._funcx_poll_thread.join() self._consolidate_results_thread.join() logger.info(f"PUSHING METADATA TO S3") # logger.info(metadata) metadata_file_path = os.path.join("/tmp", f"{self.abyss_id}.txt") with open(metadata_file_path, "w") as f: f.writelines("\n".join( [json.dumps(metadata) for metadata in self.abyss_metadata])) s3_upload_file(self.s3_conn, "xtract-abyss", metadata_file_path, f"{self.abyss_id}.txt") os.remove(metadata_file_path) def _unpredicted_preprocessing(self) -> None: """Determines whether to use machine learning or file headers for decompressed size prediction and places jobs into respective queues. Returns ------- None """ while not self.kill_status: unpredicted_queue = self.job_statuses[JobStatus.UNPREDICTED] unpredicted_predict_queue = self.job_statuses[ JobStatus.UNPREDICTED_PREDICT] unpredicted_schedule_queue = self.job_statuses[ JobStatus.UNPREDICTED_SCHEDULE] while not unpredicted_queue.empty(): self.thread_statuses["unpredicted_preprocessing_thread"] = True job = unpredicted_queue.get() # If a file is recursively compressed we will use machine learning to predict the file size. # We only use file headers if the compressed file is directly stored on our storage source. if self.prediction_mode == "ml" or job.status != JobStatus.UNPREDICTED: if job.status == JobStatus.UNPREDICTED: job.status = JobStatus.UNPREDICTED_PREDICT unpredicted_predict_queue.put(job) logger.info( f"PLACING {job.file_path} IN UNPREDICTED PREDICT") elif self.prediction_mode == "header": if job.file_path.endswith( ".zip") or job.file_path.endswith(".tar"): job.status = JobStatus.UNPREDICTED_SCHEDULE unpredicted_schedule_queue.put(job) logger.info( f"PLACING {job.file_path} IN UNPREDICTED SCHEDULE") else: unpredicted_predict_queue.put(job) logger.info( f"PLACING {job.file_path} IN UNPREDICTED PREDICT") else: self.kill_status = True raise ValueError( f"Unknown prediction mode \"{self.prediction_mode}\"") self.thread_statuses[ "unpredicted_preprocessing_thread"] = False def _predict_decompressed_size(self) -> None: """Runs decompression size predictions on all files in self.compressed_files and then places them in self.predicted_files. Returns ------- None """ while not self.kill_status: unpredicted_queue = self.job_statuses[ JobStatus.UNPREDICTED_PREDICT] predicted_queue = self.job_statuses[JobStatus.PREDICTED] while not unpredicted_queue.empty(): self.thread_statuses["predictor_thread"] = True job = unpredicted_queue.get() for job_node in job.bfs_iterator(include_root=True): if job_node.status in [ JobStatus.UNPREDICTED, JobStatus.UNPREDICTED_PREDICT ]: file_path = job_node.file_path file_extension = Predictor.get_extension(file_path) predictor = self.predictors[file_extension] if job_node.decompressed_size: decompressed_size = predictor.repredict( job_node.decompressed_size) logger.info( f"REPREDICTED {job.file_path} WITH DECOMPRESSED SIZE {decompressed_size}" ) else: compressed_size = job_node.compressed_size decompressed_size = predictor.predict( file_path, compressed_size) logger.info( f"PREDICTED {job.file_path} WITH DECOMPRESSED SIZE {decompressed_size}" ) with self._lock: job_node.decompressed_size = decompressed_size job_node.status = JobStatus.PREDICTED logger.info( f"LATENCY PLACING {job.file_id} INTO PREDICTED AT {time.time()}" ) predicted_queue.put(job) self.thread_statuses["predictor_thread"] = False def _thread_schedule_jobs(self) -> None: """Schedules items from self.predicted_files into worker queues in self.worker_queues. Returns ------- None """ while not self.kill_status: predicted_queue = self.job_statuses[JobStatus.PREDICTED] unpredicted_schedule_queue = self.job_statuses[ JobStatus.UNPREDICTED_SCHEDULE] unpredicted_scheduled_queue = self.job_statuses[ JobStatus.UNPREDICTED_SCHEDULED] scheduled_queue = self.job_statuses[JobStatus.SCHEDULED] failed_queue = self.job_statuses[JobStatus.FAILED] with self._lock: predicted_list = [] while not predicted_queue.empty(): self.thread_statuses["scheduler_thread"] = True job = predicted_queue.get() logger.info(f"{job.file_path} SCHEDULING") job.calculate_total_size() predicted_list.append(job) while not unpredicted_schedule_queue.empty(): self.thread_statuses["scheduler_thread"] = True job = unpredicted_schedule_queue.get() logger.info(f"{job.file_path} UNPREDICTED SCHEDULING") job.calculate_total_size() predicted_list.append(job) self.scheduler.schedule_jobs(predicted_list) self.worker_queues = self.scheduler.worker_queues failed_jobs = self.scheduler.failed_jobs queue = None for job in predicted_list: for job_node in job.bfs_iterator(include_root=True): if job_node in failed_jobs: job_node.status = JobStatus.FAILED job_node.error = "Could not schedule" logger.info(f"FAILED TO SCHEDULE {job.file_path}") elif job_node.status == JobStatus.PREDICTED: job_node.status = JobStatus.SCHEDULED queue = JobStatus.SCHEDULED elif job_node.status == JobStatus.UNPREDICTED_SCHEDULE: job_node.status = JobStatus.UNPREDICTED_SCHEDULED queue = JobStatus.UNPREDICTED_SCHEDULED if queue: if queue == JobStatus.SCHEDULED: logger.info( f"LATENCY PLACING {job.file_id} INTO SCHEDULED AT {time.time()}" ) scheduled_queue.put(job) logger.info(f"{job.file_path} SCHEDULED") elif queue == JobStatus.UNPREDICTED_SCHEDULED: unpredicted_scheduled_queue.put(job) logger.info( f"{job.file_path} UNPREDICTED SCHEDULED") else: logger.info( f"LATENCY PLACING {job.file_id} INTO FAILED AT {time.time()}" ) logger.info(f"{job.file_path} PLACED INTO FAILED") failed_queue.put(job) self.thread_statuses["scheduler_thread"] = False def _thread_prefetch(self) -> None: """Places jobs into queue for prefetcher to transfer. Returns ------- None """ while not self.kill_status: scheduled_queue = self.job_statuses[JobStatus.SCHEDULED] unpredicted_scheduled_queue = self.job_statuses[ JobStatus.UNPREDICTED_SCHEDULED] prefetching_queue = self.job_statuses[JobStatus.PREFETCHING] unpredicted_prefetching_queue = self.job_statuses[ JobStatus.UNPREDICTED_PREFETCHING] with self._lock: for worker_id, worker_queue in self.worker_queues.items(): prefetcher = self.prefetchers[worker_id] jobs_to_prefetch = [] while len(worker_queue): self.thread_statuses["prefetcher_thread"] = True job = worker_queue.popleft() logger.info(f"{job.file_path} PREFETCHING") worker_id = job.worker_id jobs_to_prefetch.append(job) job.transfer_path = f"{self.worker_dict[worker_id].transfer_dir}/{job.file_id}" for job_node in job.bfs_iterator(include_root=True): if job_node.status == JobStatus.SCHEDULED: job_node.status = JobStatus.PREFETCHING elif job_node.status == JobStatus.UNPREDICTED_SCHEDULED: job_node.status = JobStatus.UNPREDICTED_PREFETCHING if job.status == JobStatus.UNPREDICTED_PREFETCHING: unpredicted_prefetching_queue.put(job) unpredicted_scheduled_queue.get() logger.info( f"{job.file_path} PLACED INTO UNPREDICTED PREFETCHING" ) else: prefetching_queue.put(job) scheduled_queue.get() logger.info( f"{job.file_path} PLACED INTO PREFETCHING") prefetcher.transfer_job_batch(jobs_to_prefetch) for job in jobs_to_prefetch: logger.info( f"LATENCY PLACING {job.file_id} INTO PREFETCHING AT {time.time()}" ) self.thread_statuses["prefetcher_thread"] = False time.sleep(4) def _thread_poll_prefetch(self) -> None: """Thread function to poll prefetcher and update self.job_statuses. Returns ------- None """ while not self.kill_status: prefetching_queue = self.job_statuses[JobStatus.PREFETCHING] unpredicted_prefetching_queue = self.job_statuses[ JobStatus.UNPREDICTED_PREFETCHING] unpredicted_prefetched_queue = self.job_statuses[ JobStatus.UNPREDICTED_PREFETCHED] prefetched_queue = self.job_statuses[JobStatus.PREFETCHED] failed_queue = self.job_statuses[JobStatus.FAILED] for _ in range(prefetching_queue.qsize() + unpredicted_prefetching_queue.qsize()): self.thread_statuses["prefetcher_poll_thread"] = True if prefetching_queue.empty(): job = unpredicted_prefetching_queue.get() else: job = prefetching_queue.get() logger.info(f"{job.file_path} POLL PREFETCH") file_path = job.file_path worker_id = job.worker_id prefetcher = self.prefetchers[worker_id] prefetcher_status = prefetcher.get_transfer_status(file_path) if prefetcher_status == PrefetcherStatuses.SUCCEEDED: for job_node in job.bfs_iterator(include_root=True): if job_node.status == JobStatus.PREFETCHING: job_node.status = JobStatus.PREFETCHED elif job_node.status == JobStatus.UNPREDICTED_PREFETCHING: job_node.status = JobStatus.UNPREDICTED_PREFETCHED if job.status == JobStatus.UNPREDICTED_PREFETCHED: unpredicted_prefetched_queue.put(job) logger.info( f"{job.file_path} PLACED INTO UNPREDICTED PREFETCHED" ) else: prefetched_queue.put(job) logger.info( f"LATENCY PLACING {job.file_id} INTO PREFETCHED AT {time.time()}" ) logger.info(f"{job.file_path} PLACED INTO PREFETCHED") elif prefetcher_status == PrefetcherStatuses.FAILED: for job_node in job.bfs_iterator(include_root=True): if job_node.status == JobStatus.PREFETCHING or job_node.status == JobStatus.UNPREDICTED_PREFETCHING: job_node.status = JobStatus.FAILED logger.info(f"{job.file_path} FAILED TO PREFETCH") # Potentially add more logic here or in prefetcher to restart failed transfer failed_queue.put(job) else: if job.status == JobStatus.UNPREDICTED_PREFETCHING: unpredicted_prefetching_queue.put(job) else: prefetching_queue.put(job) self.thread_statuses["prefetcher_poll_thread"] = False time.sleep(5) def _thread_funcx_process_headers(self) -> None: """Thread function to submit header processing tasks to funcX. Returns ------- None """ while not self.kill_status: unpredicted_prefetched_queue = self.job_statuses[ JobStatus.UNPREDICTED_PREFETCHED] processing_headers_queue = self.job_statuses[ JobStatus.PROCESSING_HEADERS] batch = self.funcx_client.create_batch() batched_jobs = [] while not unpredicted_prefetched_queue.empty(): self.thread_statuses["funcx_processing_headers_thread"] = True job = unpredicted_prefetched_queue.get() logger.info(f"{job.file_path} PROCESSING HEADERS") job_dict = Job.to_dict(job) worker_id = job.worker_id worker = self.worker_dict[worker_id] batch.add(job_dict, endpoint_id=worker.funcx_eid, function_id=PROCESS_HEADER_FUNCX_UUID) batched_jobs.append(job) if len(batch.tasks) > 0: batch_res = self.funcx_client.batch_run(batch) else: batch_res = None for idx, job in enumerate(batched_jobs): job.funcx_process_headers_id = batch_res[idx] job.status = JobStatus.PROCESSING_HEADERS processing_headers_queue.put(job) logger.info(f"{job.file_path} PROCESSING HEADERS QUEUE") time.sleep(5) self.thread_statuses["funcx_processing_headers_thread"] = False # TODO: Consolidate this and _thread_funcx_crawl into one function def _thread_funcx_decompress(self) -> None: """Thread function to submit decompression tasks to funcX. Returns ------- None """ while not self.kill_status: prefetched_queue = self.job_statuses[JobStatus.PREFETCHED] decompressing_queue = self.job_statuses[JobStatus.DECOMPRESSING] batch = self.funcx_client.create_batch() batched_jobs = [] while not prefetched_queue.empty(): self.thread_statuses["funcx_decompress_thread"] = True job = prefetched_queue.get() job_dict = Job.to_dict(job) worker_id = job.worker_id worker = self.worker_dict[worker_id] batch.add(job_dict, worker.decompress_dir, endpoint_id=worker.funcx_eid, function_id=DECOMPRESSOR_FUNCX_UUID) batched_jobs.append(job) if len(batch.tasks) > 0: batch_res = self.funcx_client.batch_run(batch) else: batch_res = None for idx, job in enumerate(batched_jobs): logger.info(f"{job.file_path} DECOMPRESSING") for job_node in job.bfs_iterator(include_root=True): job_node.funcx_decompress_id = batch_res[idx] if job_node.status == JobStatus.PREFETCHED: job_node.status = JobStatus.DECOMPRESSING decompressing_queue.put(job) logger.info( f"LATENCY PLACING {job.file_id} INTO DECOMPRESSING AT {time.time()}" ) time.sleep(5) self.thread_statuses["funcx_decompress_thread"] = False def _thread_funcx_crawl(self) -> None: """Thread function to submit crawl tasks to funcX. Returns ------- None """ while not self.kill_status: decompressed_queue = self.job_statuses[JobStatus.DECOMPRESSED] crawling_queue = self.job_statuses[JobStatus.CRAWLING] batch = self.funcx_client.create_batch() batched_jobs = [] while not decompressed_queue.empty(): self.thread_statuses["funcx_crawl_thread"] = True job = decompressed_queue.get() logger.info(f"{job.file_path} CRAWLING") job_dict = Job.to_dict(job) worker_id = job.worker_id worker = self.worker_dict[worker_id] batch.add(job_dict, "", endpoint_id=worker.funcx_eid, function_id=LOCAL_CRAWLER_FUNCX_UUID) batched_jobs.append(job) if len(batch.tasks) > 0: batch_res = self.funcx_client.batch_run(batch) else: batch_res = None for idx, job in enumerate(batched_jobs): logger.info( f"LATENCY PLACING {job.file_id} INTO CRAWLING AT {time.time()}" ) for job_node in job.bfs_iterator(include_root=True): job_node.funcx_crawl_id = batch_res[idx] if job_node.status == JobStatus.DECOMPRESSED: job_node.status = JobStatus.CRAWLING crawling_queue.put(job) time.sleep(5) self.thread_statuses["funcx_crawl_thread"] = False def _thread_funcx_poll(self) -> None: """Thread function to poll funcX for results. Returns ------- None """ unpredicted_queue = self.job_statuses[JobStatus.UNPREDICTED] decompressing_queue = self.job_statuses[JobStatus.DECOMPRESSING] decompressed_queue = self.job_statuses[JobStatus.DECOMPRESSED] crawling_queue = self.job_statuses[JobStatus.CRAWLING] processing_headers_queue = self.job_statuses[ JobStatus.PROCESSING_HEADERS] predicted_queue = self.job_statuses[JobStatus.PREDICTED] consolidating_queue = self.job_statuses[JobStatus.CONSOLIDATING] failed_queue = self.job_statuses[JobStatus.FAILED] while not self.kill_status: processing_headers_funcx_ids = [] processing_header_jobs = [] while not processing_headers_queue.empty(): self.thread_statuses["funcx_poll_thread"] = True job = processing_headers_queue.get() logger.info(f"{job.file_path} POLLING HEADER PROCESSING") processing_headers_funcx_ids.append( job.funcx_process_headers_id) processing_header_jobs.append(job) processing_headers_statuses = self.funcx_client.get_batch_status( task_id_list=processing_headers_funcx_ids) for job in processing_header_jobs: worker = self.worker_dict[job.worker_id] job_status = processing_headers_statuses[ job.funcx_process_headers_id] if job_status["pending"]: processing_headers_queue.put(job) elif job_status["status"] == "success": logger.info(f"{job.file_path} COMPLETED HEADER PROCESSING") job = Job.from_dict(job_status["result"]) job.status = JobStatus.PREDICTED worker.curr_available_space += job.compressed_size predicted_queue.put(job) elif job_status["status"] == "failed": worker.curr_available_space += job.compressed_size unpredicted_predict_queue = self.job_statuses[ JobStatus.UNPREDICTED_PREDICT] job.status = JobStatus.UNPREDICTED_PREDICT unpredicted_predict_queue.put(job) time.sleep(5) decompressing_funcx_ids = [] decompressing_jobs = [] while not decompressing_queue.empty(): self.thread_statuses["funcx_poll_thread"] = True job = decompressing_queue.get() logger.info(f"{job.file_path} POLLING DECOMPRESS") decompressing_funcx_ids.append(job.funcx_decompress_id) decompressing_jobs.append(job) decompressing_statuses = self.funcx_client.get_batch_status( decompressing_funcx_ids) for job in decompressing_jobs: worker = self.worker_dict[job.worker_id] job_status = decompressing_statuses[job.funcx_decompress_id] logger.info(job_status) if job_status["pending"]: decompressing_queue.put(job) elif job_status["status"] == "success": job = Job.from_dict(job_status["result"]) logger.info(f"{job.file_path} COMPLETED DECOMPRESS") if job.status == JobStatus.FAILED: worker.curr_available_space += job.total_size failed_queue.put(job) logger.info(f"{job.file_path} PLACED INTO FAILED") logger.info( f"LATENCY PLACING {job.file_id} INTO FAILED AT {time.time()}" ) continue has_unpredicted = False for job_node in job.bfs_iterator(include_root=True): if job_node.status == JobStatus.DECOMPRESSING: job_node.status = JobStatus.DECOMPRESSED elif job_node.status == JobStatus.UNPREDICTED: has_unpredicted = True if has_unpredicted: unpredicted_queue.put(job) logger.info( f"LATENCY PLACING {job.file_id} INTO UNPREDICTED AT {time.time()}" ) logger.info(f"{job.file_path} PLACED INTO UNPREDICTED") worker.curr_available_space += job.compressed_size decompressed_queue.put(job) logger.info( f"LATENCY PLACING {job.file_id} INTO DECOMPRESSED AT {time.time()}" ) logger.info(f"{job.file_path} PLACED INTO DECOMPRESSED") elif job_status["status"] == "failed": worker.curr_available_space += job.compressed_size logger.info( f"ERROR for {job.file_path}: {job_status['exception']}" ) logger.info(f"{job.file_path} PLACED INTO FAILED") failed_queue.put(job) logger.info( f"LATENCY PLACING {job.file_id} INTO FAILED AT {time.time()}" ) time.sleep(5) crawling_funcx_ids = [] crawling_jobs = [] while not crawling_queue.empty(): self.thread_statuses["funcx_poll_thread"] = True job = crawling_queue.get() logger.info(f"{job.file_path} POLLING CRAWL") crawling_funcx_ids.append(job.funcx_crawl_id) crawling_jobs.append(job) crawling_statuses = self.funcx_client.get_batch_status( crawling_funcx_ids) for job in crawling_jobs: worker = self.worker_dict[job.worker_id] job_status = crawling_statuses[job.funcx_crawl_id] if job_status["pending"]: crawling_queue.put(job) elif job_status["status"] == "success": result = job_status["result"] job = Job.from_dict(result) logger.info(f"{job.file_path} COMPLETED CRAWL") for job_node in job.bfs_iterator(include_root=True): if job_node.status == JobStatus.CRAWLING: job_node.status = JobStatus.CONSOLIDATING worker.curr_available_space += (job.total_size - job.compressed_size) consolidating_queue.put(job) logger.info( f"LATENCY PLACING {job.file_id} INTO CONSOLIDATING AT {time.time()}" ) logger.info(f"{job.file_path} PLACED INTO CONSOLIDATING") elif job_status["status"] == "failed": worker.curr_available_space += (job.total_size - job.compressed_size) failed_queue.put(job) logger.info(f"{job.file_path} PLACED INTO FAILED") logger.info( f"LATENCY PLACING {job.file_id} INTO FAILED AT {time.time()}" ) time.sleep(5) self.thread_statuses["funcx_poll_thread"] = False def _thread_consolidate_crawl_results(self) -> None: """Thread function to consolidate crawl results and push to SQS. Returns ------- None """ while not self.kill_status: unpredicted_queue = self.job_statuses[JobStatus.UNPREDICTED] consolidating_queue = self.job_statuses[JobStatus.CONSOLIDATING] succeeded_queue = self.job_statuses[JobStatus.SUCCEEDED] failed_queue = self.job_statuses[JobStatus.FAILED] while not consolidating_queue.empty(): self.thread_statuses["consolidate_results_thread"] = True job = consolidating_queue.get() logger.info(f"{job.file_path} CONSOLIDATING") resubmit_task = False for job_node in job.bfs_iterator(include_root=True): root_path = job_node.metadata["root_path"] for file_path, file_metadata in job_node.metadata[ "metadata"].items(): file_size = file_metadata["physical"]["size"] is_compressed = file_metadata["physical"][ "is_compressed"] child_file_path = os.path.join(root_path, file_path) if is_compressed: if "decompressed_size" in file_metadata[ "physical"]: decompressed_size = file_metadata["physical"][ "decompressed_size"] else: decompressed_size = None if child_file_path in job_node.child_jobs: break else: child_job = Job(file_path=child_file_path, file_id=f"{str(uuid.uuid4())}", compressed_size=file_size) if decompressed_size: child_job.decompressed_size = decompressed_size child_job.status = JobStatus.PREDICTED else: child_job.status = JobStatus.UNPREDICTED job_node.child_jobs[ child_file_path] = child_job resubmit_task = True if resubmit_task: logger.info(f"{job.file_path} RESUBMITTING") unpredicted_queue.put(job) logger.info( f"LATENCY PLACING {job.file_id} INTO UNPREDICTED AT {time.time()}" ) continue consolidated_metadata = job.consolidate_metadata() self.abyss_metadata.append(consolidated_metadata) for job_node in job.bfs_iterator(include_root=True): if job_node.status == JobStatus.CONSOLIDATING: job_node.status = JobStatus.SUCCEEDED succeeded_queue.put(job) logger.info(f"{job.file_path} PLACED INTO SUCCEEDED") logger.info( f"LATENCY PLACING {job.file_id} INTO SUCCEEDED AT {time.time()}" ) while not failed_queue.empty(): job = failed_queue.get() logger.info(f"{job.file_path} CONSOLIDATING FROM FAILED") consolidated_metadata = job.consolidate_metadata() self.abyss_metadata.append(consolidated_metadata) succeeded_queue.put(job) logger.info( f"LATENCY PLACING {job.file_id} INTO SUCCEEDED AT {time.time()}" ) self.thread_statuses["consolidate_results_thread"] = False
def async_fxc(fxc_args): fxc = FuncXClient(**fxc_args, asynchronous=True) return fxc
def fxc(funcx_test_config): client_args = funcx_test_config["client_args"] fxc = FuncXClient(**client_args) fxc.throttling_enabled = False return fxc
endpoint_id=ep_id, function_id=fn_uuid) delta = time.time() - start print("Time to launch {} tasks: {:8.3f} s".format(task_count, delta)) print("Got {} tasks_ids ".format(len(task_ids))) for i in range(3): x = fxc.get_batch_status(task_ids) complete_count = sum( [1 for t in task_ids if t in x and x[t].get('pending', False)]) print("Batch status : {}/{} complete".format(complete_count, len(task_ids))) if complete_count == len(task_ids): break time.sleep(2) delta = time.time() - start print("Time to complete {} tasks: {:8.3f} s".format(task_count, delta)) print("Throughput : {:8.3f} Tasks/s".format(task_count / delta)) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("-e", "--endpoint", required=True) parser.add_argument("-c", "--count", default="10") args = parser.parse_args() print("FuncX version : ", funcx.__version__) fxc = FuncXClient(funcx_service_address='https://dev.funcx.org/api/v1') test(fxc, args.endpoint, task_count=int(args.count))
no_local_server=True, no_browser=True) auth_token = tokens["petrel_https_server"]['access_token'] transfer_token = tokens['transfer.api.globus.org']['access_token'] funcx_token = tokens['funcx_service']['access_token'] headers = {'Authorization': f"Bearer {funcx_token}", 'Transfer': transfer_token, 'FuncX': funcx_token, 'Petrel': auth_token} print(f"Headers: {headers}") def hello_world(event): return "Hello World!" fxc = FuncXClient() func_uuid = fxc.register_function(hello_world) print(func_uuid) event = None endpoint = '68bade94-bf58-4a7a-bfeb-9c6a61fa5443' items_to_batch = [{"func_id": func_uuid, "event": {}}, {"func_id": func_uuid, "event": {}}] x = remote_extract_batch(items_to_batch, endpoint, headers=headers) fx_ser = FuncXSerializer() import time while True:
def test(event): import os return os.environ['container_version'] def main(fxc, ep_id): container_uuid = fxc.register_container('/home/tskluzac/ext_repos/xtract-keyword/xtract-keyword.img', 'singularity') print("Container UUID: {}".format(container_uuid)) fn_uuid = fxc.register_function(base_extractor, #ep_id, # TODO: We do not need ep id here container_uuid=container_uuid, description="Tabular test function.") print("FN_UUID : ", fn_uuid) res = fxc.run(tabular_event, endpoint_id=ep_id, function_id=fn_uuid) print(res) for i in range(100): try: x = fxc.get_result(res) print(x) break except Exception as e: print("Exception: {}".format(e)) time.sleep(2) if __name__ == "__main__": fxc = FuncXClient() main(fxc, "e1398319-0d0f-4188-909b-a978f6fc5621")
from funcx import FuncXClient import time from queue import Queue from extractors.xtract_matio import matio_extract fxc = FuncXClient() # id_list = ['63525bf3-b894-4571-9976-fd675932db46'] # id_list = ['63525bf3-b894-4571-9976-fd675932db46'] # id_list = ['dbc7a749-f689-419c-b114-2f9eb8146496'] id_list = ['c8f24648-6b96-4d58-ac14-93ccf81da12c'] def sleep_func(file_ls): import time # # # for item in file_ls: # # with open(item, 'r') as f: # # f.close() # # time.sleep(sleep_s) return "hello, world!" # func_id = fxc.register_function(function=sleep_func, function_name='hpdc_sleep_extractor') container_uuid = fxc.register_container('/home/tskluzac/xtract-matio.img', 'singularity') print("Container UUID: {}".format(container_uuid)) func_id = fxc.register_function(matio_extract, #ep_id, # TODO: We do not need ep id here container_uuid=container_uuid, description="New sum function defined without")