def handle_completion(self, filemanager, event_list, config): if self.status != JobStatus.COMPLETED: msg = '{prefix}: Job failed, not running completion handler'.format( prefix=self.msg_prefix()) print_line(msg, event_list) logging.info(msg) return else: msg = '{prefix}: Job complete'.format(prefix=self.msg_prefix()) print_line(msg, event_list) logging.info(msg) new_files = list() regrid_files = get_data_output_files(self._output_path, self.case, self.start_year, self.end_year) for regrid_file in regrid_files: new_files.append({ 'name': regrid_file, 'local_path': os.path.join(self._output_path, regrid_file), 'case': self.case, 'year': self.start_year, 'local_status': FileStatus.PRESENT.value }) filemanager.add_files(data_type='regrid', file_list=new_files) if not config['data_types'].get('regrid'): config['data_types']['regrid'] = {'monthly': True}
def setup_globus(endpoints, event_list): """ Check globus login status and login as nessisary, then iterate over a list of endpoints and activate them all Parameters: endpoints: list of strings containing globus endpoint UUIDs event_list: the event list to push user notifications into return: True if successful, False otherwise """ # First go through the globus login process if not check_logged_in(): message = 'Globus login required. Please run {cmd}\n\n'.format( cmd='"globus login"') print_line(message, event_list) print '================================================' sys.exit(1) if isinstance(endpoints, str): endpoints = [endpoints] activated = False client = get_client() while not activated: activated = True message = '' for endpoint in endpoints: msg = 'activating endpoint {}'.format(endpoint) logging.info(msg) try: r = client.endpoint_autoactivate(endpoint, if_expires_in=3600) logging.info(r['code']) except Exception as e: print_debug(e) if e.code == 'ClientError.NotFound': return False else: continue if r["code"] == "AutoActivationFailed": activated = False logging.info('endpoint autoactivation failed') server_document = client.endpoint_server_list(endpoint) for server in server_document['DATA']: hostname = server["hostname"] break message += """ Data transfer server {server} requires manual activation. Please open the following URL in a browser to activate the endpoint: https://www.globus.org/app/endpoints/{endpoint}/activate """.format(endpoint=endpoint, server=server['hostname']) if not activated: print message raw_input('Press ENTER once endpoints have been activated\n') return True
def handle_completion(self, filemanager, event_list, config): if self.status != JobStatus.COMPLETED: msg = '{prefix}: Job failed'.format(prefix=self.msg_prefix()) print_line(msg, event_list) logging.info(msg) else: msg = '{prefix}: Job complete'.format(prefix=self.msg_prefix()) print_line(msg, event_list) logging.info(msg) # if hosting is turned off, simply return if not config['global']['host']: return # else setup the web hosting hostname = config['img_hosting']['img_host_server'] self.host_path = os.path.join( config['img_hosting']['host_directory'], self.case, 'e3sm_diags', '{start:04d}_{end:04d}_vs_{comp}'.format( start=self.start_year, end=self.end_year, comp=self._short_comp_name)) self.setup_hosting(config, self._output_path, self.host_path, event_list) self._host_url = 'https://{server}/{prefix}/{case}/e3sm_diags/{start:04d}_{end:04d}_vs_{comp}/viewer/index.html'.format( server=config['img_hosting']['img_host_server'], prefix=config['img_hosting']['url_prefix'], case=self.case, start=self.start_year, end=self.end_year, comp=self._short_comp_name)
def __init__(self, event_list, event, config, filemanager): self.config = config self.account = config['global'].get('account', '') self.event_list = event_list self.filemanager = filemanager self.dryrun = True if config['global']['dryrun'] == True else False self.debug = True if config['global']['debug'] == True else False self._resource_path = config['global']['resource_path'] """ A list of cases, dictionaries structured as: case (str): the full case name jobs (list): a list of job.Jobs short_name (str): the short name of the case """ self.cases = list() self.running_jobs = list() self.kill_event = event self._job_total = 0 self._job_complete = 0 self.slurm = Slurm() max_jobs = config['global']['max_jobs'] self.max_running_jobs = max_jobs if max_jobs else self.slurm.get_node_number() * 3 while self.max_running_jobs == 0: sleep(1) msg = 'Unable to communication with scontrol, checking again' print_line(msg, event_list) logging.error(msg) self.max_running_jobs = self.slurm.get_node_number() * 3
def handle_completion(self, event_list, config, *args): """ Setup for webhosting after a successful run Parameters ---------- event_list (EventList): an event list to push user notifications into config (dict): the global config object """ if self.status != JobStatus.COMPLETED: msg = '{prefix}: Job failed'.format(prefix=self.msg_prefix(), case=self._short_name) print_line(msg, event_list) logging.info(msg) else: msg = '{prefix}: Job complete'.format(prefix=self.msg_prefix(), case=self._short_name) print_line(msg, event_list) logging.info(msg) # if hosting is turned off, simply return if not config['global']['host']: return img_source = os.path.join( self._output_path, 'coupled_diagnostics', '{case}_vs_{comp}'.format(case=self.case, comp=self._short_comp_name), '{case}_years{start}-{end}_vs_{comp}'.format( case=self.case, start=self.start_year, end=self.end_year, comp=self._short_comp_name))
def verify_remote_files(self, client, case): """ Check that the user supplied file paths are valid for remote files Parameters: client: either an ssh_client or a globus_client case: the case to check remote paths for """ if not self._config['global']['verify']: return True msg = 'verifying remote file paths' print_line(msg, self._event_list) data_types_to_verify = [] q = (DataFile .select() .where( (DataFile.case == case) & (DataFile.local_status != FileStatus.PRESENT.value))) for datafile in q.execute(): if datafile.datatype not in data_types_to_verify: data_types_to_verify.append(datafile.datatype) found_all = True for datatype in data_types_to_verify: q = (DataFile .select() .where( (DataFile.case == case) & (DataFile.datatype == datatype))) files = q.execute() remote_path, _ = os.path.split(files[0].remote_path) msg = 'Checking {} files in {}'.format(datatype, remote_path) print_line(msg, self._event_list) if files[0].transfer_type == 'globus': from lib.globus_interface import get_ls as globus_ls remote_contents = globus_ls( client=client, path=remote_path, endpoint=self._config['simulations'][case]['remote_uuid']) elif files[0].transfer_type == 'sftp': from lib.ssh_interface import get_ls as ssh_ls remote_contents = ssh_ls( client=client, remote_path=remote_path) remote_names = [x['name'] for x in remote_contents] for df in files: if df.name not in remote_names: msg = 'Unable to find file {name} at {remote_path}'.format( name=df.name, remote_path=remote_path) print_message(msg, 'error') found_all = False if not found_all: return False else: msg = 'found all remote files for {}'.format(case) print_message(msg, 'ok') return True
def postvalidate(self, config, *args, **kwargs): if not self._output_path: self._output_path = os.path.join( config['global']['project_path'], 'output', 'diags', self.short_name, 'amwg', '{start:04d}_{end:04d}_vs_{comp}'.format( start=self.start_year, end=self.end_year, comp=self._short_comp_name)) if not self._host_path: self._host_path = os.path.join( config['img_hosting']['host_directory'], self.case, 'amwg', '{start:04d}_{end:04d}_vs_{comp}'.format( start=self.start_year, end=self.end_year, comp=self._short_comp_name)) # check that there have been enough plots created to call this a successful run num_found = sum( len(files) for r, d, files in os.walk(self._output_path)) num_expected = 1900 if self.comparison == 'obs' else 1500 enough_files = bool(num_found > num_expected) if not enough_files: if not self._has_been_executed: msg = '{prefix}: Job hasnt been run yet, starting from scratch'.format( prefix=self.msg_prefix()) logging.info(msg) return False else: img_source = os.path.join( self._output_path, '{case}-vs-{comp}'.format(case=self.short_name, comp=self._short_comp_name)) if os.path.exists(img_source + '.tar'): msg = '{prefix}: extracting images from tar archive'.format( prefix=self.msg_prefix()) print_line(msg, kwargs['event_list']) call([ 'tar', '-xf', img_source + '.tar', '--directory', self._output_path ]) num_found = sum( len(files) for r, d, files in os.walk(self._output_path)) enough_files = bool(num_found > num_expected) if not enough_files: msg = '{prefix}: Not enough images generated, only {num_found} but expected > {num_expected}'.format( prefix=self.msg_prefix(), num_found=num_found, num_expected=num_expected) logging.error(msg) return False else: msg = '{prefix}: Found expected output after extracting archive'.format( prefix=self.msg_prefix()) logging.info(msg) self._check_links(config) return True
def transfer_directory(src_uuid, dst_uuid, src_path, dst_path, event_list=None, killevent=None): """ Transfer all the contents from source_endpoint:src_path to destination_endpoint:dst_path parameters: src_uuid (str): the globus UUID for the source files dst_uuid (str) the globus UUID for the destination src_path (str) the path to the source directory to copy dst_path (str) the path on the destination directory event_list (EventList): an eventlist to push user notifications into killevent (Threadding.Event): an event to listen for if running inside a thread to terminate """ client = get_client() transfer = TransferData(client, src_uuid, dst_uuid, sync_level='checksum') transfer.add_item(source_path=src_path, destination_path=dst_path, recursive=True) try: msg = 'Starting globus directory transfer from {src} to {dst}'.format( src=src_path, dst=dst_path) print_line(msg, event_list) logging.info(msg) result = client.submit_transfer(transfer) task_id = result['task_id'] except: msg = 'Transfer setup for {src_uuid}:{src_path} tp {dst_uuid}:{dst_pathj} failed'.format( src_uuid=src_uuid, src_path=src_path, dst_uuid=dst_uuid, dst_path=dst_path) logging.error(msg) return False while True: status = client.get_task(task_id).get('status') if status == 'SUCCEEDED': return True elif status == 'FAILED': return False else: msg = 'Unexpected globus code: {}'.format(status) print_line(msg, event_list) if event and event.is_set(): client.cancel_task(task_id) return False sleep(10)
def setup_hosting(self, config, img_source, host_path, event_list): """ Performs file copys for images into the web hosting directory Parameters ---------- config (dict): the global config object img_source (str): the path to where the images are coming from host_path (str): the path for where the images should be hosted event_list (EventList): an eventlist to push user notifications into """ if config['global']['always_copy']: if os.path.exists(host_path): msg = '{prefix}: Removing previous output from host location'.format( prefix=self.msg_prefix()) print_line(msg, event_list) rmtree(host_path) if not os.path.exists(host_path): msg = '{prefix}: Moving files for web hosting'.format( prefix=self.msg_prefix()) print_line(msg, event_list) copytree(src=img_source, dst=host_path) else: msg = '{prefix}: Files already present at host location, skipping'.format( prefix=self.msg_prefix()) print_line(msg, event_list) # fix permissions for apache msg = '{prefix}: Fixing permissions'.format(prefix=self.msg_prefix()) print_line(msg, event_list) call(['chmod', '-R', 'go+rx', host_path]) tail, _ = os.path.split(host_path) for _ in range(2): call(['chmod', 'go+rx', tail]) tail, _ = os.path.split(tail)
def setup_hosting(self, config, img_source, host_path, event_list): if config['global']['always_copy']: if os.path.exists(host_path) and self.job_type != 'aprime': msg = '{prefix}: Removing previous output from host location'.format( prefix=self.msg_prefix()) print_line(msg, event_list) rmtree(host_path) if not os.path.exists(host_path): msg = '{prefix}: Moving files for web hosting'.format( prefix=self.msg_prefix()) print_line(msg, event_list) copytree(src=img_source, dst=host_path) # fix permissions for apache msg = '{prefix}: Fixing permissions'.format( prefix=self.msg_prefix()) print_line(msg, event_list) else: msg = '{prefix}: Files already present at host location, skipping'.format( prefix=self.msg_prefix()) print_line(msg, event_list) call(['chmod', '-R', 'a+rx', host_path]) tail, _ = os.path.split(host_path) for _ in range(2): call(['chmod', 'a+rx', tail]) tail, _ = os.path.split(tail)
def handle_completion(self, filemanager, event_list, config): if self.status == JobStatus.COMPLETED: msg = '{prefix}: Job complete'.format(prefix=self.msg_prefix()) else: msg = '{prefix}: Job failed'.format(prefix=self.msg_prefix()) print_line(msg, event_list) logging.info(msg) # if hosting is turned off, simply return if not config['global']['host']: return img_source = os.path.join( self._output_path, '{case}-vs-{comp}'.format(case=self.short_name, comp=self._short_comp_name))
def handle_completion(self, filemanager, event_list, config): """ Adds the output from cmor into the filemanager database as type 'cmorized' Paremeters ---------- filemanager (FileManager): the manager to add files to event_list (EventList): an EventList to add notification messages to config (dict): the global config object Returns ------- True if files added correctly False if there was any error """ try: new_files = list() for root, dirs, files in os.walk(self._output_path): if files is not None: for file in files: new_files.append({ 'name': file, 'local_path': os.path.abspath(file), 'case': self.case, 'year': self.start_year, 'month': self. end_year, # use the month to hold the end year field 'local_status': FileStatus.PRESENT.value }) filemanager.add_files(data_type='cmorized', file_list=new_files) filemanager.write_database() msg = '{prefix}: Job completion handler done'.format( prefix=self.msg_prefix()) print_line(msg, event_list) logging.info(msg) return True except Exception as e: raise e return False
def handle_completion(self, event_list, config, *args): """ Perform setup for webhosting Parameters ---------- event_list (EventList): an event list to push user notifications into config (dict): the global config object """ if self.status != JobStatus.COMPLETED: msg = '{prefix}: Job failed'.format(prefix=self.msg_prefix()) print_line(msg, event_list) logging.info(msg) else: msg = '{prefix}: Job complete'.format(prefix=self.msg_prefix()) print_line(msg, event_list) logging.info(msg) # if hosting is turned off, simply return if not config['global']['host']: return # else setup the web hosting hostname = config['img_hosting']['img_host_server'] self.host_path = os.path.join( config['img_hosting']['host_directory'], self.short_name, 'e3sm_diags', '{start:04d}_{end:04d}_vs_{comp}'.format( start=self.start_year, end=self.end_year, comp=self._short_comp_name)) self.setup_hosting(config, self._output_path, self.host_path, event_list) self._host_url = 'https://{server}/{prefix}/{case}/e3sm_diags/{start:04d}_{end:04d}_vs_{comp}/viewer/index.html'.format( server=config['img_hosting']['img_host_server'], prefix=config['img_hosting']['url_prefix'], case=self.short_name, start=self.start_year, end=self.end_year, comp=self._short_comp_name)
def update_local_status(self): """ Update the database with the local status of the expected files Return True if there was new local data found, False othewise """ self._mutex.acquire() try: query = (DataFile.select().where( (DataFile.local_status == FileStatus.NOT_PRESENT.value) | (DataFile.local_status == FileStatus.IN_TRANSIT.value))) printed = False change = False for datafile in query.execute(): marked = False if os.path.exists(datafile.local_path): if datafile.local_status == FileStatus.NOT_PRESENT.value or datafile.local_status == FileStatus.IN_TRANSIT.value: datafile.local_status = FileStatus.PRESENT.value marked = True change = True else: if datafile.transfer_type == 'local': msg = '{case} transfer_type is local, but {filename} is not present'.format( case=datafile.case, filename=datafile.name) logging.error(msg) if not printed: print_line(msg, self._event_list) printed = True if datafile.local_status == FileStatus.PRESENT.value: datafile.local_status = FileStatus.NOT_PRESENT.value marked = True if marked: datafile.save() except OperationalError as operror: line = 'Error writing to database, database is locked by another process' print_line(line=line, event_list=self._event_list) logging.error(line) finally: if self._mutex.locked(): self._mutex.release() return change
def update_local_status(self): """ Update the database with the local status of the expected files Return True if there was new local data found, False othewise """ try: query = (DataFile .select() .where( (DataFile.local_status == FileStatus.NOT_PRESENT.value) | (DataFile.local_status == FileStatus.IN_TRANSIT.value))) printed = False change = False for datafile in query.execute(): marked = False if os.path.exists(datafile.local_path): if datafile.local_status == FileStatus.NOT_PRESENT.value or datafile.local_status == FileStatus.IN_TRANSIT.value: datafile.local_status = FileStatus.PRESENT.value marked = True change = True else: if datafile.transfer_type == 'local': msg = '{case} transfer_type is local, but {filename} is not present'.format( case=datafile.case, filename=datafile.name) logging.error(msg) if not printed: print_line(msg, self._event_list) printed = True if datafile.local_status == FileStatus.PRESENT.value: datafile.local_status = FileStatus.NOT_PRESENT.value marked = True if marked: datafile.save() except Exception as e: print_debug(e) return change
def handle_completion(self, event_list, config, *args): """ Sets up variables needed to web hosting Parameters ---------- event_list (EventList): an EventList to push user notifications into config (dict): the global config object """ if self.status == JobStatus.COMPLETED: msg = '{prefix}: Job complete'.format(prefix=self.msg_prefix()) else: msg = '{prefix}: Job failed'.format(prefix=self.msg_prefix()) print_line(msg, event_list) logging.info(msg) # if hosting is turned off, simply return if not config['global']['host']: return img_source = os.path.join( self._output_path, '{case}-vs-{comp}'.format(case=self.short_name, comp=self._short_comp_name))
def handle_completion(self, filemanager, event_list, config): if self.status != JobStatus.COMPLETED: msg = '{prefix}: Job failed, not running completion handler'.format( prefix=self.msg_prefix()) print_line(msg, event_list) logging.info(msg) return else: msg = '{prefix}: Job complete'.format( prefix=self.msg_prefix()) print_line(msg, event_list) logging.info(msg) regrid_path = os.path.join( config['global']['project_path'], 'output', 'pp', config['post-processing']['climo']['destination_grid_name'], self._short_name, 'climo', '{length}yr'.format(length=self.end_year-self.start_year+1)) new_files = list() for regrid_file in get_climo_output_files(regrid_path, self.start_year, self.end_year): new_files.append({ 'name': regrid_file, 'local_path': os.path.join(regrid_path, regrid_file), 'case': self.case, 'year': self.start_year, 'local_status': FileStatus.PRESENT.value }) filemanager.add_files( data_type='climo_regrid', file_list=new_files) if not config['data_types'].get('climo_regrid'): config['data_types']['climo_regrid'] = {'monthly': True} climo_path = os.path.join( config['global']['project_path'], 'output', 'pp', config['simulations'][self.case]['native_grid_name'], self._short_name, 'climo', '{length}yr'.format(length=self.end_year-self.start_year+1)) for climo_file in get_climo_output_files(climo_path, self.start_year, self.end_year): new_files.append({ 'name': climo_file, 'local_path': os.path.join(regrid_path, climo_file), 'case': self.case, 'year': self.start_year, 'local_status': FileStatus.PRESENT.value }) filemanager.add_files( data_type='climo_native', file_list=new_files) if not config['data_types'].get('climo_native'): config['data_types']['climo_native'] = {'monthly': True} msg = '{prefix}: Job completion handler done'.format( prefix=self.msg_prefix()) print_line(msg, event_list) logging.info(msg)
def _ssh_transfer(self, target_files, client, event): sftp_client = client.open_sftp() for file in target_files: if event.is_set(): return _, filename = os.path.split(file['local_path']) msg = 'sftp transfer from {} to {}'.format( file['remote_path'], file['local_path']) logging.info(msg) msg = 'starting sftp transfer for {}'.format(filename) print_line(msg, self._event_list) ssh_transfer(sftp_client, file) msg = 'sftp transfer complete for {}'.format(filename) print_line(msg, self._event_list) msg = self.report_files_local() print_line(msg, self._event_list)
def terminate_transfers(self): self.kill_event.set() for thread in self.thread_list: msg = 'terminating {}, this may take a moment'.format(thread.name) print_line(msg, self._event_list) thread.join()
def populate_file_list(self): """ Populate the database with the required DataFile entries """ msg = 'Creating file table' print_line( line=msg, event_list=self._event_list) newfiles = list() start_year = int(self._config['simulations']['start_year']) end_year = int(self._config['simulations']['end_year']) with DataFile._meta.database.atomic(): # for each case for case in self._config['simulations']: if case in ['start_year', 'end_year', 'comparisons']: continue # for each data type for _type in self._config['data_types']: data_types_for_case = self._config['simulations'][case]['data_types'] if 'all' not in data_types_for_case: if _type not in data_types_for_case: continue # setup the base local_path local_path = self.render_file_string( data_type=_type, data_type_option='local_path', case=case) new_files = list() if self._config['data_types'][_type].get('monthly') and self._config['data_types'][_type]['monthly'] in ['True', 'true', '1', 1]: # handle monthly data for year in range(start_year, end_year + 1): for month in range(1, 13): filename = self.render_file_string( data_type=_type, data_type_option='file_format', case=case, year=year, month=month) r_path = self.render_file_string( data_type=_type, data_type_option='remote_path', case=case, year=year, month=month) new_files.append({ 'name': filename, 'remote_path': os.path.join(r_path, filename), 'local_path': os.path.join(local_path, filename), 'local_status': FileStatus.NOT_PRESENT.value, 'case': case, 'remote_status': FileStatus.NOT_PRESENT.value, 'year': year, 'month': month, 'datatype': _type, 'local_size': 0, 'transfer_type': self._config['simulations'][case]['transfer_type'], 'remote_uuid': self._config['simulations'][case].get('remote_uuid', ''), 'remote_hostname': self._config['simulations'][case].get('remote_hostname', '') }) else: # handle one-off data filename = self.render_file_string( data_type=_type, data_type_option='file_format', case=case) r_path = self.render_file_string( data_type=_type, data_type_option='remote_path', case=case) new_files.append({ 'name': filename, 'remote_path': os.path.join(r_path, filename), 'local_path': os.path.join(local_path, filename), 'local_status': FileStatus.NOT_PRESENT.value, 'case': case, 'remote_status': FileStatus.NOT_PRESENT.value, 'year': 0, 'month': 0, 'datatype': _type, 'local_size': 0, 'transfer_type': self._config['simulations'][case]['transfer_type'], 'remote_uuid': self._config['simulations'][case].get('remote_uuid', ''), 'remote_hostname': self._config['simulations'][case].get('remote_hostname', '') }) tail, _ = os.path.split(new_files[0]['local_path']) if not os.path.exists(tail): os.makedirs(tail) step = 50 for idx in range(0, len(new_files), step): DataFile.insert_many( new_files[idx: idx + step]).execute() msg = 'Database update complete' print_line(msg, self._event_list)
def main(test=False, **kwargs): """ Processflow main Parameters: test (bool): turns on test mode. Simply stops the logger from reloading itself, which stops a crash when running from inside the test runner kwargs (dict): when running in test mode, arguments are passed directly through the kwargs which bypasses the argument parsing. """ # The master configuration object config = {} # An event to kill the threads on terminal exception thread_kill_event = threading.Event() # A flag to tell if we have all the data locally all_data = False all_data_remote = False # get a globus client client = get_client() # Read in parameters from config if test: print '==========================================' print '---- Processflow running in test mode ----' print '==========================================' _args = kwargs['testargs'] config, filemanager, runmanager = initialize( argv=_args, version=__version__, branch=__branch__, event_list=event_list, kill_event=thread_kill_event, testing=True) else: config, filemanager, runmanager = initialize( argv=sys.argv[1:], version=__version__, branch=__branch__, event_list=event_list, kill_event=thread_kill_event) # setup returned an error code if isinstance(config, int): print "Error in setup, exiting" return -1 logging.info('Config setup complete') debug = True if config['global'].get('debug') else False msg = "Updating local file status" print_line(line=msg, event_list=event_list) filemanager.update_local_status() all_data_local = filemanager.all_data_local() if not all_data_local: filemanager.transfer_needed(event_list=event_list, event=thread_kill_event) # Main loop printed = False loop_delay = 10 state_path = os.path.join(config['global']['project_path'], 'output', 'state.txt') try: print "--------------------------" print " Entering Main Loop " print " Status file: {}".format(state_path) print "--------------------------" while True: if not all_data_local: if debug: print_line(' -- Updating local status --', event_list) if filemanager.update_local_status(): msg = filemanager.report_files_local() print_line(msg, event_list) filemanager.write_database() all_data_local = filemanager.all_data_local() if not all_data_local: if debug: print_line(' -- Additional data needed --', event_list) filemanager.transfer_needed(event_list, thread_kill_event) if debug: print_line(' -- checking data -- ', event_list) runmanager.check_data_ready() if debug: print_line(' -- starting ready jobs --', event_list) runmanager.start_ready_jobs() if debug: print_line(' -- monitoring running jobs --', event_list) runmanager.monitor_running_jobs() if debug: print_line(' -- writing out state -- ', event_list) runmanager.write_job_sets(state_path) status = runmanager.is_all_done() # return -1 if still running # return 0 if a jobset failed # return 1 if all complete if status >= 0: msg = "Finishing up run" print_line(msg, event_list) printed = False while not filemanager.all_data_local(): if not printed: printed = True msg = 'Jobs are complete, but additional data is being transfered' print_line(msg, event_list) filemanager.update_local_status() if not filemanager.all_data_local(): filemanager.transfer_needed(event_list=event_list, event=thread_kill_event) sleep(10) filemanager.write_database() finalize(config=config, event_list=event_list, status=status, runmanager=runmanager) # SUCCESS EXIT return 0 if debug: print_line(' -- sleeping', event_list) sleep(loop_delay) except KeyboardInterrupt as e: print_message('\n----- KEYBOARD INTERRUPT -----') runmanager.write_job_sets(state_path) filemanager.terminate_transfers() print_message('----- cleanup complete -----', 'ok') except Exception as e: print_message('----- AN UNEXPECTED EXCEPTION OCCURED -----') print_debug(e) runmanager.write_job_sets(state_path) filemanager.terminate_transfers()
if count < expected_files[setname]: if not self._has_been_executed: return False msg = '{prefix}: set {set} only produced {numProduced} when {numExpected} were expected'.format( prefix=self.msg_prefix(), set=setname, numProduced=count, numExpected=expected_files[setname]) logging.error(msg) passed = False if not passed: if os.path.exists(img_source_tar): msg = '{prefix}: extracting images from tar archive'.format( prefix=self.msg_prefix()) print_line(msg, kwargs['event_list']) call([ 'tar', '-xf', img_source_tar, '--directory', self._output_path ]) passed = True for item in config['diags']['amwg']['sets']: setname = 'set5_6' if item == '6' or item == '5' else 'set' + item directory = os.path.join( self._output_path, '{}-vs-{}'.format(self.short_name, self._short_comp_name), setname) if not os.path.exists(directory): msg = '{prefix}: could not find output directory after inflating tar archive: {dir}'.format( prefix=self.msg_prefix(), dir=directory) logging.error(msg)
def monitor_running_jobs(self): slurm = Slurm() for_removal = list() for item in self.running_jobs: job = self.get_job_by_id(item['job_id']) if item['slurm_id'] == 0: self._job_complete += 1 for_removal.append(item) job.handle_completion( self.filemanager, self.event_list, self.config) self.report_completed_job() continue try: job_info = slurm.showjob(item['slurm_id']) if not job_info or job_info.get('JobState') is None: continue except Exception as e: # if the job is old enough it wont be in the slurm list anymore # which will throw an exception self._job_complete += 1 for_removal.append(item) valid = job.postvalidate(self.config, event_list=self.event_list) if valid: job.status = JobStatus.COMPLETED job.handle_completion( self.filemanager, self.event_list, self.config) self.report_completed_job() else: line = "slurm lookup error for {job}: {id}".format( job=job.job_type, id=item['job_id']) print_line( line=line, event_list=self.event_list) continue status = StatusMap[job_info.get('JobState')] if status != job.status: msg = '{prefix}: Job changed from {s1} to {s2}'.format( prefix=job.msg_prefix(), s1=ReverseMap[job.status], s2=ReverseMap[status]) print_line(msg, self.event_list) job.status = status if status in [JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED]: self._job_complete += 1 valid = job.postvalidate(self.config, event_list=self.event_list) if not valid: job.status = JobStatus.FAILED job.handle_completion( self.filemanager, self.event_list, self.config) for_removal.append(item) self.report_completed_job() if status in [JobStatus.FAILED, JobStatus.CANCELLED]: for depjob in self.get_jobs_that_depend(job.id): depjob.status = JobStatus.FAILED if not for_removal: return else: self.running_jobs = [x for x in self.running_jobs if x not in for_removal] return
# setup the web hosting hostname = config['img_hosting']['img_host_server'] self._host_path = os.path.join( config['img_hosting']['host_directory'], self.case, 'amwg', '{start:04d}_{end:04d}_vs_{comp}'.format( start=self.start_year, end=self.end_year, comp=self._short_comp_name)) if not os.path.exists(img_source): if os.path.exists(img_source + '.tar'): self.extract_img_tar(img_source) else: msg = '{prefix}: Unable to find output directory or tar archive'.format( prefix=self.msg_prefix()) print_line(msg, event_list) self.status = JobStatus.FAILED logging.info(msg) return self.setup_hosting(config, img_source, self._host_path, event_list) self._host_url = 'https://{server}/{prefix}/{case}/amwg/{start:04d}_{end:04d}_vs_{comp}/index.html'.format( server=config['img_hosting']['img_host_server'], prefix=config['img_hosting']['url_prefix'], case=self.case, start=self.start_year, end=self.end_year, comp=self._short_comp_name) # ----------------------------------------------- def _check_links(self, config):
def start_ready_jobs(self): """ Loop over the list of jobs for each case, first setting up the data for, and then submitting each job to the queue """ for case in self.cases: for job in case['jobs']: if job.status != JobStatus.VALID: continue if len(self.running_jobs) >= self.max_running_jobs: msg = 'running {} of {} jobs, waiting for queue to shrink'.format( len(self.running_jobs), self.max_running_jobs) if self.debug: print_line(msg, self.event_list) return deps_ready = True for depjobid in job.depends_on: depjob = self.get_job_by_id(depjobid) if depjob.status != JobStatus.COMPLETED: deps_ready = False break if deps_ready and job.data_ready: # if the job was finished by a previous run of the processflow valid = job.postvalidate(self.config, event_list=self.event_list) if valid: job.status = JobStatus.COMPLETED self._job_complete += 1 job.handle_completion( self.filemanager, self.event_list, self.config) self.report_completed_job() msg = '{}: Job previously computed, skipping'.format(job.msg_prefix()) print_line(msg, self.event_list) continue # the job is ready for submission if job.run_type is not None: msg = '{}: Job ready, submitting to queue'.format(job.msg_prefix()) elif isinstance(job, Diag): msg = '{}: Job ready, submitting to queue'.format(job.msg_prefix()) else: msg = '{}: Job ready, submitting to queue'.format(job.msg_prefix()) print_line(msg, self.event_list) # set to pending before data setup so we dont double submit job.status = JobStatus.PENDING # setup the data needed for the job job.setup_data( config=self.config, filemanager=self.filemanager, case=job.case) # if this job needs data from another case, set that up too if isinstance(job, Diag): if job.comparison != 'obs': job.setup_data( config=self.config, filemanager=self.filemanager, case=job.comparison) slurmid = job.execute( config=self.config, dryrun=self.dryrun) if slurmid is False: msg = '{}: Prevalidation FAILED'.format(job.msg_prefix()) print_line(msg, self.event_list) job.status = JobStatus.FAILED else: self.running_jobs.append({ 'slurm_id': slurmid, 'job_id': job.id })
def handle_completion(self, filemanager, event_list, config): """ Adds the output files to the filemanager database as 'climo_regrid' and 'climo_native' data types Parameters ---------- filemanager (FileManager): The filemanager to add the climo files to event_list (EventList): an event list to push notifications into config (dict): the global configuration object """ if self.status != JobStatus.COMPLETED: msg = '{prefix}: Job failed, not running completion handler'.format( prefix=self.msg_prefix()) print_line(msg, event_list) logging.info(msg) return else: msg = '{prefix}: Job complete'.format( prefix=self.msg_prefix()) print_line(msg, event_list) logging.info(msg) regrid_path = os.path.join( config['global']['project_path'], 'output', 'pp', config['post-processing']['climo']['destination_grid_name'], self._short_name, 'climo', '{length}yr'.format(length=self.end_year - self.start_year + 1)) new_files = list() for regrid_file in get_climo_output_files(regrid_path, self.start_year, self.end_year): new_files.append({ 'name': regrid_file, 'local_path': os.path.join(regrid_path, regrid_file), 'case': self.case, 'year': self.start_year, 'month': self.end_year, # use the month to hold the end year field 'local_status': FileStatus.PRESENT.value }) filemanager.add_files( data_type='climo_regrid', file_list=new_files) if not config['data_types'].get('climo_regrid'): config['data_types']['climo_regrid'] = {'monthly': True} climo_path = os.path.join( config['global']['project_path'], 'output', 'pp', config['simulations'][self.case]['native_grid_name'], self._short_name, 'climo', '{length}yr'.format(length=self.end_year - self.start_year + 1)) for climo_file in get_climo_output_files(climo_path, self.start_year, self.end_year): new_files.append({ 'name': climo_file, 'local_path': os.path.join(regrid_path, climo_file), 'case': self.case, 'year': self.start_year, 'month': self.end_year, # use the month to hold the end year field 'local_status': FileStatus.PRESENT.value }) filemanager.add_files( data_type='climo_native', file_list=new_files) if not config['data_types'].get('climo_native'): config['data_types']['climo_native'] = {'monthly': True} filemanager.write_database() msg = '{prefix}: Job completion handler done'.format( prefix=self.msg_prefix()) print_line(msg, event_list) logging.info(msg)
def main(test=False, **kwargs): """ Processflow main """ # The master configuration object config = {} # An event to kill the threads on terminal exception thread_kill_event = threading.Event() mutex = threading.Lock() # A flag to tell if we have all the data locally all_data = False all_data_remote = False # get a globus client client = get_client() # Read in parameters from config if test: print '==========================================' print '---- Processflow running in test mode ----' print '==========================================' _args = kwargs['testargs'] config, filemanager, runmanager = initialize( argv=_args, version=__version__, branch=__branch__, event_list=event_list, kill_event=thread_kill_event, mutex=mutex, testing=True) else: config, filemanager, runmanager = initialize( argv=sys.argv[1:], version=__version__, branch=__branch__, event_list=event_list, kill_event=thread_kill_event, mutex=mutex) # setup returned an error code if isinstance(config, int): print "Error in setup, exiting" return -1 logging.info('Config setup complete') debug = True if config['global'].get('debug') else False msg = "Updating local file status" print_line(line=msg, event_list=event_list) filemanager.update_local_status() all_data_local = filemanager.all_data_local() if not all_data_local: filemanager.transfer_needed(event_list=event_list, event=thread_kill_event) # msg = "Writing human readable state to file" # print_line(msg, event_list) # check if the case_scripts directory is present # if its not, transfer it over if config['global'].get('get_scripts'): pass # msg = 'transfering case_scripts from remote machine' # print_line( # line=msg, # event_list=event_list) # case_scripts_dir = os.path.join( # config['global']['input_path'], # 'case_scripts') # if not os.path.exists(case_scripts_dir): # logging.info(msg) # src_path = os.path.join( # config['global']['source_path'], 'case_scripts') # while True: # try: # args = { # 'source_endpoint': config['transfer']['source_endpoint'], # 'destination_endpoint': config['transfer']['destination_endpoint'], # 'src_path': src_path, # 'dst_path': case_scripts_dir, # 'event_list': event_list, # 'event': thread_kill_event # } # thread = threading.Thread( # target=transfer_directory, # name='transfer_directory', # kwargs=args) # except: # sleep(1) # else: # thread_list.append(thread) # thread.start() # break # Main loop printed = False loop_delay = 10 state_path = os.path.join(config['global']['project_path'], 'output', 'state.txt') try: print "--------------------------" print " Entering Main Loop " print " Status file: {}".format(state_path) print "--------------------------" while True: if not all_data_local: if debug: print_line(' -- Updating local status --', event_list) if filemanager.update_local_status(): msg = filemanager.report_files_local() print_line(msg, event_list) filemanager.write_database() all_data_local = filemanager.all_data_local() if not all_data_local: if debug: print_line(' -- Additional data needed --', event_list) filemanager.transfer_needed(event_list, thread_kill_event) if debug: print_line(' -- checking data -- ', event_list) runmanager.check_data_ready() if debug: print_line(' -- starting ready jobs --', event_list) runmanager.start_ready_jobs() if debug: print_line(' -- monitoring running jobs --', event_list) runmanager.monitor_running_jobs() if debug: print_line(' -- writing out state -- ', event_list) runmanager.write_job_sets(state_path) status = runmanager.is_all_done() # return -1 if still running # return 0 if a jobset failed # return 1 if all complete if status >= 0: msg = "Finishing up run" print_line(msg, event_list) printed = False while not filemanager.all_data_local(): if not printed: printed = True msg = 'Jobs are complete, but additional data is being transfered' print_line(msg, event_list) filemanager.update_local_status() if not filemanager.all_data_local(): filemanager.transfer_needed(event_list=event_list, event=thread_kill_event) sleep(10) filemanager.write_database() finalize(config=config, event_list=event_list, status=status, kill_event=thread_kill_event, runmanager=runmanager) # SUCCESS EXIT return 0 if debug: print_line(' -- sleeping', event_list) sleep(loop_delay) except KeyboardInterrupt as e: print_message('\n----- KEYBOARD INTERRUPT -----') runmanager.write_job_sets(state_path) filemanager.terminate_transfers() print_message('----- cleanup complete -----', 'ok') except Exception as e: print_message('----- AN UNEXPECTED EXCEPTION OCCURED -----') print_debug(e) runmanager.write_job_sets(state_path) filemanager.terminate_transfers()
def transfer_needed(self, event_list, event): """ Start a transfer job for any files that arent local, but do exist remotely Globus user must already be logged in """ # required files dont exist locally, do exist remotely # or if they do exist locally have a different local and remote size target_files = list() try: q = (DataFile .select(DataFile.case) .where( DataFile.local_status == FileStatus.NOT_PRESENT.value)) caselist = [x.case for x in q.execute()] if not caselist or len(caselist) == 0: return cases = list() for case in caselist: if case not in cases: cases.append(case) for case in cases: q = (DataFile .select() .where( (DataFile.case == case) & (DataFile.local_status == FileStatus.NOT_PRESENT.value))) required_files = [x for x in q.execute()] for file in required_files: if file.transfer_type == 'local': required_files.remove(file) if not required_files: msg = 'ERROR: all missing files are marked as local' print_line(msg, self._event_list) return # mark files as in-transit so we dont double-copy q = (DataFile .update({DataFile.local_status: FileStatus.IN_TRANSIT}) .where(DataFile.name << [x.name for x in required_files])) q.execute() for file in required_files: target_files.append({ 'local_path': file.local_path, 'remote_path': file.remote_path, }) if required_files[0].transfer_type == 'globus': msg = 'Starting globus file transfer of {} files'.format( len(required_files)) print_line(msg, self._event_list) msg = 'See https://www.globus.org/app/activity for transfer details' print_line(msg, self._event_list) client = get_client() if not self.verify_remote_files(client=client, case=case): return False remote_uuid = required_files[0].remote_uuid local_uuid = self._config['global']['local_globus_uuid'] thread_name = '{}_globus_transfer'.format(required_files[0].case) _args = (client, remote_uuid, local_uuid, target_files, self.kill_event) thread = Thread( target=globus_transfer, name=thread_name, args=_args) self.thread_list.append(thread) thread.start() elif required_files[0].transfer_type == 'sftp': msg = 'Starting sftp file transfer of {} files'.format( len(required_files)) print_line(msg, self._event_list) client = get_ssh_client(required_files[0].remote_hostname) if not self.verify_remote_files(client=client, case=case): return False thread_name = '{}_sftp_transfer'.format(required_files[0].case) _args = (target_files, client, self.kill_event) thread = Thread( target=self._ssh_transfer, name=thread_name, args=_args) self.thread_list.append(thread) thread.start() except Exception as e: print_debug(e) return False
def handle_completion(self, filemanager, event_list, config): """ Post run handler, adds produced timeseries variable files into the filemanagers database Parameters ---------- filemanager (FileManager): The filemanager to add the files to event_list (EventList): an event list to push user notifications into config (dict): the global config object """ if self.status != JobStatus.COMPLETED: msg = '{prefix}: Job failed, not running completion handler'.format( prefix=self.msg_prefix()) print_line(msg, event_list) logging.info(msg) return else: msg = '{prefix}: Job complete'.format(prefix=self.msg_prefix()) print_line(msg, event_list) logging.info(msg) var_list = config['post-processing']['timeseries'][self._run_type] # add native timeseries files to the filemanager db ts_path = os.path.join( config['global']['project_path'], 'output', 'pp', config['simulations'][self.case]['native_grid_name'], self._short_name, 'ts', '{length}yr'.format(length=self.end_year - self.start_year + 1)) new_files = list() for ts_file in get_ts_output_files(ts_path, var_list, self.start_year, self.end_year): new_files.append({ 'name': ts_file, 'local_path': os.path.join(ts_path, ts_file), 'case': self.case, 'year': self.start_year, 'month': self.end_year, 'local_status': FileStatus.PRESENT.value }) filemanager.add_files(data_type='ts_native', file_list=new_files) if not config['data_types'].get('ts_native'): config['data_types']['ts_native'] = {'monthly': False} if self._regrid: # add regridded timeseries files to the filemanager db regrid_path = os.path.join( config['global']['project_path'], 'output', 'pp', config['post-processing']['timeseries'] ['destination_grid_name'], self._short_name, 'ts', '{length}yr'.format(length=self.end_year - self.start_year + 1)) new_files = list() ts_files = get_ts_output_files(ts_path, var_list, self.start_year, self.end_year) for regrid_file in ts_files: new_files.append({ 'name': regrid_file, 'local_path': os.path.join(regrid_path, regrid_file), 'case': self.case, 'year': self.start_year, 'month': self.end_year, 'local_status': FileStatus.PRESENT.value }) filemanager.add_files(data_type='ts_regrid', file_list=new_files) if not config['data_types'].get('ts_regrid'): config['data_types']['ts_regrid'] = {'monthly': False} msg = '{prefix}: Job completion handler done'.format( prefix=self.msg_prefix()) print_line(msg, event_list) logging.info(msg)
def report_completed_job(self): msg = '{complete}/{total} jobs complete or {percent:.2f}%'.format( complete=self._job_complete, total=self._job_total, percent=(((self._job_complete * 1.0)/self._job_total)*100)) print_line(msg, self.event_list)