def option_reset(self, request): """ Fetch and overwrite values from subcampaign """ prepid = request.get_prepid() request_db = Database('requests') with self.locker.get_nonblocking_lock(prepid): request_json = request_db.get(prepid) request = Request(json_input=request_json) if request.get('status') != 'new': raise Exception('It is not allowed to option reset ' 'requests that are not in status "new"') subcampaign_db = Database('subcampaigns') subcampaign_name = request.get('subcampaign') subcampaign_json = subcampaign_db.get(subcampaign_name) if not subcampaign_json: raise Exception( f'Subcampaign "{subcampaign_name}" does not exist') subcampaign = Subcampaign(json_input=subcampaign_json) request.set('memory', subcampaign.get('memory')) request.set('sequences', subcampaign.get('sequences')) request.set('energy', subcampaign.get('energy')) request.set('cmssw_release', subcampaign.get('cmssw_release')) request_db.save(request.get_json()) return request
def update_workflows(self, relval): """ Update computing workflows from Stats2 """ prepid = relval.get_prepid() relval_db = Database('relvals') with self.locker.get_lock(prepid): relval = self.get(prepid) workflow_names = {w['name'] for w in relval.get('workflows')} stats_workflows = get_workflows_from_stats_for_prepid(prepid) workflow_names -= {w['RequestName'] for w in stats_workflows} self.logger.info('%s workflows that are not in stats: %s', len(workflow_names), workflow_names) stats_workflows += get_workflows_from_stats(list(workflow_names)) all_workflows = {} for workflow in stats_workflows: if not workflow or not workflow.get('RequestName'): raise Exception('Could not find workflow in Stats2') name = workflow.get('RequestName') all_workflows[name] = workflow self.logger.info('Found workflow %s', name) output_datasets = self.get_output_datasets(relval, all_workflows) workflows = self.pick_workflows(all_workflows, output_datasets) relval.set('output_datasets', output_datasets) relval.set('workflows', workflows) relval_db.save(relval.get_json()) return relval
def submit_relval(self, relval, controller): """ Method that is used by submission workers. This is where the actual submission happens """ prepid = relval.get_prepid() credentials_file = Config.get('credentials_file') workspace_dir = Config.get('remote_path').rstrip('/') prepid = relval.get_prepid() self.logger.debug('Will try to acquire lock for %s', prepid) with Locker().get_lock(prepid): self.logger.info('Locked %s for submission', prepid) relval_db = Database('relvals') relval = controller.get(prepid) try: self.check_for_submission(relval) with SSHExecutor('lxplus.cern.ch', credentials_file) as ssh: # Start executing commands self.prepare_workspace(relval, controller, ssh, workspace_dir) # Create configs self.generate_configs(relval, ssh, workspace_dir) # Upload configs config_hashes = self.upload_configs( relval, ssh, workspace_dir) # Remove remote relval directory ssh.execute_command([f'rm -rf {workspace_dir}/{prepid}']) self.logger.debug(config_hashes) # Iterate through uploaded configs and save their hashes in RelVal steps self.update_steps_with_config_hashes(relval, config_hashes) # Submit job dict to ReqMgr2 job_dict = controller.get_job_dict(relval) cmsweb_url = Config.get('cmsweb_url') grid_cert = Config.get('grid_user_cert') grid_key = Config.get('grid_user_key') connection = ConnectionWrapper(host=cmsweb_url, cert_file=grid_cert, key_file=grid_key) workflow_name = self.submit_job_dict(job_dict, connection) # Update RelVal after successful submission relval.set('workflows', [{'name': workflow_name}]) relval.set('status', 'submitted') relval.add_history('submission', 'succeeded', 'automatic') relval_db.save(relval.get_json()) time.sleep(3) self.approve_workflow(workflow_name, connection) connection.close() if not Config.get('development'): refresh_workflows_in_stats([workflow_name]) except Exception as ex: self.__handle_error(relval, str(ex)) return self.__handle_success(relval) if not Config.get('development'): controller.update_workflows(relval) self.logger.info('Successfully finished %s submission', prepid)
def after_update(self, old_obj, new_obj, changed_values): self.logger.info('Changed values: %s', changed_values) if 'workflow_name' in changed_values: new_relval = self.create(new_obj.get_json()) old_prepid = old_obj.get_prepid() new_prepid = new_relval.get_prepid() new_relval.set('history', old_obj.get('history')) new_relval.add_history('rename', [old_prepid, new_prepid], None) relvals_db = Database('relvals') relvals_db.save(new_relval.get_json()) self.logger.info('Created %s as rename of %s', new_prepid, old_prepid) new_obj.set('prepid', new_prepid) # Update the ticket... tickets_db = Database('tickets') tickets = tickets_db.query( f'created_relvals={old_obj.get_prepid()}') self.logger.debug(json.dumps(tickets, indent=2)) for ticket_json in tickets: ticket_prepid = ticket_json['prepid'] with self.locker.get_lock(ticket_prepid): ticket_json = tickets_db.get(ticket_prepid) ticket = Ticket(json_input=ticket_json) created_relvals = ticket.get('created_relvals') if old_prepid in created_relvals: created_relvals.remove(old_prepid) created_relvals.append(new_prepid) ticket.set('created_relvals', created_relvals) ticket.add_history('rename', [old_prepid, new_prepid], None) tickets_db.save(ticket.get_json()) self.delete(old_obj.get_json())
def update_status(self, request, status, timestamp=None): """ Set new status to request, update history accordingly and save to database """ request_db = Database(self.database_name) request.set('status', status) request.add_history('status', status, None, timestamp) request_db.save(request.get_json())
def create_relvals_for_ticket(self, ticket): """ Create RelVals from given ticket. Return list of relval prepids """ ticket_db = Database('tickets') ticket_prepid = ticket.get_prepid() ssh_executor = SSHExecutor('lxplus.cern.ch', Config.get('credentials_file')) relval_controller = RelValController() created_relvals = [] with self.locker.get_lock(ticket_prepid): ticket = self.get(ticket_prepid) rewrite_gt_string = ticket.get('rewrite_gt_string') recycle_input_of = ticket.get('recycle_input_of') try: workflows = self.generate_workflows(ticket, ssh_executor) # Iterate through workflows and create RelVal objects relvals = [] for workflow_id, workflow_dict in workflows.items(): relvals.append(self.create_relval_from_workflow(ticket, workflow_id, workflow_dict)) # Handle recycling if needed if recycle_input_of: if rewrite_gt_string: self.recycle_input_with_gt_rewrite(relvals, rewrite_gt_string, recycle_input_of) else: self.recycle_input(relvals, relval_controller, recycle_input_of) for relval in relvals: relval = relval_controller.create(relval.get_json()) created_relvals.append(relval) self.logger.info('Created %s', relval.get_prepid()) created_relval_prepids = [r.get('prepid') for r in created_relvals] ticket.set('created_relvals', created_relval_prepids) ticket.set('status', 'done') ticket.add_history('created_relvals', created_relval_prepids, None) ticket_db.save(ticket.get_json()) except Exception as ex: self.logger.error('Error creating RelVal from ticket: %s', ex) # Delete created relvals if there was an Exception for created_relval in reversed(created_relvals): relval_controller.delete({'prepid': created_relval.get('prepid')}) # And reraise the exception raise ex finally: # Close all SSH connections ssh_executor.close_connections() return [r.get('prepid') for r in created_relvals]
def update_status(self, relval, status, timestamp=None): """ Set new status to RelVal, update history accordingly and save to database """ relval_db = Database(self.database_name) relval.set('status', status) relval.add_history('status', status, None, timestamp) relval_db.save(relval.get_json()) self.logger.info('Set "%s" status to "%s"', relval.get_prepid(), status)
def __check_for_submission(self, request): """ Perform one last check of values before submitting a request """ prepid = request.get_prepid() self.logger.debug('Final check before submission for %s', prepid) if request.get('status') != 'submitting': raise Exception( f'Cannot submit a request with status {request.get("status")}') if not request.get('input')['dataset']: request_db = Database('requests') request.set('status', 'approved') request_db.save(request.get_json()) raise Exception('Cannot submit a request without input dataset')
def after_delete(self, obj): prepid = obj.get_prepid() tickets_db = Database('tickets') tickets = tickets_db.query(f'created_relvals={prepid}') self.logger.debug(json.dumps(tickets, indent=2)) for ticket_json in tickets: ticket_prepid = ticket_json['prepid'] with self.locker.get_lock(ticket_prepid): ticket_json = tickets_db.get(ticket_prepid) ticket = Ticket(json_input=ticket_json) created_relvals = ticket.get('created_relvals') if prepid in created_relvals: created_relvals.remove(prepid) ticket.set('created_relvals', created_relvals) ticket.add_history('remove_relval', prepid, None) tickets_db.save(ticket.get_json())
def __handle_error(self, request, error_message): """ Handle error that occured during submission, modify request accordingly """ request_db = Database('requests') request.set('status', 'new') request.add_history('submission', 'failed', 'automatic') request_db.save(request.get_json()) service_url = Config.get('service_url') emailer = Emailer() prepid = request.get_prepid() self.logger.warning('Submission of %s failed', prepid) subject = f'Request {prepid} submission failed' body = f'Hello,\n\nUnfortunately submission of {prepid} failed.\n' body += (f'You can find this request at ' f'{service_url}/requests?prepid={prepid}\n') body += f'Error message:\n\n{error_message}' recipients = emailer.get_recipients(request) emailer.send(subject, body, recipients)
def change_request_priority(self, request, priority): """ Change request priority """ prepid = request.get_prepid() request_db = Database('requests') cmsweb_url = Config.get('cmsweb_url') grid_cert = Config.get('grid_user_cert') grid_key = Config.get('grid_user_key') self.logger.info('Will try to change %s priority to %s', prepid, priority) with self.locker.get_nonblocking_lock(prepid): request_json = request_db.get(prepid) request = Request(json_input=request_json) if request.get('status') != 'submitted': raise Exception('It is not allowed to change priority of ' 'requests that are not in status "submitted"') request.set('priority', priority) updated_workflows = [] active_workflows = self.__pick_active_workflows(request) connection = ConnectionWrapper(host=cmsweb_url, keep_open=True, cert_file=grid_cert, key_file=grid_key) for workflow in active_workflows: workflow_name = workflow['name'] self.logger.info('Changing "%s" priority to %s', workflow_name, priority) response = connection.api( 'PUT', f'/reqmgr2/data/request/{workflow_name}', {'RequestPriority': priority}) updated_workflows.append(workflow_name) self.logger.debug(response) connection.close() # Update priority in Stats2 self.force_stats_to_refresh(updated_workflows) # Finally save the request request_db.save(request.get_json()) return request
def __handle_error(self, relval, error_message): """ Handle error that occured during submission, modify RelVal accordingly """ self.logger.error(error_message) relval_db = Database('relvals') relval.set('status', 'new') relval.set('campaign_timestamp', 0) relval.add_history('submission', 'failed', 'automatic') for step in relval.get('steps'): step.set('config_id', '') step.set('resolved_globaltag', '') relval_db.save(relval.get_json()) service_url = Config.get('service_url') emailer = Emailer() prepid = relval.get_prepid() subject = f'RelVal {prepid} submission failed' body = f'Hello,\n\nUnfortunately submission of {prepid} failed.\n' body += (f'You can find this relval at ' f'{service_url}/relvals?prepid={prepid}\n') body += f'Error message:\n\n{error_message}' recipients = emailer.get_recipients(relval) emailer.send(subject, body, recipients)
def update_workflows(self, request): """ Update computing workflows from Stats2 """ prepid = request.get_prepid() request_db = Database('requests') with self.locker.get_lock(prepid): request_json = request_db.get(prepid) request = Request(json_input=request_json) stats_conn = ConnectionWrapper(host='vocms074.cern.ch', port=5984, https=False, keep_open=True) stats_workflows = stats_conn.api( 'GET', f'/requests/_design/_designDoc/_view/prepids?key="{prepid}"&include_docs=True' ) stats_workflows = json.loads(stats_workflows) stats_workflows = [x['doc'] for x in stats_workflows['rows']] existing_workflows = [x['name'] for x in request.get('workflows')] stats_workflows = [x['RequestName'] for x in stats_workflows] all_workflow_names = list( set(existing_workflows) | set(stats_workflows)) self.logger.info('All workflows of %s are %s', prepid, ', '.join(all_workflow_names)) all_workflows = {} for workflow_name in all_workflow_names: workflow = stats_conn.api('GET', f'/requests/{workflow_name}') if not workflow: raise Exception( f'Could not find {workflow_name} in Stats2') workflow = json.loads(workflow) if not workflow.get('RequestName'): raise Exception( f'Could not find {workflow_name} in Stats2') if workflow.get('RequestType', '').lower() == 'resubmission': continue all_workflows[workflow_name] = workflow self.logger.info('Fetched workflow %s', workflow_name) stats_conn.close() output_datasets = self.__get_output_datasets( request, all_workflows) new_workflows = self.__pick_workflows(all_workflows, output_datasets) all_workflow_names = [x['name'] for x in new_workflows] for new_workflow in reversed(new_workflows): completed_events = -1 for output_dataset in new_workflow.get('output_datasets', []): if output_datasets and output_dataset[ 'name'] == output_datasets[-1]: completed_events = output_dataset['events'] break if completed_events != -1: request.set('completed_events', completed_events) break if all_workflow_names: newest_workflow = all_workflows[all_workflow_names[-1]] if 'RequestPriority' in newest_workflow: request.set('priority', newest_workflow['RequestPriority']) if 'TotalEvents' in newest_workflow: request.set('total_events', max(0, newest_workflow['TotalEvents'])) request.set('output_datasets', output_datasets) request.set('workflows', new_workflows) request_db.save(request.get_json()) if output_datasets: subsequent_requests = request_db.query( f'input.request={prepid}') self.logger.info('Found %s subsequent requests for %s: %s', len(subsequent_requests), prepid, [r['prepid'] for r in subsequent_requests]) for subsequent_request_json in subsequent_requests: subsequent_request_prepid = subsequent_request_json.get( 'prepid') self.update_input_dataset( self.get(subsequent_request_prepid)) return request
""" Script to add run list to relval steps """ import sys import os.path import os sys.path.append(os.path.abspath(os.path.pardir)) from core_lib.database.database import Database Database.set_credentials_file(os.getenv('DB_AUTH')) Database.set_database_name('relval') relvals_database = Database('relvals') total_relvals = relvals_database.get_count() print('Total relvals: %s' % (total_relvals)) for index, item in enumerate(relvals_database.query(limit=total_relvals)): print('Processing entry %s/%s %s' % (index + 1, total_relvals, item.get('prepid', '<no-id>'))) for step in item['steps']: step['input']['run'] = step['input'].get('run', []) relvals_database.save(item) print('Done')
total_requests = request_db.get_count() total_old_tickets = old_ticket_db.get_count() total_new_tickets = new_ticket_db.get_count() print('Requests: %s' % (total_requests)) print('Subcampaigns: %s' % (total_subcampaigns)) print('(Old) subcampaign tickets: %s' % (total_old_tickets)) print('(New) tickets: %s' % (total_new_tickets)) for index, subcampaign in enumerate( subcampaign_db.query(limit=total_subcampaigns)): print('Processing subcampaign %s/%s %s' % (index + 1, total_subcampaigns, subcampaign['prepid'])) subcampaign.pop('_rev', None) subcampaign.pop('step', None) subcampaign_db.save(subcampaign) for index, request in enumerate(request_db.query(limit=total_requests)): print('Processing request %s/%s %s' % (index + 1, total_requests, request['prepid'])) request.pop('_rev', None) request.pop('step', None) if 'input_dataset' in request: request['input'] = { 'dataset': request.pop('input_dataset'), 'request': '' } request_db.save(request) for index, ticket in enumerate(old_ticket_db.query(limit=total_old_tickets)):
Database.set_credentials_file(os.getenv('DB_AUTH')) Database.set_database_name('rereco') request_db = Database('requests') subcampaign_db = Database('subcampaigns') total_subcampaigns = subcampaign_db.get_count() total_requests = request_db.get_count() print('Requests: %s' % (total_requests)) print('Subcampaigns: %s' % (total_subcampaigns)) for index, subcampaign in enumerate(subcampaign_db.query(limit=total_subcampaigns)): print('Processing subcampaign %s/%s %s' % (index + 1, total_subcampaigns, subcampaign['prepid'])) subcampaign.pop('scram_arch', None) subcampaign_db.save(subcampaign) for index, request in enumerate(request_db.query(limit=total_requests)): print('Processing request %s/%s %s' % (index + 1, total_requests, request['prepid'])) request.pop('scram_arch', None) request_db.save(request) total_subcampaigns = subcampaign_db.get_count() total_requests = request_db.get_count() print('Requests: %s' % (total_requests)) print('Subcampaigns: %s' % (total_subcampaigns))
total_relvals = relvals_database.get_count() print('Total tickets: %s' % (total_tickets)) print('Total relvals: %s' % (total_relvals)) for index, item in enumerate(tickets_database.query(limit=total_tickets)): print('Processing entry %s/%s %s' % (index + 1, total_tickets, item.get('prepid', '<no-id>'))) item['gpu'] = {'requires': 'forbidden', 'gpu_memory': '', 'cuda_capabilities': [], 'cuda_runtime': '', 'gpu_name': '', 'cuda_driver_version': '', 'cuda_runtime_version': ''} item['gpu_steps'] = [] tickets_database.save(item) for index, item in enumerate(relvals_database.query(limit=total_relvals)): print('Processing entry %s/%s %s' % (index + 1, total_relvals, item.get('prepid', '<no-id>'))) for step in item['steps']: step['gpu'] = {'requires': 'forbidden', 'gpu_memory': '', 'cuda_capabilities': [], 'cuda_runtime': '', 'gpu_name': '', 'cuda_driver_version': '', 'cuda_runtime_version': ''} relvals_database.save(item)
import sys import os.path import os sys.path.append(os.path.abspath(os.path.pardir)) from core_lib.database.database import Database Database.set_credentials_file(os.getenv('DB_AUTH')) Database.set_database_name('relval') database = Database('relvals') total_entries = database.get_count() print('Total entries: %s' % (total_entries)) for index, item in enumerate(database.query(limit=total_entries)): print('Processing entry %s/%s %s' % (index + 1, total_entries, item.get('prepid', '<no-id>'))) item['job_dict_overwrite'] = {} database.save(item) print('Done')
def create_requests_for_ticket(self, ticket): """ Create requests from given ticket. Return list of request prepids """ database = Database(self.database_name) ticket_prepid = ticket.get_prepid() created_requests = [] dataset_blacklist = set(Settings().get('dataset_blacklist')) request_controller = RequestController() with self.locker.get_lock(ticket_prepid): ticket = Ticket(json_input=database.get(ticket_prepid)) created_requests = ticket.get('created_requests') status = ticket.get('status') if status != 'new': raise Exception(f'Ticket is not new, it already has ' f'{len(created_requests)} requests created') # In case black list was updated after ticket was created for input_dataset in ticket.get('input_datasets'): dataset = input_dataset.split('/')[1] if dataset in dataset_blacklist: raise Exception( f'Input dataset {input_dataset} is not ' f'allowed because {dataset} is in blacklist') try: for input_dataset in ticket.get('input_datasets'): last_request_prepid = None for step_index, step in enumerate(ticket.get('steps')): subcampaign_name = step['subcampaign'] processing_string = step['processing_string'] time_per_event = step['time_per_event'] size_per_event = step['size_per_event'] priority = step['priority'] new_request_json = { 'subcampaign': subcampaign_name, 'priority': priority, 'processing_string': processing_string, 'time_per_event': time_per_event, 'size_per_event': size_per_event, 'input': { 'dataset': '', 'request': '' } } if step_index == 0: new_request_json['input'][ 'dataset'] = input_dataset else: new_request_json['input'][ 'request'] = last_request_prepid try: runs = request_controller.get_runs( subcampaign_name, input_dataset) new_request_json['runs'] = runs lumis = request_controller.get_lumisections( subcampaign_name, runs) new_request_json['lumisections'] = lumis except Exception as ex: self.logger.error( 'Error getting runs or lumis for %s %s %s: \n%s', subcampaign_name, input_dataset, processing_string, ex) request = request_controller.create(new_request_json) created_requests.append(request) last_request_prepid = request.get('prepid') self.logger.info('Created %s', last_request_prepid) created_request_prepids = [ r.get('prepid') for r in created_requests ] ticket.set('created_requests', created_request_prepids) ticket.set('status', 'done') ticket.add_history('create_requests', created_request_prepids, None) database.save(ticket.get_json()) except Exception as ex: # Delete created requests if there was an Exception for created_request in reversed(created_requests): request_controller.delete( {'prepid': created_request.get('prepid')}) # And reraise the exception raise ex return [r.get('prepid') for r in created_requests]