def message_handle(self, msg): record_counter('daemons.cache.consumer.message_handle.message') try: if isinstance(msg, dict) and 'operation' in msg.keys(): if msg['operation'] == 'add_replicas': validate_schema(name='cache_add_replicas', obj=msg) if 'rse' in msg.keys() and 'files' in msg.keys(): logging.debug('[%s] %s %s %s' % (self.__broker, msg['operation'], msg['rse'], msg['files'])) try: if not self.__rse_volatile.get_volatile(msg['rse']): logging.error("%s volatile is not True, Rucio Cache should not update it." % (msg['rse'])) else: cache_add_replicas(rse=msg['rse'], files=msg['files'], account=self.__account, lifetime=msg['lifetime']) except Exception, e: logging.error('[%s] %s %s %s %s with error details: %s' % (self.__broker, msg['operation'], msg['rse'], msg['files'], str(e), str(format_exc()))) if msg['operation'] == 'delete_replicas': validate_schema(name='cache_delete_replicas', obj=msg) if 'rse' in msg.keys() and 'files' in msg.keys(): logging.debug('[%s] %s %s %s' % (self.__broker, msg['operation'], msg['rse'], msg['files'])) try: if not self.__rse_volatile.get_volatile(msg['rse']): logging.error("%s volatile is not True, Rucio Cache should not update it." % (msg['rse'])) else: cache_delete_replicas(rse=msg['rse'], files=msg['files'], account=self.__account) except Exception, e: logging.error('[%s] %s %s %s %s with error details: %s' % (self.__broker, msg['operation'], msg['rse'], msg['files'], str(e), str(format_exc())))
def query_request_details(request_id, transfertool='fts3', session=None): """ Query the detailed status of a request. Can also be done after the external transfer has finished. :param request_id: Request-ID as a 32 character hex string. :param transfertool: Transfertool name as a string. :param session: Database session to use. :returns: Detailed request status information as a dictionary. """ record_counter('core.request.query_request_details') req = get_request(request_id, session=session) if not req: return if transfertool == 'fts3': ts = time.time() tmp = fts3.query_details(req['external_id'], req['external_host']) record_timer('core.request.query_details_fts3', (time.time() - ts) * 1000) return tmp raise NotImplementedError
def get_request_by_did(scope, name, rse, rse_id=None, request_type=None, session=None): """ Retrieve a request by its DID for a destination RSE. :param scope: The scope of the data identifier. :param name: The name of the data identifier. :param rse: The destination RSE of the request. :param rse_id: The destination RSE ID of the request. Overrides rse param! :param request_type: The type of request as rucio.db.constants.RequestType. :param session: Database session to use. :returns: Request as a dictionary. """ record_counter('core.request.get_request_by_did') try: tmp = session.query(models.Request).filter_by(scope=scope, name=name) if rse_id: tmp = tmp.filter_by(dest_rse_id=rse_id) else: tmp = tmp.filter_by(dest_rse_id=get_rse_id(rse)) if request_type: tmp = tmp.filter_by(request_type=request_type) tmp = tmp.first() if not tmp: return else: tmp = dict(tmp) tmp.pop('_sa_instance_state') return tmp except IntegrityError, e: raise RucioException(e.args)
def query(tid, session): """ Query the transfer job information of a single job. Has a chance to progress the job from QUEUED to either DONE or FAILED. :param tid: The transfer job id. :returns: The transfer job information. """ record_counter('daemons.mock.fts3.query') ts = time.time() new_state = random.sample(sum([[FTSState.FINISHED]*15, [FTSState.FAILED]*3, [FTSState.FINISHEDDIRTY]*2, [FTSState.ACTIVE]*80], []), 1)[0] record_timer('daemons.mock.fts3.query.000-random_sample', (time.time()-ts)*1000) ts = time.time() query = session.query(test_models.MockFTSTransfer).filter(and_(test_models.MockFTSTransfer.transfer_id == tid, or_(test_models.MockFTSTransfer.state == FTSState.SUBMITTED, test_models.MockFTSTransfer.state == FTSState.ACTIVE))) if query.update({'state': new_state, 'last_modified': datetime.datetime.utcnow()}) == 0: return None r = {'job_state': str(new_state)} if new_state == FTSState.FAILED or new_state == FTSState.FINISHEDDIRTY: r['reason'] = 'Mock FTS decided to kill your transfer.' r['files'] = [{'source_surl': 'mock_src', 'dest_surl': 'mock_dest', 'reason': 'mock failure'}] return r
def undertaker(worker_number=1, total_workers=1, chunk_size=5, once=False): """ Main loop to select and delete dids. """ logging.info('Undertaker(%s): starting' % worker_number) logging.info('Undertaker(%s): started' % worker_number) while not graceful_stop.is_set(): try: dids = list_expired_dids(worker_number=worker_number, total_workers=total_workers, limit=10000) if not dids and not once: logging.info('Undertaker(%s): Nothing to do. sleep 60.' % worker_number) time.sleep(60) continue for chunk in chunks(dids, chunk_size): try: logging.info('Undertaker(%s): Receive %s dids to delete' % (worker_number, len(chunk))) delete_dids(dids=chunk, account='root') logging.info('Undertaker(%s): Delete %s dids' % (worker_number, len(chunk))) record_counter(counters='undertaker.delete_dids', delta=len(chunk)) except DatabaseException, e: logging.error('Undertaker(%s): Got database error %s.' % (worker_number, str(e))) except: logging.error(traceback.format_exc()) time.sleep(1) if once: break logging.info('Undertaker(%s): graceful stop requested' % worker_number) logging.info('Undertaker(%s): graceful stop done' % worker_number)
def delete_dids(dids, account, session=None): """ Delete data identifiers :param dids: The list of dids to delete. :param account: The account. :param session: The database session in use. """ rule_id_clause = [] content_clause = [] parent_content_clause = [] did_clause = [] for did in dids: logging.info('Removing did %s:%s' % (did['scope'], did['name'])) did_clause.append(and_(models.DataIdentifier.scope == did['scope'], models.DataIdentifier.name == did['name'])) parent_content_clause.append(and_(models.DataIdentifierAssociation.child_scope == did['scope'], models.DataIdentifierAssociation.child_name == did['name'])) rule_id_clause.append(and_(models.ReplicationRule.scope == did['scope'], models.ReplicationRule.name == did['name'])) content_clause.append(and_(models.DataIdentifierAssociation.scope == did['scope'], models.DataIdentifierAssociation.name == did['name'])) # Send message for AMI add_message('ERASE', {'account': account, 'scope': did['scope'], 'name': did['name']}, session=session) # Delete rules on did if rule_id_clause: with record_timer_block('undertaker.rules'): for (rule_id, scope, name, rse_expression, ) in session.query(models.ReplicationRule.id, models.ReplicationRule.scope, models.ReplicationRule.name, models.ReplicationRule.rse_expression).filter(or_(*rule_id_clause)): logging.debug('Removing rule %s for did %s:%s on RSE-Expression %s' % (str(rule_id), scope, name, rse_expression)) rucio.core.rule.delete_rule(rule_id=rule_id, nowait=True, session=session) # Detach from parent dids: existing_parent_dids = False if parent_content_clause: with record_timer_block('undertaker.parent_content'): for parent_did in session.query(models.DataIdentifierAssociation).filter(or_(*parent_content_clause)): existing_parent_dids = True detach_dids(scope=parent_did.scope, name=parent_did.name, dids=[{'scope': parent_did.child_scope, 'name': parent_did.child_name}], session=session) # Remove content if content_clause: with record_timer_block('undertaker.content'): rowcount = session.query(models.DataIdentifierAssociation).filter(or_(*content_clause)).\ delete(synchronize_session=False) record_counter(counters='undertaker.content.rowcount', delta=rowcount) # remove data identifier if existing_parent_dids: # Exit method early to give Judge time to remove locks (Otherwise, due to foreign keys, did removal does not work logging.debug('Leaving delete_dids early for Judge-Evaluator checks') return with record_timer_block('undertaker.dids'): rowcount = session.query(models.DataIdentifier).filter(or_(*did_clause)).\ filter(or_(models.DataIdentifier.did_type == DIDType.CONTAINER, models.DataIdentifier.did_type == DIDType.DATASET)).\ delete(synchronize_session=False)
def necromancer(worker_number=1, total_workers=1, chunk_size=5, once=False): """ Creates a Necromancer Worker that gets a list of bad replicas for a given hash, identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation. param worker_number: The number of the worker (thread). param total_number: The total number of workers (threads). chunk_size: The chunk of the size to process. once: To run only once """ sleep_time = 60 while not graceful_stop.is_set(): stime = time.time() try: replicas = list_bad_replicas(limit=chunk_size, worker_number=worker_number, total_workers=total_workers) for replica in replicas: scope, name, rse_id, rse = replica['scope'], replica['name'], replica['rse_id'], replica['rse'] logging.info('Thread [%i/%i] : Working on %s:%s on %s' % (worker_number, total_workers, scope, name, rse)) rep = [r for r in list_replicas([{'scope': scope, 'name': name}, ])] if (not rep[0]['rses']) or (rep[0]['rses'].keys() == [rse]): logging.info('Thread [%i/%i] : File %s:%s has no other replicas, it will be marked as lost' % (worker_number, total_workers, scope, name)) try: update_rules_for_lost_replica(scope=scope, name=name, rse_id=rse_id) monitor.record_counter(counters='necromancer.badfiles.lostfile', delta=1) except DatabaseException, e: logging.info('Thread [%i/%i] : %s' % (worker_number, total_workers, str(e))) else: logging.info('Thread [%i/%i] : File %s:%s can be recovered. Available sources : %s' % (worker_number, total_workers, scope, name, str(rep[0]['rses']))) try: update_rules_for_bad_replica(scope=scope, name=name, rse_id=rse_id) monitor.record_counter(counters='necromancer.badfiles.recovering', delta=1) except DatabaseException, e: logging.info('Thread [%i/%i] : %s' % (worker_number, total_workers, str(e))) logging.info('Thread [%i/%i] : It took %s seconds to process %s replicas' % (worker_number, total_workers, str(time.time() - stime), str(len(replicas))))
def requeue_and_archive(request_id, session=None): """ Requeue and archive a failed request. TODO: Multiple requeue. :param request_id: Original request ID as a string. :param session: Database session to use. """ record_counter('core.request.requeue_request') new_req = get_request(request_id, session=session) if new_req: archive_request(request_id, session=session) new_req['request_id'] = generate_uuid() new_req['previous_attempt_id'] = request_id if new_req['retry_count'] is None: new_req['retry_count'] = 1 else: new_req['retry_count'] += 1 # hardcoded for now - only requeue a couple of times if new_req['retry_count'] < 4: queue_requests([new_req], session=session) return new_req
def submit_transfers(transfers, transfertool='fts3', job_metadata={}, session=None): """ Submit transfer request to a transfertool. :param transfers: Dictionary containing request metadata. :param transfertool: Transfertool as a string. :param job_metadata: Metadata key/value pairs for all files as a dictionary. :param session: Database session to use. :returns: Transfertool external ID. """ record_counter('core.request.submit_transfer') transfer_id = None if transfertool == 'fts3': ts = time.time() transfer_ids = fts3.submit_transfers(transfers, job_metadata) record_timer('core.request.submit_transfers_fts3', (time.time() - ts) * 1000) for transfer_id in transfer_ids: session.query(models.Request)\ .filter_by(id=transfer_id)\ .update({'state': RequestState.SUBMITTED, 'external_id': transfer_ids[transfer_id]['external_id'], 'external_host': transfer_ids[transfer_id]['external_host'], 'dest_url': transfer_ids[transfer_id]['dest_urls'][0]}, synchronize_session=False) return transfer_ids
def query(transfer_id, transfer_host): """ Query the status of a transfer in FTS3 via JSON. :param transfer_id: FTS transfer identifier as a string. :param transfer_host: FTS server as a string. :returns: Transfer status information as a dictionary. """ job = None if transfer_host.startswith("https://"): job = requests.get( "%s/jobs/%s" % (transfer_host, transfer_id), verify=False, cert=(__USERCERT, __USERCERT), headers={"Content-Type": "application/json"}, ) else: job = requests.get("%s/jobs/%s" % (transfer_host, transfer_id), headers={"Content-Type": "application/json"}) if job and job.status_code == 200: record_counter("transfertool.fts3.%s.query.success" % __extract_host(transfer_host)) return job.json() record_counter("transfertool.fts3.%s.query.failure" % __extract_host(transfer_host)) raise Exception("Could not retrieve transfer information: %s", job.content)
def cancel_request_did(scope, name, dest_rse_id, request_type=RequestType.TRANSFER, session=None): """ Cancel a request based on a DID and request type. :param scope: Data identifier scope as a string. :param name: Data identifier name as a string. :param dest_rse_id: RSE id as a string. :param request_type: Type of the request. :param session: Database session to use. """ record_counter('core.request.cancel_request_did') reqs = None try: reqs = session.query(models.Request.id, models.Request.external_id, models.Request.external_host).filter_by(scope=scope, name=name, dest_rse_id=dest_rse_id, request_type=request_type).all() if not reqs: logging.warn('Tried to cancel non-existant request for DID %s:%s at RSE ID %s' % (scope, name, dest_rse_id)) except IntegrityError, e: raise RucioException(e.args)
def version(transfer_host): """ Returns FTS3 server information. :param transfer_host: FTS server as a string. :returns: FTS3 server information as a dictionary. """ r = None if transfer_host.startswith("https://"): r = requests.get( "%s/" % transfer_host, verify=False, cert=(__USERCERT, __USERCERT), headers={"Content-Type": "application/json"}, ) else: r = requests.get("%s/" % transfer_host, headers={"Content-Type": "application/json"}) if r and r.status_code == 200: record_counter("transfertool.fts3.%s.version.success" % __extract_host(transfer_host)) return r.json() record_counter("transfertool.fts3.%s.version.failure" % __extract_host(transfer_host)) raise Exception("Could not retrieve version: %s", r.content)
def archive_request(request_id, session=None): """ Move a request to the history table. :param request_id: Request-ID as a 32 character hex string. :param session: Database session to use. """ record_counter('core.request.archive') req = get_request(request_id=request_id, session=session) if req: hist_request = models.Request.__history_mapper__.class_(id=req['id'], request_type=req['request_type'], scope=req['scope'], name=req['name'], dest_rse_id=req['dest_rse_id'], attributes=req['attributes'], state=req['state'], external_id=req['external_id'], retry_count=req['retry_count'], err_msg=req['err_msg'], previous_attempt_id=req['previous_attempt_id'], external_host=req['external_host'], rule_id=req['rule_id'], activity=req['activity'], bytes=req['bytes'], md5=req['md5'], adler32=req['adler32'], dest_url=req['dest_url']) hist_request.save(session=session) try: session.query(models.Request).filter_by(id=request_id).delete() except IntegrityError, e: raise RucioException(e.args)
def cancel(transfer_id, transfer_host): """ Cancel a transfer that has been submitted to FTS via JSON. :param transfer_id: FTS transfer identifier as a string. :param transfer_host: FTS server as a string. """ job = None if transfer_host.startswith("https://"): job = requests.delete( "%s/jobs/%s" % (transfer_host, transfer_id), verify=False, cert=(__USERCERT, __USERCERT), headers={"Content-Type": "application/json"}, ) else: job = requests.delete("%s/jobs/%s" % (transfer_host, transfer_id), headers={"Content-Type": "application/json"}) if job and job.status_code == 200: record_counter("transfertool.fts3.%s.cancel.success" % __extract_host(transfer_host)) return job.json() record_counter("transfertool.fts3.%s.cancel.failure" % __extract_host(transfer_host)) raise Exception("Could not cancel transfer: %s", job.content)
def query_details(transfer_id, transfer_host): """ Query the detailed status of a transfer in FTS3 via JSON. :param transfer_id: FTS transfer identifier as a string. :param transfer_host: FTS server as a string. :returns: Detailed transfer status information as a dictionary. """ files = None if transfer_host.startswith("https://"): files = requests.get( "%s/jobs/%s/files" % (transfer_host, transfer_id), verify=False, cert=(__USERCERT, __USERCERT), headers={"Content-Type": "application/json"}, ) else: files = requests.get( "%s/jobs/%s/files" % (transfer_host, transfer_id), headers={"Content-Type": "application/json"} ) if files and files.status_code == 200: record_counter("transfertool.fts3.%s.query_details.success" % __extract_host(transfer_host)) return files.json() record_counter("transfertool.fts3.%s.query_details.failure" % __extract_host(transfer_host)) return
def on_message(self, headers, message): record_counter('daemons.cache.consumer2.message') msg = json.loads(message) id = msg['id'] if id % self.__num_thread == self.__id: self.message_handle(msg['payload'])
def consumer(id, total_threads=1): """ Main loop to consume messages from the FTS3 producer. """ logging.info('consumer starting') brokers_alias = [] brokers_resolved = [] try: brokers_alias = [b.strip() for b in config_get('messaging-fts3', 'brokers').split(',')] except: raise Exception('Could not load brokers from configuration') logging.info('resolving broker dns alias: %s' % brokers_alias) brokers_resolved = [] for broker in brokers_alias: brokers_resolved.append([str(tmp_broker) for tmp_broker in dns.resolver.query(broker, 'A')]) brokers_resolved = [item for sublist in brokers_resolved for item in sublist] logging.debug('brokers resolved to %s', brokers_resolved) conns = [] for broker in brokers_resolved: conns.append(stomp.Connection(host_and_ports=[(broker, config_get_int('messaging-fts3', 'port'))], use_ssl=True, ssl_key_file=config_get('messaging-fts3', 'ssl_key_file'), ssl_cert_file=config_get('messaging-fts3', 'ssl_cert_file'), ssl_version=ssl.PROTOCOL_TLSv1)) logging.info('consumer started') while not graceful_stop.is_set(): for conn in conns: if not conn.is_connected(): logging.info('connecting to %s' % conn.transport._Transport__host_and_ports[0][0]) record_counter('daemons.messaging.fts3.reconnect.%s' % conn.transport._Transport__host_and_ports[0][0].split('.')[0]) conn.set_listener('rucio-messaging-fts3', Consumer(broker=conn.transport._Transport__host_and_ports[0], id=id, total_threads=total_threads)) conn.start() conn.connect() conn.subscribe(destination=config_get('messaging-fts3', 'destination'), id='rucio-messaging-fts3', ack='auto') time.sleep(1) logging.info('graceful stop requested') for conn in conns: try: conn.disconnect() except: pass logging.info('graceful stop done')
def submit_deletion(url, session=None): """ Submit a deletion request to a deletiontool. :param url: URL acceptable to deletiontool as a string. :param session: Database sesssion to use. :returns: Deletiontool external ID. """ record_counter('core.request.submit_deletion')
def list_all(session): """ List all transfer jobs. :returns: List of dictionaries with job information """ record_counter('daemons.mock.fts3.list_all') query = session.query(test_models.MockFTSTransfer).order_by(test_models.MockFTSTransfer.lastmodified.desc()) for row in query.yield_per(5): yield row
def bulk_query_requests(request_host, request_ids, transfertool='fts3', session=None): """ Query the status of a request. :param request_host: Name of the external host. :param request_ids: List of (Request-ID as a 32 character hex string, External-ID as a 32 character hex string) :param transfertool: Transfertool name as a string. :param session: Database session to use. :returns: Request status information as a dictionary. """ record_counter('core.request.query_request') transfer_ids = [] for request_id, external_id in request_ids: if external_id not in transfer_ids: transfer_ids.append(external_id) if transfertool == 'fts3': try: ts = time.time() fts_resps = fts3.bulk_query(transfer_ids, request_host) record_timer('core.request.query_bulk_request_fts3', (time.time() - ts) * 1000 / len(transfer_ids)) except Exception: raise responses = {} for request_id, external_id in request_ids: fts_resp = fts_resps[external_id] if not fts_resp: req_status = {} req_status['new_state'] = RequestState.LOST req_status['request_id'] = request_id elif isinstance(fts_resp, Exception): req_status = fts_resp else: req_status = fts_resp # needed for unfinished jobs req_status['request_id'] = request_id if req_status['job_state'] in (str(FTSState.FAILED), str(FTSState.FINISHEDDIRTY), str(FTSState.CANCELED)): req_status['new_state'] = RequestState.FAILED elif req_status['job_state'] == str(FTSState.FINISHED): req_status['new_state'] = RequestState.DONE responses[request_id] = req_status return responses else: raise NotImplementedError return None
def make_replicas_available(self): """ Marks available replicas for the dataset at rse if they are in PhEDEx """ with monitor.record_timer_block('cms_sync.time_recover_replica'): logging.info('Recovering unavailable replicas for %s:%s at %s', self.scope, self.block_name, self.rse) replicas = list_replicas(dids=[{ 'scope': self.scope, 'name': self.block_name }], rse_expression='rse=%s' % self.rse, all_states=True) try: unavailable_replicas = { repl['name'] for repl in replicas if repl['states'][self.rse] != 'AVAILABLE' } except TypeError: unavailable_replicas = set() phedex_replicas = set(self.replicas.keys()) missing = list(phedex_replicas & unavailable_replicas) logging.info( 'Recovery for %s:%s at %s: PhEDEx has %s, Rucio unavailable %s. Missing: %s ', self.scope, self.block_name, self.rse, len(phedex_replicas), len(unavailable_replicas), len(missing)) # Fix up things which are unavailable rse_details = get_rse(self.rse) rse_id = rse_details['id'] scope = InternalScope(self.scope) state = 'A' for name in missing: logging.info('Setting available %s:%s at %s', self.scope, name, self.rse) core_update_state(rse_id=rse_id, scope=scope, name=name, state=state) monitor.record_counter('cms_sync.files_made_available', delta=len(missing)) return
def bulk_submit_xfer(submitjob, recursive=False): cfg = load_config() client_id = cfg['globus']['apps'][GLOBUS_AUTH_APP]['client_id'] auth_client = NativeAppAuthClient(client_id) refresh_token = cfg['globus']['apps'][GLOBUS_AUTH_APP]['refresh_token'] source_endpoint_id = submitjob[0].get('metadata').get( 'source_globus_endpoint_id') destination_endpoint_id = submitjob[0].get('metadata').get( 'dest_globus_endpoint_id') authorizer = RefreshTokenAuthorizer(refresh_token=refresh_token, auth_client=auth_client) tc = TransferClient(authorizer=authorizer) # make job_label for task a timestamp now = datetime.datetime.now() job_label = now.strftime('%Y%m%d%H%M%s') # retrieve globus_task_deadline value to enforce time window to complete transfers # default is 2880 minutes or 48 hours globus_task_deadline = config_get_int('conveyor', 'globus_task_deadline', False, 2880) deadline = now + datetime.timedelta(minutes=globus_task_deadline) # from Globus... sync_level=checksum means that before files are transferred, Globus will compute checksums on the source # and destination files, and only transfer files that have different checksums are transferred. verify_checksum=True means # that after a file is transferred, Globus will compute checksums on the source and destination files to verify that the # file was transferred correctly. If the checksums do not match, it will redo the transfer of that file. tdata = TransferData(tc, source_endpoint_id, destination_endpoint_id, label=job_label, sync_level="checksum", deadline=str(deadline)) for file in submitjob: source_path = file.get('sources')[0] dest_path = file.get('destinations')[0] filesize = file['metadata']['filesize'] # TODO: support passing a recursive parameter to Globus # md5 = file['metadata']['md5'] # tdata.add_item(source_path, dest_path, recursive=False, external_checksum=md5) tdata.add_item(source_path, dest_path, recursive=False) record_counter( 'daemons.conveyor.transfer_submitter.globus.transfers.submit.filesize', filesize) # logging.info('submitting transfer...') transfer_result = tc.submit_transfer(tdata) # logging.info("task_id =", transfer_result["task_id"]) return transfer_result["task_id"]
def query_latest(self, state, last_nhours=1): """ Query the latest status transfers status in FTS3 via JSON. :param state: Transfer state as a string or a dictionary. :returns: Transfer status information as a dictionary. """ jobs = None try: whoami = requests.get('%s/whoami' % (self.external_host), verify=self.verify, cert=self.cert, headers={'Content-Type': 'application/json'}) if whoami and whoami.status_code == 200: delegation_id = whoami.json()['delegation_id'] else: raise Exception('Could not retrieve delegation id: %s', whoami.content) state_string = ','.join(state) jobs = requests.get( '%s/jobs?dlg_id=%s&state_in=%s&time_window=%s' % (self.external_host, delegation_id, state_string, last_nhours), verify=self.verify, cert=self.cert, headers={'Content-Type': 'application/json'}) except ReadTimeout as error: raise TransferToolTimeout(error) except JSONDecodeError as error: raise TransferToolWrongAnswer(error) except Exception: logging.warn('Could not query latest terminal states from %s', self.external_host) if jobs and (jobs.status_code == 200 or jobs.status_code == 207): record_counter('transfertool.fts3.%s.query_latest.success' % self.__extract_host(self.external_host)) try: jobs_json = jobs.json() return jobs_json except ReadTimeout as error: raise TransferToolTimeout(error) except JSONDecodeError as error: raise TransferToolWrongAnswer(error) except Exception as error: logging.error("Failed to parse the jobs status %s" % (str(error))) record_counter('transfertool.fts3.%s.query.failure' % self.__extract_host(self.external_host))
def update_replicas(self): """ Add or removes replicas for the dataset at rse. """ with monitor.record_timer_block('cms_sync.time_update_replica'): logging.info('Updating replicas for %s:%s at %s', self.scope, self.block_name, self.rse) replicas = list_replicas(dids=[{ 'scope': self.scope, 'name': self.block_name }], rse_expression='rse=%s' % self.rse) try: rucio_replicas = {repl['name'] for repl in replicas} except TypeError: rucio_replicas = set() phedex_replicas = set(self.replicas.keys()) missing = list(phedex_replicas - rucio_replicas) to_remove = list(rucio_replicas - phedex_replicas) if missing and (len(phedex_replicas) != len(missing)): logging.warn( 'Recovery: Inconsistency found for %s at %s: %s in PhEDEx and %s missing', self.rse, self.block_name, len(phedex_replicas), len(missing)) if missing: logging.info('Some or all replicas for %s at %s missing', self.rse, self.block_name) lfns_added = self.add_missing_replicas(missing) monitor.record_counter('cms_sync.files_added', delta=lfns_added) if to_remove: logging.info('Removing replicas for %s at %s', self.rse, self.block_name) lfns_removed = self.remove_extra_replicas(to_remove) monitor.record_counter('cms_sync.files_removed', delta=lfns_removed) if not missing and not to_remove: logging.warn('Something very off for %s at %s', self.rse, self.block_name) logging.warn('Phedex: %s', phedex_replicas) logging.warn('Rucio: %s', rucio_replicas) logging.warn('Missing: %s', missing) logging.warn('To remove: %s', to_remove) return
def necromancer(thread=0, bulk=5, once=False): """ Creates a Necromancer Worker that gets a list of bad replicas for a given hash, identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation. :param thread: Thread number at startup. :param bulk: The number of requests to process. :param once: Run only once. """ sleep_time = 60 update_history_threshold = 3600 update_history_time = time.time() executable = ' '.join(argv) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) while not graceful_stop.is_set(): hb = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'] + 1, hb['nr_threads']) stime = time.time() try: replicas = list_bad_replicas(limit=bulk, thread=hb['assign_thread'], total_threads=hb['nr_threads']) for replica in replicas: scope, name, rse_id, rse = replica['scope'], replica['name'], replica['rse_id'], replica['rse'] logging.info(prepend_str + 'Working on %s:%s on %s' % (scope, name, rse)) rep = [r for r in list_replicas([{'scope': scope, 'name': name}, ])] if (not rep[0]['rses']) or (rep[0]['rses'].keys() == [rse]): logging.info(prepend_str + 'File %s:%s has no other replicas, it will be marked as lost' % (scope, name)) try: update_rules_for_lost_replica(scope=scope, name=name, rse_id=rse_id, nowait=True) monitor.record_counter(counters='necromancer.badfiles.lostfile', delta=1) except DatabaseException, error: logging.info(prepend_str + '%s' % (str(error))) else: logging.info(prepend_str + 'File %s:%s can be recovered. Available sources : %s' % (scope, name, str(rep[0]['rses']))) try: update_rules_for_bad_replica(scope=scope, name=name, rse_id=rse_id, nowait=True) monitor.record_counter(counters='necromancer.badfiles.recovering', delta=1) except DatabaseException, error: logging.info(prepend_str + '%s' % (str(error))) logging.info(prepend_str + 'It took %s seconds to process %s replicas' % (str(time.time() - stime), str(len(replicas))))
def touch_request(request_id, session=None): """ Update the timestamp of a request. Fails silently if the request_id does not exist. :param request_id: Request-ID as a 32 character hex string. :param session: Database session to use. """ record_counter('core.request.touch_request') try: rowcount = session.query(models.Request).filter_by(id=request_id).update({'updated_at': datetime.datetime.utcnow()}, synchronize_session=False) except IntegrityError, e: raise RucioException(e.args)
def __release_all_activities(stats, direction, rse_name, rse_id): """ Release requests if activities should be ignored. :param stats: Request statistics :param direction: String whether request statistics are based on source or destination RSEs. :param rse_name: RSE name. :param rse_id: RSE id. """ threshold = stats['threshold'] transfer = stats['transfer'] waiting = stats['waiting'] strategy = stats['strategy'] if threshold is not None and transfer + waiting > threshold: record_gauge( 'daemons.conveyor.throttler.set_rse_transfer_limits.%s.max_transfers' % (rse_name), threshold) record_gauge( 'daemons.conveyor.throttler.set_rse_transfer_limits.%s.transfers' % (rse_name), transfer) record_gauge( 'daemons.conveyor.throttler.set_rse_transfer_limits.%s.waitings' % (rse_name), waiting) if transfer < 0.8 * threshold: to_be_released = threshold - transfer if strategy == 'grouped_fifo': deadline = stats.get('deadline') volume = stats.get('volume') release_waiting_requests_grouped_fifo(rse_id, count=to_be_released, direction=direction, volume=volume, deadline=deadline) elif strategy == 'fifo': release_waiting_requests_fifo(rse_id, count=to_be_released, direction=direction) else: logging.debug( "Throttler has done nothing on rse %s (transfer > 0.8 * threshold)" % rse_name) elif waiting > 0 or not threshold: logging.debug( "Throttler remove limits(threshold: %s) and release all waiting requests, rse %s" % (threshold, rse_name)) delete_rse_transfer_limits(rse_id, activity='all_activities') release_all_waiting_requests(rse_id, direction=direction) record_counter( 'daemons.conveyor.throttler.delete_rse_transfer_limits.%s' % (rse_name))
def cancel(tid, session): """ Kills a transfer by setting its state to CANCELLED. :param tid: The transfer job id. """ record_counter('daemons.mock.fts3.cancel') ts = time.time() query = session.query(test_models.MockFTSTransfer).filter(tid=tid) query.update({'state': FTSState.CANCELED, 'last_modified': datetime.datetime.utcnow()}) record_timer('daemons.mock.fts3.cancel.update_state', (time.time()-ts)*1000)
def trace(payload): """ Write a trace to log file and send it to active mq. :param payload: Python dictionary with trace report. """ record_counter('trace.trace') report = json.dumps(payload, default=date_handler) LOGGER.debug(report) t_conns = CONNS[:] try: for i in range(len(t_conns)): try: conn = random.sample(t_conns, 1)[0] if not conn.is_connected(): logging.info( 'reconnect to ' + conn.transport._Transport__host_and_ports[0][0]) conn.start() conn.connect(USERNAME, PASSWORD) except stomp.exception.NotConnectedException as error: logging.warn( 'Could not connect to broker %s, try another one' % conn.transport._Transport__host_and_ports[0][0]) t_conns.remove(conn) continue except stomp.exception.ConnectFailedException as error: logging.warn( 'Could not connect to broker %s, try another one' % conn.transport._Transport__host_and_ports[0][0]) t_conns.remove(conn) continue if conn.is_connected: conn.send(body=report, destination=TOPIC, headers={ 'persistent': 'true', 'appversion': 'rucio' }) else: logging.error( "Unable to connect to broker. Could not send trace: %s" % report) except Exception as error: logging.error(error)
def get_jobs_response(transfer_host, fts_session, jobs_response): """ Parse FTS bulk query response and query details for finished jobs. :param transfer_host: FTS server as a string. :fts_session: query request as a session. :jobs_response: FTS bulk query response as a dict. :returns: Transfer status information as a dictionary. """ responses = {} for job_response in jobs_response: transfer_id = job_response['job_id'] if job_response['http_status'] == "404 Not Found": responses[transfer_id] = None elif job_response['http_status'] == "200 Ok": if not job_response['job_state'] in (str( FTSState.FAILED), str( FTSState.FINISHEDDIRTY), str( FTSState.CANCELED), str(FTSState.FINISHED)): responses[transfer_id] = {} responses[transfer_id]['job_state'] = job_response['job_state'] responses[transfer_id]['new_state'] = None responses[transfer_id]['transfer_id'] = transfer_id else: if transfer_host.startswith("https"): files = fts_session.get( '%s/jobs/%s/files' % (transfer_host, transfer_id), verify=False, cert=(__USERCERT, __USERCERT), headers={'Content-Type': 'application/json'}) else: files = fts_session.get( '%s/jobs/%s/files' % (transfer_host, transfer_id), headers={'Content-Type': 'application/json'}) if files and (files.status_code == 200 or files.status_code == 207): record_counter( 'transfertool.fts3.%s.jobs_response.success' % __extract_host(transfer_host)) responses[transfer_id] = format_response( transfer_host, job_response, files.json()) else: record_counter( 'transfertool.fts3.%s.jobs_response.failure' % __extract_host(transfer_host)) responses[transfer_id] = Exception( 'Could not retrieve files information: %s', files) return responses
def set_external_host(request_id, external_host, session=None): """ Update the state of a request. Fails silently if the request_id does not exist. :param request_id: Request-ID as a 32 character hex string. :param external_host: Selected external host as string in format protocol://fqdn:port :param session: Database session to use. """ record_counter('core.request.set_external_host') try: session.query(models.Request).filter_by(id=request_id).update({'external_host': external_host}, synchronize_session=False) except IntegrityError, e: raise RucioException(e.args)
def set_request_state(request_id, new_state, session=None): """ Update the state of a request. Fails silently if the request_id does not exist. :param request_id: Request-ID as a 32 character hex string. :param new_state: New state as string. :param session: Database session to use. """ record_counter('core.request.set_request_state') try: session.query(models.Request).filter_by(id=request_id).update({'state': new_state}, synchronize_session=False) except IntegrityError, e: raise RucioException(e.args)
def on_message(self, headers, message): record_counter('daemons.cache.consumer2.message') # id = msg['id'] # if id % self.__num_thread == self.__id: # self.message_handle(msg['payload']) try: msg = json.loads(message) if isinstance(msg, dict) and 'operation' in msg.keys(): if msg['operation'] == 'add_replicas': logging.info('add_replicas to RSE %s: %s ' % (msg['rse'], str(msg['files']))) add_volatile_replicas(rse=msg['rse'], replicas=msg['files']) elif msg['operation'] == 'delete_replicas': logging.info('delete_replicas to RSE %s: %s ' % (msg['rse'], str(msg['files']))) delete_volatile_replicas(rse=msg['rse'], replicas=msg['files']) except: logging.error(str(format_exc()))
def update_rule(self): """ Adds or removes the rule for the block. """ rules = list_replication_rules(filters={ 'scope': self.scope, 'name': self.block_name }) # rules = self.rcli.list_did_rules(scope=self.scope, name=self.block_name) rse_expression = 'rse=' + self.rse remove_rules = [ rule for rule in rules if rule['account'] == self.account and rule['rse_expression'] == rse_expression ] if not remove_rules and self.is_at_pnn: self.rule_exists = False if self.dry_run: logging.info("Dry run: Adding rule for dataset %s at rse %s.", self.block_name, self.rse) else: self.add_replication_rule_with_defaults( dids=[{ 'scope': self.scope, 'name': self.block_name }], copies=1, rse_expression=rse_expression, account=self.account) monitor.record_counter('cms_sync.rules_added') self.rule_exists = True elif remove_rules and not self.is_at_pnn: self.rule_exists = True if self.dry_run: logging.info("Removing rules for dataset %s at rse %s.", self.block_name, self.rse) else: for rule in remove_rules: # delete_replication_rule(rule['id'], purge_replicas=False, issuer=self.account) delete_rule(rule_id=rule['id'], purge_replicas=True, soft=False) monitor.record_counter('cms_sync.rules_removed') self.rule_exists = False
def register_container(self): self.container_exists = False if self.dry_run: logging.info('Dry Run: Create container %s in scope %s.', self.container, self.scope) self.container_exists = True return self.container_exists existed, created, attached, already_attached = self.register_and_attach_did( scope=self.scope, name=self.container, did_type='CONTAINER') self.container_exists = existed | created if existed: monitor.record_counter('cms_sync.container_exists') if created: monitor.record_counter('cms_sync.container_created') return self.container_exists
def get_jobs_response(transfer_host, fts_session, jobs_response): """ Parse FTS bulk query response and query details for finished jobs. :param transfer_host: FTS server as a string. :fts_session: query request as a session. :jobs_response: FTS bulk query response as a dict. :returns: Transfer status information as a dictionary. """ responses = {} for job_response in jobs_response: transfer_id = job_response["job_id"] if job_response["http_status"] == "404 Not Found": responses[transfer_id] = None elif job_response["http_status"] == "200 Ok": if not job_response["job_state"] in ( str(FTSState.FAILED), str(FTSState.FINISHEDDIRTY), str(FTSState.CANCELED), str(FTSState.FINISHED), ): responses[transfer_id] = {} responses[transfer_id]["job_state"] = job_response["job_state"] responses[transfer_id]["new_state"] = None responses[transfer_id]["transfer_id"] = transfer_id else: if transfer_host.startswith("https"): files = fts_session.get( "%s/jobs/%s/files" % (transfer_host, transfer_id), verify=False, cert=(__USERCERT, __USERCERT), headers={"Content-Type": "application/json"}, ) else: files = fts_session.get( "%s/jobs/%s/files" % (transfer_host, transfer_id), headers={"Content-Type": "application/json"} ) if files and files.status_code == 200: record_counter("transfertool.fts3.%s.jobs_response.success" % __extract_host(transfer_host)) responses[transfer_id] = format_response(transfer_host, job_response, files.json()) else: record_counter("transfertool.fts3.%s.jobs_response.failure" % __extract_host(transfer_host)) responses[transfer_id] = Exception("Could not retrieve files information: %s", files) return responses
def undertaker(worker_number=1, total_workers=1, chunk_size=5, once=False): """ Main loop to select and delete dids. """ logging.info('Undertaker(%s): starting', worker_number) logging.info('Undertaker(%s): started', worker_number) hostname = socket.gethostname() pid = os.getpid() thread = threading.current_thread() sanity_check(executable='rucio-undertaker', hostname=hostname) while not GRACEFUL_STOP.is_set(): try: heartbeat = live(executable='rucio-undertaker', hostname=hostname, pid=pid, thread=thread, older_than=6000) logging.info( 'Undertaker({0[worker_number]}/{0[total_workers]}): Live gives {0[heartbeat]}' .format(locals())) dids = list_expired_dids(worker_number=heartbeat['assign_thread'] + 1, total_workers=heartbeat['nr_threads'], limit=10000) if not dids and not once: logging.info('Undertaker(%s): Nothing to do. sleep 60.', worker_number) time.sleep(60) continue for chunk in chunks(dids, chunk_size): try: logging.info('Undertaker(%s): Receive %s dids to delete', worker_number, len(chunk)) delete_dids(dids=chunk, account='root') logging.info('Undertaker(%s): Delete %s dids', worker_number, len(chunk)) record_counter(counters='undertaker.delete_dids', delta=len(chunk)) except RuleNotFound, error: logging.error(error) except DatabaseException, error: logging.error('Undertaker(%s): Got database error %s.', worker_number, str(error))
def query_request(request_id, transfertool='fts3', session=None): """ Query the status of a request. :param request_id: Request-ID as a 32 character hex string. :param transfertool: Transfertool name as a string. :param session: Database session to use. :returns: Request status information as a dictionary. """ record_counter('core.request.query_request') req = get_request(request_id, session=session) req_status = {'request_id': request_id, 'new_state': None} if not req: req_status['new_state'] = RequestState.LOST return req_status if transfertool == 'fts3': try: ts = time.time() response = fts3.query(req['external_id'], req['external_host']) record_timer('core.request.query_request_fts3', (time.time() - ts) * 1000) req_status['details'] = response except Exception: raise if not response: req_status['new_state'] = RequestState.LOST else: if 'job_state' not in response: req_status['new_state'] = RequestState.LOST elif response['job_state'] in (str(FTSState.FAILED), str(FTSState.FINISHEDDIRTY), str(FTSState.CANCELED)): req_status['new_state'] = RequestState.FAILED elif response['job_state'] == str(FTSState.FINISHED): req_status['new_state'] = RequestState.DONE else: raise NotImplementedError return req_status
def LIST_DIDS_WILDCARD(self, scope, wildcard): jdoe_account = 'jdoe' client = DIDClient(account=jdoe_account) print 'run with: ' + str(wildcard) start = time() with monitor.record_timer_block('jdoe.list_dids_wildcard'): dids = [did for did in client.list_dids(scope=scope, filters=wildcard, type='dataset')] duration = time() - start cnt = len(dids) print 'got %d dids' % cnt monitor.record_counter('jdoe.list_dids_wildcard.num_results', cnt) if cnt != 0: monitor.record_counter('jdoe.list_dids_wildcard.time_per_did', duration / cnt) return {'no_datasets': cnt}
def new_bulk_query(transfer_ids, transfer_host): """ Query the status of a bulk of transfers in FTS3 via JSON. :param transfer_ids: FTS transfer identifiers as a list. :param transfer_host: FTS server as a string. :returns: Transfer status information as a dictionary. """ responses = {} if transfer_host.startswith('https://'): fts_session = requests.Session() jobs = fts_session.get('%s/jobs/%s' % (transfer_host, ','.join(transfer_ids)), verify=False, cert=(__USERCERT, __USERCERT), headers={'Content-Type': 'application/json'}) if jobs and (jobs.status_code == 200 or jobs.status_code == 207): record_counter('transfertool.fts3.%s.new_bulk.success' % __extract_host(transfer_host)) jobs_response = jobs.json() responses = get_jobs_response(transfer_host, fts_session, jobs_response) for transfer_id in transfer_ids: if transfer_id not in responses.keys(): responses[transfer_id] = None else: record_counter('transfertool.fts3.%s.new_bulk.failure' % __extract_host(transfer_host)) for transfer_id in transfer_ids: responses[transfer_id] = Exception( 'Could not retrieve transfer information: %s' % jobs) else: fts_session = requests.Session() jobs = fts_session.get('%s/jobs/%s' % (transfer_host, transfer_id), headers={'Content-Type': 'application/json'}) if jobs and (jobs.status_code == 200 or jobs.status_code == 207): record_counter('transfertool.fts3.%s.new_bulk.success' % __extract_host(transfer_host)) jobs_response = jobs.json() responses = get_jobs_response(transfer_host, fts_session, jobs_response) for transfer_id in transfer_ids: if transfer_id not in responses.keys(): responses[transfer_id] = None else: record_counter('transfertool.fts3.%s.new_bulk.failure' % __extract_host(transfer_host)) for transfer_id in transfer_ids: responses[transfer_id] = Exception( 'Could not retrieve transfer information: %s' % jobs) return responses
def set_transfer_update_time(external_host, transfer_id, update_time=datetime.datetime.utcnow(), session=None): """ Update the state of a request. Fails silently if the transfer_id does not exist. :param external_host: Selected external host as string in format protocol://fqdn:port :param transfer_id: External transfer job id as a string. :param update_time: Time stamp. :param session: Database session to use. """ record_counter('core.request.set_transfer_update_time') try: rowcount = session.query(models.Request).filter_by(external_id=transfer_id, state=RequestState.SUBMITTED).update({'updated_at': update_time}, synchronize_session=False) except IntegrityError as error: raise RucioException(error.args) if not rowcount: raise UnsupportedOperation("Transfer %s doesn't exist or its status is not submitted." % (transfer_id))
def __set_transfer_state(external_host, transfer_id, new_state, session=None): """ Update the state of a transfer. Fails silently if the transfer_id does not exist. :param external_host: Selected external host as string in format protocol://fqdn:port :param transfer_id: External transfer job id as a string. :param new_state: New state as string. :param session: Database session to use. """ record_counter('core.request.set_transfer_state') try: rowcount = session.query(models.Request).filter_by(external_id=transfer_id).update({'state': new_state, 'updated_at': datetime.datetime.utcnow()}, synchronize_session=False) except IntegrityError as error: raise RucioException(error.args) if not rowcount: raise UnsupportedOperation("Transfer %s on %s state %s cannot be updated." % (transfer_id, external_host, new_state))
def on_message(self, frame): record_counter('daemons.conveyor.receiver.message_all') msg = json.loads(frame.body) if not self.__all_vos: if 'vo' not in msg or msg['vo'] != get_policy(): return if 'job_metadata' in msg.keys() \ and isinstance(msg['job_metadata'], dict) \ and 'issuer' in msg['job_metadata'].keys() \ and str(msg['job_metadata']['issuer']) == str('rucio'): if 'job_state' in msg.keys() and str( msg['job_state']) != str('ACTIVE'): record_counter('daemons.conveyor.receiver.message_rucio') self._perform_request_update(msg)
def query(self, transfer_ids, details=False, timeout=None): """ Query the status of a transfer in FTS3 via JSON. :param transfer_ids: FTS transfer identifiers as list of strings. :param details: Switch if detailed information should be listed. :param timeout: Timeout in seconds. :returns: Transfer status information as a list of dictionaries. """ if len(transfer_ids) > 1: raise NotImplementedError('FTS3 transfertool query not bulk ready') transfer_id = transfer_ids[0] if details: return self.__query_details(transfer_id=transfer_id) job = None job = requests.get('%s/jobs/%s' % (self.external_host, transfer_id), verify=self.verify, cert=self.cert, headers=self.headers, timeout=timeout) # TODO Set to 5 in conveyor if job and job.status_code == 200: record_counter('transfertool.fts3.%s.query.success' % self.__extract_host(self.external_host)) labels = { 'state': 'success', 'host': self.__extract_host(self.external_host) } QUERY_COUNTER.labels(**labels).inc() return [job.json()] record_counter('transfertool.fts3.%s.query.failure' % self.__extract_host(self.external_host)) labels = { 'state': 'failure', 'host': self.__extract_host(self.external_host) } QUERY_COUNTER.labels(**labels).inc() raise Exception('Could not retrieve transfer information: %s', job.content)
def bulk_check_xfers(task_ids): tc = get_transfer_client() logging.debug('task_ids: %s' % task_ids) responses = {} for task_id in task_ids: transfer = tc.get_task(str(task_id)) logging.debug('transfer: %s' % transfer) status = str(transfer["status"]) if status == 'SUCCEEDED': record_counter('daemons.conveyor.transfer_submitter.globus.transfers.bytes_transferred', transfer['bytes_transferred']) record_counter('daemons.conveyor.transfer_submitter.globus.transfers.effective_bytes_per_second', transfer['effective_bytes_per_second']) responses[str(task_id)] = status logging.debug('responses: %s' % responses) return responses
def touch_transfer(external_host, transfer_id, session=None): """ Update the timestamp of requests in a transfer. Fails silently if the transfer_id does not exist. :param request_host: Name of the external host. :param transfer_id: External transfer job id as a string. :param session: Database session to use. """ record_counter('core.request.touch_transfer') try: # don't touch it if it's already touched in 30 seconds session.query(models.Request).with_hint(models.Request, "INDEX(REQUESTS REQUESTS_EXTERNALID_UQ)", 'oracle')\ .filter_by(external_id=transfer_id)\ .filter(models.Request.state == RequestState.SUBMITTED)\ .filter(models.Request.updated_at < datetime.datetime.utcnow() - datetime.timedelta(seconds=30))\ .update({'updated_at': datetime.datetime.utcnow()}, synchronize_session=False) except IntegrityError as error: raise RucioException(error.args)
def submit(tinfo, session): """ Create a new transfer job in state QUEUED. :param tinfo: The transfer job information as a string. :returns: The transfer job id. """ record_counter('daemons.mock.fts3.submit') ts = time.time() tid = generate_uuid() record_timer('daemons.mock.fts3.submit.000-generate_uuid', (time.time()-ts)*1000) ts = time.time() new_transfer = test_models.MockFTSTransfer(transfer_id=tid, transfer_metadata=str(tinfo)) new_transfer.save(session=session) record_timer('daemons.mock.fts3.submit.001-new_transfer', (time.time()-ts)*1000) return {'job_id': tid}
def trace(payload): """ Write a trace to log file and send it to active mq. :param payload: Python dictionary with trace report. """ record_counter('trace.trace') report = json.dumps(payload, default=date_handler) logger.info(report) try: conn = random.sample(conns, 1)[0] if not conn.is_connected(): logging.info('reconnect to ' + conn.transport._Transport__host_and_ports[0][0]) conn.start() conn.connect(username, password) conn.send(body=report, destination=topic, headers={'persistent': 'true', 'appversion': 'rucio'}) except Exception, e: errlog.error(e)
def whoami(self): """ Returns credential information from the FTS3 server. :returns: Credentials as stored by the FTS3 server as a dictionary. """ get_result = None get_result = requests.get('%s/whoami' % self.external_host, verify=self.verify, cert=self.cert, headers={'Content-Type': 'application/json'}) if get_result and get_result.status_code == 200: record_counter('transfertool.fts3.%s.whoami.success' % self.__extract_host(self.external_host)) return get_result.json() record_counter('transfertool.fts3.%s.whoami.failure' % self.__extract_host(self.external_host)) raise Exception('Could not retrieve credentials: %s', get_result.content)
def version(self): """ Returns FTS3 server information. :returns: FTS3 server information as a dictionary. """ get_result = None get_result = requests.get('%s/' % self.external_host, verify=self.verify, cert=self.cert, headers={'Content-Type': 'application/json'}) if get_result and get_result.status_code == 200: record_counter('transfertool.fts3.%s.version.success' % self.__extract_host(self.external_host)) return get_result.json() record_counter('transfertool.fts3.%s.version.failure' % self.__extract_host(self.external_host)) raise Exception('Could not retrieve version: %s', get_result.content)
def trace(payload): """ Write a trace to log file and send it to active mq. :param payload: Python dictionary with trace report. """ record_counter('trace.nongrid_trace') report = json.dumps(payload, default=date_handler) LOGGER.debug(report) try: conn = random.sample(CONNS, 1)[0] if not conn.is_connected(): logging.info('reconnect to ' + conn.transport._Transport__host_and_ports[0][0]) conn.start() conn.connect(USERNAME, PASSWORD) conn.send(body=report, destination=TOPIC, headers={'persistent': 'true', 'appversion': 'rucio'}) except Exception as exception: ERRLOG.error(exception)
def __query_details(self, transfer_id): """ Query the detailed status of a transfer in FTS3 via JSON. :param transfer_id: FTS transfer identifier as a string. :returns: Detailed transfer status information as a dictionary. """ files = None files = requests.get('%s/jobs/%s/files' % (self.external_host, transfer_id), verify=self.verify, cert=self.cert, headers={'Content-Type': 'application/json'}, timeout=5) if files and (files.status_code == 200 or files.status_code == 207): record_counter('transfertool.fts3.%s.query_details.success' % self.__extract_host(self.external_host)) return files.json() record_counter('transfertool.fts3.%s.query_details.failure' % self.__extract_host(self.external_host)) return
def run_once(bulk, group_bulk, rse_ids, scheme, failover_scheme, transfertool_kwargs, heartbeat_handler, activity): worker_number, total_workers, logger = heartbeat_handler.live() start_time = time.time() transfers = next_transfers_to_submit( total_workers=total_workers, worker_number=worker_number, failover_schemes=failover_scheme, limit=bulk, activity=activity, rses=rse_ids, schemes=scheme, transfertools_by_name={'fts3': FTS3Transfertool}, older_than=None, request_type=RequestType.STAGEIN, logger=logger, ) total_transfers = len(list(hop for paths in transfers.values() for path in paths for hop in path)) record_timer('daemons.conveyor.stager.get_stagein_transfers.per_transfer', (time.time() - start_time) * 1000 / (total_transfers if transfers else 1)) record_counter('daemons.conveyor.stager.get_stagein_transfers', total_transfers) record_timer('daemons.conveyor.stager.get_stagein_transfers.transfers', total_transfers) logger(logging.INFO, 'Got %s stagein transfers for %s' % (total_transfers, activity)) for builder, transfer_paths in transfers.items(): transfertool_obj = builder.make_transfertool(logger=logger, **transfertool_kwargs.get(builder.transfertool_class, {})) logger(logging.INFO, 'Starting to group transfers for %s (%s)' % (activity, transfertool_obj)) start_time = time.time() grouped_jobs = transfertool_obj.group_into_submit_jobs(transfer_paths) record_timer('daemons.conveyor.stager.bulk_group_transfer', (time.time() - start_time) * 1000 / (len(transfer_paths) or 1)) logger(logging.INFO, 'Starting to submit transfers for %s (%s)' % (activity, transfertool_obj)) for job in grouped_jobs: worker_number, total_workers, logger = heartbeat_handler.live() submit_transfer(transfertool_obj=transfertool_obj, transfers=job['transfers'], job_params=job['job_params'], submitter='transfer_submitter', logger=logger) queue_empty = False if total_transfers < group_bulk: queue_empty = True logger(logging.INFO, 'Only %s transfers for %s which is less than group bulk %s' % (total_transfers, activity, group_bulk)) return queue_empty
def register_container(self): self.container_exists = False if self.is_at_pnn and self.dry_run: logging.info('Dry Run: Create container %s in scope %s.', self.container, self.scope) self.container_exists = True return self.container_exists try: get_did(scope=self.scope, name=self.container) monitor.record_counter('cms_sync.container_exists') self.container_exists = True logging.info('Found container %s', self.container) except DataIdentifierNotFound: if self.is_at_pnn: try: logging.info('Create container %s in scope %s.', self.container, self.scope) add_did(scope=self.scope, name=self.container, type='CONTAINER', issuer=self.account, lifetime=self.lifetime) monitor.record_counter('cms_sync.container_created') self.container_exists = True logging.info('Created container %s in scope %s.', self.container, self.scope) except DataIdentifierAlreadyExists: logging.warning('Container was created in the meanwhile') monitor.record_counter('cms_sync.container_collision') self.container_exists = True else: logging.warning('Container was not at PNN') return self.container_exists
def register_block(self): """ Register the dataset (if there is a replica at the pnn) and attach to container :dry: Dry run. Default false. """ # FIXME: The logic here could use some improvement as we try to create a block even if it exists already existed, created, attached, already_attached = self.register_and_attach_did( scope=self.scope, name=self.block_name, did_type='DATASET', parent_did=self.container) if self.is_at_pnn and self.dry_run: logging.info('Dry Run: Create dataset %s in scope %s.', self.block_name, self.scope) self.block_exists = True self.block_exists = existed | created if existed: monitor.record_counter('cms_sync.dataset_exists') if created: monitor.record_counter('cms_sync.dataset_created') if not existed and not created: monitor.record_counter('cms_sync.dataset_create_failed') return self.block_exists
def update_priority(self, transfer_id, priority, timeout=None): """ Update the priority of a transfer that has been submitted to FTS via JSON. :param transfer_id: FTS transfer identifier as a string. :param priority: FTS job priority as an integer from 1 to 5. :param timeout: Timeout in seconds. :returns: True if update was successful. """ job = None params_dict = {"params": {"priority": priority}} params_str = json.dumps(params_dict, cls=APIEncoder) job = requests.post('%s/jobs/%s' % (self.external_host, transfer_id), verify=self.verify, data=params_str, cert=self.cert, headers=self.headers, timeout=timeout) # TODO set to 3 in conveyor if job and job.status_code == 200: record_counter('transfertool.fts3.%s.update_priority.success' % self.__extract_host(self.external_host)) labels = { 'state': 'success', 'host': self.__extract_host(self.external_host) } UPDATE_PRIORITY_COUNTER.labels(**labels).inc() return job.json() record_counter('transfertool.fts3.%s.update_priority.failure' % self.__extract_host(self.external_host)) labels = { 'state': 'failure', 'host': self.__extract_host(self.external_host) } UPDATE_PRIORITY_COUNTER.labels(**labels).inc() raise Exception('Could not update priority of transfer: %s', job.content)
def cancel(self, transfer_ids, timeout=None): """ Cancel transfers that have been submitted to FTS3. :param transfer_ids: FTS transfer identifiers as list of strings. :param timeout: Timeout in seconds. :returns: True if cancellation was successful. """ if len(transfer_ids) > 1: raise NotImplementedError('Bulk cancelling not implemented') transfer_id = transfer_ids[0] job = None job = requests.delete('%s/jobs/%s' % (self.external_host, transfer_id), verify=self.verify, cert=self.cert, headers=self.headers, timeout=timeout) if job and job.status_code == 200: record_counter('transfertool.fts3.%s.cancel.success' % self.__extract_host(self.external_host)) labels = { 'state': 'success', 'host': self.__extract_host(self.external_host) } CANCEL_COUNTER.labels(**labels).inc() return job.json() record_counter('transfertool.fts3.%s.cancel.failure' % self.__extract_host(self.external_host)) labels = { 'state': 'failure', 'host': self.__extract_host(self.external_host) } CANCEL_COUNTER.labels(**labels).inc() raise Exception('Could not cancel transfer: %s', job.content)
def register_block(self): """ Register the dataset (if there is a replica at the pnn) and attach to container :dry: Dry run. Default false. """ # FIXME: The logic here could use some improvement as we try to create a block even if it exists already try: get_did(scope=self.scope, name=self.block_name) self.block_exists = True monitor.record_counter('cms_sync.dataset_exists') except DataIdentifierNotFound: self.block_exists = False if self.is_at_pnn and self.dry_run: logging.info('Dry Run: Create dataset %s in scope %s.', self.block_name, self.scope) self.block_exists = True elif self.is_at_pnn: logging.info('Create block %s in scope %s.', self.block_name, self.scope) try: if not self.block_exists: add_did(scope=self.scope, name=self.block_name, type='DATASET', issuer=self.account, lifetime=self.lifetime) monitor.record_counter('cms_sync.dataset_created') except DataIdentifierAlreadyExists: logging.warning('Attempt to add %s:%s failed, already exists.', self.scope, self.block_name) monitor.record_counter('cms_sync.dataset_collision') try: attach_dids(scope=self.scope, name=self.container, attachment={ 'dids': [{ 'scope': self.scope, 'name': self.block_name }] }, issuer=self.account) except DuplicateContent: logging.warning( 'Attempt to add %s:%s to %s failed, already exists.', self.scope, self.block_name, self.container) except DataIdentifierNotFound: logging.error( 'Attempt to add %s:%s to %s failed. Container does not exist.', self.scope, self.block_name, self.container) return False self.block_exists = True else: logging.warning('Block %s was not at PNN', self.block_name) return self.block_exists
def submit_bulk_transfers(external_host, files, transfertool='fts3', job_params={}, timeout=None, user_transfer_job=False): """ Submit transfer request to a transfertool. :param external_host: External host name as string :param files: List of Dictionary containing request file. :param transfertool: Transfertool as a string. :param job_params: Metadata key/value pairs for all files as a dictionary. :returns: Transfertool external ID. """ record_counter('core.request.submit_transfer') transfer_id = None if transfertool == 'fts3': start_time = time.time() job_files = [] for file in files: job_file = {} for key in file: if key == 'sources': # convert sources from (src_rse, url, src_rse_id, rank) to url job_file[key] = [] for source in file[key]: job_file[key].append(source[1]) else: job_file[key] = file[key] job_files.append(job_file) if not user_transfer_job: transfer_id = FTS3Transfertool(external_host=external_host).submit(files=job_files, job_params=job_params, timeout=timeout) elif USER_TRANSFERS == "cms": transfer_id = FTS3MyProxyTransfertool(external_host=external_host).submit(files=job_files, job_params=job_params, timeout=timeout) else: # if no valid USER TRANSFER cases --> go with std submission transfer_id = FTS3Transfertool(external_host=external_host).submit(files=job_files, job_params=job_params, timeout=timeout) record_timer('core.request.submit_transfers_fts3', (time.time() - start_time) * 1000 / len(files)) return transfer_id
def on_message(self, headers, message): record_counter('daemons.tracer.kronos.reports') appversion = 'dq2' id = headers['message-id'] if 'appversion' in headers: appversion = headers['appversion'] try: if appversion == 'dq2': self.__conn.ack(id, self.__subscription_id) return else: report = jloads(message) except: # message is corrupt, not much to do here # send count to graphite, send ack to broker and return record_counter('daemons.tracer.kronos.json_error') logging.error('(kronos_file) json error') self.__conn.ack(id, self.__subscription_id) return self.__ids.append(id) self.__reports.append(report) try: logging.debug('(kronos_file) message received: %s %s %s' % (str(report['eventType']), report['filename'], report['remoteSite'])) except: pass if len(self.__ids) >= self.__chunksize: self.__update_atime() for id in self.__ids: self.__conn.ack(id, self.__subscription_id) self.__reports = [] self.__ids = []