def get_engine(echo=True): """ Creates a engine to a specific database. :returns: engine """ global _ENGINE if not _ENGINE: sql_connection = config_get('database', 'default') config_params = [('pool_size', int), ('max_overflow', int), ('pool_timeout', int), ('pool_recycle', int), ('echo', int), ('echo_pool', str), ('pool_reset_on_return', str), ('use_threadlocal', int)] params = {} for param, param_type in config_params: try: params[param] = param_type(config_get('database', param)) except NoOptionError: pass _ENGINE = create_engine(sql_connection, **params) if 'mysql' in sql_connection: event.listen(_ENGINE, 'checkout', mysql_ping_listener) elif 'sqlite' in sql_connection: event.listen(_ENGINE, 'connect', _fk_pragma_on_connect) elif 'oracle' in sql_connection: event.listen(_ENGINE, 'connect', my_on_connect) # Override engine.connect method with db error wrapper # To have auto_reconnect (will come in next sqlalchemy releases) _ENGINE.connect = wrap_db_error(_ENGINE.connect) # _ENGINE.connect() assert(_ENGINE) return _ENGINE
def __init__(self, rucio_host=None, auth_host=None, account=None, ca_cert=None, auth_type=None, creds=None, timeout=None, user_agent='rucio-clients'): """ Constructor of the BaseClient. :param rucio_host: the address of the rucio server, if None it is read from the config file. :param rucio_port: the port of the rucio server, if None it is read from the config file. :param auth_host: the address of the rucio authentication server, if None it is read from the config file. :param auth_port: the port of the rucio authentication server, if None it is read from the config file. :param account: the account to authenticate to rucio. :param use_ssl: enable or disable ssl for commucation. Default is enabled. :param ca_cert: the path to the rucio server certificate. :param auth_type: the type of authentication (e.g.: 'userpass', 'kerberos' ...) :param creds: a dictionary with credentials needed for authentication. :param user_agent: indicates the client """ self.host = rucio_host self.list_hosts = [] self.auth_host = auth_host self.session = session() self.user_agent = user_agent try: if self.host is None: self.host = config_get('client', 'rucio_host') if self.auth_host is None: self.auth_host = config_get('client', 'auth_host') except (NoOptionError, NoSectionError), e: raise MissingClientParameter('Section client and Option \'%s\' cannot be found in config file' % e.args[0])
def consumer(id, total_threads=1): """ Main loop to consume messages from the FTS3 producer. """ logging.info('consumer starting') brokers_alias = [] brokers_resolved = [] try: brokers_alias = [b.strip() for b in config_get('messaging-fts3', 'brokers').split(',')] except: raise Exception('Could not load brokers from configuration') logging.info('resolving broker dns alias: %s' % brokers_alias) brokers_resolved = [] for broker in brokers_alias: brokers_resolved.append([str(tmp_broker) for tmp_broker in dns.resolver.query(broker, 'A')]) brokers_resolved = [item for sublist in brokers_resolved for item in sublist] logging.debug('brokers resolved to %s', brokers_resolved) conns = [] for broker in brokers_resolved: conns.append(stomp.Connection(host_and_ports=[(broker, config_get_int('messaging-fts3', 'port'))], use_ssl=True, ssl_key_file=config_get('messaging-fts3', 'ssl_key_file'), ssl_cert_file=config_get('messaging-fts3', 'ssl_cert_file'), ssl_version=ssl.PROTOCOL_TLSv1)) logging.info('consumer started') while not graceful_stop.is_set(): for conn in conns: if not conn.is_connected(): logging.info('connecting to %s' % conn.transport._Transport__host_and_ports[0][0]) record_counter('daemons.messaging.fts3.reconnect.%s' % conn.transport._Transport__host_and_ports[0][0].split('.')[0]) conn.set_listener('rucio-messaging-fts3', Consumer(broker=conn.transport._Transport__host_and_ports[0], id=id, total_threads=total_threads)) conn.start() conn.connect() conn.subscribe(destination=config_get('messaging-fts3', 'destination'), id='rucio-messaging-fts3', ack='auto') time.sleep(1) logging.info('graceful stop requested') for conn in conns: try: conn.disconnect() except: pass logging.info('graceful stop done')
def setup(self): self.cacert = config_get('test', 'cacert') self.usercert = config_get('test', 'usercert') try: remove('/tmp/' + getuser() + '/.rucio_root/auth_token_root') except OSError, e: if e.args[0] != 2: raise e
def setup(self): self.cacert = config_get('test', 'cacert') self.host = config_get('client', 'rucio_host') self.auth_host = config_get('client', 'auth_host') self.marker = '$> ' # get auth token self.base_client = BaseClient() self.token = self.base_client.headers['X-Rucio-Auth-Token'] self.replica_client = ReplicaClient()
def run(once=False, process=0, total_processes=1, total_threads=1): """ Starts up the consumer threads """ logging.info('resolving brokers') brokers_alias = [] brokers_resolved = [] try: brokers_alias = [b.strip() for b in config_get('tracer-kronos', 'brokers').split(',')] except: raise Exception('Could not load brokers from configuration') logging.info('resolving broker dns alias: %s' % brokers_alias) brokers_resolved = [] for broker in brokers_alias: brokers_resolved.append([str(tmp_broker) for tmp_broker in resolver.query(broker, 'A')]) brokers_resolved = [item for sublist in brokers_resolved for item in sublist] logging.debug('brokers resolved to %s', brokers_resolved) dataset_queue = Queue() logging.info('starting tracer consumer threads') threads = [] for i in xrange(0, total_threads): threads.append(Thread(target=kronos_file, kwargs={'process': process, 'total_processes': total_processes, 'thread': i, 'total_threads': total_threads, 'brokers_resolved': brokers_resolved, 'dataset_queue': dataset_queue})) threads.append(Thread(target=kronos_dataset, kwargs={'process': process, 'total_processes': total_processes, 'thread': i, 'total_threads': total_threads, 'dataset_queue': dataset_queue})) [t.start() for t in threads] logging.info('waiting for interrupts') while len(threads) > 0: [t.join(timeout=3) for t in threads if t and t.isAlive()]
def get_special_accounts(): accounts = [] try: accounts = config_get('accounts', 'special_accounts') accounts = [a.strip() for a in accounts.split(',')] except: pass return accounts
def setup(self): self.did_client = DIDClient() self.replica_client = ReplicaClient() self.base_client = BaseClient(account='root', ca_cert=config_get('client', 'ca_cert'), auth_type='x509') self.token = self.base_client.headers['X-Rucio-Auth-Token'] self.fname = generate_uuid() rses = ['LXPLUS', 'MOCK4'] dsn = generate_uuid() self.files = [{'scope': 'mock', 'name': self.fname, 'bytes': 1L, 'adler32': '0cc737eb'}]
def run(once=False, process=0, total_processes=1, total_threads=1, bulk=1000): """ Starts up the hermes threads. """ if once: logging.info('executing one hermes iteration only') deliver_messages(once=once, bulk=bulk) else: logging.info('resolving brokers') brokers_alias = [] brokers_resolved = [] try: brokers_alias = [b.strip() for b in config_get('messaging-hermes', 'brokers').split(',')] except: raise Exception('Could not load brokers from configuration') logging.info('resolving broker dns alias: %s' % brokers_alias) brokers_resolved = [] for broker in brokers_alias: brokers_resolved.append([str(tmp_broker) for tmp_broker in dns.resolver.query(broker, 'A')]) brokers_resolved = [item for sublist in brokers_resolved for item in sublist] logging.debug('brokers resolved to %s', brokers_resolved) logging.info('starting hermes threads') threads = [threading.Thread(target=deliver_messages, kwargs={'brokers_resolved': brokers_resolved, 'process': process, 'total_processes': total_processes, 'thread': i, 'total_threads': total_threads, 'bulk': bulk}) for i in xrange(0, total_threads)] [t.start() for t in threads] logging.info('waiting for interrupts') # Interruptible joins require a timeout. while len(threads) > 0: [t.join(timeout=3.14) for t in threads if t and t.isAlive()]
def run(total_workers=1, once=False, inputfile=None): """ Starts up the automatix threads. """ try: sites = [s.strip() for s in config_get('automatix', 'sites').split(',')] except: raise Exception('Could not load sites from configuration') if not inputfile: inputfile = '/opt/rucio/etc/automatix.json' try: sleep_time = config_get_int('automatix', 'sleep_time') except: sleep_time = 3600 try: account = config_get_int('automatix', 'account') except: account = 'root' try: dataset_lifetime = config_get_int('automatix', 'dataset_lifetime') except: dataset_lifetime = None threads = list() for worker_number in xrange(0, total_workers): kwargs = {'worker_number': worker_number + 1, 'total_workers': total_workers, 'once': once, 'sites': sites, 'sleep_time': sleep_time, 'account': account, 'inputfile': inputfile, 'dataset_lifetime': dataset_lifetime} threads.append(threading.Thread(target=automatix, kwargs=kwargs)) [t.start() for t in threads] while threads[0].is_alive(): logging.debug('Still %i active threads' % len(threads)) [t.join(timeout=3.14) for t in threads]
def get_dump_engine(echo=False): """ Creates a dump engine to a specific database. :returns: engine """ statements = list() def dump(sql, *multiparams, **params): statement = str(sql.compile(dialect=engine.dialect)) if statement in statements: return statements.append(statement) if statement.endswith(')\n\n'): if engine.dialect.name == 'oracle': print statement.replace(')\n\n', ') PCTFREE 0;\n') else: print statement.replace(')\n\n', ');\n') elif statement.endswith(')'): print statement.replace(')', ');\n') else: print statement sql_connection = config_get('database', 'default') engine = create_engine(sql_connection, echo=echo, strategy='mock', executor=dump) return engine
# # Licensed under the Apache License, Version 2.0 (the "License"); # You may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: # - Luis Rodrigues, <*****@*****.**>, 2013 from pystatsd import Client from rucio.common.config import config_get import time server = config_get('monitor', 'carbon_server') port = config_get('monitor', 'carbon_port') scope = config_get('monitor', 'user_scope') pystatsd_client = Client(host=server, port=port, prefix=scope) def record_counter(counters, delta=1): """ Log one or more counters by arbitrary amounts :param counters: The counter or a list of counters to be updated. :param delta: The increment for the counter, by default increment by 1. """ pystatsd_client.update_stats(counters, delta)
if 'RUCIO_AUTH_TYPE' in environ: if environ['RUCIO_AUTH_TYPE'] not in ('userpass', 'x509', 'x509_proxy', 'gss'): raise MissingClientParameter('Possible RUCIO_AUTH_TYPE values: userpass, x509, x509_proxy, gss vs. ' + environ['RUCIO_AUTH_TYPE']) self.auth_type = environ['RUCIO_AUTH_TYPE'] else: try: self.auth_type = config_get('client', 'auth_type') except (NoOptionError, NoSectionError), e: raise MissingClientParameter('Option \'%s\' cannot be found in config file' % e.args[0]) if creds is None: LOG.debug('no creds passed. Trying to get it from the config file.') self.creds = {} try: if self.auth_type == 'userpass': self.creds['username'] = config_get('client', 'username') self.creds['password'] = config_get('client', 'password') elif self.auth_type == 'x509': self.creds['client_cert'] = path.abspath(path.expanduser(path.expandvars(config_get('client', 'client_cert')))) self.creds['client_key'] = path.abspath(path.expanduser(path.expandvars(config_get('client', 'client_key')))) elif self.auth_type == 'x509_proxy': self.creds['client_proxy'] = path.abspath(path.expanduser(path.expandvars(config_get('client', 'client_x509_proxy')))) except (NoOptionError, NoSectionError), e: if e.args[0] != 'client_key': raise MissingClientParameter('Option \'%s\' cannot be found in config file' % e.args[0]) rucio_scheme = urlparse(self.host).scheme auth_scheme = urlparse(self.auth_host).scheme if (rucio_scheme != 'http' and rucio_scheme != 'https'): raise ClientProtocolNotSupported('\'%s\' not supported' % rucio_scheme)
''' import logging import sys import threading import time import traceback from rucio.common.config import config_get from rucio.common.exception import DatabaseException from rucio.common.utils import chunks from rucio.core.monitor import record_counter from rucio.core.did import list_expired_dids, delete_dids logging.getLogger("requests").setLevel(getattr(logging, config_get('common', 'loglevel').upper())) logging.basicConfig(stream=sys.stdout, level=getattr(logging, config_get('common', 'loglevel').upper()), format='%(asctime)s\t%(process)d\t%(levelname)s\t%(message)s') graceful_stop = threading.Event() def undertaker(worker_number=1, total_workers=1, chunk_size=5, once=False): """ Main loop to select and delete dids. """ logging.info('Undertaker(%s): starting' % worker_number) logging.info('Undertaker(%s): started' % worker_number) while not graceful_stop.is_set():
def POST(self): """ List all replicas for data identifiers. HTTP Success: 200 OK HTTP Error: 401 Unauthorized 406 Not Acceptable 500 InternalError :returns: A dictionary containing all replicas information, either as JSON stream or metalink4. """ metalink = False if ctx.env.get('HTTP_ACCEPT') is not None: tmp = ctx.env.get('HTTP_ACCEPT').split(',') if 'application/metalink4+xml' in tmp: metalink = True client_ip = ctx.env.get('HTTP_X_FORWARDED_FOR') if client_ip is None: client_ip = ctx.ip dids, schemes, select, unavailable, limit = [], None, None, False, None ignore_availability, rse_expression, all_states, domain = False, None, False, None signature_lifetime, resolve_archives, resolve_parents = None, True, False client_location = {} json_data = data() try: params = parse_response(json_data) if 'dids' in params: dids = params['dids'] if 'schemes' in params: schemes = params['schemes'] if 'unavailable' in params: unavailable = params['unavailable'] ignore_availability = True if 'all_states' in params: all_states = params['all_states'] if 'rse_expression' in params: rse_expression = params['rse_expression'] if 'client_location' in params: client_location = params['client_location'] client_location['ip'] = params['client_location'].get( 'ip', client_ip) if 'sort' in params: select = params['sort'] if 'domain' in params: domain = params['domain'] if 'resolve_archives' in params: resolve_archives = params['resolve_archives'] if 'resolve_parents' in params: resolve_parents = params['resolve_parents'] if 'signature_lifetime' in params: signature_lifetime = params['signature_lifetime'] else: # hardcoded default of 10 minutes if config is not parseable signature_lifetime = config_get('credentials', 'signature_lifetime', raise_exception=False, default=600) except ValueError: raise generate_http_error(400, 'ValueError', 'Cannot decode json parameter list') if ctx.query: params = parse_qs(ctx.query[1:]) if 'select' in params: select = params['select'][0] if 'limit' in params: limit = params['limit'][0] if 'sort' in params: select = params['sort'] # Resolve all reasonable protocols when doing metalink for maximum access possibilities if metalink and schemes is None: schemes = SUPPORTED_PROTOCOLS try: # we need to call list_replicas before starting to reply # otherwise the exceptions won't be propagated correctly __first = True # then, stream the replica information for rfile in list_replicas(dids=dids, schemes=schemes, unavailable=unavailable, request_id=ctx.env.get('request_id'), ignore_availability=ignore_availability, all_states=all_states, rse_expression=rse_expression, client_location=client_location, domain=domain, signature_lifetime=signature_lifetime, resolve_archives=resolve_archives, resolve_parents=resolve_parents, issuer=ctx.env.get('issuer')): # in first round, set the appropriate content type, and stream the header if __first: if not metalink: header('Content-Type', 'application/x-json-stream') else: header('Content-Type', 'application/metalink4+xml') yield '<?xml version="1.0" encoding="UTF-8"?>\n<metalink xmlns="urn:ietf:params:xml:ns:metalink">\n' __first = False if not metalink: yield dumps(rfile, cls=APIEncoder) + '\n' else: replicas = [] dictreplica = {} for replica in rfile['pfns'].keys(): replicas.append(replica) dictreplica[replica] = ( rfile['pfns'][replica]['domain'], rfile['pfns'][replica]['priority'], rfile['pfns'][replica]['rse'], rfile['pfns'][replica]['client_extract']) yield ' <file name="' + rfile['name'] + '">\n' if 'parents' in rfile and rfile['parents']: yield ' <parents>\n' for parent in rfile['parents']: yield ' <did>' + parent + '</did>\n' yield ' </parents>\n' yield ' <identity>' + rfile['scope'] + ':' + rfile[ 'name'] + '</identity>\n' if rfile['adler32'] is not None: yield ' <hash type="adler32">' + rfile[ 'adler32'] + '</hash>\n' if rfile['md5'] is not None: yield ' <hash type="md5">' + rfile['md5'] + '</hash>\n' yield ' <size>' + str(rfile['bytes']) + '</size>\n' yield ' <glfn name="/%s/rucio/%s:%s"></glfn>\n' % ( config_get( 'policy', 'schema', raise_exception=False, default='generic'), rfile['scope'], rfile['name']) # TODO: deprecate this if select == 'geoip': replicas = sort_geoip(dictreplica, client_location['ip']) elif select == 'closeness': replicas = sort_closeness(dictreplica, client_location) elif select == 'dynamic': replicas = sort_dynamic(dictreplica, client_location) elif select == 'ranking': replicas = sort_ranking(dictreplica, client_location) elif select == 'random': replicas = sort_random(dictreplica) else: replicas = sorted(dictreplica, key=dictreplica.get) idx = 0 for replica in replicas: yield ' <url location="' + str(dictreplica[replica][2]) \ + '" domain="' + str(dictreplica[replica][0]) \ + '" priority="' + str(dictreplica[replica][1]) \ + '" client_extract="' + str(dictreplica[replica][3]).lower() \ + '">' + escape(replica) + '</url>\n' idx += 1 if limit and limit == idx: break yield ' </file>\n' # ensure complete metalink if __first and metalink: yield '<?xml version="1.0" encoding="UTF-8"?>\n<metalink xmlns="urn:ietf:params:xml:ns:metalink">\n' if metalink: yield '</metalink>\n' except DataIdentifierNotFound as error: raise generate_http_error(404, 'DataIdentifierNotFound', error.args[0]) except RucioException as error: raise generate_http_error(500, error.__class__.__name__, error.args[0]) except Exception as error: print(format_exc()) raise InternalError(error)
def submitter(once=False, rses=None, partition_wait_time=10, bulk=100, group_bulk=1, group_policy='rule', source_strategy=None, activities=None, sleep_time=600, max_sources=4, retry_other_fts=False, filter_transfertool=FILTER_TRANSFERTOOL, transfertool=TRANSFER_TOOL, transfertype=TRANSFER_TYPE): """ Main loop to submit a new transfer primitive to a transfertool. """ try: scheme = config_get('conveyor', 'scheme') except NoOptionError: scheme = None try: failover_scheme = config_get('conveyor', 'failover_scheme') except NoOptionError: failover_scheme = None try: timeout = config_get('conveyor', 'submit_timeout') timeout = float(timeout) except NoOptionError: timeout = None try: bring_online = config_get('conveyor', 'bring_online') except NoOptionError: bring_online = 43200 try: max_time_in_queue = {} timelife_conf = config_get('conveyor', 'max_time_in_queue') timelife_confs = timelife_conf.split(",") for conf in timelife_confs: act, timelife = conf.split(":") max_time_in_queue[act.strip()] = int(timelife.strip()) except NoOptionError: max_time_in_queue = {} if 'default' not in max_time_in_queue: max_time_in_queue['default'] = 168 logging.debug("Maximum time in queue for different activities: %s", max_time_in_queue) activity_next_exe_time = defaultdict(time.time) executable = "conveyor-submitter" if activities: activities.sort() executable += '--activities ' + str(activities) if filter_transfertool: executable += ' --filter-transfertool ' + filter_transfertool hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prefix = 'conveyor-submitter[%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') logger(logging.INFO, 'Submitter starting with timeout %s', timeout) if partition_wait_time: time.sleep(partition_wait_time) # To prevent running on the same partition if all the poller restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prefix = 'conveyor-submitter[%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') logger(logging.INFO, 'Transfer submitter started') while not graceful_stop.is_set(): if activities is None: activities = [None] if rses: rse_ids = [rse['id'] for rse in rses] else: rse_ids = None for activity in activities: try: if activity_next_exe_time[activity] > time.time(): graceful_stop.wait(1) continue heart_beat = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600) prefix = 'conveyor-submitter[%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') user_transfer = False if activity in USER_ACTIVITY and USER_TRANSFERS in ['cms']: logger(logging.INFO, 'CMS user transfer activity') user_transfer = True logger(logging.INFO, 'Starting to get transfer transfers for %s', activity) start_time = time.time() transfers = __get_transfers(total_workers=heart_beat['nr_threads'], worker_number=heart_beat['assign_thread'], failover_schemes=failover_scheme, limit=bulk, activity=activity, rses=rse_ids, schemes=scheme, max_sources=max_sources, bring_online=bring_online, retry_other_fts=retry_other_fts, transfertool=filter_transfertool, logger=logger) record_timer('daemons.conveyor.transfer_submitter.get_transfers.per_transfer', (time.time() - start_time) * 1000 / (len(transfers) if transfers else 1)) record_counter('daemons.conveyor.transfer_submitter.get_transfers', len(transfers)) GET_TRANSFERS_COUNTER.inc(len(transfers)) record_timer('daemons.conveyor.transfer_submitter.get_transfers.transfers', len(transfers)) logger(logging.INFO, 'Got %s transfers for %s in %s seconds', len(transfers), activity, time.time() - start_time) # group transfers logger(logging.INFO, 'Starting to group transfers for %s', activity) start_time = time.time() grouped_jobs = bulk_group_transfer(transfers, group_policy, group_bulk, source_strategy, max_time_in_queue, group_by_scope=user_transfer) record_timer('daemons.conveyor.transfer_submitter.bulk_group_transfer', (time.time() - start_time) * 1000 / (len(transfers) if transfers else 1)) logger(logging.INFO, 'Starting to submit transfers for %s', activity) if transfertool in ['fts3', 'mock']: for external_host in grouped_jobs: if not user_transfer: for job in grouped_jobs[external_host]: # submit transfers submit_transfer(external_host=external_host, job=job, submitter='transfer_submitter', timeout=timeout, logger=logger, transfertool=transfertool) else: for _, jobs in iteritems(grouped_jobs[external_host]): # submit transfers for job in jobs: submit_transfer(external_host=external_host, job=job, submitter='transfer_submitter', timeout=timeout, user_transfer_job=user_transfer, logger=logger, transfertool=transfertool) elif transfertool == 'globus': if transfertype == 'bulk': # build bulk job file list per external host to send to submit_transfer for external_host in grouped_jobs: # pad the job with job_params; irrelevant for globus but needed for further rucio parsing submitjob = {'files': [], 'job_params': grouped_jobs[''][0].get('job_params')} for job in grouped_jobs[external_host]: submitjob.get('files').append(job.get('files')[0]) logger(logging.DEBUG, 'submitjob: %s' % submitjob) submit_transfer(external_host=external_host, job=submitjob, submitter='transfer_submitter', timeout=timeout, logger=logger, transfertool=transfertool) else: # build single job files and individually send to submit_transfer job_params = grouped_jobs[''][0].get('job_params') if grouped_jobs else None for external_host in grouped_jobs: for job in grouped_jobs[external_host]: for file in job['files']: singlejob = {'files': [file], 'job_params': job_params} logger(logging.DEBUG, 'singlejob: %s' % singlejob) submit_transfer(external_host=external_host, job=singlejob, submitter='transfer_submitter', timeout=timeout, logger=logger, transfertool=transfertool) else: logger(logging.ERROR, 'Unknown transfer tool') if len(transfers) < group_bulk: logger(logging.INFO, 'Only %s transfers for %s which is less than group bulk %s, sleep %s seconds', len(transfers), activity, group_bulk, sleep_time) if activity_next_exe_time[activity] < time.time(): activity_next_exe_time[activity] = time.time() + sleep_time except Exception: logger(logging.CRITICAL, 'Exception', exc_info=True) if once: break logger(logging.INFO, 'Graceful stop requested') heartbeat.die(executable, hostname, pid, hb_thread) logger(logging.INFO, 'Graceful stop done') return
def consumer(id, num_thread=1): """ Main loop to consume messages from the Rucio Cache producer. """ logging.info('Rucio Cache consumer starting') brokers_alias = [] brokers_resolved = [] try: brokers_alias = [ b.strip() for b in config_get('messaging-cache', 'brokers').split(',') ] except: raise Exception( 'Could not load rucio cache brokers from configuration') logging.info('resolving rucio cache broker dns alias: %s' % brokers_alias) brokers_resolved = [] for broker in brokers_alias: addrinfos = socket.getaddrinfo(broker, 0, socket.AF_INET, 0, socket.IPPROTO_TCP) brokers_resolved.extend(ai[4][0] for ai in addrinfos) logging.debug('Rucio cache brokers resolved to %s', brokers_resolved) conns = {} for broker in brokers_resolved: conn = stomp.Connection( host_and_ports=[(broker, config_get_int('messaging-cache', 'port'))], use_ssl=True, ssl_key_file=config_get('messaging-cache', 'ssl_key_file'), ssl_cert_file=config_get('messaging-cache', 'ssl_cert_file'), vhost=config_get('messaging-cache', 'broker_virtual_host', raise_exception=False)) conns[conn] = Consumer(conn.transport._Transport__host_and_ports[0], account=config_get('messaging-cache', 'account'), id=id, num_thread=num_thread) logging.info('consumer started') while not GRACEFUL_STOP.is_set(): for conn in conns: if not conn.is_connected(): logging.info('connecting to %s' % conn.transport._Transport__host_and_ports[0][0]) record_counter('daemons.messaging.cache.reconnect.%s' % conn.transport._Transport__host_and_ports[0] [0].split('.')[0]) conn.set_listener('rucio-cache-messaging', conns[conn]) conn.connect() conn.subscribe(destination=config_get('messaging-cache', 'destination'), id='rucio-cache-messaging', ack='auto') time.sleep(1) logging.info('graceful stop requested') for conn in conns: try: conn.disconnect() except: pass logging.info('graceful stop done')
def setup(self): self.host = config_get('client', 'rucio_host') self.auth_host = config_get('client', 'auth_host') # print 'get from config:', self.host, self.auth_host self.marker = '$> '
def kronos_file(thread=0, dataset_queue=None, sleep_time=60): """ Main loop to consume tracer reports. """ logging.info('kronos_file[%i/?] starting', thread) executable = 'kronos-file' hostname = socket.gethostname() pid = getpid() hb_thread = current_thread() chunksize = config_get_int('tracer-kronos', 'chunksize') prefetch_size = config_get_int('tracer-kronos', 'prefetch_size') subscription_id = config_get('tracer-kronos', 'subscription_id') try: bad_files_patterns = [] pattern = config_get(section='kronos', option='bad_files_patterns', session=None) pattern = str(pattern) patterns = pattern.split(",") for pat in patterns: bad_files_patterns.append(re.compile(pat.strip())) except (NoOptionError, NoSectionError, RuntimeError): bad_files_patterns = [] except Exception as error: logging.log(logging.ERROR, 'kronos_file[%i/?] Failed to get bad_file_patterns %s', thread, str(error)) bad_files_patterns = [] use_ssl = True try: use_ssl = config_get_bool('tracer-kronos', 'use_ssl') except Exception: pass if not use_ssl: username = config_get('tracer-kronos', 'username') password = config_get('tracer-kronos', 'password') excluded_usrdns = set( config_get('tracer-kronos', 'excluded_usrdns').split(',')) vhost = config_get('tracer-kronos', 'broker_virtual_host', raise_exception=False) brokers_alias = [ b.strip() for b in config_get('tracer-kronos', 'brokers').split(',') ] port = config_get_int('tracer-kronos', 'port') reconnect_attempts = config_get_int('tracer-kronos', 'reconnect_attempts') ssl_key_file = config_get('tracer-kronos', 'ssl_key_file', raise_exception=False) ssl_cert_file = config_get('tracer-kronos', 'ssl_cert_file', raise_exception=False) sanity_check(executable=executable, hostname=hostname) while not graceful_stop.is_set(): start_time = time() heart_beat = live(executable, hostname, pid, hb_thread) prepend_str = 'kronos-file[%i/%i] ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') conns = get_stomp_brokers(brokers=brokers_alias, port=port, use_ssl=use_ssl, vhost=vhost, reconnect_attempts=reconnect_attempts, ssl_key_file=ssl_key_file, ssl_cert_file=ssl_cert_file, timeout=sleep_time, logger=logger) for conn in conns: if not conn.is_connected(): logger( logging.INFO, 'connecting to %s' % str(conn.transport._Transport__host_and_ports[0])) record_counter( 'daemons.tracer.kronos.reconnect.{host}', labels={ 'host': conn.transport._Transport__host_and_ports[0][0] }) conn.set_listener( 'rucio-tracer-kronos', AMQConsumer( broker=conn.transport._Transport__host_and_ports[0], conn=conn, queue=config_get('tracer-kronos', 'queue'), chunksize=chunksize, subscription_id=subscription_id, excluded_usrdns=excluded_usrdns, dataset_queue=dataset_queue, bad_files_patterns=bad_files_patterns, logger=logger)) if not use_ssl: conn.connect(username, password) else: conn.connect() conn.subscribe( destination=config_get('tracer-kronos', 'queue'), ack='client-individual', id=subscription_id, headers={'activemq.prefetchSize': prefetch_size}) tottime = time() - start_time if tottime < sleep_time: logger(logging.INFO, 'Will sleep for %s seconds' % (sleep_time - tottime)) sleep(sleep_time - tottime) logger(logging.INFO, 'graceful stop requested') for conn in conns: try: conn.disconnect() except Exception: pass die(executable=executable, hostname=hostname, pid=pid, thread=thread) logger(logging.INFO, 'graceful stop done')
def get_signed_url(rse_id, service, operation, url, lifetime=600): """ Get a signed URL for a particular service and operation. The signed URL will be valid for 1 hour but can be overriden. :param rse_id: The ID of the RSE that the URL points to. :param service: The service to authorise, either 'gcs', 's3' or 'swift'. :param operation: The operation to sign, either 'read', 'write', or 'delete'. :param url: The URL to sign. :param lifetime: Lifetime of the signed URL in seconds. :returns: Signed URL as a variable-length string. """ global CREDS_GCS if service not in ['gcs', 's3', 'swift']: raise UnsupportedOperation('Service must be "gcs", "s3" or "swift"') if operation not in ['read', 'write', 'delete']: raise UnsupportedOperation( 'Operation must be "read", "write", or "delete"') if url is None or url == '': raise UnsupportedOperation('URL must not be empty') if lifetime: if not isinstance(lifetime, integer_types): try: lifetime = int(lifetime) except: raise UnsupportedOperation( 'Lifetime must be convertible to numeric.') signed_url = None if service == 'gcs': if not CREDS_GCS: CREDS_GCS = ServiceAccountCredentials.from_json_keyfile_name( config_get( 'credentials', 'gcs', raise_exception=False, default='/opt/rucio/etc/google-cloud-storage-test.json')) components = urlparse(url) host = components.netloc # select the correct operation operations = {'read': 'GET', 'write': 'PUT', 'delete': 'DELETE'} operation = operations[operation] # special case to test signature, force epoch time if lifetime is None: lifetime = 0 else: # GCS is timezone-sensitive, don't use UTC # has to be converted to Unixtime lifetime = datetime.datetime.now() + datetime.timedelta( seconds=lifetime) lifetime = int(time.mktime(lifetime.timetuple())) # sign the path only path = components.path # assemble message to sign to_sign = "%s\n\n\n%s\n%s" % (operation, lifetime, path) # create URL-capable signature # first character is always a '=', remove it signature = urlencode( {'': base64.b64encode(CREDS_GCS.sign_blob(to_sign)[1])})[1:] # assemble final signed URL signed_url = 'https://%s%s?GoogleAccessId=%s&Expires=%s&Signature=%s' % ( host, path, CREDS_GCS.service_account_email, lifetime, signature) elif service == 's3': # split URL to get hostname, bucket and key components = urlparse(url) host = components.netloc pathcomponents = components.path.split('/') if len(pathcomponents) < 3: raise UnsupportedOperation('Not a valid S3 URL') bucket = pathcomponents[1] key = '/'.join(pathcomponents[2:]) # remove port number from host if present colon = host.find(':') port = '443' if colon >= 0: port = host[colon + 1:] host = host[:colon] # look up in RSE account configuration by RSE ID cred_name = rse_id cred = REGION.get('s3-%s' % cred_name) if cred is NO_VALUE: rse_cred = get_rse_credentials() cred = rse_cred.get(cred_name) REGION.set('s3-%s' % cred_name, cred) access_key = cred['access_key'] secret_key = cred['secret_key'] signature_version = cred['signature_version'] region_name = cred['region'] if operation == 'read': s3op = 'get_object' elif operation == 'write': s3op = 'put_object' else: s3op = 'delete_object' with record_timer_block('credential.signs3'): s3 = boto3.client('s3', endpoint_url='https://' + host + ':' + port, aws_access_key_id=access_key, aws_secret_access_key=secret_key, config=Config( signature_version=signature_version, region_name=region_name)) signed_url = s3.generate_presigned_url(s3op, Params={ 'Bucket': bucket, 'Key': key }, ExpiresIn=lifetime) elif service == 'swift': # split URL to get hostname and path components = urlparse(url) host = components.netloc # remove port number from host if present colon = host.find(':') if colon >= 0: host = host[:colon] # use RSE ID to look up key cred_name = rse_id # look up tempurl signing key cred = REGION.get('swift-%s' % cred_name) if cred is NO_VALUE: rse_cred = get_rse_credentials() cred = rse_cred.get(cred_name) REGION.set('swift-%s' % cred_name, cred) tempurl_key = cred['tempurl_key'] if operation == 'read': swiftop = 'GET' elif operation == 'write': swiftop = 'PUT' else: swiftop = 'DELETE' expires = int(time.time() + lifetime) # create signed URL with record_timer_block('credential.signswift'): hmac_body = u'%s\n%s\n%s' % (swiftop, expires, components.path) # Python 3 hmac only accepts bytes or bytearray sig = hmac.new(bytearray(tempurl_key, 'utf-8'), bytearray(hmac_body, 'utf-8'), sha1).hexdigest() signed_url = 'https://' + host + components.path + '?temp_url_sig=' + sig + '&temp_url_expires=' + str( expires) return signed_url
def create_root_account(create_counters=True): """ Inserts the default root account to an existing database. Make sure to change the default password later. :param create_counters: If True, create counters for the new account at existing RSEs. """ multi_vo = bool(config_get('common', 'multi_vo', False, False)) up_id = 'ddmlab' up_pwd = 'secret' up_email = '*****@*****.**' x509_id = '/C=CH/ST=Geneva/O=CERN/OU=PH-ADP-CO/CN=DDMLAB Client Certificate/[email protected]' x509_email = '*****@*****.**' gss_id = '*****@*****.**' gss_email = '*****@*****.**' ssh_id = 'ssh-rsa AAAAB3NzaC1yc2EAAAABIwAAAQEAq5LySllrQFpPL614sulXQ7wnIr1aGhGtl8b+HCB/'\ '0FhMSMTHwSjX78UbfqEorZV16rXrWPgUpvcbp2hqctw6eCbxwqcgu3uGWaeS5A0iWRw7oXUh6ydn'\ 'Vy89zGzX1FJFFDZ+AgiZ3ytp55tg1bjqqhK1OSC0pJxdNe878TRVVo5MLI0S/rZY2UovCSGFaQG2'\ 'iLj14wz/YqI7NFMUuJFR4e6xmNsOP7fCZ4bGMsmnhR0GmY0dWYTupNiP5WdYXAfKExlnvFLTlDI5'\ 'Mgh4Z11NraQ8pv4YE1woolYpqOc/IMMBBXFniTT4tC7cgikxWb9ZmFe+r4t6yCDpX4IL8L5GOQ== ddmlab' ssh_email = '*****@*****.**' try: up_id = config_get('bootstrap', 'userpass_identity') up_pwd = config_get('bootstrap', 'userpass_pwd') up_email = config_get('bootstrap', 'userpass_email') x509_id = config_get('bootstrap', 'x509_identity') x509_email = config_get('bootstrap', 'x509_email') gss_id = config_get('bootstrap', 'gss_identity') gss_email = config_get('bootstrap', 'gss_email') ssh_id = config_get('bootstrap', 'ssh_identity') ssh_email = config_get('bootstrap', 'ssh_email') except: pass # print 'Config values are missing (check rucio.cfg{.template}). Using hardcoded defaults.' s = get_session() if multi_vo: access = 'super_root' else: access = 'root' account = models.Account(account=InternalAccount(access, 'def'), account_type=AccountType.SERVICE, status=AccountStatus.ACTIVE) salt = urandom(255) salted_password = salt + up_pwd.encode() hashed_password = sha256(salted_password).hexdigest() identity1 = models.Identity(identity=up_id, identity_type=IdentityType.USERPASS, password=hashed_password, salt=salt, email=up_email) iaa1 = models.IdentityAccountAssociation( identity=identity1.identity, identity_type=identity1.identity_type, account=account.account, is_default=True) # X509 authentication identity2 = models.Identity(identity=x509_id, identity_type=IdentityType.X509, email=x509_email) iaa2 = models.IdentityAccountAssociation( identity=identity2.identity, identity_type=identity2.identity_type, account=account.account, is_default=True) # GSS authentication identity3 = models.Identity(identity=gss_id, identity_type=IdentityType.GSS, email=gss_email) iaa3 = models.IdentityAccountAssociation( identity=identity3.identity, identity_type=identity3.identity_type, account=account.account, is_default=True) # SSH authentication identity4 = models.Identity(identity=ssh_id, identity_type=IdentityType.SSH, email=ssh_email) iaa4 = models.IdentityAccountAssociation( identity=identity4.identity, identity_type=identity4.identity_type, account=account.account, is_default=True) # Account counters if create_counters: create_counters_for_new_account(account=account.account, session=s) # Apply for identity in [identity1, identity2, identity3, identity4]: try: s.add(identity) s.commit() except IntegrityError: # Identities may already be in the DB when running multi-VO conversion s.rollback() s.add(account) s.commit() s.add_all([iaa1, iaa2, iaa3, iaa4]) s.commit()
def add_subscription(name, account, filter_, replication_rules, comments, lifetime, retroactive, dry_run, priority=3, session=None): """ Adds a new subscription which will be verified against every new added file and dataset :param account: Account identifier :type account: String :param name: Name of the subscription :type name: String :param filter_: Dictionary of attributes by which the input data should be filtered **Example**: ``{'dsn': 'data11_hi*.express_express.*,data11_hi*physics_MinBiasOverlay*', 'account': 'tzero'}`` :type filter_: Dict :param replication_rules: Replication rules to be set : Dictionary with keys copies, rse_expression, weight, rse_expression :type replication_rules: Dict :param comments: Comments for the subscription :type comments: String :param lifetime: Subscription's lifetime (days) :type lifetime: Integer or None :param retroactive: Flag to know if the subscription should be applied on previous data :type retroactive: Boolean :param dry_run: Just print the subscriptions actions without actually executing them (Useful if retroactive flag is set) :type dry_run: Boolean :param priority: The priority of the subscription :type priority: Integer :param session: The database session in use. :returns: The subscriptionid """ try: keep_history = config_get('subscriptions', 'keep_history') except (NoOptionError, NoSectionError, RuntimeError): keep_history = False SubscriptionHistory = models.SubscriptionHistory retroactive = bool( retroactive) # Force boolean type, necessary for strict SQL state = SubscriptionState.ACTIVE lifetime = None if retroactive: state = SubscriptionState.NEW if lifetime: lifetime = datetime.datetime.utcnow() + datetime.timedelta( days=lifetime) new_subscription = models.Subscription(name=name, filter=filter_, account=account, replication_rules=replication_rules, state=state, lifetime=lifetime, retroactive=retroactive, policyid=priority, comments=comments) if keep_history: subscription_history = SubscriptionHistory( id=new_subscription.id, name=new_subscription.name, filter=new_subscription.filter, account=new_subscription.account, replication_rules=new_subscription.replication_rules, state=new_subscription.state, lifetime=new_subscription.lifetime, retroactive=new_subscription.retroactive, policyid=new_subscription.policyid, comments=new_subscription.comments) try: new_subscription.save(session=session) if keep_history: subscription_history.save(session=session) except IntegrityError as error: if re.match('.*IntegrityError.*ORA-00001: unique constraint.*SUBSCRIPTIONS_PK.*violated.*', error.args[0])\ or re.match(".*IntegrityError.*UNIQUE constraint failed: subscriptions.name, subscriptions.account.*", error.args[0])\ or re.match('.*IntegrityError.*columns? name.*account.*not unique.*', error.args[0]) \ or re.match('.*IntegrityError.*ORA-00001: unique constraint.*SUBSCRIPTIONS_NAME_ACCOUNT_UQ.*violated.*', error.args[0])\ or re.match('.*IntegrityError.*1062.*Duplicate entry.*', error.args[0]) \ or re.match('.*IntegrityError.*duplicate key value violates unique constraint.*', error.args[0]) \ or re.match('.*UniqueViolation.*duplicate key value violates unique constraint.*', error.args[0]): raise SubscriptionDuplicate( 'Subscription \'%s\' owned by \'%s\' already exists!' % (name, account)) raise RucioException(error.args) return new_subscription.id
def update_subscription(name, account, metadata=None, session=None): """ Updates a subscription :param name: Name of the subscription :type name: String :param account: Account identifier :type account: String :param metadata: Dictionary of metadata to update. Supported keys : filter, replication_rules, comments, lifetime, retroactive, dry_run, priority, last_processed :type metadata: Dict :param session: The database session in use. :raises: SubscriptionNotFound if subscription is not found """ try: keep_history = config_get('subscriptions', 'keep_history') except (NoOptionError, NoSectionError, RuntimeError): keep_history = False values = {'state': SubscriptionState.UPDATED} if 'filter' in metadata and metadata['filter']: values['filter'] = dumps(metadata['filter']) if 'replication_rules' in metadata and metadata['replication_rules']: values['replication_rules'] = dumps(metadata['replication_rules']) if 'lifetime' in metadata and metadata['lifetime']: values['lifetime'] = datetime.datetime.utcnow() + datetime.timedelta( days=float(metadata['lifetime'])) if 'retroactive' in metadata and metadata['retroactive']: values['retroactive'] = metadata['retroactive'] if 'dry_run' in metadata and metadata['dry_run']: values['dry_run'] = metadata['dry_run'] if 'comments' in metadata and metadata['comments']: values['comments'] = metadata['comments'] if 'priority' in metadata and metadata['priority']: values['policyid'] = metadata['priority'] if 'last_processed' in metadata and metadata['last_processed']: values['last_processed'] = metadata['last_processed'] if 'state' in metadata and metadata['state'] == SubscriptionState.INACTIVE: values['state'] = SubscriptionState.INACTIVE values['expired_at'] = datetime.datetime.utcnow() SubscriptionHistory = models.SubscriptionHistory try: subscription = session.query(models.Subscription).filter_by( account=account, name=name).one() subscription.update(values) if keep_history: subscription_history = SubscriptionHistory( id=subscription.id, name=subscription.name, filter=subscription.filter, account=subscription.account, replication_rules=subscription.replication_rules, state=subscription.state, lifetime=subscription.lifetime, retroactive=subscription.retroactive, policyid=subscription.policyid, comments=subscription.comments, last_processed=subscription.last_processed, expired_at=subscription.expired_at, updated_at=subscription.updated_at, created_at=subscription.created_at) subscription_history.save(session=session) except NoResultFound: raise SubscriptionNotFound( "Subscription for account '%(account)s' named '%(name)s' not found" % locals())
def tune(self): """ tune the configuration settings """ result = self.request_timeout_data() if result is not None: try: cycle_file = config_get('conveyor', 'fts_throttler_cycle') except Exception: logging.warn( 'could not get the cycle file, cannot perform tuning for this cycle without cycle file, returning' ) return try: tuning_ratio = config_get('conveyor', 'fts_throttler_tuning_ratio') except Exception: logging.warn( 'could not get the tuning ratio from config, returning') return rses = result['aggregations']['rse']['buckets'] cycle_info_dict = {'storages': []} for rse in rses: # if a rse has a failure ratio above the tuning ratio (percentage) we tune it. if rse['failure_ratio'].get('value') > int(tuning_ratio): # rse_info holds the storage name(0) and FTS-host server(1) rse_info = rse['key'].split() # Tapes might have other reasons for timeouts which should be treated differently, therefor they are ignored and not tuned for now. if rse['storage_type']['hits']['hits'][0]['_source'][ 'payload']['dst-type'] == 'TAPE': logging.info( '%s is a tape storage type, it will not be tuned', rse_info[0]) continue # instantiate transfertool for access to get_se_config and set_se_config. t = FTS3Transfertool(rse_info[1]) # extract FTS storage from dst-url tmp = rse['destination']['hits']['hits'][0]['_source'][ 'payload']['dst-url'].split(':', 2) url = tmp[0] + ':' + tmp[1] n = rse['failure_ratio'].get('value') logging.info(' RSE ' + rse_info[0] + ' on FTS host ' + rse_info[1] + ' has failure ratio ' + str(rse['failure_ratio'].get('value')) + ' on storage ' + url) try: se = t.get_se_config(url) logging.info('storage settings: %s', se) except KeyError: logging.warn( 'configuration for storage element was not found, config will be set from default values' ) # all FTS Host servers have a default reference storage named '*' that holds the default values for all storages that arent listed yet. default_storage = t.get_se_config('*') t.set_se_config( url, inbound_max_active=int( (100 / (100 + n)) * default_storage['se_info'] ['inbound_max_active']), outbound_max_active=int( (100 / (100 + n)) * default_storage['se_info'] ['outbound_max_active'])) logging.info( url + 'inbound_max_active changed from ' + str(default_storage['se_info'] ['inbound_max_active']) + ' to ' + str( int((100 / (100 + n)) * default_storage['se_info'] ['inbound_max_active'])) + ', outbound_max_active changed from ' + str(default_storage['se_info'] ['outbound_max_active']) + ' to ' + str( int((100 / (100 + n)) * default_storage['se_info'] ['outbound_max_active']))) # cycle_info_dict is used to write changes down to the cycle file. cycle_info_dict['storages'].append({ 'storage': url, 'inbound_max_active': default_storage['se_info']['inbound_max_active'], 'outbound_max_active': default_storage['se_info']['outbound_max_active'], 'failure_ratio': n, 'tuned_inbound_max_active': int((100 / (100 + n)) * default_storage['se_info'] ['inbound_max_active']), 'tuned_outbound_max_active': int((100 / (100 + n)) * default_storage['se_info'] ['outbound_max_active']), 'fts-host': rse_info[1], 'time': str(datetime.datetime.now()) }) continue except Exception as error: logging.warn( 'an error occured when trying to get the storage configuration' ) logging.warn(str(error)) continue # Even though we could read the config, we still need to know if the important attributes are empty. if se['se_info']['inbound_max_active'] is None: try: default_storage = t.get_se_config('*') except Exception: raise Exception( 'Could not retrieve the default storage information' ) ima = default_storage['se_info']['inbound_max_active'] else: ima = se['se_info']['inbound_max_active'] if se['se_info']['outbound_max_active'] is None: try: default_storage = t.get_se_config('*') except Exception: raise Exception( 'Could not retrieve the default storage information' ) oma = default_storage['se_info']['outbound_max_active'] else: oma = se['se_info']['outbound_max_active'] # append existing information to dict and write to file. cycle_info_dict['storages'].append({ 'storage': url, 'inbound_max_active': ima, 'outbound_max_active': oma, 'failure_ratio': n, 'tuned_inbound_max_active': int((100 / (100 + n)) * ima), 'tuned_outbound_max_active': int((100 / (100 + n)) * oma), 'fts-host': rse_info[1], 'time': str(datetime.datetime.now()) }) # tune down the configuration of a storage relative to the failure ratio(n) and existing configuration. t.set_se_config(url, inbound_max_active=int( (100 / (100 + n)) * ima), outbound_max_active=int( (100 / (100 + n)) * oma)) logging.info(url + 'inbound_max_active changed from ' + str(ima) + ' to ' + str(int((100 / (100 + n)) * ima)) + ', outbound_max_active changed from ' + str(oma) + ' to ' + str(int((100 / (100 + n)) * oma))) if cycle_info_dict['storages'] == []: logging.info( 'no storages are failing significantly due to timeout errors, therefor no tuning happened.' ) with open(cycle_file, 'w') as outfile: json.dump(cycle_info_dict, outfile) else: logging.warn( 'Could not detect any storages with sufficient failure ratio for tuning, trying again next cycle' ) return
from sys import exc_info, stdout, argv from traceback import format_exception from rucio.db.sqla.constants import LifetimeExceptionsState from rucio.common.config import config_get from rucio.common.exception import RuleNotFound import rucio.common.policy from rucio.core import heartbeat import rucio.core.lifetime_exception from rucio.core.lock import get_dataset_locks from rucio.core.rse_expression_parser import parse_expression from rucio.core.rule import get_rules_beyond_eol, update_rule logging.basicConfig(stream=stdout, level=getattr(logging, config_get('common', 'loglevel').upper()), format='%(asctime)s\t%(process)d\t%(levelname)s\t%(message)s') GRACEFUL_STOP = threading.Event() def atropos(thread, bulk, date_check, dry_run=True, grace_period=86400, once=True): """ Creates an Atropos Worker that gets a list of rules which have an eol_at expired and delete them. :param thread: Thread number at startup. :param bulk: The number of requests to process. :param grace_period: The grace_period for the rules. :param once: Run only once. """
def consumer(id, num_thread=1): """ Main loop to consume messages from the Rucio Cache producer. """ logging.info('Rucio Cache consumer starting') brokers_alias = [] brokers_resolved = [] try: brokers_alias = [b.strip() for b in config_get('messaging-cache', 'brokers').split(',')] except: raise Exception('Could not load rucio cache brokers from configuration') logging.info('resolving rucio cache broker dns alias: %s' % brokers_alias) brokers_resolved = [] for broker in brokers_alias: brokers_resolved.append([str(tmp_broker) for tmp_broker in dns.resolver.query(broker, 'A')]) brokers_resolved = [item for sublist in brokers_resolved for item in sublist] logging.debug('Rucio cache brokers resolved to %s', brokers_resolved) conns = {} for broker in brokers_resolved: conn = stomp.Connection(host_and_ports=[(broker, config_get_int('messaging-cache', 'port'))], use_ssl=True, ssl_key_file=config_get('messaging-cache', 'ssl_key_file'), ssl_cert_file=config_get('messaging-cache', 'ssl_cert_file'), ssl_version=ssl.PROTOCOL_TLSv1) conns[conn] = Consumer(conn.transport._Transport__host_and_ports[0], account=config_get('messaging-cache', 'account'), id=id, num_thread=num_thread) logging.info('consumer started') while not graceful_stop.is_set(): for conn in conns: if not conn.is_connected(): logging.info('connecting to %s' % conn.transport._Transport__host_and_ports[0][0]) record_counter('daemons.messaging.cache.reconnect.%s' % conn.transport._Transport__host_and_ports[0][0].split('.')[0]) conn.set_listener('rucio-cache-messaging', conns[conn]) conn.start() conn.connect() conn.subscribe(destination=config_get('messaging-cache', 'destination'), id='rucio-cache-messaging', ack='auto', headers={'selector': 'vo = \'%s\'' % config_get('messaging-cache', 'voname')}) time.sleep(1) logging.info('graceful stop requested') for conn in conns: try: conn.disconnect() except: pass logging.info('graceful stop done')
def request_transfer(loop=1, src=None, dst=None, upload=False, same_src=False, same_dst=False): """ Main loop to request a new transfer. """ logging.info('request: starting') session = get_session() src_rse = generate_rse( src, ''.join(random.sample(string.ascii_letters.upper(), 8))) dst_rse = generate_rse( dst, ''.join(random.sample(string.ascii_letters.upper(), 8))) logging.info('request: started') i = 0 while not graceful_stop.is_set(): if i >= loop: return try: if not same_src: src_rse = generate_rse( src, ''.join(random.sample(string.ascii_letters.upper(), 8))) if not same_dst: dst_rse = generate_rse( dst, ''.join(random.sample(string.ascii_letters.upper(), 8))) tmp_name = generate_uuid() # add a new dataset did.add_did(scope='mock', name='dataset-%s' % tmp_name, type=DIDType.DATASET, account='root', session=session) # construct PFN pfn = rsemanager.lfns2pfns(src_rse, lfns=[{ 'scope': 'mock', 'name': 'file-%s' % tmp_name }])['mock:file-%s' % tmp_name] if upload: # create the directories if needed p = rsemanager.create_protocol(src_rse, operation='write', scheme='srm') p.connect() try: p.mkdir(pfn) except: pass # upload the test file try: fp = os.path.dirname(config_get('injector', 'file')) fn = os.path.basename(config_get('injector', 'file')) p.put(fn, pfn, source_dir=fp) except: logging.critical( 'Could not upload, removing temporary DID: %s' % str(sys.exc_info())) did.delete_dids([{ 'scope': 'mock', 'name': 'dataset-%s' % tmp_name }], account='root', session=session) break # add the replica replica.add_replica(rse=src_rse['rse'], scope='mock', name='file-%s' % tmp_name, bytes=config_get_int('injector', 'bytes'), adler32=config_get('injector', 'adler32'), md5=config_get('injector', 'md5'), account='root', session=session) logging.info('added replica on %s for DID mock:%s' % (src_rse['rse'], tmp_name)) # to the dataset did.attach_dids(scope='mock', name='dataset-%s' % tmp_name, dids=[{ 'scope': 'mock', 'name': 'file-%s' % tmp_name, 'bytes': config_get('injector', 'bytes') }], account='root', session=session) # add rule for the dataset rule.add_rule(dids=[{ 'scope': 'mock', 'name': 'dataset-%s' % tmp_name }], account='root', copies=1, rse_expression=dst_rse['rse'], grouping='ALL', weight=None, lifetime=None, locked=False, subscription_id=None, activity='mock-injector', session=session) logging.info('added rule for %s for DID mock:%s' % (dst_rse['rse'], tmp_name)) session.commit() except: session.rollback() logging.critical(traceback.format_exc()) i += 1 logging.info('request: graceful stop requested') logging.info('request: graceful stop done')
import logging import socket import sys import threading import os from rucio.core.heartbeat import live, die, sanity_check from rucio.common.config import config_get GRACEFUL_STOP = threading.Event() logging.basicConfig( stream=sys.stdout, level=getattr(logging, config_get('common', 'loglevel').upper()), format='%(asctime)s\t%(process)d\t%(levelname)s\t%(message)s') def rule_rebalancer(once=False): """ Main loop to rebalancer rules automatically """ raise NotImplementedError() hostname = socket.gethostname() pid = os.getpid() current_thread = threading.current_thread() # Make an initial heartbeat so that all have the correct worker number on the next try
def __init__(self, rucio_host=None, auth_host=None, account=None, ca_cert=None, auth_type=None, creds=None, timeout=None, user_agent='rucio-clients'): """ Constructor of the BaseClient. :param rucio_host: the address of the rucio server, if None it is read from the config file. :param rucio_port: the port of the rucio server, if None it is read from the config file. :param auth_host: the address of the rucio authentication server, if None it is read from the config file. :param auth_port: the port of the rucio authentication server, if None it is read from the config file. :param account: the account to authenticate to rucio. :param use_ssl: enable or disable ssl for commucation. Default is enabled. :param ca_cert: the path to the rucio server certificate. :param auth_type: the type of authentication (e.g.: 'userpass', 'kerberos' ...) :param creds: a dictionary with credentials needed for authentication. :param user_agent: indicates the client """ self.host = rucio_host self.list_hosts = [] self.auth_host = auth_host self.session = session() self.user_agent = "%s/%s" % (user_agent, version.version_string() ) # e.g. "rucio-clients/0.2.13" sys.argv[0] = sys.argv[0].split('/')[-1] self.script_id = '::'.join(sys.argv[0:2]) if self.script_id == '': # Python interpreter used self.script_id = 'python' try: if self.host is None: self.host = config_get('client', 'rucio_host') if self.auth_host is None: self.auth_host = config_get('client', 'auth_host') except (NoOptionError, NoSectionError) as error: raise MissingClientParameter( 'Section client and Option \'%s\' cannot be found in config file' % error.args[0]) self.account = account self.ca_cert = ca_cert self.auth_type = auth_type self.creds = creds self.auth_token = None self.headers = {} self.timeout = timeout self.request_retries = self.REQUEST_RETRIES if auth_type is None: LOG.debug( 'no auth_type passed. Trying to get it from the environment variable RUCIO_AUTH_TYPE and config file.' ) if 'RUCIO_AUTH_TYPE' in environ: if environ['RUCIO_AUTH_TYPE'] not in ('userpass', 'x509', 'x509_proxy', 'gss', 'ssh'): raise MissingClientParameter( 'Possible RUCIO_AUTH_TYPE values: userpass, x509, x509_proxy, gss, ssh, vs. ' + environ['RUCIO_AUTH_TYPE']) self.auth_type = environ['RUCIO_AUTH_TYPE'] else: try: self.auth_type = config_get('client', 'auth_type') except (NoOptionError, NoSectionError) as error: raise MissingClientParameter( 'Option \'%s\' cannot be found in config file' % error.args[0]) if creds is None: LOG.debug( 'no creds passed. Trying to get it from the config file.') self.creds = {} try: if self.auth_type == 'userpass': self.creds['username'] = config_get('client', 'username') self.creds['password'] = config_get('client', 'password') elif self.auth_type == 'x509': self.creds['client_cert'] = path.abspath( path.expanduser( path.expandvars(config_get('client', 'client_cert')))) self.creds['client_key'] = path.abspath( path.expanduser( path.expandvars(config_get('client', 'client_key')))) elif self.auth_type == 'x509_proxy': try: self.creds['client_proxy'] = path.abspath( path.expanduser( path.expandvars( config_get('client', 'client_x509_proxy')))) except NoOptionError as error: # Recreate the classic GSI logic for locating the proxy: # - $X509_USER_PROXY, if it is set. # - /tmp/x509up_u`id -u` otherwise. # If neither exists (at this point, we don't care if it exists but is invalid), then rethrow if 'X509_USER_PROXY' in environ: self.creds['client_proxy'] = environ[ 'X509_USER_PROXY'] else: fname = '/tmp/x509up_u%d' % geteuid() if path.exists(fname): self.creds['client_proxy'] = path else: raise MissingClientParameter( 'Cannot find a valid X509 proxy; not in %s, $X509_USER_PROXY not set, and ' '\'x509_proxy\' not set in the configuration file.' % fname) elif self.auth_type == 'ssh': self.creds['ssh_private_key'] = path.abspath( path.expanduser( path.expandvars( config_get('client', 'ssh_private_key')))) except (NoOptionError, NoSectionError) as error: if error.args[0] != 'client_key': raise MissingClientParameter( 'Option \'%s\' cannot be found in config file' % error.args[0]) rucio_scheme = urlparse(self.host).scheme auth_scheme = urlparse(self.auth_host).scheme if rucio_scheme != 'http' and rucio_scheme != 'https': raise ClientProtocolNotSupported('\'%s\' not supported' % rucio_scheme) if auth_scheme != 'http' and auth_scheme != 'https': raise ClientProtocolNotSupported('\'%s\' not supported' % auth_scheme) if (rucio_scheme == 'https' or auth_scheme == 'https') and ca_cert is None: LOG.debug( 'no ca_cert passed. Trying to get it from the config file.') try: self.ca_cert = path.expandvars(config_get('client', 'ca_cert')) except (NoOptionError, NoSectionError) as error: raise MissingClientParameter( 'Option \'%s\' cannot be found in config file' % error.args[0]) self.list_hosts = [self.host] if account is None: LOG.debug( 'no account passed. Trying to get it from the config file.') try: self.account = config_get('client', 'account') except (NoOptionError, NoSectionError): try: self.account = environ['RUCIO_ACCOUNT'] except KeyError: raise MissingClientParameter( 'Option \'account\' cannot be found in config file and RUCIO_ACCOUNT is not set.' ) token_path = self.TOKEN_PATH_PREFIX + self.account self.token_file = token_path + '/' + self.TOKEN_PREFIX + self.account self.__authenticate() try: self.request_retries = int(config_get('client', 'request_retries')) except NoOptionError: LOG.debug( 'request_retries not specified in config file. Taking default.' ) except ValueError: LOG.debug('request_retries must be an integer. Taking default.')
def list_rebalance_rule_candidates(rse_id, mode=None, session=None): """ List the rebalance rule candidates based on the agreed on specification :param rse_id: RSE of the source. :param mode: Rebalancing mode. :param session: DB Session. """ vo = get_rse_vo(rse_id=rse_id) # dumps can be applied only for decommission since the dumps doesn't contain info from dids if mode == "decommission": return _list_rebalance_rule_candidates_dump(rse_id, mode) # If no decommissioning use SQLAlchemy # Rules constraints. By default only moves rules in state OK that have no children and have only one copy # Additional constraints can be imposed by setting specific configuration rule_clause = [ models.ReplicationRule.state == RuleState.OK, models.ReplicationRule.child_rule_id.is_(None), models.ReplicationRule.copies == 1, ] # Only move rules w/o expiration date, or rules with expiration_date > >min_expires_date_in_days> days expiration_clause = models.ReplicationRule.expires_at.is_(None) min_expires_date_in_days = config_get_int( section="bb8", option="min_expires_date_in_days", raise_exception=False, default=-1, expiration_time=3600, ) if min_expires_date_in_days > 0: min_expires_date_in_days = datetime.utcnow() + timedelta( days=min_expires_date_in_days) expiration_clause = or_( models.ReplicationRule.expires_at > min_expires_date_in_days, models.ReplicationRule.expires_at.is_(None), ) rule_clause.append(expiration_clause) # Only move rules which were created more than <min_created_days> days ago min_created_days = config_get_int( section="bb8", option="min_created_days", raise_exception=False, default=-1, expiration_time=3600, ) if min_created_days > 0: min_created_days = datetime.now() - timedelta(days=min_created_days) rule_clause.append( models.ReplicationRule.created_at < min_created_days) # Only move rules which are owned by <allowed_accounts> (coma separated accounts, e.g. panda,root,ddmadmin,jdoe) allowed_accounts = config_get( section="bb8", option="allowed_accounts", raise_exception=False, default=None, expiration_time=3600, ) if allowed_accounts: allowed_accounts = [ InternalAccount(acc.strip(" "), vo=vo) for acc in allowed_accounts.split(",") ] rule_clause.append( models.ReplicationRule.account.in_(allowed_accounts)) # Only move rules which with scope <allowed_scopes> (coma separated scopes, e.g. mc16_13TeV,data18_13TeV) allowed_scopes = config_get( section="bb8", option="allowed_scopes", raise_exception=False, default=None, expiration_time=3600, ) if allowed_scopes: allowed_scopes = [ InternalScope(scope.strip(" "), vo=vo) for scope in allowed_scopes.split(",") ] rule_clause.append(models.ReplicationRule.scope.in_(allowed_scopes)) # Only move rules that have a certain grouping <allowed_grouping> (accepted values : all, dataset, none) rule_grouping_mapping = { "all": RuleGrouping.ALL, "dataset": RuleGrouping.DATASET, "none": RuleGrouping.NONE, } allowed_grouping = config_get( section="bb8", option="allowed_grouping", raise_exception=False, default=None, expiration_time=3600, ) if allowed_grouping: rule_clause.append(models.ReplicationRule.grouping == rule_grouping_mapping.get(allowed_grouping)) # DIDs constraints. By default only moves rules of DID where we can compute the size # Additional constraints can be imposed by setting specific configuration did_clause = [models.DataIdentifier.bytes.isnot(None)] type_to_did_type_mapping = { "all": [DIDType.CONTAINER, DIDType.DATASET, DIDType.FILE], "collection": [DIDType.CONTAINER, DIDType.DATASET], "container": [DIDType.CONTAINER], "dataset": [DIDType.DATASET], "file": [DIDType.FILE], } # Only allows to migrate rules of a certain did_type <allowed_did_type> (accepted values : all, collection, container, dataset, file) allowed_did_type = config_get( section="bb8", option="allowed_did_type", raise_exception=False, default=None, expiration_time=3600, ) if allowed_did_type: allowed_did_type = [ models.DataIdentifier.did_type == did_type for did_type in type_to_did_type_mapping.get(allowed_did_type) ] did_clause.append(or_(*allowed_did_type)) # Only allows to migrate rules of closed DID is <only_move_closed_did> is set only_move_closed_did = config_get_bool( section="bb8", option="only_move_closed_did", raise_exception=False, default=None, expiration_time=3600, ) if only_move_closed_did: did_clause.append(models.DataIdentifier.is_open == False) # NOQA # Now build the query external_dsl = aliased(models.DatasetLock) count_locks = (select([func.count()]).where( and_( external_dsl.scope == models.DatasetLock.scope, external_dsl.name == models.DatasetLock.name, external_dsl.rse_id == models.DatasetLock.rse_id, )).as_scalar()) query = (session.query( models.DatasetLock.scope, models.DatasetLock.name, models.ReplicationRule.id, models.ReplicationRule.rse_expression, models.ReplicationRule.subscription_id, models.DataIdentifier.bytes, models.DataIdentifier.length, case( [( or_( models.DatasetLock.length < 1, models.DatasetLock.length.is_(None), ), 0, )], else_=cast(models.DatasetLock.bytes / models.DatasetLock.length, BigInteger), ), ).join( models.ReplicationRule, models.ReplicationRule.id == models.DatasetLock.rule_id, ).join( models.DataIdentifier, and_( models.DatasetLock.scope == models.DataIdentifier.scope, models.DatasetLock.name == models.DataIdentifier.name, ), ).filter(models.DatasetLock.rse_id == rse_id).filter( and_(*rule_clause) ).filter(and_(*did_clause)).filter( case( [( or_( models.DatasetLock.length < 1, models.DatasetLock.length.is_(None), ), 0, )], else_=cast(models.DatasetLock.bytes / models.DatasetLock.length, BigInteger), ) > 1000000000).filter(count_locks == 1)) summary = query.order_by( case( [( or_( models.DatasetLock.length < 1, models.DatasetLock.length.is_(None), ), 0, )], else_=cast(models.DatasetLock.bytes / models.DatasetLock.length, BigInteger), ), models.DatasetLock.accessed_at, ).all() return summary
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import sessionmaker, scoped_session from rucio.common.config import config_get from rucio.common.exception import RucioException, DatabaseException try: main_script = os.path.basename(sys.argv[0]) CURRENT_COMPONENT = main_script.split('-')[1] except: CURRENT_COMPONENT = None DATABASE_SECTION = 'database' try: if CURRENT_COMPONENT: sql_connection = config_get('%s-database' % CURRENT_COMPONENT, 'default').strip() if sql_connection and len(sql_connection): DATABASE_SECTION = '%s-database' % CURRENT_COMPONENT except: pass BASE = declarative_base() try: DEFAULT_SCHEMA_NAME = config_get(DATABASE_SECTION, 'schema') BASE.metadata.schema = DEFAULT_SCHEMA_NAME except NoOptionError: DEFAULT_SCHEMA_NAME = None _MAKER, _ENGINE, _LOCK = None, None, Lock()
from rucio.core.heartbeat import live, die, sanity_check, list_payload_counts from rucio.core.message import add_message from rucio.core.replica import list_and_mark_unlocked_replicas, delete_replicas from rucio.core.rse import list_rses, get_rse_limits, get_rse_usage, list_rse_attributes, get_rse_protocols from rucio.core.rse_expression_parser import parse_expression from rucio.core.rule import get_evaluation_backlog from rucio.core.vo import list_vos from rucio.rse import rsemanager as rsemgr GRACEFUL_STOP = threading.Event() REGION = make_region().configure('dogpile.cache.memcached', expiration_time=600, arguments={ 'url': config_get('cache', 'url', False, '127.0.0.1:11211'), 'distributed_lock': True }) DELETION_COUNTER = Counter('rucio_daemons_reaper_deletion_done', 'Number of deleted replicas') EXCLUDED_RSE_GAUGE = Gauge('rucio_daemons_reaper_excluded_rses', 'Temporarly excluded RSEs', labelnames=('rse', )) def get_rses_to_process(rses, include_rses, exclude_rses, vos): """ Return the list of RSEs to process based on rses, include_rses and exclude_rses
def setup(self): self.cacert = config_get('test', 'cacert') self.marker = '$> '
def create_root_account(): """ Inserts the default root account to an existing database. Make sure to change the default password later. """ up_id = 'ddmlab' up_pwd = '2ccee6f6dd1bc2269cddd7cd5e47578e98e430539807c36df23fab7dd13e7583' up_email = '*****@*****.**' x509_id = '/C=CH/ST=Geneva/O=CERN/OU=PH-ADP-CO/CN=DDMLAB Client Certificate/[email protected]' x509_email = '*****@*****.**' gss_id = '*****@*****.**' gss_email = '*****@*****.**' ssh_id = 'ssh-rsa AAAAB3NzaC1yc2EAAAABIwAAAQEAq5LySllrQFpPL614sulXQ7wnIr1aGhGtl8b+HCB/'\ '0FhMSMTHwSjX78UbfqEorZV16rXrWPgUpvcbp2hqctw6eCbxwqcgu3uGWaeS5A0iWRw7oXUh6ydn'\ 'Vy89zGzX1FJFFDZ+AgiZ3ytp55tg1bjqqhK1OSC0pJxdNe878TRVVo5MLI0S/rZY2UovCSGFaQG2'\ 'iLj14wz/YqI7NFMUuJFR4e6xmNsOP7fCZ4bGMsmnhR0GmY0dWYTupNiP5WdYXAfKExlnvFLTlDI5'\ 'Mgh4Z11NraQ8pv4YE1woolYpqOc/IMMBBXFniTT4tC7cgikxWb9ZmFe+r4t6yCDpX4IL8L5GOQ== ddmlab' ssh_email = '*****@*****.**' try: up_id = config_get('bootstrap', 'userpass_identity') up_pwd = config_get('bootstrap', 'userpass_pwd') up_email = config_get('bootstrap', 'userpass_email') x509_id = config_get('bootstrap', 'x509_identity') x509_email = config_get('bootstrap', 'x509_email') gss_id = config_get('bootstrap', 'gss_identity') gss_email = config_get('bootstrap', 'gss_email') ssh_id = config_get('bootstrap', 'ssh_identity') ssh_email = config_get('bootstrap', 'ssh_email') except: pass # print 'Config values are missing (check rucio.cfg{.template}). Using hardcoded defaults.' s = session.get_session() account = models.Account(account='root', account_type=AccountType.SERVICE, status=AccountStatus.ACTIVE) identity1 = models.Identity(identity=up_id, identity_type=IdentityType.USERPASS, password=up_pwd, salt='0', email=up_email) iaa1 = models.IdentityAccountAssociation( identity=identity1.identity, identity_type=identity1.identity_type, account=account.account, is_default=True) # X509 authentication identity2 = models.Identity(identity=x509_id, identity_type=IdentityType.X509, email=x509_email) iaa2 = models.IdentityAccountAssociation( identity=identity2.identity, identity_type=identity2.identity_type, account=account.account, is_default=True) # GSS authentication identity3 = models.Identity(identity=gss_id, identity_type=IdentityType.GSS, email=gss_email) iaa3 = models.IdentityAccountAssociation( identity=identity3.identity, identity_type=identity3.identity_type, account=account.account, is_default=True) # SSH authentication identity4 = models.Identity(identity=ssh_id, identity_type=IdentityType.SSH, email=ssh_email) iaa4 = models.IdentityAccountAssociation( identity=identity4.identity, identity_type=identity4.identity_type, account=account.account, is_default=True) # Account counters create_counters_for_new_account(account='root', session=s) # Apply s.add_all([account, identity1, identity2, identity3, identity4]) s.commit() s.add_all([iaa1, iaa2, iaa3, iaa4]) s.commit()
from six import iteritems from six.moves.configparser import NoOptionError import rucio.db.sqla.util from rucio.common import exception from rucio.common.config import config_get, config_get_bool from rucio.common.logging import formatted_logger, setup_logging from rucio.common.schema import get_schema_value from rucio.core import heartbeat, request as request_core, transfer as transfer_core from rucio.core.monitor import record_counter, record_timer from rucio.daemons.conveyor.common import submit_transfer, bulk_group_transfer, get_conveyor_rses, USER_ACTIVITY from rucio.db.sqla.constants import RequestState graceful_stop = threading.Event() USER_TRANSFERS = config_get('conveyor', 'user_transfers', False, None) TRANSFER_TOOL = config_get('conveyor', 'transfertool', False, None) # NOTE: This should eventually be completely removed, as it can be fetched from the request FILTER_TRANSFERTOOL = config_get('conveyor', 'filter_transfertool', False, None) # NOTE: TRANSFERTOOL to filter requests on TRANSFER_TYPE = config_get('conveyor', 'transfertype', False, 'single') GET_TRANSFERS_COUNTER = Counter('rucio_daemons_conveyor_submitter_get_transfers', 'Number of transfers retrieved') def submitter(once=False, rses=None, partition_wait_time=10, bulk=100, group_bulk=1, group_policy='rule', source_strategy=None, activities=None, sleep_time=600, max_sources=4, retry_other_fts=False, filter_transfertool=FILTER_TRANSFERTOOL, transfertool=TRANSFER_TOOL, transfertype=TRANSFER_TYPE): """ Main loop to submit a new transfer primitive to a transfertool. """
def kronos_file(once=False, thread=0, brokers_resolved=None, dataset_queue=None, sleep_time=60): """ Main loop to consume tracer reports. """ logging.info('tracer consumer starting') hostname = socket.gethostname() pid = getpid() thread = current_thread() chunksize = config_get_int('tracer-kronos', 'chunksize') prefetch_size = config_get_int('tracer-kronos', 'prefetch_size') subscription_id = config_get('tracer-kronos', 'subscription_id') try: bad_files_patterns = [] pattern = get(section='kronos', option='bad_files_patterns', session=None) pattern = str(pattern) patterns = pattern.split(",") for pat in patterns: bad_files_patterns.append(re.compile(pat.strip())) except ConfigNotFound: bad_files_patterns = [] except Exception as error: logging.error('(kronos_file) Failed to get bad_file_patterns' + str(error)) bad_files_patterns = [] use_ssl = True try: use_ssl = config_get_bool('tracer-kronos', 'use_ssl') except Exception: pass if not use_ssl: username = config_get('tracer-kronos', 'username') password = config_get('tracer-kronos', 'password') excluded_usrdns = set( config_get('tracer-kronos', 'excluded_usrdns').split(',')) vhost = config_get('tracer-kronos', 'broker_virtual_host', raise_exception=False) conns = [] for broker in brokers_resolved: if not use_ssl: conns.append( Connection(host_and_ports=[ (broker, config_get_int('tracer-kronos', 'port')) ], use_ssl=False, vhost=vhost, reconnect_attempts_max=config_get_int( 'tracer-kronos', 'reconnect_attempts'))) else: conns.append( Connection(host_and_ports=[ (broker, config_get_int('tracer-kronos', 'port')) ], use_ssl=True, ssl_key_file=config_get('tracer-kronos', 'ssl_key_file'), ssl_cert_file=config_get('tracer-kronos', 'ssl_cert_file'), vhost=vhost, reconnect_attempts_max=config_get_int( 'tracer-kronos', 'reconnect_attempts'))) logging.info('(kronos_file) tracer consumer started') sanity_check(executable='kronos-file', hostname=hostname) while not graceful_stop.is_set(): start_time = time() live(executable='kronos-file', hostname=hostname, pid=pid, thread=thread) for conn in conns: if not conn.is_connected(): logging.info('(kronos_file) connecting to %s' % conn.transport._Transport__host_and_ports[0][0]) record_counter('daemons.tracer.kronos.reconnect.%s' % conn.transport._Transport__host_and_ports[0] [0].split('.')[0]) conn.set_listener( 'rucio-tracer-kronos', AMQConsumer( broker=conn.transport._Transport__host_and_ports[0], conn=conn, queue=config_get('tracer-kronos', 'queue'), chunksize=chunksize, subscription_id=subscription_id, excluded_usrdns=excluded_usrdns, dataset_queue=dataset_queue, bad_files_patterns=bad_files_patterns)) conn.start() if not use_ssl: conn.connect(username, password) else: conn.connect() conn.subscribe( destination=config_get('tracer-kronos', 'queue'), ack='client-individual', id=subscription_id, headers={'activemq.prefetchSize': prefetch_size}) tottime = time() - start_time if tottime < sleep_time: logging.info('(kronos_file) Will sleep for %s seconds' % (sleep_time - tottime)) sleep(sleep_time - tottime) logging.info('(kronos_file) graceful stop requested') for conn in conns: try: conn.disconnect() except Exception: pass die(executable='kronos-file', hostname=hostname, pid=pid, thread=thread) logging.info('(kronos_file) graceful stop done')
def set_se_config(self, storage_element, inbound_max_active=None, outbound_max_active=None, inbound_max_throughput=None, outbound_max_throughput=None, staging=None): """ Set the configuration for a storage element. Used for alleviating transfer failures due to timeout. :returns: JSON post response in case of success, otherwise raise Exception. :param storage_element: The storage element to be configured :param inbound_max_active: the integer to set the inbound_max_active for the SE. :param outbound_max_active: the integer to set the outbound_max_active for the SE. :param inbound_max_throughput: the float to set the inbound_max_throughput for the SE. :param outbound_max_throughput: the float to set the outbound_max_throughput for the SE. :param staging: the integer to set the staging for the operation of a SE. """ params_dict = {storage_element: {'operations': {}, 'se_info': {}}} if staging is not None: try: policy = config_get('policy', 'permission') except Exception: logging.warning('Could not get policy from config') params_dict[storage_element]['operations'] = { policy: { 'staging': staging } } # A lot of try-excepts to avoid dictionary overwrite's, # see https://stackoverflow.com/questions/27118687/updating-nested-dictionaries-when-data-has-existing-key/27118776 if inbound_max_active is not None: try: params_dict[storage_element]['se_info'][ 'inbound_max_active'] = inbound_max_active except KeyError: params_dict[storage_element]['se_info'] = { 'inbound_max_active': inbound_max_active } if outbound_max_active is not None: try: params_dict[storage_element]['se_info'][ 'outbound_max_active'] = outbound_max_active except KeyError: params_dict[storage_element]['se_info'] = { 'outbound_max_active': outbound_max_active } if inbound_max_throughput is not None: try: params_dict[storage_element]['se_info'][ 'inbound_max_throughput'] = inbound_max_throughput except KeyError: params_dict[storage_element]['se_info'] = { 'inbound_max_throughput': inbound_max_throughput } if outbound_max_throughput is not None: try: params_dict[storage_element]['se_info'][ 'outbound_max_throughput'] = outbound_max_throughput except KeyError: params_dict[storage_element]['se_info'] = { 'outbound_max_throughput': outbound_max_throughput } params_str = json.dumps(params_dict, cls=APIEncoder) try: result = requests.post('%s/config/se' % (self.external_host), verify=self.verify, cert=self.cert, data=params_str, headers=self.headers, timeout=None) except Exception: logging.warning('Could not set the config of %s on %s - %s', storage_element, self.external_host, str(traceback.format_exc())) if result and result.status_code == 200: configSe = result.json() return configSe raise Exception( 'Could not set the configuration of %s , status code returned : %s', (storage_element, result.status_code if result else None))
def kronos_file(once=False, process=0, total_processes=1, thread=0, total_threads=1, brokers_resolved=None, dataset_queue=None): """ Main loop to consume tracer reports. """ logging.info('tracer consumer starting') chunksize = config_get_int('tracer-kronos', 'chunksize') prefetch_size = config_get_int('tracer-kronos', 'prefetch_size') subscription_id = config_get('tracer-kronos', 'subscription_id') use_ssl = True try: use_ssl = config_get_bool('tracer-kronos', 'use_ssl') except: pass if not use_ssl: username = config_get('tracer-kronos', 'username') password = config_get('tracer-kronos', 'password') excluded_usrdns = set(config_get('tracer-kronos', 'excluded_usrdns').split(',')) conns = [] for broker in brokers_resolved: if not use_ssl: conns.append(Connection(host_and_ports=[(broker, config_get_int('tracer-kronos', 'port'))], use_ssl=False, reconnect_attempts_max=config_get_int('tracer-kronos', 'reconnect_attempts'))) else: conns.append(Connection(host_and_ports=[(broker, config_get_int('tracer-kronos', 'port'))], use_ssl=True, ssl_key_file=config_get('tracer-kronos', 'ssl_key_file'), ssl_cert_file=config_get('tracer-kronos', 'ssl_cert_file'), ssl_version=PROTOCOL_TLSv1, reconnect_attempts_max=config_get_int('tracer-kronos', 'reconnect_attempts'))) logging.info('(kronos_file) tracer consumer started') while not graceful_stop.is_set(): for conn in conns: if not conn.is_connected(): logging.info('(kronos_file) connecting to %s' % conn.transport._Transport__host_and_ports[0][0]) record_counter('daemons.tracer.kronos.reconnect.%s' % conn.transport._Transport__host_and_ports[0][0].split('.')[0]) conn.set_listener('rucio-tracer-kronos', AMQConsumer(broker=conn.transport._Transport__host_and_ports[0], conn=conn, chunksize=chunksize, subscription_id=subscription_id, excluded_usrdns=excluded_usrdns, dataset_queue=dataset_queue)) conn.start() if not use_ssl: conn.connect(username, password) else: conn.connect() conn.subscribe(destination=config_get('tracer-kronos', 'queue'), ack='client-individual', id=subscription_id, headers={'activemq.prefetchSize': prefetch_size}) sleep(1) logging.info('(kronos_file) graceful stop requested') for conn in conns: try: conn.disconnect() except: pass logging.info('(kronos_file) graceful stop done')
import socket import sys import threading import os from rucio.core.heartbeat import live, die, sanity_check from rucio.common.config import config_get GRACEFUL_STOP = threading.Event() logging.basicConfig( stream=sys.stdout, level=getattr( logging, config_get('common', 'loglevel', raise_exception=False, default='DEBUG').upper()), format='%(asctime)s\t%(process)d\t%(levelname)s\t%(message)s') def rule_rebalancer(once=False): """ Main loop to rebalancer rules automatically """ raise NotImplementedError() hostname = socket.gethostname() pid = os.getpid() current_thread = threading.current_thread()
def request_transfer(once=False, src=None, dst=None): """ Main loop to request a new transfer. """ logging.info('request: starting') site_a = 'RSE%s' % generate_uuid().upper() site_b = 'RSE%s' % generate_uuid().upper() scheme = 'https' impl = 'rucio.rse.protocols.webdav.Default' if not src.startswith('https://'): scheme = 'srm' impl = 'rucio.rse.protocols.srm.Default' srctoken = src.split(':')[0] dsttoken = dst.split(':')[0] tmp_proto = { 'impl': impl, 'scheme': scheme, 'domains': { 'lan': {'read': 1, 'write': 1, 'delete': 1}, 'wan': {'read': 1, 'write': 1, 'delete': 1}}} rse.add_rse(site_a) tmp_proto['hostname'] = src.split(':')[1][2:] tmp_proto['port'] = src.split(':')[2].split('/')[0] tmp_proto['prefix'] = '/'.join([''] + src.split(':')[2].split('/')[1:]) if scheme == 'srm': tmp_proto['extended_attributes'] = {'space_token': srctoken, 'web_service_path': ''} rse.add_protocol(site_a, tmp_proto) tmp_proto = { 'impl': impl, 'scheme': scheme, 'domains': { 'lan': {'read': 1, 'write': 1, 'delete': 1}, 'wan': {'read': 1, 'write': 1, 'delete': 1}}} rse.add_rse(site_b) tmp_proto['hostname'] = dst.split(':')[1][2:] tmp_proto['port'] = dst.split(':')[2].split('/')[0] tmp_proto['prefix'] = '/'.join([''] + dst.split(':')[2].split('/')[1:]) if scheme == 'srm': tmp_proto['extended_attributes'] = {'space_token': dsttoken, 'web_service_path': ''} rse.add_protocol(site_b, tmp_proto) si = rsemanager.get_rse_info(site_a) session = get_session() logging.info('request: started') while not graceful_stop.is_set(): try: ts = time.time() tmp_name = generate_uuid() # add a new dataset did.add_did(scope='mock', name='dataset-%s' % tmp_name, type=DIDType.DATASET, account='root', session=session) # construct PFN pfn = rsemanager.lfns2pfns(si, lfns=[{'scope': 'mock', 'name': 'file-%s' % tmp_name}])['mock:file-%s' % tmp_name] # create the directories if needed p = rsemanager.create_protocol(si, operation='write', scheme=scheme) p.connect() try: p.mkdir(pfn) except: pass # upload the test file try: fp = os.path.dirname(config_get('injector', 'file')) fn = os.path.basename(config_get('injector', 'file')) p.put(fn, pfn, source_dir=fp) except: logging.critical('Could not upload, removing temporary DID: %s' % str(sys.exc_info())) did.delete_dids([{'scope': 'mock', 'name': 'dataset-%s' % tmp_name}], account='root', session=session) break # add the replica replica.add_replica(rse=site_a, scope='mock', name='file-%s' % tmp_name, bytes=config_get_int('injector', 'bytes'), adler32=config_get('injector', 'adler32'), md5=config_get('injector', 'md5'), account='root', session=session) # to the dataset did.attach_dids(scope='mock', name='dataset-%s' % tmp_name, dids=[{'scope': 'mock', 'name': 'file-%s' % tmp_name, 'bytes': config_get('injector', 'bytes')}], account='root', session=session) # add rule for the dataset ts = time.time() rule.add_rule(dids=[{'scope': 'mock', 'name': 'dataset-%s' % tmp_name}], account='root', copies=1, rse_expression=site_b, grouping='ALL', weight=None, lifetime=None, locked=False, subscription_id=None, activity='mock-injector', session=session) logging.info('added rule for %s for DID mock:%s' % (site_b, tmp_name)) record_timer('daemons.mock.conveyorinjector.add_rule', (time.time()-ts)*1000) record_counter('daemons.mock.conveyorinjector.request_transfer') session.commit() except: session.rollback() logging.critical(traceback.format_exc()) if once: return logging.info('request: graceful stop requested') logging.info('request: graceful stop done')
def submitter(once=False, rses=[], process=0, total_processes=1, thread=0, total_threads=1, mock=False, bulk=100, activities=None): """ Main loop to submit a new transfer primitive to a transfertool. """ logging.info('submitter starting - process (%i/%i) thread (%i/%i)' % (process, total_processes, thread, total_threads)) try: scheme = config_get('conveyor', 'scheme') except NoOptionError: scheme = 'srm' logging.info('submitter started - process (%i/%i) thread (%i/%i)' % (process, total_processes, thread, total_threads)) while not graceful_stop.is_set(): try: if activities is None: activities = [None] for activity in activities: if rses is None: rses = [None] for rse in rses: if rse: # run in rse list mode rse_info = rsemgr.get_rse_info(rse['rse']) logging.info("Working on RSE: %s" % rse['rse']) ts = time.time() reqs = get_requests(rse_id=rse['id'], process=process, total_processes=total_processes, thread=thread, total_threads=total_threads, mock=mock, bulk=bulk, activity=activity) record_timer('daemons.conveyor.submitter.get_requests', (time.time() - ts) * 1000) else: # no rse list, run FIFO mode rse_info = None ts = time.time() reqs = get_requests(process=process, total_processes=total_processes, thread=thread, total_threads=total_threads, mock=mock, bulk=bulk, activity=activity) record_timer('daemons.conveyor.submitter.get_requests', (time.time() - ts) * 1000) if reqs: logging.debug('%i:%i - submitting %i requests' % (process, thread, len(reqs))) if not reqs or reqs == []: time.sleep(1) continue for req in reqs: try: if not rse: # no rse list, in FIFO mode dest_rse = rse_core.get_rse(rse=None, rse_id=req['dest_rse_id']) rse_info = rsemgr.get_rse_info(dest_rse['rse']) ts = time.time() transfer = get_transfer(rse_info, req, scheme, mock) record_timer('daemons.conveyor.submitter.get_transfer', (time.time() - ts) * 1000) logging.debug('Transfer for request %s: %s' % (req['request_id'], transfer)) if transfer is None: logging.warn("Request %s DID %s:%s RSE %s failed to get transfer" % (req['request_id'], req['scope'], req['name'], rse_info['rse'])) # TODO: Merge these two calls request.set_request_state(req['request_id'], RequestState.LOST) # if the DID does not exist anymore request.archive_request(req['request_id']) continue ts = time.time() tmp_metadata = transfer['file_metadata'] eids = request.submit_transfers(transfers=[transfer, ], transfertool='fts3', job_metadata=tmp_metadata) record_timer('daemons.conveyor.submitter.submit_transfer', (time.time() - ts) * 1000) ts = time.time() if req['previous_attempt_id']: logging.info('COPYING RETRY %s REQUEST %s PREVIOUS %s DID %s:%s FROM %s TO %s USING %s with eid: %s' % (req['retry_count'], req['request_id'], req['previous_attempt_id'], req['scope'], req['name'], transfer['src_urls'], transfer['dest_urls'], eids[req['request_id']]['external_host'], eids[req['request_id']]['external_id'])) else: logging.info('COPYING REQUEST %s DID %s:%s FROM %s TO %s USING %s with eid: %s' % (req['request_id'], req['scope'], req['name'], transfer['src_urls'], transfer['dest_urls'], eids[req['request_id']]['external_host'], eids[req['request_id']]['external_id'])) record_counter('daemons.conveyor.submitter.submit_request') except UnsupportedOperation, e: # The replica doesn't exist, need to cancel the request logging.warning(e) logging.info('Cancelling transfer request %s' % req['request_id']) try: # TODO: for now, there is only ever one destination request.cancel_request_did(req['scope'], req['name'], transfer['dest_urls'][0]) except Exception, e: logging.warning('Cannot cancel request: %s' % str(e))
import logging import sys import urlparse import requests from rucio.common.config import config_get from rucio.core.monitor import record_counter from rucio.db.constants import FTSState logging.getLogger("requests").setLevel(logging.CRITICAL) logging.basicConfig( stream=sys.stdout, level=getattr(logging, config_get("common", "loglevel").upper()), format="%(asctime)s\t%(process)d\t%(levelname)s\t%(message)s", ) __CACERT = config_get("conveyor", "cacert") __USERCERT = config_get("conveyor", "usercert") def __extract_host(transfer_host): # graphite does not like the dots in the FQDN return urlparse.urlparse(transfer_host).hostname.replace(".", "_") def submit_transfers(transfers, job_metadata): """ Submit a transfer to FTS3 via JSON.
from rucio.core.message import add_message from rucio.core.replica import list_and_mark_unlocked_replicas, delete_replicas from rucio.core.rse import list_rses, get_rse_limits, get_rse_usage, list_rse_attributes, get_rse_protocols from rucio.core.rse_expression_parser import parse_expression from rucio.core.rule import get_evaluation_backlog from rucio.core.vo import list_vos from rucio.rse import rsemanager as rsemgr logging.getLogger("reaper").setLevel(logging.CRITICAL) logging.basicConfig( stream=sys.stdout, level=getattr( logging, config_get('common', 'loglevel', raise_exception=False, default='DEBUG').upper()), format='%(asctime)s\t%(process)d\t%(levelname)s\t%(message)s') GRACEFUL_STOP = threading.Event() REGION = make_region().configure('dogpile.cache.memcached', expiration_time=600, arguments={ 'url': config_get('cache', 'url', False, '127.0.0.1:11211'), 'distributed_lock': True })
InsufficientTargetRSEs, InsufficientAccountLimit, ReplicationRuleCreationTemporaryFailed, InvalidRuleWeight, StagingAreaRuleRequiresLifetime, ) from rucio.common.config import config_get from rucio.common.utils import chunks from rucio.core import monitor from rucio.core.rule import add_rule logging.getLogger("transmogrifier").setLevel(logging.CRITICAL) logging.basicConfig( stream=stdout, level=getattr(logging, config_get("common", "loglevel").upper()), format="%(asctime)s\t%(process)d\t%(levelname)s\t%(message)s", ) graceful_stop = threading.Event() def _retrial(func, *args, **kwargs): delay = 0 while True: try: return apply(func, args, kwargs) except DataIdentifierNotFound, e: logging.warning(e) return 1 except DatabaseException, e:
def setup(self): self.cacert = config_get('test', 'cacert') self.usercert = config_get('test', 'usercert') self.host = config_get('client', 'rucio_host') self.auth_host = config_get('client', 'auth_host') self.marker = '$> '
# - Edgar Fajardo <*****@*****.**>, 2018 # - Martin Barisits <*****@*****.**>, 2019 # - James Perry <*****@*****.**>, 2019 try: from ConfigParser import NoOptionError, NoSectionError except ImportError: from configparser import NoOptionError, NoSectionError from rucio.common import config, exception import importlib if config.config_has_section('policy'): try: POLICY = config.config_get('policy', 'package') + ".schema" except (NoOptionError, NoSectionError) as error: # fall back to old system for now try: POLICY = config.config_get('policy', 'schema') except (NoOptionError, NoSectionError) as error: POLICY = 'generic' POLICY = 'rucio.common.schema.' + POLICY.lower() else: POLICY = 'rucio.common.schema.generic' try: module = importlib.import_module(POLICY) except (ImportError) as error: raise exception.PolicyPackageNotFound('Module ' + POLICY + ' not found')
from rucio.core import heartbeat, request as request_core, transfer as transfer_core from rucio.core.monitor import record_counter, record_timer from rucio.daemons.conveyor.common import submit_transfer, bulk_group_transfer, get_conveyor_rses, USER_ACTIVITY from rucio.db.sqla.constants import RequestState try: from ConfigParser import NoOptionError # py2 except Exception: from configparser import NoOptionError # py3 logging.basicConfig( stream=sys.stdout, level=getattr( logging, config_get('common', 'loglevel', raise_exception=False, default='DEBUG').upper()), format='%(asctime)s\t%(process)d\t%(levelname)s\t%(message)s') graceful_stop = threading.Event() USER_TRANSFERS = config_get('conveyor', 'user_transfers', False, None) TRANSFER_TOOL = config_get('conveyor', 'transfertool', False, None) TRANSFER_TYPE = config_get('conveyor', 'transfertype', False, 'single') GET_TRANSFERS_COUNTER = Counter( 'rucio_daemons_conveyor_submitter_get_transfers', 'Number of transfers retrieved') def submitter(once=False,
import dns.resolver import stomp from rucio.common.config import config_get, config_get_int from rucio.core.monitor import record_counter ERRLOG = logging.getLogger('errlog') ERRLOG.setLevel(logging.ERROR) LOGGER = logging.getLogger('trace') LOGGER.setLevel(logging.DEBUG) try: HANDLER = logging.handlers.RotatingFileHandler( filename='%s/trace' % config_get('nongrid-trace', 'tracedir'), maxBytes=1000000000, backupCount=10) LOGFORMATTER = logging.Formatter('%(message)s') HANDLER.setFormatter(LOGFORMATTER) HANDLER.suffix = "%Y-%m-%d" LOGGER.addHandler(HANDLER) except: if 'sphinx' not in sys.modules: raise BROKERS_ALIAS, BROKERS_RESOLVED = [], [] try: BROKERS_ALIAS = [ b.strip() for b in config_get('nongrid-trace', 'brokers').split(',') ]
import time from sqlalchemy.exc import IntegrityError from sqlalchemy.sql.expression import asc, bindparam, text from rucio.common.config import config_get from rucio.common.exception import RucioException, UnsupportedOperation from rucio.common.utils import generate_uuid from rucio.core.monitor import record_counter, record_timer from rucio.core.rse import get_rse_id, get_rse_name from rucio.db import models from rucio.db.constants import RequestState, RequestType, FTSState from rucio.db.session import read_session, transactional_session from rucio.transfertool import fts3 __HOSTS = [b.strip() for b in config_get('conveyor', 'ftshosts').split(',')] @transactional_session def requeue_and_archive(request_id, session=None): """ Requeue and archive a failed request. TODO: Multiple requeue. :param request_id: Original request ID as a string. :param session: Database session to use. """ record_counter('core.request.requeue_request') new_req = get_request(request_id, session=session)
from rucio import version from rucio.api import authentication as auth, identity from rucio.api.account import get_account_info, list_account_attributes from rucio.common.config import config_get from rucio.db.sqla.constants import AccountType try: from onelogin.saml2.auth import OneLogin_Saml2_Auth SAML_SUPPORT = True except: SAML_SUPPORT = False RENDERER = template.render(join(dirname(__file__), '../templates')) # check if there is preferred server side config for webui authentication AUTH_TYPE = config_get('webui', 'auth_type', False, None) if AUTH_TYPE == 'oidc': try: AUTH_ISSUER_WEBUI = config_get('webui', 'auth_issuer') except: RENDERER.problem( "Please specify auth_issuer in the [webui] section of the Rucio configuration." ) # if no specific config on the server side - we collect information # about all authentication options, in particular OIDC AUTH_ISSUERS = [] if not AUTH_TYPE: IDPSECRETS = config_get('oidc', 'idpsecrets', False, None) try: with open(IDPSECRETS) as client_secret_file:
from copy import deepcopy from datetime import datetime, timedelta from re import match from random import randint from sqlalchemy.exc import DatabaseError from rucio.common.config import config_get from rucio.common.exception import DatabaseException from rucio.core.rule import repair_rule, get_stuck_rules from rucio.core.monitor import record_gauge, record_counter graceful_stop = threading.Event() logging.basicConfig(filename='%s/%s.log' % (config_get('common', 'logdir'), __name__), level=getattr(logging, config_get('common', 'loglevel').upper()), format='%(asctime)s\t%(process)d\t%(levelname)s\t%(message)s') def rule_repairer(once=False, process=0, total_processes=1, thread=0, threads_per_process=1): """ Main loop to check for STUCK replication rules """ logging.info('rule_repairer: starting') logging.info('rule_repairer: started') paused_rules = {} # {rule_id: datetime}
from rucio.common import exception, config from rucio.common.constraints import STRING_TYPES from rucio.common.utils import GLOBALLY_SUPPORTED_CHECKSUMS, PREFERRED_CHECKSUM from rucio.rse.protocols import protocol try: import gfal2 # pylint: disable=import-error except: if 'RUCIO_CLIENT_MODE' not in os.environ: if not config.config_has_section('database'): raise exception.MissingDependency('Missing dependency : gfal2') else: if os.environ['RUCIO_CLIENT_MODE']: raise exception.MissingDependency('Missing dependency : gfal2') TIMEOUT = config.config_get('deletion', 'timeout', False, None) class Default(protocol.RSEProtocol): """ Implementing access to RSEs using the srm protocol.""" def lfns2pfns(self, lfns): """ Returns a fully qualified PFN for the file referred by path. :param path: The path to the file. :returns: Fully qualified PFN. """ lfns = [lfns] if type(lfns) == dict else lfns pfns = {}
def deliver_messages(once=False, brokers_resolved=None, process=0, total_processes=1, thread=0, total_threads=1, bulk=1000): """ Main loop to deliver messages to a broker. """ logging.info('hermes starting - process (%i/%i) thread (%i/%i) bulk (%i)' % (process, total_processes, thread, total_threads, bulk)) conns = [] for broker in brokers_resolved: conns.append(stomp.Connection(host_and_ports=[(broker, config_get_int('messaging-hermes', 'port'))], use_ssl=True, ssl_key_file=config_get('messaging-hermes', 'ssl_key_file'), ssl_cert_file=config_get('messaging-hermes', 'ssl_cert_file'), ssl_version=ssl.PROTOCOL_TLSv1)) logging.info('hermes started - process (%i/%i) thread (%i/%i) bulk (%i)' % (process, total_processes, thread, total_threads, bulk)) while not graceful_stop.is_set(): try: for conn in conns: if not conn.is_connected(): logging.info('connecting to %s' % conn.transport._Transport__host_and_ports[0][0]) record_counter('daemons.hermes.reconnect.%s' % conn.transport._Transport__host_and_ports[0][0].split('.')[0]) conn.start() conn.connect() tmp = retrieve_messages(bulk=bulk, process=process, total_processes=total_processes, thread=thread, total_threads=total_threads) if tmp == []: time.sleep(1) else: to_delete = [] for t in tmp: try: random.sample(conns, 1)[0].send(body=json.dumps({'event_type': str(t['event_type']).lower(), 'payload': t['payload'], 'created_at': str(t['created_at'])}), destination=config_get('messaging-hermes', 'destination')) except ValueError: logging.warn('Cannot serialize payload to JSON: %s' % str(t['payload'])) continue except Exception, e: logging.warn('Could not deliver message: %s' % str(e)) continue to_delete.append(t['id']) if str(t['event_type']).lower().startswith("transfer"): logging.debug('%i:%i - event_type: %s, scope: %s, name: %s, rse: %s, request-id: %s, transfer-id: %s, created_at: %s' % (process, thread, str(t['event_type']).lower(), t['payload']['scope'], t['payload']['name'], t['payload']['dst-rse'], t['payload']['request-id'], t['payload']['transfer-id'], str(t['created_at']))) elif str(t['event_type']).lower().startswith("dataset"): logging.debug('%i:%i - event_type: %s, scope: %s, name: %s, rse: %s, rule-id: %s, created_at: %s)' % (process, thread, str(t['event_type']).lower(), t['payload']['scope'], t['payload']['name'], t['payload']['rse'], t['payload']['rule_id'], str(t['created_at']))) elif str(t['event_type']).lower().startswith("deletion"): if 'url' not in t['payload']: t['payload']['url'] = 'unknown' logging.debug('%i:%i - event_type: %s, scope: %s, name: %s, rse: %s, url: %s, created_at: %s)' % (process, thread, str(t['event_type']).lower(), t['payload']['scope'], t['payload']['name'], t['payload']['rse'], t['payload']['url'], str(t['created_at']))) else: logging.debug('%i:%i -other message: %s' % (process, thread, t)) delete_messages(to_delete) except: logging.critical(traceback.format_exc()) logging.debug('%i:%i - graceful stop requests' % (process, thread)) for conn in conns: try: conn.disconnect() except: pass logging.debug('%i:%i - graceful stop done' % (process, thread))
def deliver_messages(once=False, brokers_resolved=None, thread=0, bulk=1000, delay=10, broker_timeout=3, broker_retry=3): ''' Main loop to deliver messages to a broker. ''' logging.info('[broker] starting - threads (%i) bulk (%i)', thread, bulk) if not brokers_resolved: logging.fatal('No brokers resolved.') return logging.info('[broker] checking authentication method') use_ssl = True try: use_ssl = config_get_bool('messaging-hermes', 'use_ssl') except: logging.info( '[broker] could not find use_ssl in configuration -- please update your rucio.cfg' ) port = config_get_int('messaging-hermes', 'port') vhost = config_get('messaging-hermes', 'broker_virtual_host', raise_exception=False) if not use_ssl: username = config_get('messaging-hermes', 'username') password = config_get('messaging-hermes', 'password') port = config_get_int('messaging-hermes', 'nonssl_port') conns = [] for broker in brokers_resolved: if not use_ssl: logging.info( '[broker] setting up username/password authentication: %s' % broker) con = stomp.Connection12(host_and_ports=[(broker, port)], vhost=vhost, keepalive=True, timeout=broker_timeout) else: logging.info( '[broker] setting up ssl cert/key authentication: %s' % broker) con = stomp.Connection12( host_and_ports=[(broker, port)], use_ssl=True, ssl_key_file=config_get('messaging-hermes', 'ssl_key_file'), ssl_cert_file=config_get('messaging-hermes', 'ssl_cert_file'), vhost=vhost, keepalive=True, timeout=broker_timeout) con.set_listener( 'rucio-hermes', HermesListener(con.transport._Transport__host_and_ports[0])) conns.append(con) destination = config_get('messaging-hermes', 'destination') executable = 'hermes [broker]' hostname = socket.getfqdn() pid = os.getpid() heartbeat_thread = threading.current_thread() # Make an initial heartbeat so that all daemons have the correct worker number on the next try sanity_check(executable=executable, hostname=hostname, pid=pid, thread=heartbeat_thread) GRACEFUL_STOP.wait(1) while not GRACEFUL_STOP.is_set(): try: t_start = time.time() heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=heartbeat_thread) logging.debug('[broker] %i:%i - using: %s', heartbeat['assign_thread'], heartbeat['nr_threads'], [ conn.transport._Transport__host_and_ports[0][0] for conn in conns ]) messages = retrieve_messages(bulk=bulk, thread=heartbeat['assign_thread'], total_threads=heartbeat['nr_threads']) if messages: logging.debug('[broker] %i:%i - retrieved %i messages', heartbeat['assign_thread'], heartbeat['nr_threads'], len(messages)) to_delete = [] for message in messages: try: conn = random.sample(conns, 1)[0] if not conn.is_connected(): host_and_ports = conn.transport._Transport__host_and_ports[ 0][0] record_counter('daemons.hermes.reconnect.%s' % host_and_ports.split('.')[0]) conn.start() if not use_ssl: logging.info( '[broker] %i:%i - connecting with USERPASS to %s', heartbeat['assign_thread'], heartbeat['nr_threads'], host_and_ports) conn.connect(username, password, wait=True) else: logging.info( '[broker] %i:%i - connecting with SSL to %s', heartbeat['assign_thread'], heartbeat['nr_threads'], host_and_ports) conn.connect(wait=True) conn.send(body=json.dumps({ 'event_type': str(message['event_type']).lower(), 'payload': message['payload'], 'created_at': str(message['created_at']) }), destination=destination, headers={ 'persistent': 'true', 'event_type': str(message['event_type']).lower() }) to_delete.append({ 'id': message['id'], 'created_at': message['created_at'], 'updated_at': message['created_at'], 'payload': json.dumps(message['payload']), 'event_type': message['event_type'] }) except ValueError: logging.warn('Cannot serialize payload to JSON: %s', str(message['payload'])) to_delete.append({ 'id': message['id'], 'created_at': message['created_at'], 'updated_at': message['created_at'], 'payload': str(message['payload']), 'event_type': message['event_type'] }) continue except stomp.exception.NotConnectedException as error: logging.warn( 'Could not deliver message due to NotConnectedException: %s', str(error)) continue except stomp.exception.ConnectFailedException as error: logging.warn( 'Could not deliver message due to ConnectFailedException: %s', str(error)) continue except Exception as error: logging.warn('Could not deliver message: %s', str(error)) logging.critical(traceback.format_exc()) continue if str(message['event_type']).lower().startswith( 'transfer') or str(message['event_type']).lower( ).startswith('stagein'): logging.debug( '[broker] %i:%i - event_type: %s, scope: %s, name: %s, rse: %s, request-id: %s, transfer-id: %s, created_at: %s', heartbeat['assign_thread'], heartbeat['nr_threads'], str(message['event_type']).lower(), message['payload'].get('scope', None), message['payload'].get('name', None), message['payload'].get('dst-rse', None), message['payload'].get('request-id', None), message['payload'].get('transfer-id', None), str(message['created_at'])) elif str(message['event_type']).lower().startswith( 'dataset'): logging.debug( '[broker] %i:%i - event_type: %s, scope: %s, name: %s, rse: %s, rule-id: %s, created_at: %s)', heartbeat['assign_thread'], heartbeat['nr_threads'], str(message['event_type']).lower(), message['payload']['scope'], message['payload']['name'], message['payload']['rse'], message['payload']['rule_id'], str(message['created_at'])) elif str(message['event_type']).lower().startswith( 'deletion'): if 'url' not in message['payload']: message['payload']['url'] = 'unknown' logging.debug( '[broker] %i:%i - event_type: %s, scope: %s, name: %s, rse: %s, url: %s, created_at: %s)', heartbeat['assign_thread'], heartbeat['nr_threads'], str(message['event_type']).lower(), message['payload']['scope'], message['payload']['name'], message['payload']['rse'], message['payload']['url'], str(message['created_at'])) else: logging.debug('[broker] %i:%i - other message: %s', heartbeat['assign_thread'], heartbeat['nr_threads'], message) delete_messages(to_delete) logging.info('[broker] %i:%i - submitted %i messages', heartbeat['assign_thread'], heartbeat['nr_threads'], len(to_delete)) if once: break except NoResultFound: # silence this error: https://its.cern.ch/jira/browse/RUCIO-1699 pass except: logging.critical(traceback.format_exc()) t_delay = delay - (time.time() - t_start) t_delay = t_delay if t_delay > 0 else 0 if t_delay: logging.debug('[broker] %i:%i - sleeping %s seconds', heartbeat['assign_thread'], heartbeat['nr_threads'], t_delay) time.sleep(t_delay) for conn in conns: try: conn.disconnect() except Exception: pass logging.debug('[broker] %i:%i - graceful stop requested', heartbeat['assign_thread'], heartbeat['nr_threads']) die(executable, hostname, pid, heartbeat_thread) logging.debug('[broker] %i:%i - graceful stop done', heartbeat['assign_thread'], heartbeat['nr_threads'])
def update_bad_request(req, dest_rse, new_state, detail, session=None): if new_state == RequestState.FAILED: request.set_request_state(req['request_id'], new_state, session=session) activity = 'default' if req['attributes']: if type(req['attributes']) is dict: req_attributes = json.loads(json.dumps(req['attributes'])) else: req_attributes = json.loads(str(req['attributes'])) activity = req_attributes['activity'] if req_attributes['activity'] else 'default' tss = time.time() add_message('transfer-failed', {'activity': activity, 'request-id': req['request_id'], 'checksum-adler': None, 'checksum-md5': None, 'dst-rse': dest_rse, 'dst-url': None, 'name': req['name'], 'guid': None, 'file-size': None, 'previous-request-id': req['request_id'], 'protocol': None, 'reason': detail, 'transfer-link': None, 'scope': req['scope'], 'src-rse': None, 'src-url': None, 'tool-id': 'rucio-conveyor', 'transfer-endpoint': config_get('conveyor', 'ftshosts'), 'transfer-id': None}, session=session) request.archive_request(req['request_id'], session=session) logging.error('BAD DID %s:%s REQUEST %s details: %s' % (req['scope'], req['name'], req['request_id'], detail)) try: replica.update_replicas_states([{'rse': dest_rse, 'scope': req['scope'], 'name': req['name'], 'state': ReplicaState.UNAVAILABLE}], session=session) except: logging.critical("Could not update replica state for failed transfer %s:%s at %s (%s)" % (req['scope'], req['name'], dest_rse, traceback.format_exc())) raise tss = time.time() try: lock.failed_transfer(req['scope'], req['name'], req['dest_rse_id'], session=session) except: logging.warn('Could not update lock for failed transfer %s:%s at %s (%s)' % (req['scope'], req['name'], dest_rse, traceback.format_exc())) raise record_timer('daemons.conveyor.common.update_request_state.lock-failed_transfer', (time.time()-tss)*1000)
def run(once=False, send_email=True, threads=1, bulk=1000, delay=10, broker_timeout=3, broker_retry=3): ''' Starts up the hermes threads. ''' logging.info('resolving brokers') brokers_alias = [] brokers_resolved = [] try: brokers_alias = [ b.strip() for b in config_get('messaging-hermes', 'brokers').split(',') ] except: raise Exception('Could not load brokers from configuration') logging.info('resolving broker dns alias: %s', brokers_alias) brokers_resolved = [] for broker in brokers_alias: try: addrinfos = socket.getaddrinfo(broker, 0, socket.AF_INET, 0, socket.IPPROTO_TCP) brokers_resolved.extend(ai[4][0] for ai in addrinfos) except socket.gaierror as ex: logging.error('Cannot resolve domain name %s (%s)', broker, str(ex)) logging.debug('brokers resolved to %s', brokers_resolved) if once: logging.info('executing one hermes iteration only') deliver_messages(once=once, brokers_resolved=brokers_resolved, bulk=bulk, delay=delay, broker_timeout=broker_timeout, broker_retry=broker_retry) deliver_emails(once=once, send_email=send_email, bulk=bulk, delay=delay) else: logging.info('starting hermes threads') thread_list = [ threading.Thread(target=deliver_messages, kwargs={ 'brokers_resolved': brokers_resolved, 'thread': i, 'bulk': bulk, 'delay': delay, 'broker_timeout': broker_timeout, 'broker_retry': broker_retry }) for i in range(0, threads) ] for thrd in range(0, 1): thread_list.append( threading.Thread(target=deliver_emails, kwargs={ 'thread': thrd, 'bulk': bulk, 'delay': delay })) for thrd in thread_list: thrd.start() logging.info('waiting for interrupts') # Interruptible joins require a timeout. while thread_list: thread_list = [ t.join(timeout=3.14) for t in thread_list if t and t.isAlive() ]
from os import remove, rmdir, stat from sys import stdout from time import sleep, time from rucio.client import Client from rucio.common.config import config_get, config_get_int from rucio.common.utils import adler32 from rucio.core import monitor from rucio.rse import rsemanager as rsemgr from rucio.common.utils import execute, generate_uuid from rucio.common.exception import FileReplicaAlreadyExists logging.getLogger("automatix").setLevel(logging.CRITICAL) logging.basicConfig(stream=stdout, level=getattr(logging, config_get('common', 'loglevel').upper()), format='%(asctime)s\t%(process)d\t%(levelname)s\t%(message)s') SUCCESS = 0 FAILURE = 1 graceful_stop = threading.Event() def upload(files, scope, metadata, rse, account, source_dir, worker_number, total_workers, dataset_lifetime, did=None): logging.debug('In upload') dsn = None if did: dsn = {'scope': did.split(':')[0], 'name': did.split(':')[1]}
def deliver_emails(once=False, send_email=True, thread=0, bulk=1000, delay=10): ''' Main loop to deliver emails via SMTP. ''' logging.info('[email] starting - threads (%i) bulk (%i)', thread, bulk) executable = 'hermes [email]' hostname = socket.getfqdn() pid = os.getpid() heartbeat_thread = threading.current_thread() sanity_check(executable=executable, hostname=hostname) # Make an initial heartbeat so that all daemons have the correct worker number on the next try live(executable=executable, hostname=hostname, pid=pid, thread=heartbeat_thread) GRACEFUL_STOP.wait(1) email_from = config_get('messaging-hermes', 'email_from') while not GRACEFUL_STOP.is_set(): heartbeat = live(executable, hostname, pid, heartbeat_thread) logging.debug('[email] %i:%i - bulk %i', heartbeat['assign_thread'], heartbeat['nr_threads'], bulk) t_start = time.time() messages = retrieve_messages(bulk=bulk, thread=heartbeat['assign_thread'], total_threads=heartbeat['nr_threads'], event_type='email') if messages != []: to_delete = [] for message in messages: logging.debug('[email] %i:%i - submitting: %s', heartbeat['assign_thread'], heartbeat['nr_threads'], str(message)) msg = MIMEText(message['payload']['body'].encode('utf-8')) msg['From'] = email_from msg['To'] = ', '.join(message['payload']['to']) msg['Subject'] = message['payload']['subject'].encode('utf-8') if send_email: smtp = smtplib.SMTP() smtp.connect() smtp.sendmail(msg['From'], message['payload']['to'], msg.as_string()) smtp.quit() to_delete.append({ 'id': message['id'], 'created_at': message['created_at'], 'updated_at': message['created_at'], 'payload': str(message['payload']), 'event_type': 'email' }) logging.debug('[email] %i:%i - submitting done: %s', heartbeat['assign_thread'], heartbeat['nr_threads'], str(message['id'])) delete_messages(to_delete) logging.info('[email] %i:%i - submitted %i messages', heartbeat['assign_thread'], heartbeat['nr_threads'], len(to_delete)) if once: break t_delay = delay - (time.time() - t_start) t_delay = t_delay if t_delay > 0 else 0 if t_delay: logging.debug('[email] %i:%i - sleeping %s seconds', heartbeat['assign_thread'], heartbeat['nr_threads'], t_delay) time.sleep(t_delay) logging.debug('[email] %i:%i - graceful stop requested', heartbeat['assign_thread'], heartbeat['nr_threads']) die(executable, hostname, pid, heartbeat_thread) logging.debug('[email] %i:%i - graceful stop done', heartbeat['assign_thread'], heartbeat['nr_threads'])
import sys import threading import time import traceback from rucio.common.config import config_get from rucio.common.utils import chunks from rucio.core import request from rucio.core.monitor import record_timer, record_counter from rucio.daemons.conveyor import common from rucio.db.constants import RequestState, RequestType logging.getLogger("requests").setLevel(logging.CRITICAL) logging.basicConfig(stream=sys.stdout, level=getattr(logging, config_get('common', 'loglevel').upper()), format='%(asctime)s\t%(process)d\t%(levelname)s\t%(message)s') graceful_stop = threading.Event() # http://bugs.python.org/issue7980 datetime.datetime.strptime('', '') def poller(once=False, process=0, total_processes=1, thread=0, total_threads=1, bulk=1000, older_than=60): """ Main loop to check the status of a transfer primitive with a transfertool. """ logging.info('poller starting - process (%i/%i) thread (%i/%i) bulk (%i)' % (process, total_processes, thread, total_threads,
from threading import Lock from time import sleep from os.path import basename from sqlalchemy import create_engine, event from sqlalchemy.exc import DatabaseError, DisconnectionError, OperationalError, DBAPIError, TimeoutError from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import sessionmaker, scoped_session from rucio.common.config import config_get from rucio.common.exception import RucioException, DatabaseException BASE = declarative_base() try: default_schema_name = config_get('database', 'schema') BASE.metadata.schema = default_schema_name except NoOptionError: default_schema_name = None _MAKER, _ENGINE, _LOCK = None, None, Lock() def _fk_pragma_on_connect(dbapi_con, con_record): # Hack for previous versions of sqlite3 try: dbapi_con.execute('pragma foreign_keys=ON') except AttributeError: pass