def select_target_rse(parent_rule, current_rse_id, rse_expression, subscription_id, rse_attributes, other_rses=[], exclude_expression=None, force_expression=None, session=None): """ Select a new target RSE for a rebalanced rule. :param parent_rule rule that is rebalanced. :param current_rse_id: RSE of the source. :param rse_expression: RSE Expression of the source rule. :param subscription_id: Subscription ID of the source rule. :param rse_attributes: The attributes of the source rse. :param other_rses: Other RSEs with existing dataset replicas. :param exclude_expression: Exclude this rse_expression from being target_rses. :param force_expression: Force a specific rse_expression as target. :param session: The DB Session :returns: New RSE expression """ if rse_attributes['type'] != 'DATADISK' and force_expression is None: print('WARNING: dest RSE(s) has to be provided with --force-expression for rebalancing of non-datadisk RSES.') raise InsufficientTargetRSEs current_rse = get_rse_name(rse_id=current_rse_id) current_rse_expr = current_rse # if parent rule has a vo, enforce it vo = parent_rule['scope'].vo if exclude_expression: target_rse = '(%s)\\%s' % (exclude_expression, current_rse_expr) else: target_rse = current_rse_expr rses = parse_expression(expression=rse_expression, filter={'vo': vo}, session=session) # TODO: dest rse selection should be configurable, there might be cases when tier is not defined, or concept of DATADISKS is not present. # if subscription_id: # pass # # get_subscription_by_id(subscription_id, session) if force_expression is not None: if parent_rule['grouping'] != RuleGrouping.NONE: rses = parse_expression(expression='(%s)\\%s' % (force_expression, target_rse), filter={'vo': vo, 'availability_write': True}, session=session) else: # in order to avoid replication of the part of distributed dataset not present at rabalanced rse -> rses in force_expression # this will be extended with development of delayed rule rses = parse_expression(expression='((%s)|(%s))\\%s' % (force_expression, rse_expression, target_rse), filter={'vo': vo, 'availability_write': True}, session=session) elif len(rses) > 1: # Just define the RSE Expression without the current_rse return '(%s)\\%s' % (rse_expression, target_rse) else: if rse_attributes['tier'] is True or int(rse_attributes['tier']) == 1: # Tier 1 should go to another Tier 1 expression = '(tier=1&type=DATADISK)\\{}'.format(target_rse) elif int(rse_attributes['tier']) == 2: # Tier 2 should go to another Tier 2 expression = '(tier=2&type=DATADISK)\\{}'.format(target_rse) elif int(rse_attributes['tier']) == 3: # Tier 3 will go to Tier 2, since we don't have enough t3s expression = '((tier=2&type=DATADISK)\\datapolicynucleus=1)\\{}'.format(target_rse) rses = parse_expression(expression=expression, filter={'vo': vo, 'availability_write': True}, session=session) rseselector = RSESelector(account=InternalAccount('ddmadmin', vo=vo), rses=rses, weight='freespace', copies=1, ignore_account_limit=True, session=session) return get_rse_name([rse_id for rse_id, _, _ in rseselector.select_rse(size=0, preferred_rse_ids=[], blocklist=other_rses)][0], session=session)
def test_3(self): # enough RSEs and global quota, also after after change -> 2 RSE set_global_account_limit(account=self.account, rse_expression=self.rse_1_name, bytes=20) set_global_account_limit(account=self.account, rse_expression=self.rse_2_name, bytes=20) set_local_account_limit(account=self.account, rse_id=self.mock1_id, bytes=20) set_local_account_limit(account=self.account, rse_id=self.mock2_id, bytes=20) copies = 2 rses = [self.rse_1, self.rse_2] rse_selector = RSESelector(self.account, rses, None, copies) assert_equal(len(rse_selector.rses), 2) rse_selector.select_rse(10, [self.mock1_id], copies=1) rse_selector.select_rse(10, [self.mock2_id], copies=1) rses = rse_selector.select_rse(5, [], copies=2) assert_equal(len(rses), 2)
def test_2(self): # local quota not enough -> error copies = 2 rses = [self.rse_1, self.rse_2] set_local_account_limit(account=self.account, rse_id=self.mock1_id, bytes=10) increase(self.mock1_id, self.account, 10, 10) update_account_counter(account=self.account, rse_id=self.mock1_id) with assert_raises(InsufficientAccountLimit): RSESelector(self.account, rses, None, copies)
def test_4(self): # enough RSEs, local and global quota -> 2 RSEs set_global_account_limit(account=self.account, rse_expression=self.rse_1_name, bytes=20) set_global_account_limit(account=self.account, rse_expression=self.rse_2_name, bytes=20) set_local_account_limit(account=self.account, rse_id=self.mock1_id, bytes=20) set_local_account_limit(account=self.account, rse_id=self.mock2_id, bytes=20) copies = 2 rses = [self.rse_1, self.rse_2] rse_selector = RSESelector(self.account, rses, None, copies) assert_equal(len(rse_selector.rses), 2)
def test_5(self): # enough RSEs and local quota, but global quota missing -> 1 RSE copies = 1 rses = [self.rse_1, self.rse_2] set_global_account_limit(account=self.account, rse_expression=self.rse_1_name, bytes=10) increase(self.mock1_id, self.account, 10, 10) update_account_counter(account=self.account, rse_id=self.mock1_id) set_local_account_limit(account=self.account, rse_id=self.mock2_id, bytes=20) set_local_account_limit(account=self.account, rse_id=self.mock1_id, bytes=20) rse_selector = RSESelector(self.account, rses, None, copies) assert_equal(len(rse_selector.rses), 1)
def test_2(self): # enough RSEs and global quota, but not enough global quota after change -> 1 RSE set_global_account_limit(account=self.account, rse_expression=self.rse_1_name, bytes=10) set_global_account_limit(account=self.account, rse_expression=self.rse_2_name, bytes=10) set_local_account_limit(account=self.account, rse_id=self.mock1_id, bytes=20) set_local_account_limit(account=self.account, rse_id=self.mock2_id, bytes=20) copies = 2 rses = [self.rse_1, self.rse_2] rse_selector = RSESelector(self.account, rses, None, copies) assert len(rse_selector.rses) == 2 rse_selector.select_rse(10, [self.mock1_id], copies=1) rses = rse_selector.select_rse(5, [], copies=1) assert len(rses) == 1 assert rses[0][0] == self.mock2_id
def test_3(self): # global quota not enough -> error copies = 2 rses = [self.rse_1, self.rse_2] set_local_account_limit(account=self.account, rse_id=self.mock1_id, bytes=20) set_global_account_limit(account=self.account, rse_expression=self.rse_1_name, bytes=10) increase(self.mock1_id, self.account, 10, 10) update_account_counter(account=self.account, rse_id=self.mock1_id) with pytest.raises(InsufficientAccountLimit): RSESelector(self.account, rses, None, copies)
def select_target_rse(current_rse, rse_expression, subscription_id, rse_attributes, other_rses=[], exclude_expression=None, force_expression=None, session=None): """ Select a new target RSE for a rebalanced rule. :param current_rse: RSE of the source. :param rse_expression: RSE Expression of the source rule. :param subscription_id: Subscription ID of the source rule. :param rse_attributes: The attributes of the source rse. :param other_rses: Other RSEs with existing dataset replicas. :param exclude_expression: Exclude this rse_expression from being target_rses. :param force_expression: Force a specific rse_expression as target. :param session: The DB Session :returns: New RSE expression """ if exclude_expression: target_rse = '(%s)\\%s' % (exclude_expression, current_rse) else: target_rse = current_rse rses = parse_expression(expression=rse_expression, session=session) # if subscription_id: # pass # # get_subscription_by_id(subscription_id, session) if force_expression is not None: rses = parse_expression(expression='(%s)\\%s' % (force_expression, target_rse), filter={'availability_write': True}, session=session) elif len(rses) > 1: # Just define the RSE Expression without the current_rse return '(%s)\\%s' % (rse_expression, target_rse) elif rse_attributes['tier'] is True or rse_attributes['tier'] == '1': # Tier 1 should go to another Tier 1 rses = parse_expression(expression='(tier=1&type=DATADISK)\\%s' % target_rse, filter={'availability_write': True}, session=session) elif rse_attributes['tier'] == 2 or rse_attributes['tier'] == '2': # Tier 2 should go to another Tier 2 rses = parse_expression(expression='(tier=2&type=DATADISK)\\%s' % target_rse, filter={'availability_write': True}, session=session) rseselector = RSESelector(account='ddmadmin', rses=rses, weight='freespace', copies=1, ignore_account_limit=True, session=session) return get_rse_name([ rse_id for rse_id, _, _ in rseselector.select_rse( size=0, preferred_rse_ids=[], blacklist=other_rses) ][0], session=session)
def transmogrifier(bulk=5, once=False, sleep_time=60): """ Creates a Transmogrifier Worker that gets a list of new DIDs for a given hash, identifies the subscriptions matching the DIDs and submit a replication rule for each DID matching a subscription. :param thread: Thread number at startup. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Time between two cycles. """ executable = 'transmogrifier' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) while not graceful_stop.is_set(): heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) dids, subscriptions = [], [] tottime = 0 prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) try: # Get the new DIDs based on the is_new flag for did in list_new_dids(thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads'], chunk_size=bulk, did_type=None): dids.append({ 'scope': did['scope'], 'did_type': str(did['did_type']), 'name': did['name'] }) sub_dict = {3: []} # Get the list of subscriptions. The default priority of the subscription is 3. 0 is the highest priority, 5 the lowest # The priority is defined as 'policyid' for sub in list_subscriptions(None, None): if sub['state'] != SubscriptionState.INACTIVE and sub[ 'lifetime'] and (datetime.now() > sub['lifetime']): update_subscription( name=sub['name'], account=sub['account'], metadata={'state': SubscriptionState.INACTIVE}) elif sub['state'] in [ SubscriptionState.ACTIVE, SubscriptionState.UPDATED ]: priority = 3 if 'policyid' in sub: if int(sub['policyid']) not in sub_dict: sub_dict[int(sub['policyid'])] = [] priority = int(sub['policyid']) sub_dict[priority].append(sub) priorities = list(sub_dict.keys()) priorities.sort() # Order the subscriptions according to their priority for priority in priorities: subscriptions.extend(sub_dict[priority]) except SubscriptionNotFound as error: logging.warning(prepend_str + 'No subscriptions defined: %s' % (str(error))) time.sleep(10) continue except Exception as error: logging.error( prepend_str + 'Failed to get list of new DIDs or subscriptions: %s' % (str(error))) try: results = {} start_time = time.time() blacklisted_rse_id = [ rse['id'] for rse in list_rses({'availability_write': False}) ] logging.debug(prepend_str + 'In transmogrifier worker') identifiers = [] # Loop over all the new dids for did in dids: did_success = True if did['did_type'] == str( DIDType.DATASET) or did['did_type'] == str( DIDType.CONTAINER): did_tag = '%s:%s' % (did['scope'].internal, did['name']) results[did_tag] = [] try: metadata = get_metadata(did['scope'], did['name']) # Loop over all the subscriptions for subscription in subscriptions: # Check if the DID match the subscription if is_matching_subscription( subscription, did, metadata) is True: filter_string = loads(subscription['filter']) split_rule = filter_string.get( 'split_rule', False) stime = time.time() results[did_tag].append(subscription['id']) logging.info(prepend_str + '%s:%s matches subscription %s' % (did['scope'], did['name'], subscription['name'])) rules = loads( subscription['replication_rules']) created_rules = {} cnt = 0 for rule_dict in rules: cnt += 1 created_rules[cnt] = [] # Get all the rule and subscription parameters grouping = rule_dict.get( 'grouping', 'DATASET') lifetime = rule_dict.get('lifetime', None) ignore_availability = rule_dict.get( 'ignore_availability', None) weight = rule_dict.get('weight', None) source_replica_expression = rule_dict.get( 'source_replica_expression', None) locked = rule_dict.get('locked', None) if locked == 'True': locked = True else: locked = False purge_replicas = rule_dict.get( 'purge_replicas', False) if purge_replicas == 'True': purge_replicas = True else: purge_replicas = False rse_expression = str( rule_dict['rse_expression']) comment = str(subscription['comments']) subscription_id = str(subscription['id']) account = subscription['account'] copies = int(rule_dict['copies']) activity = rule_dict.get( 'activity', 'User Subscriptions') try: validate_schema(name='activity', obj=activity) except InputValidationError as error: logging.error( prepend_str + 'Error validating the activity %s' % (str(error))) activity = 'User Subscriptions' if lifetime: lifetime = int(lifetime) str_activity = "".join(activity.split()) success = False nattempt = 5 attemptnr = 0 skip_rule_creation = False selected_rses = [] chained_idx = rule_dict.get( 'chained_idx', None) if chained_idx: params = {} if rule_dict.get( 'associated_site_idx', None): params[ 'associated_site_idx'] = rule_dict.get( 'associated_site_idx', None) logging.debug( '%s Chained subscription identified. Will use %s', prepend_str, str(created_rules[chained_idx])) algorithm = rule_dict.get( 'algorithm', None) selected_rses = select_algorithm( algorithm, created_rules[chained_idx], params) else: # In the case of chained subscription, don't use rseselector but use the rses returned by the algorithm if split_rule: vo = account.vo rses = parse_expression( rse_expression, filter={'vo': vo}) list_of_rses = [ rse['id'] for rse in rses ] # Check that some rule doesn't already exist for this DID and subscription preferred_rse_ids = [] for rule in list_rules( filters={ 'subscription_id': subscription_id, 'scope': did['scope'], 'name': did['name'] }): already_existing_rses = [ (rse['rse'], rse['id']) for rse in parse_expression( rule['rse_expression'], filter={'vo': vo}) ] for rse, rse_id in already_existing_rses: if (rse_id in list_of_rses ) and ( rse_id not in preferred_rse_ids): preferred_rse_ids.append( rse_id) if len(preferred_rse_ids ) >= copies: skip_rule_creation = True rse_id_dict = {} for rse in rses: rse_id_dict[ rse['id']] = rse['rse'] try: rseselector = RSESelector( account=account, rses=rses, weight=weight, copies=copies - len(preferred_rse_ids)) selected_rses = [ rse_id_dict[rse_id] for rse_id, _, _ in rseselector.select_rse( 0, preferred_rse_ids= preferred_rse_ids, copies=copies, blacklist= blacklisted_rse_id) ] except (InsufficientTargetRSEs, InsufficientAccountLimit, InvalidRuleWeight, RSEOverQuota) as error: logging.warning( prepend_str + 'Problem getting RSEs for subscription "%s" for account %s : %s. Try including blacklisted sites' % (subscription['name'], account, str(error))) # Now including the blacklisted sites try: rseselector = RSESelector( account=account, rses=rses, weight=weight, copies=copies - len(preferred_rse_ids)) selected_rses = [ rse_id_dict[rse_id] for rse_id, _, _ in rseselector.select_rse( 0, preferred_rse_ids= preferred_rse_ids, copies=copies, blacklist=[]) ] ignore_availability = True except (InsufficientTargetRSEs, InsufficientAccountLimit, InvalidRuleWeight, RSEOverQuota) as error: logging.error( prepend_str + 'Problem getting RSEs for subscription "%s" for account %s : %s. Skipping rule creation.' % (subscription['name'], account, str(error))) monitor.record_counter( counters= 'transmogrifier.addnewrule.errortype.%s' % (str(error.__class__. __name__)), delta=1) # The DID won't be reevaluated at the next cycle did_success = did_success and True continue for attempt in range(0, nattempt): attemptnr = attempt nb_rule = 0 # Try to create the rule try: if split_rule: if not skip_rule_creation: for rse in selected_rses: if isinstance( selected_rses, dict): source_replica_expression = selected_rses[ rse].get( 'source_replica_expression', None) weight = selected_rses[ rse].get( 'weight', None) logging.info( prepend_str + 'Will insert one rule for %s:%s on %s' % (did['scope'], did['name'], rse)) rule_ids = add_rule( dids=[{ 'scope': did['scope'], 'name': did['name'] }], account=account, copies=1, rse_expression=rse, grouping=grouping, weight=weight, lifetime=lifetime, locked=locked, subscription_id= subscription_id, source_replica_expression =source_replica_expression, activity=activity, purge_replicas= purge_replicas, ignore_availability= ignore_availability, comment=comment) created_rules[ cnt].append( rule_ids[0]) nb_rule += 1 if nb_rule == copies: success = True break else: rule_ids = add_rule( dids=[{ 'scope': did['scope'], 'name': did['name'] }], account=account, copies=copies, rse_expression= rse_expression, grouping=grouping, weight=weight, lifetime=lifetime, locked=locked, subscription_id= subscription['id'], source_replica_expression= source_replica_expression, activity=activity, purge_replicas= purge_replicas, ignore_availability= ignore_availability, comment=comment) created_rules[cnt].append( rule_ids[0]) nb_rule += 1 monitor.record_counter( counters= 'transmogrifier.addnewrule.done', delta=nb_rule) monitor.record_counter( counters= 'transmogrifier.addnewrule.activity.%s' % str_activity, delta=nb_rule) success = True break except (InvalidReplicationRule, InvalidRuleWeight, InvalidRSEExpression, StagingAreaRuleRequiresLifetime, DuplicateRule) as error: # Errors that won't be retried success = True logging.error(prepend_str + '%s' % (str(error))) monitor.record_counter( counters= 'transmogrifier.addnewrule.errortype.%s' % (str( error.__class__.__name__)), delta=1) break except (ReplicationRuleCreationTemporaryFailed, InsufficientTargetRSEs, InsufficientAccountLimit, DatabaseException, RSEBlacklisted, RSEWriteBlocked) as error: # Errors to be retried logging.error( prepend_str + '%s Will perform an other attempt %i/%i' % (str(error), attempt + 1, nattempt)) monitor.record_counter( counters= 'transmogrifier.addnewrule.errortype.%s' % (str( error.__class__.__name__)), delta=1) except Exception: # Unexpected errors monitor.record_counter( counters= 'transmogrifier.addnewrule.errortype.unknown', delta=1) exc_type, exc_value, exc_traceback = exc_info( ) logging.critical( prepend_str + ''.join( format_exception( exc_type, exc_value, exc_traceback)).strip( )) did_success = (did_success and success) if (attemptnr + 1) == nattempt and not success: logging.error( prepend_str + 'Rule for %s:%s on %s cannot be inserted' % (did['scope'], did['name'], rse_expression)) else: logging.info( prepend_str + '%s rule(s) inserted in %f seconds' % (str(nb_rule), time.time() - stime)) except DataIdentifierNotFound as error: logging.warning(prepend_str + error) if did_success: if did['did_type'] == str(DIDType.FILE): monitor.record_counter( counters='transmogrifier.did.file.processed', delta=1) elif did['did_type'] == str(DIDType.DATASET): monitor.record_counter( counters='transmogrifier.did.dataset.processed', delta=1) elif did['did_type'] == str(DIDType.CONTAINER): monitor.record_counter( counters='transmogrifier.did.container.processed', delta=1) monitor.record_counter( counters='transmogrifier.did.processed', delta=1) identifiers.append({ 'scope': did['scope'], 'name': did['name'], 'did_type': DIDType.from_sym(did['did_type']) }) time1 = time.time() # Mark the DIDs as processed for identifier in chunks(identifiers, 100): _retrial(set_new_dids, identifier, None) logging.info(prepend_str + 'Time to set the new flag : %f' % (time.time() - time1)) tottime = time.time() - start_time for sub in subscriptions: update_subscription( name=sub['name'], account=sub['account'], metadata={'last_processed': datetime.now()}) logging.info(prepend_str + 'It took %f seconds to process %i DIDs' % (tottime, len(dids))) logging.debug(prepend_str + 'DIDs processed : %s' % (str(dids))) monitor.record_counter(counters='transmogrifier.job.done', delta=1) monitor.record_timer(stat='transmogrifier.job.duration', time=1000 * tottime) except Exception: exc_type, exc_value, exc_traceback = exc_info() logging.critical(prepend_str + ''.join( format_exception(exc_type, exc_value, exc_traceback)).strip()) monitor.record_counter(counters='transmogrifier.job.error', delta=1) monitor.record_counter(counters='transmogrifier.addnewrule.error', delta=1) if once is True: break if tottime < sleep_time: logging.info(prepend_str + 'Will sleep for %s seconds' % (sleep_time - tottime)) time.sleep(sleep_time - tottime) heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop requested') logging.info(prepend_str + 'Graceful stop done')
def test_1(self): # more copies than RSEs -> error rses = [self.rse_1] copies = 2 with pytest.raises(InsufficientTargetRSEs): RSESelector(self.account, rses, None, copies)
def select_target_rse(parent_rule, current_rse_id, rse_expression, subscription_id, rse_attributes, other_rses=[], exclude_expression=None, force_expression=None, session=None): """ Select a new target RSE for a rebalanced rule. :param parent_rule rule that is rebalanced. :param current_rse_id: RSE of the source. :param rse_expression: RSE Expression of the source rule. :param subscription_id: Subscription ID of the source rule. :param rse_attributes: The attributes of the source rse. :param other_rses: Other RSEs with existing dataset replicas. :param exclude_expression: Exclude this rse_expression from being target_rses. :param force_expression: Force a specific rse_expression as target. :param session: The DB Session. :returns: New RSE expression. """ current_rse = get_rse_name(rse_id=current_rse_id) current_rse_expr = current_rse # if parent rule has a vo, enforce it vo = parent_rule['scope'].vo if exclude_expression: target_rse = '((%s)|(%s))' % (exclude_expression, current_rse_expr) else: target_rse = current_rse_expr list_target_rses = [ rse['rse'] for rse in parse_expression( expression=target_rse, filter_={'vo': vo}, session=session) ] list_target_rses.sort() rses = parse_expression(expression=rse_expression, filter_={'vo': vo}, session=session) # TODO: Implement subscription rebalancing if force_expression is not None: if parent_rule['grouping'] != RuleGrouping.NONE: rses = parse_expression(expression='(%s)\\%s' % (force_expression, target_rse), filter_={ 'vo': vo, 'availability_write': True }, session=session) else: # in order to avoid replication of the part of distributed dataset not present at rebalanced rse -> rses in force_expression # this will be extended with development of delayed rule rses = parse_expression( expression='((%s)|(%s))\\%s' % (force_expression, rse_expression, target_rse), filter_={ 'vo': vo, 'availability_write': True }, session=session) else: # force_expression is not set, RSEs will be selected as rse_expression\rse list_rses = [rse['rse'] for rse in rses] list_rses.sort() if list_rses == list_target_rses: raise InsufficientTargetRSEs( 'Not enough RSEs to rebalance rule %s' % parent_rule['id']) else: rses = parse_expression(expression='(%s)\\%s' % (rse_expression, target_rse), filter_={ 'vo': vo, 'availability_write': True }, session=session) rseselector = RSESelector(account=InternalAccount('root', vo=vo), rses=rses, weight='freespace', copies=1, ignore_account_limit=True, session=session) return get_rse_name([ rse_id for rse_id, _, _ in rseselector.select_rse( size=0, preferred_rse_ids=[], blocklist=other_rses) ][0], session=session)
for rse, rse_id in already_existing_rses: if (rse in list_of_rses) and ( rse_id not in preferred_rse_ids): preferred_rse_ids.append( rse_id) if len(preferred_rse_ids) >= copies: skip_rule_creation = True rse_id_dict = {} for rse in rses: rse_id_dict[rse['id']] = rse['rse'] try: rseselector = RSESelector( account=account, rses=rses, weight=weight, copies=copies - len(preferred_rse_ids)) selected_rses = [ rse_id_dict[rse_id] for rse_id, _ in rseselector.select_rse( 0, preferred_rse_ids= preferred_rse_ids, copies=copies, blacklist=blacklisted_rse_id ) ] except (InsufficientTargetRSEs, InsufficientAccountLimit, InvalidRuleWeight) as error: