def update_rule(self): """ Adds or removes the rule for the block. """ rules = list_replication_rules(filters={'scope': self.scope, 'name': self.block_name}) # rules = self.rcli.list_did_rules(scope=self.scope, name=self.block_name) rse_expression = 'rse=' + self.rse remove_rules = [rule for rule in rules if rule['account'] == self.account and rule['rse_expression'] == rse_expression] if not remove_rules and self.is_at_pnn: self.rule_exists = False if self.dry_run: logging.info("Dry run: Adding rule for dataset %s at rse %s.", self.block_name, self.rse) else: self.add_replication_rule_with_defaults(dids=[{'scope': self.scope, 'name': self.block_name}], copies=1, rse_expression=rse_expression, account=self.account) monitor.record_counter('cms_sync.rules_added') self.rule_exists = True elif remove_rules and not self.is_at_pnn: self.rule_exists = True if self.dry_run: logging.info("Removing rules for dataset %s at rse %s.", self.block_name, self.rse) else: for rule in remove_rules: # delete_replication_rule(rule['id'], purge_replicas=False, issuer=self.account) delete_rule(rule_id=rule['id'], purge_replicas=True, soft=False) monitor.record_counter('cms_sync.rules_removed') self.rule_exists = False
def test_bb8_rebalance_rule(self): """ BB8: Test the rebalance rule method""" scope = InternalScope('mock', **self.vo) files = create_files(3, scope, self.rse1_id) dataset = 'dataset_' + str(uuid()) add_did(scope, dataset, DIDType.from_sym('DATASET'), self.jdoe) attach_dids(scope, dataset, files, self.jdoe) rule_id = add_rule(dids=[{'scope': scope, 'name': dataset}], account=self.jdoe, copies=1, rse_expression=self.rse1, grouping='NONE', weight='fakeweight', lifetime=None, locked=False, subscription_id=None)[0] rule = {} try: rule = get_rule(rule_id) except: assert_raises(RuleNotFound, get_rule, rule_id) child_rule = rebalance_rule(rule, 'Rebalance', self.rse3, priority=3) rule_cleaner(once=True) assert(get_rule(rule_id)['expires_at'] <= datetime.utcnow()) assert(get_rule(rule_id)['child_rule_id'] == child_rule) rule_cleaner(once=True) assert(get_rule(rule_id)['expires_at'] <= datetime.utcnow()) successful_transfer(scope=scope, name=files[0]['name'], rse_id=self.rse3_id, nowait=False) successful_transfer(scope=scope, name=files[1]['name'], rse_id=self.rse3_id, nowait=False) with assert_raises(UnsupportedOperation): delete_rule(rule_id) successful_transfer(scope=scope, name=files[2]['name'], rse_id=self.rse3_id, nowait=False) rule_cleaner(once=True) assert(get_rule(child_rule)['state'] == RuleState.OK)
def cleanup(self, session=None): if not self.created_dids: return # Cleanup Transfers session.query(models.Source).filter(or_(and_(models.Source.scope == did['scope'], models.Source.name == did['name']) for did in self.created_dids)).delete(synchronize_session=False) session.query(models.Request).filter(or_(and_(models.Request.scope == did['scope'], models.Request.name == did['name']) for did in self.created_dids)).delete(synchronize_session=False) # Cleanup Locks Rules query = session.query(models.ReplicationRule.id).filter(or_(and_(models.ReplicationRule.scope == did['scope'], models.ReplicationRule.name == did['name']) for did in self.created_dids)) for rule_id, in query: rule_core.delete_rule(rule_id, session=session) # Cleanup Replicas and Parent Datasets dids_by_rse = {} replicas = list(replica_core.list_replicas(self.created_dids, all_states=True, session=session)) for replica in replicas: for rse_id in replica['rses']: dids_by_rse.setdefault(rse_id, []).append({'scope': replica['scope'], 'name': replica['name']}) for rse_id, dids in dids_by_rse.items(): replica_core.delete_replicas(rse_id=rse_id, files=dids, session=session)
def delete_replication_rule(rule_id, purge_replicas, issuer, vo='def', session=None): """ Deletes a replication rule and all associated locks. :param rule_id: The id of the rule to be deleted :param purge_replicas: Purge the replicas immediately :param issuer: The issuing account of this operation :param vo: The VO to act on. :param session: The database session in use. :raises: RuleNotFound, AccessDenied """ kwargs = {'rule_id': rule_id, 'purge_replicas': purge_replicas} if is_multi_vo( session=session) and not has_permission(issuer=issuer, vo=vo, action='access_rule_vo', kwargs=kwargs, session=session): raise AccessDenied('Account %s can not access rules at other VOs.' % (issuer)) if not has_permission( issuer=issuer, vo=vo, action='del_rule', kwargs=kwargs): raise AccessDenied('Account %s can not remove this replication rule.' % (issuer)) rule.delete_rule(rule_id=rule_id, purge_replicas=purge_replicas, soft=True, session=session)
def test_bb8_rebalance_rule(vo, root_account, jdoe_account, rse_factory, mock_scope, did_factory): """BB8: Test the rebalance rule method""" rse1, rse1_id = rse_factory.make_posix_rse() rse2, rse2_id = rse_factory.make_posix_rse() # Add Tags T1 = tag_generator() T2 = tag_generator() add_rse_attribute(rse1_id, T1, True) add_rse_attribute(rse2_id, T2, True) # Add fake weights add_rse_attribute(rse1_id, "fakeweight", 10) add_rse_attribute(rse2_id, "fakeweight", 0) # Add quota set_local_account_limit(jdoe_account, rse1_id, -1) set_local_account_limit(jdoe_account, rse2_id, -1) set_local_account_limit(root_account, rse1_id, -1) set_local_account_limit(root_account, rse2_id, -1) files = create_files(3, mock_scope, rse1_id) dataset = did_factory.make_dataset() attach_dids(mock_scope, dataset['name'], files, jdoe_account) set_status(mock_scope, dataset['name'], open=False) # Invalid the cache because the result of parse_expression is cached REGION.invalidate() rule_id = add_rule(dids=[{'scope': mock_scope, 'name': dataset['name']}], account=jdoe_account, copies=1, rse_expression=rse1, grouping='NONE', weight='fakeweight', lifetime=None, locked=False, subscription_id=None)[0] rule = {} try: rule = get_rule(rule_id) except: pytest.raises(RuleNotFound, get_rule, rule_id) child_rule = rebalance_rule(rule, 'Rebalance', rse2, priority=3) rule_cleaner(once=True) assert(get_rule(rule_id)['expires_at'] <= datetime.utcnow()) assert(get_rule(rule_id)['child_rule_id'] == child_rule) rule_cleaner(once=True) assert(get_rule(rule_id)['expires_at'] <= datetime.utcnow()) successful_transfer(scope=mock_scope, name=files[0]['name'], rse_id=rse2_id, nowait=False) successful_transfer(scope=mock_scope, name=files[1]['name'], rse_id=rse2_id, nowait=False) with pytest.raises(UnsupportedOperation): delete_rule(rule_id) successful_transfer(scope=mock_scope, name=files[2]['name'], rse_id=rse2_id, nowait=False) rule_cleaner(once=True) assert(get_rule(child_rule)['state'] == RuleState.OK) set_metadata(mock_scope, dataset['name'], 'lifetime', -86400) undertaker.run(once=True)
def __cleanup_locks_and_rules(self, session=None): query = session.query(models.ReplicationRule.id).filter( or_( and_(models.ReplicationRule.scope == did['scope'], models.ReplicationRule.name == did['name']) for did in self.created_dids)) for rule_id, in query: rule_core.delete_rule(rule_id, session=session, ignore_rule_lock=True)
def delete_replication_rule(rule_id, purge_replicas, issuer): """ Deletes a replication rule and all associated locks. :param rule_id: The id of the rule to be deleted :param issuer: The issuing account of this operation :raises: RuleNotFound, AccessDenied """ kwargs = {'rule_id': rule_id, 'purge_replicas': purge_replicas} if not has_permission(issuer=issuer, action='del_rule', kwargs=kwargs): raise AccessDenied('Account %s can not remove this replication rule.' % (issuer)) rule.delete_rule(rule_id=rule_id, purge_replicas=purge_replicas, soft=True)
def delete_replication_rule(rule_id, issuer): """ Deletes a replication rule and all associated locks. :param rule_id: The id of the rule to be deleted :param issuer: The issuing account of this operation :raises: RuleNotFound, AccessDenied """ kwargs = {'rule_id': rule_id} if not has_permission(issuer=issuer, action='del_rule', kwargs=kwargs): raise AccessDenied('Account %s can not remove this replication rule.' % (issuer)) rule.delete_rule(rule_id)
def test_locked_rule(self): """ REPLICATION RULE (CLIENT): Delete a locked replication rule""" scope = 'mock' files = create_files(3, scope, self.rse1) dataset = 'dataset_' + str(uuid()) add_did(scope, dataset, DIDType.from_sym('DATASET'), 'jdoe') attach_dids(scope, dataset, files, 'jdoe') rule_id_1 = add_rule(dids=[{'scope': scope, 'name': dataset}], account='jdoe', copies=1, rse_expression=self.rse1, grouping='NONE', weight='fakeweight', lifetime=None, locked=True, subscription_id=None)[0] assert_raises(AccessDenied, delete_rule, rule_id_1) self.rule_client.update_replication_rule(rule_id=rule_id_1, options={'locked': False}) delete_rule(rule_id=rule_id_1)
def test_delete_rule(self): """ REPLICATION RULE (CORE): Test to delete a previously created rule""" scope = 'mock' files = create_files(3, scope, self.rse1) dataset = 'dataset_' + str(uuid()) add_did(scope, dataset, DIDType.from_sym('DATASET'), 'jdoe') attach_dids(scope, dataset, files, 'jdoe') rule_id = add_rule(dids=[{'scope': scope, 'name': dataset}], account='jdoe', copies=2, rse_expression=self.T1, grouping='DATASET', weight='fakeweight', lifetime=None, locked=False, subscription_id=None)[0] delete_rule(rule_id) for file in files: rse_locks = get_replica_locks(scope=file['scope'], name=file['name']) assert(len(rse_locks) == 0) assert_raises(RuleNotFound, delete_rule, uuid())
def test_add_rule_with_purge(self): """ REPLICATION RULE (CORE): Add a replication rule with purge setting""" scope = 'mock' files = create_files(3, scope, self.rse1) dataset = 'dataset_' + str(uuid()) add_did(scope, dataset, DIDType.from_sym('DATASET'), 'jdoe') attach_dids(scope, dataset, files, 'jdoe') rule_id = add_rule(dids=[{'scope': scope, 'name': dataset}], account='jdoe', copies=1, rse_expression=self.rse4, grouping='NONE', weight=None, lifetime=None, locked=False, subscription_id=None, purge_replicas=True)[0] delete_rule(rule_id) # Check if the Locks are created properly for file in files: replica = get_replica(rse=self.rse4, scope=file['scope'], name=file['name']) assert(replica['tombstone'] == OBSOLETE)
def test_delete_rule_and_cancel_transfers(self): """ REPLICATION RULE (CORE): Test to delete a previously created rule and do not cancel overlapping transfers""" scope = 'mock' files = create_files(3, scope, self.rse1) dataset = 'dataset_' + str(uuid()) add_did(scope, dataset, DIDType.from_sym('DATASET'), 'jdoe') attach_dids(scope, dataset, files, 'jdoe') rule_id_1 = add_rule(dids=[{'scope': scope, 'name': dataset}], account='jdoe', copies=1, rse_expression=self.rse1, grouping='NONE', weight='fakeweight', lifetime=None, locked=False, subscription_id=None)[0] add_rule(dids=[{'scope': scope, 'name': dataset}], account='jdoe', copies=2, rse_expression=self.T1, grouping='NONE', weight='fakeweight', lifetime=None, locked=False, subscription_id=None)[0] add_rule(dids=[{'scope': scope, 'name': dataset}], account='jdoe', copies=3, rse_expression=self.T1, grouping='NONE', weight='fakeweight', lifetime=None, locked=False, subscription_id=None)[0] delete_rule(rule_id_1) for file in files: rse_locks = get_replica_locks(scope=file['scope'], name=file['name']) assert(len(rse_locks) == 5) # TODO Need to check transfer queue here, this is actually not the check of this test case assert_raises(RuleNotFound, delete_rule, uuid())
def test_account_counter_rule_delete(self): """ REPLICATION RULE (CORE): Test if the account counter is updated correctly when a rule is removed""" scope = 'mock' files = create_files(3, scope, self.rse1, bytes=100) dataset = 'dataset_' + str(uuid()) add_did(scope, dataset, DIDType.from_sym('DATASET'), 'jdoe') attach_dids(scope, dataset, files, 'jdoe') rule_id = add_rule(dids=[{'scope': scope, 'name': dataset}], account='jdoe', copies=1, rse_expression=self.rse1, grouping='ALL', weight=None, lifetime=None, locked=False, subscription_id=None)[0] account_update(once=True) account_counter_before = get_account_counter(self.rse1_id, 'jdoe') delete_rule(rule_id) account_update(once=True) # Check if the counter has been updated correctly account_counter_after = get_account_counter(self.rse1_id, 'jdoe') assert(account_counter_before['bytes'] - 3*100 == account_counter_after['bytes']) assert(account_counter_before['files'] - 3 == account_counter_after['files'])
def delete_sync_rule(rule_id, session=None): rule = get_rule(rule_id, session=session) if rule["did_type"] != DIDType.DATASET: raise RuntimeError("Rule applies to did with wrong type") block = rule["name"] try: rse_id = get_rse_id(rse=rule["rse_expression"], session=session) except RSENotFound: raise RuntimeError("Rule does not apply to a specific RSE") scope = rule["scope"] account = rule["account"] files = [] for file in list_files(scope, block, long=False, session=session): files.append( {"scope": scope, "name": file["name"], "rse_id": rse_id, "state": "U"} ) update_replicas_states( replicas=files, add_tombstone=False, session=session ) delete_rule(rule_id=rule_id, purge_replicas=True, soft=False, session=session)
def cleanup(self, session=None): if not self.created_rses: return # Cleanup Transfers session.query(models.Source).filter(or_(models.Source.dest_rse_id.in_(self.created_rses), models.Source.rse_id.in_(self.created_rses))).delete(synchronize_session=False) session.query(models.Request).filter(or_(models.Request.dest_rse_id.in_(self.created_rses), models.Request.source_rse_id.in_(self.created_rses))).delete(synchronize_session=False) # Cleanup Locks and Rules query = session.query(models.ReplicationRule.id). \ join(models.ReplicaLock, models.ReplicationRule.id == models.ReplicaLock.rule_id). \ filter(models.ReplicaLock.rse_id.in_(self.created_rses)).distinct() for rule_id, in query: rule_core.delete_rule(rule_id, session=session) # Cleanup Replicas and Parent Datasets query = session.query(models.RSEFileAssociation.scope, models.RSEFileAssociation.name, models.RSEFileAssociation.rse_id). \ filter(models.RSEFileAssociation.rse_id.in_(self.created_rses)) dids_by_rse = {} for scope, name, rse_id in query: dids_by_rse.setdefault(rse_id, []).append({'scope': scope, 'name': name}) for rse_id, dids in dids_by_rse.items(): replica_core.delete_replicas(rse_id=rse_id, files=dids, session=session) # Cleanup RSEs for model in (models.RSEAttrAssociation, models.RSEProtocols, models.UpdatedRSECounter, models.RSEUsage, models.RSELimit, models.RSETransferLimit, models.RSEQoSAssociation): session.query(model).filter(model.rse_id.in_(self.created_rses)).delete(synchronize_session=False) session.query(models.Distance).filter(or_(models.Distance.src_rse_id.in_(self.created_rses), models.Distance.dest_rse_id.in_(self.created_rses))).delete(synchronize_session=False) for rse_id in self.created_rses: # Only archive RSE instead of deleting. Account handling code doesn't expect RSEs to ever be deleted. # So running test in parallel results in some tests failing on foreign key errors. rse_core.del_rse(rse_id, session=session)
def rule_cleaner(once=False): """ Main loop to check for expired replication rules """ hostname = socket.gethostname() pid = os.getpid() current_thread = threading.current_thread() paused_rules = {} # {rule_id: datetime} # Make an initial heartbeat so that all judge-cleaners have the correct worker number on the next try live(executable='rucio-judge-cleaner', hostname=hostname, pid=pid, thread=current_thread) graceful_stop.wait(1) while not graceful_stop.is_set(): try: # heartbeat heartbeat = live(executable='rucio-judge-cleaner', hostname=hostname, pid=pid, thread=current_thread) start = time.time() # Refresh paused rules iter_paused_rules = deepcopy(paused_rules) for key in iter_paused_rules: if datetime.utcnow() > paused_rules[key]: del paused_rules[key] rules = get_expired_rules(total_workers=heartbeat['nr_threads'] - 1, worker_number=heartbeat['assign_thread'], limit=200, blacklisted_rules=[key for key in paused_rules]) logging.debug('rule_cleaner[%s/%s] index query time %f fetch size is %d' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, time.time() - start, len(rules))) if not rules and not once: logging.debug('rule_cleaner[%s/%s] did not get any work (paused_rules=%s)' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, str(len(paused_rules)))) graceful_stop.wait(60) else: for rule in rules: rule_id = rule[0] rule_expression = rule[1] logging.info('rule_cleaner[%s/%s]: Deleting rule %s with expression %s' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, rule_id, rule_expression)) if graceful_stop.is_set(): break try: start = time.time() delete_rule(rule_id=rule_id, nowait=True) logging.debug('rule_cleaner[%s/%s]: deletion of %s took %f' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, rule_id, time.time() - start)) except (DatabaseException, DatabaseError, UnsupportedOperation), e: if match('.*ORA-00054.*', str(e.args[0])): paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(600, 2400)) record_counter('rule.judge.exceptions.LocksDetected') logging.warning('rule_cleaner[%s/%s]: Locks detected for %s' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, rule_id)) elif match('.*QueuePool.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) elif match('.*ORA-03135.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logging.error(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except RuleNotFound, e: pass
def rule_cleaner(once=False): """ Main loop to check for expired replication rules """ hostname = socket.gethostname() pid = os.getpid() current_thread = threading.current_thread() paused_rules = {} # {rule_id: datetime} # Make an initial heartbeat so that all judge-cleaners have the correct worker number on the next try executable = 'judge-cleaner' heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread) prefix = 'judge-cleaner[%i/%i] ' % (heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') graceful_stop.wait(1) while not graceful_stop.is_set(): try: # heartbeat heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread) prefix = 'judge-cleaner[%i/%i] ' % (heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') start = time.time() # Refresh paused rules iter_paused_rules = deepcopy(paused_rules) for key in iter_paused_rules: if datetime.utcnow() > paused_rules[key]: del paused_rules[key] rules = get_expired_rules( total_workers=heartbeat['nr_threads'], worker_number=heartbeat['assign_thread'], limit=200, blocked_rules=[key for key in paused_rules]) logger( logging.DEBUG, 'index query time %f fetch size is %d' % (time.time() - start, len(rules))) if not rules and not once: logger( logging.DEBUG, 'did not get any work (paused_rules=%s)' % str(len(paused_rules))) graceful_stop.wait(60) else: for rule in rules: rule_id = rule[0] rule_expression = rule[1] logger( logging.INFO, 'Deleting rule %s with expression %s' % (rule_id, rule_expression)) if graceful_stop.is_set(): break try: start = time.time() delete_rule(rule_id=rule_id, nowait=True) logger( logging.DEBUG, 'deletion of %s took %f' % (rule_id, time.time() - start)) except (DatabaseException, DatabaseError, UnsupportedOperation) as e: if match('.*ORA-00054.*', str(e.args[0])): paused_rules[rule_id] = datetime.utcnow( ) + timedelta(seconds=randint(600, 2400)) record_counter( 'rule.judge.exceptions.LocksDetected') logger(logging.WARNING, 'Locks detected for %s' % rule_id) elif match('.*QueuePool.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) elif match('.*ORA-03135.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logger(logging.ERROR, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except RuleNotFound: pass except (DatabaseException, DatabaseError) as e: if match('.*QueuePool.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) elif match('.*ORA-03135.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logger(logging.CRITICAL, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except Exception as e: logger(logging.CRITICAL, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) if once: break die(executable=executable, hostname=hostname, pid=pid, thread=current_thread)
def __cleanup_locks_and_rules(self, session=None): query = session.query(models.ReplicationRule.id). \ join(models.ReplicaLock, models.ReplicationRule.id == models.ReplicaLock.rule_id). \ filter(models.ReplicaLock.rse_id.in_(self.created_rses)).distinct() for rule_id, in query: rule_core.delete_rule(rule_id, session=session)
def rule_cleaner(once=False, process=0, total_processes=1, thread=0, threads_per_process=1): """ Main loop to check for expired replication rules """ logging.info('rule_cleaner: starting') logging.info('rule_cleaner: started') paused_rules = {} # {rule_id: datetime} while not graceful_stop.is_set(): try: start = time.time() rules = get_expired_rules(total_workers=total_processes*threads_per_process-1, worker_number=process*threads_per_process+thread, limit=1000) logging.debug('rule_cleaner index query time %f fetch size is %d' % (time.time() - start, len(rules))) # Refresh paused rules iter_paused_rules = deepcopy(paused_rules) for key in iter_paused_rules: if datetime.utcnow() > paused_rules[key]: del paused_rules[key] # Remove paused rules from result set rules = [rule for rule in rules if rule[0] not in paused_rules] if not rules and not once: logging.info('rule_cleaner[%s/%s] did not get any work' % (process*threads_per_process+thread, total_processes*threads_per_process-1)) time.sleep(10) else: record_gauge('rule.judge.cleaner.threads.%d' % (process*threads_per_process+thread), 1) for rule in rules: rule_id = rule[0] rule_expression = rule[1] logging.info('rule_cleaner[%s/%s]: Deleting rule %s with expression %s' % (process*threads_per_process+thread, total_processes*threads_per_process-1, rule_id, rule_expression)) if graceful_stop.is_set(): break try: start = time.time() delete_rule(rule_id=rule_id, nowait=True) logging.debug('rule_cleaner[%s/%s]: deletion of %s took %f' % (process*threads_per_process+thread, total_processes*threads_per_process-1, rule_id, time.time() - start)) except (DatabaseException, DatabaseError, AccessDenied), e: if isinstance(e.args[0], tuple): if match('.*ORA-00054.*', e.args[0][0]): paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(60, 600)) record_counter('rule.judge.exceptions.LocksDetected') logging.warning('rule_cleaner[%s/%s]: Locks detected for %s' % (process*threads_per_process+thread, total_processes*threads_per_process-1, rule_id)) else: logging.error(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logging.error(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) record_gauge('rule.judge.cleaner.threads.%d' % (process*threads_per_process+thread), 0) except Exception, e: record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) record_gauge('rule.judge.cleaner.threads.%d' % (process*threads_per_process+thread), 0) logging.critical(traceback.format_exc()) if once: return
def __cleanup_locks_and_rules(self, rules_to_remove, session=None): for rule_id, in rules_to_remove: rule_core.delete_rule(rule_id, session=session)
def run_once(paused_rules, heartbeat_handler, **_kwargs): worker_number, total_workers, logger = heartbeat_handler.live() try: start = time.time() # Refresh paused rules iter_paused_rules = deepcopy(paused_rules) for key in iter_paused_rules: if datetime.utcnow() > paused_rules[key]: del paused_rules[key] rules = get_expired_rules(total_workers=total_workers, worker_number=worker_number, limit=200, blocked_rules=[key for key in paused_rules]) logger( logging.DEBUG, 'index query time %f fetch size is %d' % (time.time() - start, len(rules))) if not rules: logger( logging.DEBUG, 'did not get any work (paused_rules=%s)' % str(len(paused_rules))) return for rule in rules: _, _, logger = heartbeat_handler.live() rule_id = rule[0] rule_expression = rule[1] logger( logging.INFO, 'Deleting rule %s with expression %s' % (rule_id, rule_expression)) if graceful_stop.is_set(): break try: start = time.time() delete_rule(rule_id=rule_id, nowait=True) logger( logging.DEBUG, 'deletion of %s took %f' % (rule_id, time.time() - start)) except (DatabaseException, DatabaseError, UnsupportedOperation) as e: if match('.*ORA-00054.*', str(e.args[0])): paused_rules[rule_id] = datetime.utcnow() + timedelta( seconds=randint(600, 2400)) record_counter('rule.judge.exceptions.{exception}', labels={'exception': 'LocksDetected'}) logger(logging.WARNING, 'Locks detected for %s' % rule_id) elif match('.*QueuePool.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) elif match('.*ORA-03135.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) else: logger(logging.ERROR, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) except RuleNotFound: pass except (DatabaseException, DatabaseError) as e: if match('.*QueuePool.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) elif match('.*ORA-03135.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) else: logger(logging.CRITICAL, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) except Exception as e: logger(logging.CRITICAL, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__})