Beispiel #1
0
    def test_to_repair_a_rule_with_only_1_rse_whose_transfers_failed(self):
        """ JUDGE REPAIRER: Test to repair a rule with only 1 rse whose transfers failed (lock)"""

        rule_repairer(once=True)  # Clean out the repairer
        scope = InternalScope('mock', **self.vo)
        files = create_files(4, scope, self.rse4_id, bytes=100)
        dataset = 'dataset_' + str(uuid())
        add_did(scope, dataset, DIDType.DATASET, self.jdoe)
        attach_dids(scope, dataset, files, self.jdoe)

        rule_id = add_rule(dids=[{'scope': scope, 'name': dataset}], account=self.jdoe, copies=1, rse_expression=self.rse1, grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None)[0]

        successful_transfer(scope=scope, name=files[0]['name'], rse_id=get_replica_locks(scope=files[0]['scope'], name=files[2]['name'])[0].rse_id, nowait=False)
        successful_transfer(scope=scope, name=files[1]['name'], rse_id=get_replica_locks(scope=files[1]['scope'], name=files[2]['name'])[0].rse_id, nowait=False)
        failed_transfer(scope=scope, name=files[2]['name'], rse_id=get_replica_locks(scope=files[2]['scope'], name=files[2]['name'])[0].rse_id)
        failed_transfer(scope=scope, name=files[3]['name'], rse_id=get_replica_locks(scope=files[3]['scope'], name=files[3]['name'])[0].rse_id)
        cancel_request_did(scope=scope, name=files[2]['name'], dest_rse_id=get_replica_locks(scope=files[2]['scope'], name=files[2]['name'])[0].rse_id)
        cancel_request_did(scope=scope, name=files[3]['name'], dest_rse_id=get_replica_locks(scope=files[3]['scope'], name=files[2]['name'])[0].rse_id)

        assert(rule_id == get_rule(rule_id)['id'].replace('-', '').lower())
        assert(RuleState.STUCK == get_rule(rule_id)['state'])
        rule_repairer(once=True)

        # Stil assert STUCK because of delays:
        assert(RuleState.STUCK == get_rule(rule_id)['state'])
        assert(get_replica_locks(scope=files[2]['scope'], name=files[2]['name'])[0].rse_id == get_replica_locks(scope=files[3]['scope'], name=files[3]['name'])[0].rse_id)
    def test_to_repair_a_rule_with_DATASET_grouping_whose_transfer_failed(
            self):
        """ JUDGE REPAIRER: Test to repair a rule with 1 failed transfer (lock)"""

        rule_repairer(once=True)  # Clean out the repairer
        scope = InternalScope('mock')
        files = create_files(4, scope, self.rse4_id, bytes=100)
        dataset = 'dataset_' + str(uuid())
        add_did(scope, dataset, DIDType.from_sym('DATASET'), self.jdoe)
        attach_dids(scope, dataset, files, self.jdoe)

        rule_id = add_rule(dids=[{
            'scope': scope,
            'name': dataset
        }],
                           account=self.jdoe,
                           copies=1,
                           rse_expression=self.T1,
                           grouping='DATASET',
                           weight=None,
                           lifetime=None,
                           locked=False,
                           subscription_id=None,
                           activity='DebugJudge')[0]

        successful_transfer(
            scope=scope,
            name=files[0]['name'],
            rse_id=get_replica_locks(scope=files[0]['scope'],
                                     name=files[2]['name'])[0].rse_id,
            nowait=False)
        successful_transfer(
            scope=scope,
            name=files[1]['name'],
            rse_id=get_replica_locks(scope=files[1]['scope'],
                                     name=files[2]['name'])[0].rse_id,
            nowait=False)
        failed_transfer(
            scope=scope,
            name=files[2]['name'],
            rse_id=get_replica_locks(scope=files[2]['scope'],
                                     name=files[2]['name'])[0].rse_id)
        failed_transfer(
            scope=scope,
            name=files[3]['name'],
            rse_id=get_replica_locks(scope=files[3]['scope'],
                                     name=files[3]['name'])[0].rse_id)

        assert (rule_id == get_rule(rule_id)['id'].replace('-', '').lower())
        assert (RuleState.STUCK == get_rule(rule_id)['state'])
        rule_repairer(once=True)
        assert (RuleState.REPLICATING == get_rule(rule_id)['state'])
        assert (get_replica_locks(
            scope=files[2]['scope'],
            name=files[2]['name'])[0].rse_id == get_replica_locks(
                scope=files[3]['scope'], name=files[3]['name'])[0].rse_id)
        assert (get_replica_locks(
            scope=files[1]['scope'],
            name=files[1]['name'])[0].rse_id == get_replica_locks(
                scope=files[3]['scope'], name=files[3]['name'])[0].rse_id)
Beispiel #3
0
    def test_to_repair_a_rule_with_NONE_grouping_whose_transfer_failed(self):
        """ JUDGE REPAIRER: Test to repair a rule with 1 failed transfer (lock)"""

        rule_repairer(once=True)  # Clean out the repairer
        scope = InternalScope('mock', **self.vo)
        files = create_files(3, scope, self.rse4_id, bytes=100)
        dataset = 'dataset_' + str(uuid())
        add_did(scope, dataset, DIDType.DATASET, self.jdoe)
        attach_dids(scope, dataset, files, self.jdoe)

        rule_id = add_rule(dids=[{'scope': scope, 'name': dataset}], account=self.jdoe, copies=1, rse_expression=self.T1, grouping='NONE', weight=None, lifetime=None, locked=False, subscription_id=None)[0]

        failed_rse_id = get_replica_locks(scope=files[2]['scope'], name=files[2]['name'])[0].rse_id
        assert(get_replica(scope=files[2]['scope'], name=files[2]['name'], rse_id=failed_rse_id)['state'] == ReplicaState.COPYING)
        assert(get_replica(scope=files[2]['scope'], name=files[2]['name'], rse_id=failed_rse_id)['lock_cnt'] == 1)

        successful_transfer(scope=scope, name=files[0]['name'], rse_id=get_replica_locks(scope=files[0]['scope'], name=files[2]['name'])[0].rse_id, nowait=False)
        successful_transfer(scope=scope, name=files[1]['name'], rse_id=get_replica_locks(scope=files[1]['scope'], name=files[2]['name'])[0].rse_id, nowait=False)
        failed_transfer(scope=scope, name=files[2]['name'], rse_id=get_replica_locks(scope=files[2]['scope'], name=files[2]['name'])[0].rse_id)

        assert(rule_id == get_rule(rule_id)['id'].replace('-', '').lower())
        assert(RuleState.STUCK == get_rule(rule_id)['state'])
        rule_repairer(once=True)
        assert(RuleState.REPLICATING == get_rule(rule_id)['state'])
        assert(get_replica(scope=files[2]['scope'], name=files[2]['name'], rse_id=failed_rse_id)['state'] == ReplicaState.UNAVAILABLE)
        assert(get_replica(scope=files[2]['scope'], name=files[2]['name'], rse_id=failed_rse_id)['lock_cnt'] == 0)
    def test_to_repair_a_rule_with_only_1_rse_whose_transfers_failed(self):
        """ JUDGE REPAIRER: Test to repair a rule with only 1 rse whose transfers failed (lock)"""

        rule_repairer(once=True)  # Clean out the repairer
        scope = 'mock'
        files = create_files(4, scope, self.rse4, bytes=100)
        dataset = 'dataset_' + str(uuid())
        add_did(scope, dataset, DIDType.from_sym('DATASET'), 'jdoe')
        attach_dids(scope, dataset, files, 'jdoe')

        rule_id = add_rule(dids=[{'scope': scope, 'name': dataset}], account='jdoe', copies=1, rse_expression=self.rse1, grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None)[0]

        successful_transfer(scope=scope, name=files[0]['name'], rse_id=get_replica_locks(scope=files[0]['scope'], name=files[2]['name'])[0].rse_id, nowait=False)
        successful_transfer(scope=scope, name=files[1]['name'], rse_id=get_replica_locks(scope=files[1]['scope'], name=files[2]['name'])[0].rse_id, nowait=False)
        failed_transfer(scope=scope, name=files[2]['name'], rse_id=get_replica_locks(scope=files[2]['scope'], name=files[2]['name'])[0].rse_id)
        failed_transfer(scope=scope, name=files[3]['name'], rse_id=get_replica_locks(scope=files[3]['scope'], name=files[3]['name'])[0].rse_id)
        cancel_request_did(scope=scope, name=files[2]['name'], dest_rse_id=get_replica_locks(scope=files[2]['scope'], name=files[2]['name'])[0].rse_id)
        cancel_request_did(scope=scope, name=files[3]['name'], dest_rse_id=get_replica_locks(scope=files[3]['scope'], name=files[2]['name'])[0].rse_id)

        assert(rule_id == get_rule(rule_id)['id'].replace('-', '').lower())
        assert(RuleState.STUCK == get_rule(rule_id)['state'])
        rule_repairer(once=True)
        # Stil assert STUCK because of delays:
        assert(RuleState.STUCK == get_rule(rule_id)['state'])
        assert(get_replica_locks(scope=files[2]['scope'], name=files[2]['name'])[0].rse_id == get_replica_locks(scope=files[3]['scope'], name=files[3]['name'])[0].rse_id)
Beispiel #5
0
    def test_to_repair_a_rule_with_only_1_rse_whose_site_is_blocklisted(self):
        """ JUDGE REPAIRER: Test to repair a rule with only 1 rse whose site is blocklisted"""

        rse = rse_name_generator()
        rse_id = add_rse(rse, **self.vo)
        set_local_account_limit(self.jdoe, rse_id, -1)
        rule_repairer(once=True)  # Clean out the repairer

        region = make_region().configure('dogpile.cache.memcached',
                                         expiration_time=900,
                                         arguments={
                                             'url':
                                             config_get(
                                                 'cache', 'url', False,
                                                 '127.0.0.1:11211'),
                                             'distributed_lock':
                                             True
                                         })

        def change_availability(new_value):
            update_rse(rse_id, {'availability_write': new_value})
            # clear cache
            region.delete(sha256(rse.encode()).hexdigest())

        for grouping, ignore_availability in itertools.product(
            ["NONE", "DATASET", "ALL"], [True, False]):
            scope = InternalScope('mock', **self.vo)
            files = create_files(1, scope, self.rse4_id, bytes_=100)
            dataset = 'dataset_' + str(uuid())
            add_did(scope, dataset, DIDType.DATASET, self.jdoe)
            attach_dids(scope, dataset, files, self.jdoe)

            if ignore_availability:
                change_availability(False)
                rule_id = add_rule(dids=[{
                    'scope': scope,
                    'name': dataset
                }],
                                   account=self.jdoe,
                                   copies=1,
                                   rse_expression=rse,
                                   grouping=grouping,
                                   weight=None,
                                   lifetime=None,
                                   locked=False,
                                   subscription_id=None,
                                   ignore_availability=ignore_availability,
                                   activity='DebugJudge')[0]
                assert (RuleState.STUCK == get_rule(rule_id)['state'])

                rule_repairer(once=True)
                assert (RuleState.REPLICATING == get_rule(rule_id)['state'])

                change_availability(True)
            else:
                rule_id = add_rule(dids=[{
                    'scope': scope,
                    'name': dataset
                }],
                                   account=self.jdoe,
                                   copies=1,
                                   rse_expression=rse,
                                   grouping=grouping,
                                   weight=None,
                                   lifetime=None,
                                   locked=False,
                                   subscription_id=None,
                                   ignore_availability=ignore_availability,
                                   activity='DebugJudge')[0]
                failed_transfer(scope=scope,
                                name=files[0]['name'],
                                rse_id=get_replica_locks(
                                    scope=files[0]['scope'],
                                    name=files[0]['name'])[0].rse_id)
                change_availability(False)
                assert (RuleState.STUCK == get_rule(rule_id)['state'])

                rule_repairer(once=True)
                assert (RuleState.STUCK == get_rule(rule_id)['state'])

                change_availability(True)
                rule_repairer(once=True)
                assert (RuleState.REPLICATING == get_rule(rule_id)['state'])
Beispiel #6
0
def update_bad_request(req, dest_rse, new_state, detail, session=None):
    if new_state == RequestState.FAILED:
        request.set_request_state(req['request_id'], new_state, session=session)

        activity = 'default'
        if req['attributes']:
            if type(req['attributes']) is dict:
                req_attributes = json.loads(json.dumps(req['attributes']))
            else:
                req_attributes = json.loads(str(req['attributes']))
            activity = req_attributes['activity'] if req_attributes['activity'] else 'default'

        tss = time.time()
        add_message('transfer-failed', {'activity': activity,
                                        'request-id': req['request_id'],
                                        'checksum-adler': None,
                                        'checksum-md5': None,
                                        'dst-rse': dest_rse,
                                        'dst-url': None,
                                        'name': req['name'],
                                        'guid': None,
                                        'file-size': None,
                                        'previous-request-id': req['request_id'],
                                        'protocol': None,
                                        'reason': detail,
                                        'transfer-link': None,
                                        'scope': req['scope'],
                                        'src-rse': None,
                                        'src-url': None,
                                        'tool-id': 'rucio-conveyor',
                                        'transfer-endpoint': config_get('conveyor', 'ftshosts'),
                                        'transfer-id': None},
                    session=session)

        request.archive_request(req['request_id'], session=session)
        logging.error('BAD DID %s:%s REQUEST %s details: %s' % (req['scope'],
                                                                req['name'],
                                                                req['request_id'],
                                                                detail))
        try:
            replica.update_replicas_states([{'rse': dest_rse,
                                             'scope': req['scope'],
                                             'name': req['name'],
                                             'state': ReplicaState.UNAVAILABLE}],
                                           session=session)
        except:
            logging.critical("Could not update replica state for failed transfer %s:%s at %s (%s)" % (req['scope'],
                                                                                                      req['name'],
                                                                                                      dest_rse,
                                                                                                      traceback.format_exc()))
            raise

        tss = time.time()
        try:
            lock.failed_transfer(req['scope'],
                                 req['name'],
                                 req['dest_rse_id'],
                                 session=session)
        except:
            logging.warn('Could not update lock for failed transfer %s:%s at %s (%s)' % (req['scope'],
                                                                                         req['name'],
                                                                                         dest_rse,
                                                                                         traceback.format_exc()))
            raise
        record_timer('daemons.conveyor.common.update_request_state.lock-failed_transfer', (time.time()-tss)*1000)
Beispiel #7
0
                    replica.update_replicas_states([{'rse': rse_update_name,
                                                     'scope': response['scope'],
                                                     'name': response['name'],
                                                     'state': ReplicaState.UNAVAILABLE}],
                                                   session=session)
                except:
                    logging.critical("Could not update replica state for failed transfer %s:%s at %s (%s)" % (response['scope'],
                                                                                                              response['name'],
                                                                                                              rse_update_name,
                                                                                                              traceback.format_exc()))
                    raise

                tss = time.time()
                try:
                    lock.failed_transfer(response['scope'],
                                         response['name'],
                                         rse_core.get_rse_id(rse=rse_update_name, session=session),
                                         session=session)
                except:
                    logging.warn('Could not update lock for failed transfer %s:%s at %s (%s)' % (response['scope'],
                                                                                                 response['name'],
                                                                                                 rse_update_name,
                                                                                                 traceback.format_exc()))
                    raise

                record_timer('daemons.conveyor.common.update_request_state.lock-failed_transfer', (time.time()-tss)*1000)
            else:
                logging.warn('REQUEUED DID %s:%s REQUEST %s AS %s TRY %s' % (response['scope'],
                                                                             response['name'],
                                                                             response['request_id'],
                                                                             new_req['request_id'],
                                                                             new_req['retry_count']))