コード例 #1
0
ファイル: did_upload.py プロジェクト: dmwm/CMSRucio
def upload(rucio_client, files, temp_rse):
    from rucio.client.uploadclient import UploadClient
    uclient = UploadClient(logger=logger)

    items = [{
        "path": file["file_path"],
        "rse": temp_rse,
        "pfn": file["pfn"],
        "name": file["lfn"],
        "did_name": file["lfn"],
        "no_register": True,
    } for file in files]

    blue = "\x1b[35;20m"
    reset = "\x1b[0m"

    #trying to upload file
    uclient.upload(items)

    #collecting metadata about file
    files = uclient._collect_and_validate_file_info(items)

    #registering uploaded replicas in rucio catalogue
    for file in files:
        register_temp_replica(rucio_client, uclient, file)
コード例 #2
0
ファイル: rucio_sitemover.py プロジェクト: PanDAWMS/pilot
    def _stageOutApi(self, src, fspec):

        from rucio.client.uploadclient import UploadClient

        # rucio logger init.
        rucio_logger = logging.getLogger('rucio_mover')
        upload_client = UploadClient(logger=rucio_logger)

        # File existence verification faileds are turned off
        if hasattr(upload_client, 'tracing'):
            upload_client.tracing = self.tracing

        # file specifications before the upload
        f = {}
        f['path'] = fspec.pfn if fspec.pfn else fspec.lfn
        f['rse'] = fspec.ddmendpoint
        f['did_scope'] = fspec.scope
        f['no_register'] = True

        #if fspec.filesize:
        #    f['transfer_timeout'] = self.getTimeOut(fspec.filesize) # too harsh, max 3 hours

        if fspec.storageId and int(fspec.storageId) > 0:
            if not self.isDeterministic(fspec.ddmendpoint):
                f['pfn'] = fspec.turl
        elif fspec.lfn and '.root' in fspec.lfn:
            f['guid'] = fspec.guid

        # process the upload
        tolog('_stageOutApi: %s' % str(f))
        upload_client.upload([f])

        return {'ddmendpoint': fspec.ddmendpoint,
                'surl': fspec.surl,
                'pfn': fspec.lfn}
コード例 #3
0
class TestAbacusAccount(unittest.TestCase):

    def setUp(self):
        self.rse = 'MOCK4'
        self.file_sizes = 2
        self.upload_client = UploadClient()
        self.account_client = AccountClient()
        self.session = get_session()

        if config_get_bool('common', 'multi_vo', raise_exception=False, default=False):
            self.vo = {'vo': config_get('client', 'vo', raise_exception=False, default='tst')}
        else:
            self.vo = {}

        self.account = InternalAccount('root', **self.vo)
        self.scope = InternalScope('mock', **self.vo)
        self.rse_id = get_rse_id(self.rse, session=self.session, **self.vo)

    def tearDown(self):
        undertaker.run(once=True)
        cleaner.run(once=True)
        if self.vo:
            reaper.run(once=True, include_rses='vo=%s&(%s)' % (self.vo['vo'], self.rse), greedy=True)
        else:
            reaper.run(once=True, include_rses=self.rse, greedy=True)

    def test_abacus_account(self):
        """ ABACUS (ACCOUNT): Test update of account usage """
        self.session.query(models.UpdatedAccountCounter).delete()  # pylint: disable=no-member
        self.session.query(models.AccountUsage).delete()  # pylint: disable=no-member
        self.session.commit()  # pylint: disable=no-member

        # Upload files -> account usage should increase
        self.files = [{'did_scope': self.scope.external, 'did_name': 'file_' + generate_uuid(), 'path': file_generator(size=self.file_sizes), 'rse': self.rse, 'lifetime': -1} for i in range(0, 2)]
        self.upload_client.upload(self.files)
        [os.remove(file['path']) for file in self.files]
        account.run(once=True)
        account_usage = get_local_account_usage(account=self.account, rse_id=self.rse_id)[0]
        assert account_usage['bytes'] == len(self.files) * self.file_sizes
        assert account_usage['files'] == len(self.files)

        # Update and check the account history with the core method
        update_account_counter_history(account=self.account, rse_id=self.rse_id)
        usage_history = get_usage_history(rse_id=self.rse_id, account=self.account)
        assert usage_history[-1]['bytes'] == len(self.files) * self.file_sizes
        assert usage_history[-1]['files'] == len(self.files)

        # Check the account history with the client
        usage_history = self.account_client.get_account_usage_history(rse=self.rse, account=self.account.external)
        assert usage_history[-1]['bytes'] == len(self.files) * self.file_sizes
        assert usage_history[-1]['files'] == len(self.files)

        # Delete rules -> account usage should decrease
        cleaner.run(once=True)
        account.run(once=True)
        # set account limit because return value of get_local_account_usage differs if a limit is set or not
        set_local_account_limit(account=self.account, rse_id=self.rse_id, bytes=10)
        account_usages = get_local_account_usage(account=self.account, rse_id=self.rse_id)[0]
        assert account_usages['bytes'] == 0
        assert account_usages['files'] == 0
コード例 #4
0
ファイル: test_abacus_rse.py プロジェクト: zlion/rucio
class TestAbacusRSE():
    def setUp(self):
        self.account = 'root'
        self.scope = 'mock'
        self.upload_client = UploadClient()
        self.file_sizes = 2
        self.rse = 'MOCK4'
        self.rse_id = get_rse_id(self.rse)
        self.session = get_session()

    def tearDown(self):
        undertaker.run(once=True)
        cleaner.run(once=True)
        reaper.run(once=True, rses=[self.rse], greedy=True)

    def test_abacus_rse(self):
        """ ABACUS (RSE): Test update of RSE usage. """
        # Get RSE usage of all sources
        self.session.query(models.UpdatedRSECounter).delete()  # pylint: disable=no-member
        self.session.query(models.RSEUsage).delete()  # pylint: disable=no-member
        self.session.commit()  # pylint: disable=no-member

        # Upload files -> RSE usage should increase
        self.files = [{
            'did_scope': self.scope,
            'did_name': 'file_' + generate_uuid(),
            'path': file_generator(size=self.file_sizes),
            'rse': self.rse,
            'lifetime': -1
        } for i in range(0, 2)]
        self.upload_client.upload(self.files)
        [os.remove(file['path']) for file in self.files]
        rse.run(once=True)
        rse_usage = get_rse_usage(rse_id=self.rse_id)[0]
        assert_equal(rse_usage['used'], len(self.files) * self.file_sizes)
        rse_usage_from_rucio = get_rse_usage(rse_id=self.rse_id,
                                             source='rucio')[0]
        assert_equal(rse_usage_from_rucio['used'],
                     len(self.files) * self.file_sizes)
        rse_usage_from_unavailable = get_rse_usage(rse_id=self.rse_id,
                                                   source='unavailable')
        assert_equal(len(rse_usage_from_unavailable), 0)

        # Delete files -> rse usage should decrease
        cleaner.run(once=True)
        reaper.run(once=True, rses=[self.rse], greedy=True)
        rse.run(once=True)
        rse_usage = get_rse_usage(rse_id=self.rse_id)[0]
        assert_equal(rse_usage['used'], 0)
        rse_usage_from_rucio = get_rse_usage(rse_id=self.rse_id,
                                             source='rucio')[0]
        assert_equal(rse_usage_from_rucio['used'], 0)
        rse_usage_from_unavailable = get_rse_usage(rse_id=self.rse_id,
                                                   source='unavailable')
        assert_equal(len(rse_usage_from_unavailable), 0)
コード例 #5
0
ファイル: rucio.py プロジェクト: esseivaju/pilot2
def _stage_out_api(fspec, summary_file_path, trace_report, trace_report_out, transfer_timeout):

    # init. download client
    from rucio.client.uploadclient import UploadClient
    upload_client = UploadClient(logger=logger)

    # traces are turned off
    if hasattr(upload_client, 'tracing'):
        upload_client.tracing = tracing_rucio
    if tracing_rucio:
        upload_client.trace = trace_report

    # file specifications before the upload
    f = {}
    f['path'] = fspec.surl or getattr(fspec, 'pfn', None) or os.path.join(fspec.workdir, fspec.lfn)
    f['rse'] = fspec.ddmendpoint
    f['did_scope'] = fspec.scope
    f['no_register'] = True

    if transfer_timeout:
        f['transfer_timeout'] = transfer_timeout

    # if fspec.storageId and int(fspec.storageId) > 0:
    #     if fspec.turl and fspec.is_nondeterministic:
    #         f['pfn'] = fspec.turl
    # elif fspec.lfn and '.root' in fspec.lfn:
    #     f['guid'] = fspec.guid
    if fspec.lfn and '.root' in fspec.lfn:
        f['guid'] = fspec.guid

    # process with the upload
    logger.info('_stage_out_api: %s' % str(f))
    result = None

    # upload client raises an exception if any file failed
    try:
        result = upload_client.upload([f], summary_file_path=summary_file_path, traces_copy_out=trace_report_out)
    except UnboundLocalError:
        logger.warning('rucio still needs a bug fix of the summary in the uploadclient')

    logger.debug('Rucio upload client returned %s' % result)

    try:
        file_exists = verify_stage_out(fspec)
        logger.info('File exists at the storage: %s' % str(file_exists))
        if not file_exists:
            raise StageOutFailure('stageOut: Physical check after upload failed.')
    except Exception as e:
        msg = 'stageOut: File existence verification failed with: %s' % str(e)
        logger.info(msg)
        raise StageOutFailure(msg)

    return trace_report_out
コード例 #6
0
def stage_out():
    disk = os.environ['MINIPILOT_STAGEOUT_RSE']
    configfile = json.load(open(sys.argv[2]))
    u = UploadClient()
    outputs = configfile['outputs']
    for output in outputs:
        print('output', output)
        toupload = [{
            'path':
            output,
            'rse':
            disk,
            'did_name':
            'user.{user}.{taskid}._{subjobid}.{output}'.format(
                user=configfile['user'],
                taskid=str(configfile['taskid']).zfill(8),
                subjobid=str(configfile['subjobid']).zfill(6),
                output=output),
            'did_scope':
            'user.{user}'.format(user=configfile['user'])
        }]
        print(json.dumps(toupload))
        u.upload(toupload)
コード例 #7
0
    def _stageOutApi(self, src, fspec):

        from rucio.client.uploadclient import UploadClient

        # rucio logger init.
        rucio_logger = logging.getLogger('rucio_mover')
        upload_client = UploadClient(logger=rucio_logger)

        # File existence verification faileds are turned off
        if hasattr(upload_client, 'tracing'):
            upload_client.tracing = self.tracing

        # file specifications before the upload
        f = {}
        f['path'] = fspec.pfn if fspec.pfn else fspec.lfn
        f['rse'] = fspec.ddmendpoint
        f['did_scope'] = fspec.scope
        f['no_register'] = True

        #if fspec.filesize:
        #    f['transfer_timeout'] = self.getTimeOut(fspec.filesize) # too harsh, max 3 hours

        if fspec.storageId and int(fspec.storageId) > 0:
            if not self.isDeterministic(fspec.ddmendpoint):
                f['pfn'] = fspec.turl
        elif fspec.lfn and '.root' in fspec.lfn:
            f['guid'] = fspec.guid

        # process the upload
        tolog('_stageOutApi: %s' % str(f))
        upload_client.upload([f])

        return {
            'ddmendpoint': fspec.ddmendpoint,
            'surl': fspec.surl,
            'pfn': fspec.lfn
        }
コード例 #8
0
class TestDownloadClient(unittest.TestCase):

    def setUp(self):
        if config_get_bool('common', 'multi_vo', raise_exception=False, default=False):
            self.vo = {'vo': config_get('client', 'vo', raise_exception=False, default='tst')}
        else:
            self.vo = {}

        logger = logging.getLogger('dlul_client')
        logger.addHandler(logging.StreamHandler())
        logger.setLevel(logging.DEBUG)
        self.client = Client()
        self.upload_client = UploadClient(_client=self.client, logger=logger)
        self.download_client = DownloadClient(client=self.client, logger=logger)

        self.file_path = file_generator()
        self.scope = 'mock'
        self.name = os.path.basename(self.file_path)
        self.rse = 'MOCK4'
        self.guid = generate_uuid()

        item = {'path': self.file_path,
                'rse': self.rse,
                'did_scope': self.scope,
                'did_name': self.name,
                'guid': self.guid}
        assert self.upload_client.upload([item]) == 0

    def tearDown(self):
        shutil.rmtree('mock')

    def test_download_item(self):
        """ DOWNLOAD (CLIENT): Download DIDs """

        # Download specific DID
        result = self.download_client.download_dids([{'did': '%s:%s' % (self.scope, self.name)}])
        assert result

        # Download with wildcard
        result = self.download_client.download_dids([{'did': '%s:%s' % (self.scope, self.name[:-2] + '*')}])
        assert result

        # Download with filter
        result = self.download_client.download_dids([{'filters': {'guid': self.guid, 'scope': self.scope}}])
        assert result

        # Download with wildcard and name
        result = self.download_client.download_dids([{'did': '%s:%s' % (self.scope, '*'), 'filters': {'guid': self.guid}}])
        assert result
コード例 #9
0
def _stage_out_api(fspec, summary_file_path, trace_report):

    # init. download client
    from rucio.client.uploadclient import UploadClient
    upload_client = UploadClient()

    # traces are turned off
    if hasattr(upload_client, 'tracing'):
        upload_client.tracing = tracing_rucio
    if tracing_rucio:
        upload_client.trace = trace_report

    # file specifications before the upload
    f = {}
    f['path'] = fspec.surl or getattr(fspec, 'pfn', None) or os.path.join(
        fspec.workdir, fspec.lfn)
    f['rse'] = fspec.ddmendpoint
    f['did_scope'] = fspec.scope
    f['no_register'] = True

    if fspec.filesize:
        f['transfer_timeout'] = get_timeout(fspec.filesize)

    # if fspec.storageId and int(fspec.storageId) > 0:
    #     if fspec.turl and fspec.is_nondeterministic:
    #         f['pfn'] = fspec.turl
    # elif fspec.lfn and '.root' in fspec.lfn:
    #     f['guid'] = fspec.guid
    if fspec.lfn and '.root' in fspec.lfn:
        f['guid'] = fspec.guid

    # process with the upload
    logger.info('_stage_out_api: %s' % str(f))
    result = None
    try:
        result = upload_client.upload([f], summary_file_path)
    except UnboundLocalError:
        logger.warning(
            'rucio still needs a bug fix of the summary in the uploadclient')
        result = 0

    client_state = 'FAILED'
    if result == 0:
        client_state = 'DONE'

    return client_state
コード例 #10
0
class TestAbacusAccount():
    def setUp(self):
        self.account = InternalAccount('root')
        self.scope = InternalScope('mock')
        self.upload_client = UploadClient()
        self.account_client = AccountClient()
        self.file_sizes = 2
        self.rse = 'MOCK4'
        self.rse_id = get_rse_id(self.rse)
        self.session = get_session()

    def tearDown(self):
        undertaker.run(once=True)
        cleaner.run(once=True)
        reaper.run(once=True, rses=[self.rse], greedy=True)

    def test_abacus_account(self):
        """ ABACUS (ACCOUNT): Test update of account usage """
        self.session.query(models.UpdatedAccountCounter).delete()  # pylint: disable=no-member
        self.session.query(models.AccountUsage).delete()  # pylint: disable=no-member
        self.session.commit()  # pylint: disable=no-member

        # Upload files -> account usage should increase
        self.files = [{
            'did_scope': self.scope.external,
            'did_name': 'file_' + generate_uuid(),
            'path': file_generator(size=self.file_sizes),
            'rse': self.rse,
            'lifetime': -1
        } for i in range(0, 2)]
        self.upload_client.upload(self.files)
        [os.remove(file['path']) for file in self.files]
        account.run(once=True)
        account_usage = get_local_account_usage(account=self.account,
                                                rse_id=self.rse_id)[0]
        assert_equal(account_usage['bytes'], len(self.files) * self.file_sizes)
        assert_equal(account_usage['files'], len(self.files))

        # Update and check the account history with the core method
        update_account_counter_history(account=self.account,
                                       rse_id=self.rse_id)
        usage_history = get_usage_history(rse_id=self.rse_id,
                                          account=self.account)
        assert_equal(usage_history[-1]['bytes'],
                     len(self.files) * self.file_sizes)
        assert_equal(usage_history[-1]['files'], len(self.files))

        # Check the account history with the client
        usage_history = self.account_client.get_account_usage_history(
            rse=self.rse, account=self.account.external)
        assert_equal(usage_history[-1]['bytes'],
                     len(self.files) * self.file_sizes)
        assert_equal(usage_history[-1]['files'], len(self.files))

        # Delete rules -> account usage should decrease
        cleaner.run(once=True)
        account.run(once=True)
        # set account limit because return value of get_local_account_usage differs if a limit is set or not
        set_local_account_limit(account=self.account,
                                rse_id=self.rse_id,
                                bytes=10)
        account_usages = get_local_account_usage(account=self.account,
                                                 rse_id=self.rse_id)[0]
        assert_equal(account_usages['bytes'], 0)
        assert_equal(account_usages['files'], 0)
コード例 #11
0
def _stage_out_api(fspec, summary_file_path, trace_report, trace_report_out,
                   transfer_timeout):

    ec = 0

    # init. download client
    from rucio.client.uploadclient import UploadClient
    upload_client = UploadClient(logger=logger)

    # traces are turned off
    if hasattr(upload_client, 'tracing'):
        upload_client.tracing = tracing_rucio
    if tracing_rucio:
        upload_client.trace = trace_report

    # file specifications before the upload
    f = {}
    f['path'] = fspec.surl or getattr(fspec, 'pfn', None) or os.path.join(
        fspec.workdir, fspec.lfn)
    f['rse'] = fspec.ddmendpoint
    f['did_scope'] = fspec.scope
    f['no_register'] = True

    if transfer_timeout:
        f['transfer_timeout'] = transfer_timeout
    f['connection_timeout'] = 60 * 60

    # if fspec.storageId and int(fspec.storageId) > 0:
    #     if fspec.turl and fspec.is_nondeterministic:
    #         f['pfn'] = fspec.turl
    # elif fspec.lfn and '.root' in fspec.lfn:
    #     f['guid'] = fspec.guid
    if fspec.lfn and '.root' in fspec.lfn:
        f['guid'] = fspec.guid

    logger.info('rucio API stage-out dictionary: %s' % f)

    # upload client raises an exception if any file failed
    try:
        logger.info('*** rucio API uploading file (taking over logging) ***')
        logger.debug('summary_file_path=%s' % summary_file_path)
        logger.debug('trace_report_out=%s' % trace_report_out)
        result = upload_client.upload([f],
                                      summary_file_path=summary_file_path,
                                      traces_copy_out=trace_report_out)
    except Exception as e:
        logger.warning('*** rucio API upload client failed ***')
        logger.warning('caught exception: %s' % e)
        import traceback
        logger.error(traceback.format_exc())
        logger.debug('trace_report_out=%s' % trace_report_out)
        if not trace_report_out:
            raise e
        if not trace_report_out[0].get('stateReason'):
            raise e
        ec = -1
    except UnboundLocalError:
        logger.warning('*** rucio API upload client failed ***')
        logger.warning(
            'rucio still needs a bug fix of the summary in the uploadclient')
    else:
        logger.warning('*** rucio API upload client finished ***')
        logger.debug('client returned %s' % result)

    try:
        file_exists = verify_stage_out(fspec)
        logger.info('file exists at the storage: %s' % str(file_exists))
        if not file_exists:
            raise StageOutFailure('physical check after upload failed')
    except Exception as e:
        msg = 'file existence verification failed with: %s' % e
        logger.info(msg)
        raise StageOutFailure(msg)

    return ec, trace_report_out
コード例 #12
0
ファイル: test_download.py プロジェクト: sahandilshan/rucio
class TestDownloadClient(unittest.TestCase):
    def setUp(self):
        if config_get_bool('common',
                           'multi_vo',
                           raise_exception=False,
                           default=False):
            self.vo = {
                'vo':
                config_get('client',
                           'vo',
                           raise_exception=False,
                           default='tst')
            }
        else:
            self.vo = {}

        logger = logging.getLogger('dlul_client')
        logger.addHandler(logging.StreamHandler())
        logger.setLevel(logging.DEBUG)
        self.client = Client()
        self.did_client = DIDClient()
        self.upload_client = UploadClient(_client=self.client, logger=logger)
        self.download_client = DownloadClient(client=self.client,
                                              logger=logger)

    def _upoad_test_file(self, rse, scope, name, path=None):
        item = {
            'path': path if path else file_generator(),
            'rse': rse,
            'did_scope': scope,
            'did_name': name,
            'guid': generate_uuid(),
        }
        assert self.upload_client.upload([item]) == 0
        return item

    @staticmethod
    def _check_download_result(actual_result, expected_result):
        assert len(expected_result) == len(actual_result)
        expected_result = sorted(expected_result, key=lambda x: x['did'])
        actual_result = sorted(actual_result, key=lambda x: x['did'])
        for i, expected in enumerate(expected_result):
            for param_name, expected_value in expected.items():
                assert param_name and actual_result[i][param_name] == expected[
                    param_name]

    def test_download_without_base_dir(self):
        rse = 'MOCK4'
        scope = 'mock'
        item = self._upoad_test_file(rse, scope,
                                     'testDownloadNoBasedir' + generate_uuid())
        did = '%s:%s' % (scope, item['did_name'])
        try:
            # download to the default location, i.e. to ./
            result = self.download_client.download_dids([{'did': did}])
            self._check_download_result(
                actual_result=result,
                expected_result=[{
                    'did': did,
                    'clientState': 'DONE',
                }],
            )

            # re-downloading the same file again should not overwrite it
            result = self.download_client.download_dids([{'did': did}])
            self._check_download_result(
                actual_result=result,
                expected_result=[{
                    'did': did,
                    'clientState': 'ALREADY_DONE',
                }],
            )
        finally:
            shutil.rmtree(scope)

    def test_download_multiple(self):
        rse = 'MOCK4'
        scope = 'mock'
        base_name = 'testDownloadItem' + generate_uuid()
        item000 = self._upoad_test_file(rse, scope, base_name + '.000')
        item001 = self._upoad_test_file(rse, scope, base_name + '.001')
        item100 = self._upoad_test_file(rse, scope, base_name + '.100')

        with TemporaryDirectory() as tmp_dir:
            # Download specific DID
            result = self.download_client.download_dids([{
                'did':
                '%s:%s' % (scope, item000['did_name']),
                'base_dir':
                tmp_dir
            }])
            self._check_download_result(
                actual_result=result,
                expected_result=[{
                    'did': '%s:%s' % (scope, item000['did_name']),
                    'clientState': 'DONE',
                }],
            )

            # Download multiple files with wildcard. One file already exists on the file system. Will not be re-downloaded.
            result = self.download_client.download_dids([{
                'did':
                '%s:%s.0*' % (scope, base_name),
                'base_dir':
                tmp_dir
            }])
            self._check_download_result(
                actual_result=result,
                expected_result=[
                    {
                        'did': '%s:%s' % (scope, item000['did_name']),
                        'clientState': 'ALREADY_DONE',
                    },
                    {
                        'did': '%s:%s' % (scope, item001['did_name']),
                        'clientState': 'DONE',
                    },
                ],
            )

            # Download with filter
            result = self.download_client.download_dids([{
                'filters': {
                    'guid': item000['guid'],
                    'scope': scope
                },
                'base_dir': tmp_dir
            }])
            self._check_download_result(
                actual_result=result,
                expected_result=[{
                    'did': '%s:%s' % (scope, item000['did_name']),
                }],
            )

            # Download with wildcard and name
            result = self.download_client.download_dids([{
                'did': '%s:*' % scope,
                'filters': {
                    'guid': item100['guid']
                },
                'base_dir': tmp_dir
            }])
            self._check_download_result(
                actual_result=result,
                expected_result=[{
                    'did': '%s:%s' % (scope, item100['did_name']),
                    'clientState': 'DONE',
                }],
            )

            # Don't create subdirectories by scope
            result = self.download_client.download_dids([{
                'did':
                '%s:%s.*' % (scope, base_name),
                'base_dir':
                tmp_dir,
                'no_subdir':
                True
            }])
            self._check_download_result(
                actual_result=result,
                expected_result=[
                    {
                        'did':
                        '%s:%s' % (scope, item000['did_name']),
                        'clientState':
                        'DONE',
                        'dest_file_paths':
                        ['%s/%s' % (tmp_dir, item000['did_name'])],
                    },
                    {
                        'did':
                        '%s:%s' % (scope, item001['did_name']),
                        'clientState':
                        'DONE',
                        'dest_file_paths':
                        ['%s/%s' % (tmp_dir, item001['did_name'])],
                    },
                    {
                        'did':
                        '%s:%s' % (scope, item100['did_name']),
                        'clientState':
                        'DONE',
                        'dest_file_paths':
                        ['%s/%s' % (tmp_dir, item100['did_name'])],
                    },
                ],
            )

            # Re-download file existing on the file system with no-subdir set. It must be overwritten.
            result = self.download_client.download_dids([{
                'did':
                '%s:%s' % (scope, item100['did_name']),
                'base_dir':
                tmp_dir,
                'no_subdir':
                True
            }])
            self._check_download_result(
                actual_result=result,
                expected_result=[{
                    'did':
                    '%s:%s' % (scope, item100['did_name']),
                    'clientState':
                    'ALREADY_DONE',
                    'dest_file_paths':
                    ['%s/%s' % (tmp_dir, item100['did_name'])],
                }],
            )

    @pytest.mark.xfail(
        reason=
        'XRD1 must be initialized https://github.com/rucio/rucio/pull/4165/')
    def test_download_from_archive_on_xrd(self):
        scope = 'test'
        rse = 'XRD1'
        base_name = 'testDownloadArchive' + generate_uuid()
        with TemporaryDirectory() as tmp_dir:
            # Create a zip archive with two files and upload it
            name000 = base_name + '.000'
            data000 = '000'
            adler000 = '01230091'
            name001 = base_name + '.001'
            data001 = '001'
            adler001 = '01240092'
            zip_name = base_name + '.zip'
            zip_path = '%s/%s' % (tmp_dir, zip_name)
            with ZipFile(zip_path, 'w') as myzip:
                myzip.writestr(name000, data=data000)
                myzip.writestr(name001, data=data001)
            self._upoad_test_file(rse, scope, zip_name, path=zip_path)
            self.did_client.add_files_to_archive(
                scope,
                zip_name,
                [
                    {
                        'scope': scope,
                        'name': name000,
                        'bytes': len(data000),
                        'type': 'FILE',
                        'adler32': adler000,
                        'meta': {
                            'guid': str(generate_uuid())
                        }
                    },
                    {
                        'scope': scope,
                        'name': name001,
                        'bytes': len(data001),
                        'type': 'FILE',
                        'adler32': adler001,
                        'meta': {
                            'guid': str(generate_uuid())
                        }
                    },
                ],
            )

            # Download one file from the archive
            result = self.download_client.download_dids([{
                'did':
                '%s:%s' % (scope, name000),
                'base_dir':
                tmp_dir
            }])
            self._check_download_result(
                actual_result=result,
                expected_result=[
                    {
                        'did': '%s:%s' % (scope, name000),
                        'clientState': 'DONE',
                    },
                ],
            )
            with open('%s/%s/%s' % (tmp_dir, scope, name000), 'r') as file:
                assert file.read() == data000

            # Download both files from the archive
            result = self.download_client.download_dids([{
                'did':
                '%s:%s.00*' % (scope, base_name),
                'base_dir':
                tmp_dir
            }])
            self._check_download_result(
                actual_result=result,
                expected_result=[
                    {
                        'did': '%s:%s' % (scope, name000),
                        'clientState': 'ALREADY_DONE',
                    },
                    {
                        'did': '%s:%s' % (scope, name001),
                        'clientState': 'DONE',
                    },
                ],
            )
            with open('%s/%s/%s' % (tmp_dir, scope, name001), 'r') as file:
                assert file.read() == data001

            pfn = next(filter(lambda r: name001 in r['did'],
                              result))['sources'][0]['pfn']
            # Download by pfn from the archive
            result = self.download_client.download_pfns([{
                'did':
                '%s:%s' % (scope, name001),
                'pfn':
                pfn,
                'rse':
                rse,
                'base_dir':
                tmp_dir,
                'no_subdir':
                True
            }])
            self._check_download_result(
                actual_result=result,
                expected_result=[
                    {
                        'did': '%s:%s' % (scope, name001),
                        'clientState': 'DONE',
                    },
                ],
            )

    def test_trace_copy_out_and_checksum_validation(self):
        rse = 'MOCK4'
        scope = 'mock'
        name = 'testDownloadTraces' + generate_uuid()
        self._upoad_test_file(rse, scope, name)

        with TemporaryDirectory() as tmp_dir:
            # Try downloading non-existing did
            traces = []
            with pytest.raises(NoFilesDownloaded):
                self.download_client.download_dids([{
                    'did': 'some:randomNonExistingDid',
                    'base_dir': tmp_dir
                }],
                                                   traces_copy_out=traces)
            assert len(
                traces) == 1 and traces[0]['clientState'] == 'FILE_NOT_FOUND'

            # Download specific DID
            traces = []
            self.download_client.download_dids([{
                'did': '%s:%s' % (scope, name),
                'base_dir': tmp_dir
            }],
                                               traces_copy_out=traces)
            assert len(traces) == 1 and traces[0]['clientState'] == 'DONE'

            # Download same DID again
            traces = []
            result = self.download_client.download_dids(
                [{
                    'did': '%s:%s' % (scope, name),
                    'base_dir': tmp_dir
                }],
                traces_copy_out=traces)
            assert len(
                traces) == 1 and traces[0]['clientState'] == 'ALREADY_DONE'

            # Change the local file and download the same file again. Checksum validation should fail and it must be re-downloaded
            with open(result[0]['dest_file_paths'][0], 'a') as f:
                f.write("more data")
            traces = []
            result = self.download_client.download_dids(
                [{
                    'did': '%s:%s' % (scope, name),
                    'base_dir': tmp_dir
                }],
                traces_copy_out=traces)
            assert len(traces) == 1 and traces[0]['clientState'] == 'DONE'

            pfn = result[0]['sources'][0]['pfn']

        # Switch to a new empty directory
        with TemporaryDirectory() as tmp_dir:
            # Wildcards in did name are not allowed on pfn downloads
            traces = []
            with pytest.raises(InputValidationError):
                self.download_client.download_pfns([{
                    'did': '%s:*' % scope,
                    'pfn': pfn,
                    'rse': rse,
                    'base_dir': tmp_dir
                }],
                                                   traces_copy_out=traces)
            assert not traces

            # Same pfn, but without wildcard in the did should work
            traces = []
            self.download_client.download_pfns([{
                'did': '%s:%s' % (scope, name),
                'pfn': pfn,
                'rse': rse,
                'base_dir': tmp_dir
            }],
                                               traces_copy_out=traces)
            assert len(traces) == 1 and traces[0]['clientState'] == 'DONE'

            # Same pfn. Local file already present. Shouldn't be overwritten.
            traces = []
            self.download_client.download_pfns([{
                'did': '%s:%s' % (scope, name),
                'pfn': pfn,
                'rse': rse,
                'base_dir': tmp_dir
            }],
                                               traces_copy_out=traces)
            assert len(
                traces) == 1 and traces[0]['clientState'] == 'ALREADY_DONE'

            # Provide wrong checksum for validation, the file will be re-downloaded but checksum validation fails
            traces = []
            with pytest.raises(NoFilesDownloaded):
                self.download_client.download_pfns(
                    [{
                        'did': '%s:%s' % (scope, name),
                        'pfn': pfn,
                        'rse': rse,
                        'adler32': 'wrong',
                        'base_dir': tmp_dir
                    }],
                    traces_copy_out=traces)
            assert len(
                traces) == 1 and traces[0]['clientState'] == 'FAIL_VALIDATE'

        # Switch to a new empty directory
        with TemporaryDirectory() as tmp_dir:
            # Simulate checksum corruption by changing the source file. We rely on the particularity
            # that the MOCK4 rse uses the posix protocol: files are stored on the local file system
            protocol = rsemgr.create_protocol(rsemgr.get_rse_info(
                rse, vo=self.client.vo),
                                              operation='read')
            assert isinstance(protocol, PosixProtocol)
            mock_rse_local_path = protocol.pfn2path(pfn)
            with open(mock_rse_local_path, 'w') as f:
                f.write('some completely other data')

            # Download fails checksum validation
            traces = []
            with pytest.raises(NoFilesDownloaded):
                self.download_client.download_dids(
                    [{
                        'did': '%s:%s' % (scope, name),
                        'base_dir': tmp_dir
                    }],
                    traces_copy_out=traces)
            assert len(
                traces) == 1 and traces[0]['clientState'] == 'FAIL_VALIDATE'

            # Ignore_checksum set. Download works.
            traces = []
            self.download_client.download_dids([{
                'did': '%s:%s' % (scope, name),
                'base_dir': tmp_dir,
                'ignore_checksum': True
            }],
                                               traces_copy_out=traces)
            assert len(traces) == 1 and traces[0]['clientState'] == 'DONE'
コード例 #13
0
                newfile.write(b"\0")
                newfile.close()
                os.stat(file).st_size
                print('random file with size %f generated ok' % size)
                n_files = np.append(n_files, file)
            except:
                print('could not be generate file %s' % file)

    return (n_files)


list_files = generate_random_file('deletion', 10)

if list_files:
    for n in range(0, len(list_files)):
        client = Client()
        rulesClient = RuleClient()
        uploadClient = UploadClient()

        name_file = list_files[n]
    filePath = "./" + name_file
    file = {
        'path': filePath,
        'did_name': 'thisisatest/' + name_file,
        'rse': DEFAULT_ORIGIN_RSE,
        'did_scope': DEFAULT_SCOPE
    }
    print(file)
    # perform upload
    uploadClient.upload([file])
コード例 #14
0
class TestAbacusCollectionReplica():

    def setUp(self):
        self.account = 'root'
        self.scope = 'mock'
        self.rule_client = RuleClient()
        self.did_client = DIDClient()
        self.replica_client = ReplicaClient()
        self.upload_client = UploadClient()
        self.file_sizes = 2
        self.dataset = 'dataset_%s' % generate_uuid()
        self.rse = 'MOCK5'
        self.rse_id = get_rse_id(rse=self.rse)

    def tearDown(self):
        undertaker.run(once=True)
        cleaner.run(once=True)
        reaper.run(once=True, rses=[self.rse], greedy=True)

    def test_abacus_collection_replica(self):
        """ ABACUS (COLLECTION REPLICA): Test update of collection replica. """
        self.files = [{'did_scope': self.scope, 'did_name': 'file_' + generate_uuid(), 'path': file_generator(size=self.file_sizes), 'rse': self.rse, 'lifetime': -1} for i in range(0, 2)]
        self.did_client.add_did(self.scope, self.dataset, DIDType.DATASET, lifetime=-1)
        self.upload_client.upload(self.files)
        self.did_client.attach_dids(scope=self.scope, name=self.dataset, dids=[{'name': file['did_name'], 'scope': file['did_scope']} for file in self.files])
        self.rule_client.add_replication_rule([{'scope': self.scope, 'name': self.dataset}], 1, self.rse, lifetime=-1)
        [os.remove(file['path']) for file in self.files]

        # Check dataset replica after rule creation - initial data
        dataset_replica = [replica for replica in self.replica_client.list_dataset_replicas(self.scope, self.dataset)][0]
        assert_equal(dataset_replica['bytes'], 0)
        assert_equal(dataset_replica['length'], 0)
        assert_equal(dataset_replica['available_bytes'], 0)
        assert_equal(dataset_replica['available_length'], 0)
        assert_equal(str(dataset_replica['state']), 'UNAVAILABLE')

        # Run Abacus
        collection_replica.run(once=True)

        # Check dataset replica after abacus - abacus should update the collection_replica table from updated_col_rep
        dataset_replica = [replica for replica in self.replica_client.list_dataset_replicas(self.scope, self.dataset)][0]
        assert_equal(dataset_replica['bytes'], len(self.files) * self.file_sizes)
        assert_equal(dataset_replica['length'], len(self.files))
        assert_equal(dataset_replica['available_bytes'], len(self.files) * self.file_sizes)
        assert_equal(dataset_replica['available_length'], len(self.files))
        assert_equal(str(dataset_replica['state']), 'AVAILABLE')

        # Delete one file -> collection replica should be unavailable
        cleaner.run(once=True)
        delete_replicas(rse_id=self.rse_id, files=[{'name': self.files[0]['did_name'], 'scope': InternalScope(self.files[0]['did_scope'])}])
        self.rule_client.add_replication_rule([{'scope': self.scope, 'name': self.dataset}], 1, self.rse, lifetime=-1)
        collection_replica.run(once=True)
        dataset_replica = [replica for replica in self.replica_client.list_dataset_replicas(self.scope, self.dataset)][0]
        assert_equal(dataset_replica['length'], len(self.files))
        assert_equal(dataset_replica['bytes'], len(self.files) * self.file_sizes)
        assert_equal(dataset_replica['available_length'], len(self.files) - 1)
        assert_equal(dataset_replica['available_bytes'], (len(self.files) - 1) * self.file_sizes)
        assert_equal(str(dataset_replica['state']), 'UNAVAILABLE')

        # Delete all files -> collection replica should be deleted
        cleaner.run(once=True)
        reaper.run(once=True, rses=[self.rse], greedy=True)
        self.rule_client.add_replication_rule([{'scope': self.scope, 'name': self.dataset}], 1, self.rse, lifetime=-1)
        collection_replica.run(once=True)
        dataset_replica = [replica for replica in self.replica_client.list_dataset_replicas(self.scope, self.dataset)]
        assert_equal(len(dataset_replica), 0)
コード例 #15
0
ファイル: test_download.py プロジェクト: ijjorama/rucio
class TestDownloadClient(object):
    def setup(self):
        logger = logging.getLogger('dlul_client')
        logger.addHandler(logging.StreamHandler())
        logger.setLevel(logging.DEBUG)
        self.client = Client()
        self.upload_client = UploadClient(_client=self.client, logger=logger)
        self.download_client = DownloadClient(client=self.client,
                                              logger=logger)

    def create_and_upload_tmp_file(self, rse, scope='mock'):
        file_path = file_generator()
        item = {
            'path': file_path,
            'rse': rse,
            'did_scope': scope,
            'did_name': os.path.basename(file_path),
            'guid': generate_uuid()
        }
        nose.tools.assert_equal(self.upload_client.upload([item]), 0)
        return item

    def test_download_item(self):
        """ DOWNLOAD (CLIENT): download DIDs. """
        item = self.create_and_upload_tmp_file('MOCK4')
        scope = item['did_scope']
        name = item['did_name']
        uuid = item['guid']

        # Download specific DID
        result = self.download_client.download_dids([{
            'did':
            '%s:%s' % (scope, name)
        }])
        nose.tools.assert_true(result)

        # Download with wildcard
        result = self.download_client.download_dids([{
            'did':
            '%s:%s' % (scope, name[:-2] + '*')
        }])
        nose.tools.assert_true(result)

        # Download with filter
        result = self.download_client.download_dids([{
            'filters': {
                'guid': uuid,
                'scope': scope
            }
        }])
        nose.tools.assert_true(result)

        # Download with wildcard and name
        result = self.download_client.download_dids([{
            'did':
            '%s:%s' % (scope, '*'),
            'filters': {
                'guid': uuid
            }
        }])
        nose.tools.assert_true(result)
コード例 #16
0
ファイル: combine.py プロジェクト: napoliion/outsource
def main():
    parser = argparse.ArgumentParser(description="Combine strax output")
    parser.add_argument('dataset', help='Run number', type=int)
    parser.add_argument('dtype', help='dtype to combine')
    parser.add_argument('--context', help='Strax context')
    parser.add_argument('--input', help='path where the temp directory is')
    parser.add_argument('--rse', help='RSE to upload to')
    parser.add_argument('--cmt', help='CMT global version')
    parser.add_argument('--update-db',
                        help='flag to update runsDB',
                        dest='update_db',
                        action='store_true')
    parser.add_argument('--upload-to-rucio',
                        help='flag to upload to rucio',
                        dest='upload_to_rucio',
                        action='store_true')

    args = parser.parse_args()

    runid = args.dataset
    runid_str = "%06d" % runid
    dtype = args.dtype
    path = args.input

    final_path = 'finished_data'

    # get context
    st = getattr(straxen.contexts, args.context)()
    st.storage = [
        strax.DataDirectory('./'),
        strax.DataDirectory(final_path)  # where we are copying data to
    ]
    apply_global_version(st, args.cmt)

    # check what data is in the output folder
    dtypes = [d.split('-')[1] for d in os.listdir(path)]

    if 'records' in dtypes:
        plugin_levels = ['records', 'peaklets']
    else:
        plugin_levels = ['peaklets']

    # merge
    for dtype in plugin_levels:
        print(f"Merging {dtype} level")
        merge(runid_str, dtype, st, path)

    print(f"Current contents of {final_path}:")
    print(os.listdir(final_path))

    # now upload the merged metadata
    # setup the rucio client(s)
    if not args.upload_to_rucio:
        print("Ignoring rucio upload. Exiting")
        return

    # need to patch the storage one last time
    st.storage = [strax.DataDirectory(final_path)]

    updonkey = UploadClient()
    donkey = Client()

    for this_dir in os.listdir(final_path):
        # prepare list of dicts to be uploaded
        _run, keystring, straxhash = this_dir.split('-')
        dataset_did = make_did(runid, keystring, straxhash)
        scope, dset_name = dataset_did.split(':')

        files = os.listdir(os.path.join(final_path, this_dir))
        to_upload = []
        existing_files = [
            f for f in donkey.list_dids(scope, {'type': 'file'}, type='file')
        ]
        existing_files = [f for f in existing_files if dset_name in f]

        try:
            existing_files_in_dataset = [
                f['name'] for f in donkey.list_files(scope, dset_name)
            ]
        except rucio.common.exception.DataIdentifierNotFound:
            existing_files_in_dataset = []

        # for some reason files get uploaded but not attached correctly
        need_attached = list(
            set(existing_files) - set(existing_files_in_dataset))

        if len(need_attached) > 0:
            dids_to_attach = [
                dict(scope=scope, name=name) for name in need_attached
            ]

            donkey.attach_dids(scope, dset_name, dids_to_attach)

        for f in files:
            if f in existing_files:
                print(f"Skipping {f} since it is already uploaded")
                continue

            this_path = os.path.join(final_path, this_dir, f)
            d = dict(path=this_path,
                     did_scope=scope,
                     did_name=f,
                     dataset_scope=scope,
                     dataset_name=dset_name,
                     rse=args.rse,
                     register_after_upload=True)
            to_upload.append(d)

        # now do the upload!
        if len(to_upload) == 0:
            print(f"No files to upload for {this_dir}")
            continue

        # now do the upload!
        try:
            updonkey.upload(to_upload)
        except:
            print(f'Upload of {keystring} failed')
            raise
        print(
            f"Upload of {len(files)} files in {this_dir} finished successfully"
        )
        for f in files:
            print(f"{scope}:{f}")

        # now check the rucio data matche what we expect
        rucio_files = [f for f in donkey.list_files(scope, dset_name)]

        # how many chunks?
        md = st.get_meta(runid_str, keystring)

        expected_chunks = len([c for c in md['chunks'] if c['n'] > 0])

        # we should have n+1 files in rucio (counting metadata)
        if len(rucio_files) != expected_chunks + 1:
            # we're missing some data, uh oh
            successful_chunks = set(
                [int(f['name'].split('-')[-1]) for f in rucio_files])
            expected_chunks = set(np.arange(expected_chunks))

            missing_chunks = expected_chunks - successful_chunks

            missing_chunk_str = '/n'.join(missing_chunks)
            raise RuntimeError(
                f"File mismatch! There are {len(rucio_files)} but the metadata thinks there "
                f"should be {expected_chunks} chunks + 1 metadata. "
                f"The missing chunks are:\n{missing_chunk_str}")

        chunk_mb = [chunk['nbytes'] / (1e6) for chunk in md['chunks']]
        data_size_mb = np.sum(chunk_mb)
        avg_data_size_mb = np.mean(chunk_mb)

        # let's do one last check of the rule
        rc = RucioSummoner()

        rses = [args.rse]
        if (keystring not in ['records', 'veto_regions', 'pulse_counts']
                and "UC_DALI_USERDISK" not in rses):
            rses.append('UC_DALI_USERDISK')

        for rse in rses:
            rule = rc.GetRule(dataset_did, rse)
            if rule['state'] == 'OK':
                status = 'transferred'
            elif rule['state'] == 'REPLICATING':
                status = 'transferring'
            else:
                status = 'error'

            if args.update_db:
                # update runDB
                new_data_dict = dict()
                new_data_dict['location'] = rse
                new_data_dict['did'] = dataset_did
                new_data_dict['status'] = status
                new_data_dict['host'] = "rucio-catalogue"
                new_data_dict['type'] = keystring
                new_data_dict['protocol'] = 'rucio'
                new_data_dict['creation_time'] = datetime.datetime.utcnow(
                ).isoformat()
                new_data_dict['creation_place'] = "OSG"
                #new_data_dict['file_count'] = file_count
                new_data_dict['meta'] = dict(  #lineage=plugin.lineage_hash,
                    avg_chunk_mb=avg_data_size_mb,
                    file_count=len(rucio_files),
                    size_mb=data_size_mb,
                    strax_version=strax.__version__,
                    straxen_version=straxen.__version__)

                db.update_data(runid, new_data_dict)

                print(f"Database updated for {keystring} at {rse}")
            else:
                print("Skipping database update.")

        # if everything is good, let's close the dataset
        # this will make it so no more data can be added to this dataset
        if status == 'transferred':
            try:
                donkey.close(scope, dset_name)
            except:
                print(f"Closing {scope}:{dset_name} failed")
コード例 #17
0
class TestAbacusCollectionReplica(unittest.TestCase):
    def setUp(self):
        self.account = 'root'
        self.scope = 'mock'
        self.rse = 'MOCK5'
        self.file_sizes = 2
        self.dataset = 'dataset_%s' % generate_uuid()

        self.rule_client = RuleClient()
        self.did_client = DIDClient()
        self.replica_client = ReplicaClient()
        self.upload_client = UploadClient()

        if config_get_bool('common',
                           'multi_vo',
                           raise_exception=False,
                           default=False):
            self.vo = {
                'vo':
                config_get('client',
                           'vo',
                           raise_exception=False,
                           default='tst')
            }
        else:
            self.vo = {}

        self.rse_id = get_rse_id(rse=self.rse, **self.vo)

    def tearDown(self):
        undertaker.run(once=True)
        cleaner.run(once=True)
        if self.vo:
            reaper.run(once=True,
                       include_rses='vo=%s&(%s)' % (self.vo['vo'], self.rse),
                       greedy=True)
        else:
            reaper.run(once=True, include_rses=self.rse, greedy=True)

    def test_abacus_collection_replica(self):
        """ ABACUS (COLLECTION REPLICA): Test update of collection replica. """
        self.files = [{
            'did_scope': self.scope,
            'did_name': 'file_' + generate_uuid(),
            'path': file_generator(size=self.file_sizes),
            'rse': self.rse,
            'lifetime': -1
        } for i in range(0, 2)]
        self.did_client.add_did(self.scope,
                                self.dataset,
                                DIDType.DATASET,
                                lifetime=-1)
        self.upload_client.upload(self.files)
        self.did_client.attach_dids(scope=self.scope,
                                    name=self.dataset,
                                    dids=[{
                                        'name': file['did_name'],
                                        'scope': file['did_scope']
                                    } for file in self.files])
        self.rule_client.add_replication_rule([{
            'scope': self.scope,
            'name': self.dataset
        }],
                                              1,
                                              self.rse,
                                              lifetime=-1)
        [os.remove(file['path']) for file in self.files]

        # Check dataset replica after rule creation - initial data
        dataset_replica = [
            replica for replica in self.replica_client.list_dataset_replicas(
                self.scope, self.dataset)
        ][0]
        assert dataset_replica['bytes'] == 0
        assert dataset_replica['length'] == 0
        assert dataset_replica['available_bytes'] == 0
        assert dataset_replica['available_length'] == 0
        assert str(dataset_replica['state']) == 'UNAVAILABLE'

        # Run Abacus
        collection_replica.run(once=True)

        # Check dataset replica after abacus - abacus should update the collection_replica table from updated_col_rep
        dataset_replica = [
            replica for replica in self.replica_client.list_dataset_replicas(
                self.scope, self.dataset)
        ][0]
        assert dataset_replica['bytes'] == len(self.files) * self.file_sizes
        assert dataset_replica['length'] == len(self.files)
        assert dataset_replica['available_bytes'] == len(
            self.files) * self.file_sizes
        assert dataset_replica['available_length'] == len(self.files)
        assert str(dataset_replica['state']) == 'AVAILABLE'

        # Delete one file -> collection replica should be unavailable
        cleaner.run(once=True)
        delete_replicas(rse_id=self.rse_id,
                        files=[{
                            'name':
                            self.files[0]['did_name'],
                            'scope':
                            InternalScope(self.files[0]['did_scope'],
                                          **self.vo)
                        }])
        self.rule_client.add_replication_rule([{
            'scope': self.scope,
            'name': self.dataset
        }],
                                              1,
                                              self.rse,
                                              lifetime=-1)
        collection_replica.run(once=True)
        dataset_replica = [
            replica for replica in self.replica_client.list_dataset_replicas(
                self.scope, self.dataset)
        ][0]
        assert dataset_replica['length'] == len(self.files)
        assert dataset_replica['bytes'] == len(self.files) * self.file_sizes
        assert dataset_replica['available_length'] == len(self.files) - 1
        assert dataset_replica['available_bytes'] == (len(self.files) -
                                                      1) * self.file_sizes
        assert str(dataset_replica['state']) == 'UNAVAILABLE'

        # Delete all files -> collection replica should be deleted
        cleaner.run(once=True)
        if self.vo:
            reaper.run(once=True,
                       include_rses='vo=%s&(%s)' % (self.vo['vo'], self.rse),
                       greedy=True)
        else:
            reaper.run(once=True, include_rses=self.rse, greedy=True)
        self.rule_client.add_replication_rule([{
            'scope': self.scope,
            'name': self.dataset
        }],
                                              1,
                                              self.rse,
                                              lifetime=-1)
        collection_replica.run(once=True)
        dataset_replica = [
            replica for replica in self.replica_client.list_dataset_replicas(
                self.scope, self.dataset)
        ]
        assert len(dataset_replica) == 0
コード例 #18
0
class TestDownloadClient(object):
    def setup(self):
        logger = logging.getLogger('dlul_client')
        logger.addHandler(logging.StreamHandler())
        logger.setLevel(logging.DEBUG)
        self.client = Client()
        self.upload_client = UploadClient(_client=self.client, logger=logger)
        self.download_client = DownloadClient(client=self.client,
                                              logger=logger)

        self.file_path = file_generator()
        self.scope = 'mock'
        self.name = os.path.basename(self.file_path)
        self.rse = 'MOCK4'
        self.guid = generate_uuid()

        item = {
            'path': self.file_path,
            'rse': self.rse,
            'did_scope': self.scope,
            'did_name': self.name,
            'guid': self.guid
        }
        nose.tools.assert_equal(self.upload_client.upload([item]), 0)

    def teardown(self):
        shutil.rmtree('mock')

    def test_download_item(self):
        """ DOWNLOAD (CLIENT): Download DIDs """

        # Download specific DID
        result = self.download_client.download_dids([{
            'did':
            '%s:%s' % (self.scope, self.name)
        }])
        nose.tools.assert_true(result)

        # Download with wildcard
        result = self.download_client.download_dids([{
            'did':
            '%s:%s' % (self.scope, self.name[:-2] + '*')
        }])
        nose.tools.assert_true(result)

        # Download with filter
        result = self.download_client.download_dids([{
            'filters': {
                'guid': self.guid,
                'scope': self.scope
            }
        }])
        nose.tools.assert_true(result)

        # Download with wildcard and name
        result = self.download_client.download_dids([{
            'did':
            '%s:%s' % (self.scope, '*'),
            'filters': {
                'guid': self.guid
            }
        }])
        nose.tools.assert_true(result)
コード例 #19
0
ファイル: rucio_api.py プロジェクト: XENONnT/admix
class RucioAPI():
    """Class RucioAPI()

    This class presents an approach to collect all necessary Rucio calls
    in one class. That allows easy handling of Rucio calls based on the
    Rucio API.
    If there are ever changes in the Rucio API, here is the wrapper to fix it.
    """
    def __init__(self, enable_print=False):
        """Function: __init__()

        Constructor of RucioAPI class. Comes with a setting set up the print statements to terminal

        :param enable_print: If True then enable print to terminal
        """
        self._print_to_screen = enable_print
        self._rucio_ping = None
        self._rucio_account = os.environ.get("RUCIO_ACCOUNT")
        self.ConfigHost()

    def __del__(self):
        """Function: __del__()

        Destructor - No further description
        """
        pass

    # Here comes the backend configuration part:
    def SetRucioAccount(self, rucio_account=None):
        """Function: SetRucioAccount
        :param rucio_account: The Rucio account you would like to work with
        """

        self._rucio_account = rucio_account

    def SetConfigPath(self, config_path=None):
        """Function: SetConfigPath
        This option is only important for legacy command line support and
        is ignored in RucioAPI setup.
        :param config_path: Path to CLI configuration file
        """
        pass

    def SetProxyTicket(self, proxy_path=None):
        """Function: SetProxyTicket
        This option is only important for legacy command line support and
        is ignored in RucioAPI setup.
        :param proxy_path: Path to CLI configuration file
        """
        pass

    def SetHost(self, hostname=None):
        """Function: SetHost
        This option is only important for legacy command line support and
        is ignored in RucioAPI setup.
        :param hostname: Path to CLI configuration file
        """
        pass

    def ConfigHost(self):
        """Function: ConfigHost

        This member function setup the rucioAPI backend.
        To ensure full functionality, it needs:
        * Client()
        * UploadClient()
        * DownloadClient()

        :raise Exception if Rucio API is not ready (miss-configured)
        """
        try:
            self._rucio_client = Client()
            self._rucio_client_upload = UploadClient(logger=logger)
            #            self._rucio_client_upload = UploadClient(tracing=False)
            #            print("Tracing set to False")
            self._rucio_client_download = DownloadClient()
            self._rucio_ping = self._rucio_client.ping

        except:
            print("Can not init the Rucio API")
            print("-> Check for your Rucio installation")
            exit(1)

    # finished the backend configuration for the Rucio API

    def Whoami(self):
        """RucioAPI:Whoami
        Results a dictionary to identify the current
        Rucio user and credentials.
        """
        return self._rucio_client.whoami()

    def GetRucioPing(self):
        """Function: GetRucioPing
        :return If ConfigHost is executed without execption GetRucioPing provides a Rucio ping
        """

        return self._rucio_client.ping

    #The scope section:
    def CreateScope(self, account, scope, verbose=False):
        """Function: CreateScope()

        Create a new Rucio scope what does not yet exists yet.
        Be aware that you need Rucio permissions to do it. Check your Rucio account and settings.

        :param account: The Rucio account you are working with (need to be allowed to create scopes)
        :param scope: The scope name you like to create
        :return result:
        """

        result = 1
        try:
            self._rucio_client.add_scope(account, scope)
            result = 0
        except AccessDenied as e:
            print(e)
        except Duplicate as e:
            if verbose:
                print(e)
            else:
                pass
        return result

        #Several list commands

    def GetRSE(self, rse):
        """Function: GetRSE(...)

        Return further information about the RSE setup of a specific RSE
        :param rse: A (string) valid Rucio Storage Element (RSE) name
        :return result: A dictionary which holds information according the selected RSE
        """

        result = {}
        try:
            result = self._rucio_client.get_rse(rse)
        except:
            print("No RSE attributes received for {0}".format(rse))
        return result

    def ListRSEAttributes(self, rse):
        """Function: ListRSEAttributes(...)

        Return some attributes of a Rucio Storage Element
        Received keys are fts, fts_testing, RSE-NAME, istape

        :param rse: A valid (string) Rucio Storage Element (RSE) name
        :return result: A dictionary with RSE attributes
        """

        result = {}

        try:
            result = self._rucio_client.list_rse_attributes(rse)
        except:
            print("No RSE attributes received for {0}".format(rse))
        return result

    def ListRSEs(self):
        """Function: ListRSEs

        Returns an overview about all registered Rucio Storage elements in the current setup

        :return result: A list of dictionaries. Each dictionary holds RSE information. If not successful []
        """
        result = []
        try:
            result = list(self._rucio_client.list_rses())
        except:
            print("No RSE received from Rucio.")

        return result

    def ListContent(self, scope, name):
        """Function: ListContent()

        :param scope: A string which refers to a Rucio scope
        :param name: A string which refers to a Rucio name
        :return result: A list of dictionaries with the attached files to the DID
        """
        result = []
        try:
            return list(self._rucio_client.list_content(scope, name))
        except TypeError as e:
            print(e)

        return result

    def ListScopes(self):
        """Function: ListScopes()

        List all created scopes in the Rucio catalogue

        :return result: A list of scopes, otherwise []
        """
        result = []
        try:
            result = self._rucio_client.list_scopes()
        except:
            print("No scopes? - Check that!")
        return result

    def ListFileReplicas(self, scope, lfn):
        """Function: ListFileReplicas(...)

        List all your files which are attached to a dataset or container

        :param scope: A string which follows the rules of a Rucio scope
        :param lfn: the lfn.
        :return result: A list of file replicas, otherwise []
        """

        #todo FIX ME
        result = []
        result = self._rucio_client.list_file_replicas(scope, lfn)
        return result

        #try:
        #    result = self._rucio_client.list_file_replicas(scope, lfn)
        #except AttributeError as e:
        #    print(e)
        return result

    def ListFiles(self, scope, name, long=True):
        """Function: ListFiles(...)

        List all your files which are attached to a dataset or container

        :param scope: A string which follows the rules of a Rucio scope
        :param name: A string which follows the rules of a Rucio name
        :param long: Define another output (Check the Rucio tutorials for it)
        :return result: A list of files, otherwise []
        """
        result = []
        try:
            result = self._rucio_client.list_files(scope, name, long=None)
        except:
            print("No files are listed for {0}:{1}".format(scope, name))
        return result

    def ListDids(self,
                 scope,
                 filters,
                 type='collection',
                 long=False,
                 recursive=False):
        """
        List all data identifiers in a scope which match a given pattern. Check Rucio github page for details

        :param scope: The valid string which follows the Rucio scope name.
        :param filters: A dictionary of key/value pairs like {'name': 'file_name','rse-expression': 'tier0'}.
        :param type: The type of the did: 'all'(container, dataset or file)|'collection'(dataset or container)|'dataset'|'container'|'file'
        :param long: Long format option to display more information for each DID.
        :param result: Recursively list DIDs content.
        """

        result = []
        try:
            return list(
                self._rucio_client.list_dids(scope, filters, type, long,
                                             recursive))
        except TypeError as e:
            print(e)
        return result

    def ListDidRules(self, scope, name):
        """Return a class generator from Rucio which contains the
        individual rules to iterate over (or to create a list from)

        :param scope: A string which refers to the Rucio scope
        :param name: A string which refers to the Rucio name (a container, dataset or file name)

        :return: A list of Rucio transfer rules with additional rule information. Each list element stands for a
                 Rucio Storage Element (RSE). List is empty if not successful or nor rules.
        """

        result = []
        try:
            return list(self._rucio_client.list_did_rules(scope, name))
        except TypeError as e:
            print(e)

        return result

    #Attach and detach:
    def AttachDids(self, scope, name, attachment, rse=None):
        """Function: AttachDids(...)

        This function allows to attach datasets or containers to a top-level dataset or container.
        The parameters scope and name define the top-level structure (container or dataset) and the dictionary or
        the list of dictionaries contains the information about what is attached to the top-level structure.

        More information under https://github.com/rucio/rucio


        :param scope: A string which follows the rules of a Rucio scope
        :param name: A string which follows the rules of a Rucio name
        :param attachment: A dictionary or a list of dictionaries which consist of two keys: scope and name
                           example{'scope': 'example_scope1', 'name':'example_name1'}
        :param rse: The RSE name when registering replicas. (optional)
        :return result: 0 if successful, 1 for failure
        """
        result = 1

        #In case there is only an individual dictionary provided, the dictionary is transformed into a list of
        #dictionaries.
        if isinstance(attachment, dict) == True:
            attachment = [attachment]

        #self._rucio_client.attach_dids(scope, name, attachment, rse=rse)

        try:
            self._rucio_client.attach_dids(scope, name, attachment, rse=rse)
            result = 0
        except DuplicateContent as e:
            print(e)

        return result

    def DetachDids(self, scope, name, dids):
        try:
            self._rucio_client.detach_dids(scope, name, dids)
        except:
            return None

    #Container and Dataset managment:
    def CreateContainer(self,
                        scope,
                        name,
                        statuses=None,
                        meta=None,
                        rules=None,
                        lifetime=None):
        """Function CreateContainer(...)

        Follows the Rucio API to create a Rucio container based on scope and container name. It accept also further
        Rucio features.
        More information under https://github.com/rucio/rucio

        :param scope: A string which follows the rules of a Rucio scope
        :param name: A string which follows the rules of a Rucio container name
        :param statuses: Status (optional)
        :param meta: Put in further meta data which are going to be connected to the container. (optional)
        :param rules: Define transfer rules which apply to the container immediately. (optional)
        :param lifetime: Set a Rucio lifetime to the container if you with (optional)
        :return result: 0 if successful, 1 for failure
        """
        result = 1
        try:
            self._rucio_client.add_container(scope,
                                             name,
                                             statuses=None,
                                             meta=None,
                                             rules=None,
                                             lifetime=None)
            result = 0
        except DataIdentifierAlreadyExists as e:
            print(e)
        return result

    def CreateDataset(self,
                      scope,
                      name,
                      statuses=None,
                      meta=None,
                      rules=None,
                      lifetime=None,
                      files=None,
                      rse=None,
                      verbose=False):
        """Function CreateDataset(...)

        Follows the Rucio API to create a Rucio dataset based on scope and dataset name. It accept also further
        Rucio features.
        More information under https://github.com/rucio/rucio

        :param scope:    A string which follows the rules of a Rucio scope
        :param name:     A string which follows the rules of a Rucio dataset name
        :param statuses: Status (optional)
        :param meta:     Put in further meta data which are going to be connected to the container. (optional)
        :param rules:    Define transfer rules which apply to the container immediately. (optional)
        :param lifetime: Set a Rucio lifetime to the container if you with (optional)
        :param verbose:  Flag to print DataIdentifierAlreadyExists exceptions
        :return result:  0 if successful, 1 for failure
        """
        result = 1
        try:
            self._rucio_client.add_dataset(scope, name, statuses=None, meta=None, rules=None, lifetime=None,\
                                           files=None, rse=None)
            result = 0
        except DataIdentifierAlreadyExists as e:
            if verbose:
                print(e)
        return result

    #Rules:
    def AddRule(self,
                dids,
                copies,
                rse_expression,
                weight=None,
                lifetime=None,
                grouping='DATASET',
                account=None,
                locked=False,
                source_replica_expression=None,
                activity=None,
                notify='N',
                purge_replicas=False,
                ignore_availability=False,
                comment=None,
                ask_approval=False,
                asynchronous=False,
                priority=3,
                meta=None):
        """Function: AddRule(...)

        A function to add a Rucio transfer rule to the given Rucio data identifiers (DIDs)
        More information under https://github.com/rucio/rucio

        :param dids:                       The data identifier set.
        :param copies:                     The number of replicas.
        :param rse_expression:             Boolean string expression to give the list of RSEs.
        :param weight:                     If the weighting option of the replication rule is used, the choice of RSEs takes their weight into account.
        :param lifetime:                   The lifetime of the replication rules (in seconds).
        :param grouping:                   ALL -  All files will be replicated to the same RSE.
                                           DATASET - All files in the same dataset will be replicated to the same RSE.
                                           NONE - Files will be completely spread over all allowed RSEs without any grouping considerations at all.
        :param account:                    The account owning the rule.
        :param locked:                     If the rule is locked, it cannot be deleted.
        :param source_replica_expression:  RSE Expression for RSEs to be considered for source replicas.
        :param activity:                   Transfer Activity to be passed to FTS.
        :param notify:                     Notification setting for the rule (Y, N, C).
        :param purge_replicas:             When the rule gets deleted purge the associated replicas immediately.
        :param ignore_availability:        Option to ignore the availability of RSEs.
        :param ask_approval:               Ask for approval of this replication rule.
        :param asynchronous:               Create rule asynchronously by judge-injector.
        :param priority:                   Priority of the transfers.
        :param comment:                    Comment about the rule.
        :param meta:                       Metadata, as dictionary.

        :return result:  0 if successful, 1 for failure
        """
        result = 1

        try:
            #            self._rucio_client.add_replication_rule(dids, copies, rse_expression, weight=None, lifetime=lifetime,
            #                                                    grouping='DATASET', account=None, locked=False,
            #                                                    source_replica_expression=None, activity=None, notify='N',
            #                                                    purge_replicas=False, ignore_availability=False, comment=None,
            #                                                    ask_approval=False, asynchronous=False, priority=3)
            self._rucio_client.add_replication_rule(
                dids,
                copies,
                rse_expression,
                weight=None,
                lifetime=lifetime,
                grouping='DATASET',
                account=None,
                locked=False,
                source_replica_expression=source_replica_expression,
                activity=None,
                notify='N',
                purge_replicas=False,
                ignore_availability=False,
                comment=None,
                ask_approval=False,
                asynchronous=False,
                priority=priority)
            result = 0
        except DuplicateRule as e:
            print(e)

        return result

    def UpdateRule(self, rule_id, options=None):
        """Function UpdateRule()

        Aims to update a particular rule according to its rule_id and further option such as lifetime
        :param rule_id: A Rucio rule id string
        :param options: A dictionary with certain options (e.g. lifetime, weight, ,...)
        :return result: 0 on success, 1 at failure
        """
        result = 1
        try:
            self._rucio_client.update_replication_rule(rule_id, options)
            result = 0
        except:
            print("Raised exception in UpdateRule")

        return result

    def GetReplicationRule(self, rule_id, estimate_ttc=False):
        """Function: GetReplicationRule(...)

        Get information on the replication rule based on the rule ID

        :param rule_id: A valid Rucio rule ID
        :return result: Information on the replication rule, otherwise 1
        """
        result = 1
        try:
            result = self._rucio_client.get_replication_rule(
                self, rule_id, estimate_ttc=False)
        except:
            print("No replication rule to get")
        return result

    def DeleteRule(self, rule_id):
        """Function: DeleteRule(...)

        Deletes a replication rule.
        :param rule_id: A rucio rule id string
        """
        self._rucio_client.delete_replication_rule(rule_id,
                                                   purge_replicas=True)

    #Metadata:
    def GetMetadata(self, scope, name):
        try:
            return self._rucio_client.get_metadata(scope, name)
        except:
            return None

    def SetMetadata(self, scope, name, key, value, recursive=False):
        try:
            return self._rucio_client.set_metadata(scope,
                                                   name,
                                                   key,
                                                   value,
                                                   recursive=False)
        except:
            return None

    #Data upload / download / register
    def Upload(self, upload_dict=None):
        """Function: Upload()

        The list of dictionaries need to follow this convention:
        Rucio/Github: https://github.com/rucio/rucio/blob/master/lib/rucio/client/uploadclient.py#L71

        :param upload_dict: A list object with dictionaries
        :return result: 0 on success, 1 on failure

        """
        result = self._rucio_client_upload.upload(upload_dict)
        return result

    def DownloadDids(self, items, num_threads=2, trace_custom_fields={}):
        """Function: DownloadDids(...)

        Download from the Rucio catalogue by Rucio DIDs (or a list of them)

        :param items: A list or a dictionary of information what to download
        :param num_threads: Specify the number threads on the CPU, standard 2 (optional)
        :param trace_custom_fields: Customize downloads (Look at Rucio tutorials) (optional)
        :return result: A list of dictionaries of Rucio download result messages. If it fails: 1
        """
        result = 1
        #if a dictionary is handed over, we create a list of it.
        if isinstance(items, dict):
            items = [items]

        try:
            result = self._rucio_client_download.download_dids(
                items=items,
                num_threads=num_threads,
                trace_custom_fields=trace_custom_fields)
        except:
            result = 1

        return result

    def Register(self, rse, files, ignore_availability=True):
        #See email "IceCube Script to register data"
        #from Benedikt.
        #files = {
        #'scope': self.scope,
        #'name': replicas[filemd]['name'],
        #'adler32': replicas[filemd]['adler32'],
        #'bytes': replicas[filemd]['size'],
        #} for filemd in replicas]
        #--> Think about metadata
        try:
            self._rucio_client.add_replicas(rse, files, ignore_availability)
        except:
            print("Problem with file name does not match pattern")

        for filemd in replicas:
            try:
                self.didc.attach_dids(scope=self.scope,
                                      name=self.run_Number,
                                      dids=[{
                                          'scope': self.scope,
                                          'name': replicas[filemd]['name']
                                      }])
            except FileAlreadyExists:
                print("File already attached")
コード例 #20
0
ファイル: runstrax.py プロジェクト: napoliion/outsource
def main():

    parser = argparse.ArgumentParser(description="Strax Processing With Outsource")
    parser.add_argument('dataset', help='Run number', type=int)
    parser.add_argument('--output', help='desired strax(en) output')
    parser.add_argument('--context', help='name of context')
    parser.add_argument('--chunks', nargs='*', help='chunk ids to download')
    parser.add_argument('--rse', type=str, default="UC_OSG_USERDISK")
    parser.add_argument('--cmt', type=str, default='ONLINE')
    parser.add_argument('--upload-to-rucio', action='store_true', dest='upload_to_rucio')
    parser.add_argument('--update-db', action='store_true', dest='update_db')
    parser.add_argument('--download-only', action='store_true', dest='download_only')
    parser.add_argument('--no-download', action='store_true', dest='no_download')

    args = parser.parse_args()

    # directory where we will be putting everything
    data_dir = './data'

    # make sure this is empty
    # if os.path.exists(data_dir):
    #     rmtree(data_dir)

    # get context
    st = getattr(straxen.contexts, args.context)()
    st.storage = [strax.DataDirectory(data_dir)]

    apply_global_version(st, args.cmt)

    runid = args.dataset
    runid_str = "%06d" % runid
    out_dtype = args.output

    # determine which input dtypes we need
    bottom = 'peaklets' if args.chunks is None else 'raw_records'
    to_download = find_data_to_download(runid, out_dtype, st, bottom=bottom)

    if not args.no_download:
        t0 = time.time()
        # download all the required datatypes to produce this output file
        if args.chunks:
            for in_dtype, hash in to_download:
                # download the input data
                if not os.path.exists(os.path.join(data_dir, f"{runid:06d}-{in_dtype}-{hash}")):
                    admix.download(runid, in_dtype, hash, chunks=args.chunks, location=data_dir)
        else:

            for in_dtype, hash in to_download:
                if not os.path.exists(os.path.join(data_dir, f"{runid:06d}-{in_dtype}-{hash}")):
                    admix.download(runid, in_dtype, hash, location=data_dir)
    
        download_time = time.time() - t0 # seconds
        print(f"=== Download time (minutes): {download_time/60:0.2f}")

    # initialize plugin needed for processing this output type
    plugin = st._get_plugins((out_dtype,), runid_str)[out_dtype]
    st._set_plugin_config(plugin, runid_str, tolerant=False)
    plugin.setup()

    # figure out what plugins we need to process/initialize
    to_process = [args.output]
    downloaded = [dtype for dtype, _ in to_download]
    missing = set(plugin.depends_on) - set(downloaded)
    if len(missing) > 0:
        missing_str = ', '.join(missing)
        print(f"Need to create intermediate data: {missing_str}")
        to_process = list(missing) + to_process

    # keep track of the data we just downloaded -- will be important for the upload step later
    downloaded_data = os.listdir(data_dir)
    print("--Downloaded data--")
    for dd in downloaded_data:
        print(dd)
    print("-------------------\n")

    if args.download_only:
        sys.exit(0)

    print(f"To process: {', '.join(to_process)}")

    _tmp_path = tempfile.mkdtemp()
    for dtype in to_process:
        close_savers = dtype != args.output
        process(runid,
                dtype,
                st,
                args.chunks,
                close_savers=close_savers,
                tmp_path=_tmp_path
                )

    print("Done processing. Now check if we should upload to rucio")

    # now we move the tmpfiles back to main directory, if needed
    # this is for cases where we went from raw_records-->records-->peaklets in one go
    if os.path.exists(_tmp_path):
        for dtype_path_thing in os.listdir(_tmp_path):
            tmp_path = os.path.join(_tmp_path, dtype_path_thing)
            merged_dir = os.path.join(data_dir, dtype_path_thing.split('_temp')[0])

            for file in os.listdir(tmp_path):
                copyfile(os.path.join(tmp_path, file), os.path.join(merged_dir, file))

            os.rename(merged_dir, os.path.join(data_dir, dtype_path_thing))


    # initiate the rucio client
    upload_client = UploadClient()
    rucio_client = Client()

    # if we processed the entire run, we upload everything including metadata
    # otherwise, we just upload the chunks
    upload_meta = args.chunks is None

    # now loop over datatypes we just made and upload the data
    processed_data = [d for d in os.listdir(data_dir) if d not in downloaded_data]
    print("---- Processed data ----")
    for d in processed_data:
        print(d)
    print("------------------------\n")

    if not args.upload_to_rucio:
        print("Ignoring rucio upload. Exiting. ")
        return

    for dirname in processed_data:
        # get rucio dataset
        this_run, this_dtype, this_hash = dirname.split('-')
        if this_dtype in rechunk_dtypes:
            print(f"Skipping upload of {this_dtype} since we need to rechunk it")
            continue

        # remove the _temp if we are processing chunks in parallel
        if args.chunks is not None:
            this_hash = this_hash.replace('_temp', '')
        dataset = make_did(int(this_run), this_dtype, this_hash)

        scope, dset_name = dataset.split(':')

        files = [f for f in os.listdir(os.path.join(data_dir, dirname))]

        if not upload_meta:
            files = [f for f in files if not f.endswith('.json')]

            # check that the output number of files is what we expect
            if len(files) != len(args.chunks):
                processed_chunks = set([int(f.split('-')[-1]) for f in files])
                expected_chunks = set(args.chunks)
                missing_chunks = expected_chunks - processed_chunks
                missing_chunks = ' '.join(missing_chunks)
                raise RuntimeError("File mismatch! We are missing output data for the following chunks: "
                                   f"{missing_chunks}"
                                   )


        # if there are no files, we can't upload them
        if len(files) == 0:
            print(f"No files to upload in {dirname}. Skipping.")
            continue

        # get list of files that have already been uploaded
        # this is to allow us re-run workflow for some chunks
        try:
            existing_files = [f for f in rucio_client.list_dids(scope,
                                                                       {'type': 'file'},
                                                                        type='file')
                              ]
            existing_files = [f for f in existing_files if dset_name in f]

            existing_files_in_dataset = [f['name'] for f in rucio_client.list_files(scope, dset_name)]

            # for some reason files get uploaded but not attached correctly
            need_attached = list(set(existing_files) - set(existing_files_in_dataset))

            # only consider the chunks here
            need_attached = [f for f in need_attached if str(int(f.split('-')[-1])) in args.chunks]


            if len(need_attached) > 0:
                dids_to_attach = [dict(scope=scope, name=name) for name in need_attached]

                rucio_client.attach_dids(scope, dset_name, dids_to_attach)


        except rucio.common.exception.DataIdentifierNotFound:
            existing_files = []

        # prepare list of dicts to be uploaded
        to_upload = []


        for f in files:
            path = os.path.join(data_dir, dirname, f)
            if f in existing_files:
                print(f"Skipping {f} since it is already uploaded")
                continue

            print(f"Uploading {f}")
            d = dict(path=path,
                     did_scope=scope,
                     did_name=f,
                     dataset_scope=scope,
                     dataset_name=dset_name,
                     rse=args.rse,
                     register_after_upload=True
                     )
            to_upload.append(d)

        # skip upload for now

        # now do the upload!
        if len(to_upload) == 0:
            print(f"No files to upload for {dirname}")
            continue
        try:
            upload_client.upload(to_upload)
        except:
            print(f"Upload of {dset_name} failed for some reason")
            raise

        # TODO check rucio that the files are there?
        print(f"Upload of {len(files)} files in {dirname} finished successfully")

        # if we processed the whole thing, add a rule at DALI update the runDB here
        if args.chunks is None:
            rucio_client.add_replication_rule([dict(scope=scope, name=dset_name)], 1, 'UC_DALI_USERDISK',
                                                source_replica_expression=args.rse,
                                              priority=5)
            # skip if update_db flag is false
            if args.update_db:
                md = st.get_meta(runid_str, this_dtype)
                chunk_mb = [chunk['nbytes'] / (1e6) for chunk in md['chunks']]
                data_size_mb = np.sum(chunk_mb)
                avg_data_size_mb = np.mean(chunk_mb)

                # update runDB
                new_data_dict = dict()
                new_data_dict['location'] = args.rse
                new_data_dict['did'] = dataset
                new_data_dict['status'] = 'transferred'
                new_data_dict['host'] = "rucio-catalogue"
                new_data_dict['type'] = this_dtype
                new_data_dict['protocol'] = 'rucio'
                new_data_dict['creation_time'] = datetime.datetime.utcnow().isoformat()
                new_data_dict['creation_place'] = "OSG"
                new_data_dict['meta'] = dict(lineage=md.get('lineage'),
                                             avg_chunk_mb=avg_data_size_mb,
                                             file_count=len(files),
                                             size_mb=data_size_mb,
                                             strax_version=strax.__version__,
                                             straxen_version=straxen.__version__
                                             )

                db.update_data(runid, new_data_dict)
                print(f"Database updated for {this_dtype} at {args.rse}")

                # now update dali db entry
                rule = rc.GetRule(dataset, 'UC_DALI_USERDISK')
                if rule['state'] == 'OK':
                    status = 'transferred'
                elif rule['state'] == 'REPLICATING':
                    status = 'transferring'
                elif rule['state'] == 'STUCK':
                    status = 'stuck'
                new_data_dict['location'] = 'UC_DALI_USERDISK'
                new_data_dict['status'] = status
                db.update_data(runid, new_data_dict)

        # cleanup the files we uploaded
        # this is likely only done for records data because we will rechunk the others
        for f in files:
            print(f"Removing {f}")
            os.remove(os.path.join(data_dir, dirname, f))

    print("ALL DONE!")