def test_put_mgr_ok_multi(self): """(RSE/PROTOCOLS): Put multiple files to storage (Success)""" result = mgr.upload(self.rse_settings, [{ 'name': '1_rse_local_put.raw', 'scope': 'user.%s' % self.user, 'adler32': adler32('%s/1_rse_local_put.raw' % self.tmpdir), 'filesize': os.stat( '%s/1_rse_local_put.raw' % self.tmpdir)[os.path.stat.ST_SIZE] }, { 'name': '2_rse_local_put.raw', 'scope': 'user.%s' % self.user, 'adler32': adler32('%s/2_rse_local_put.raw' % self.tmpdir), 'filesize': os.stat( '%s/2_rse_local_put.raw' % self.tmpdir)[os.path.stat.ST_SIZE] }], source_dir=self.tmpdir) status = result[0] details = result[1] if not (status and details['user.%s:1_rse_local_put.raw' % self.user] and details['user.%s:2_rse_local_put.raw' % self.user]): raise Exception('Return not as expected: %s, %s' % (status, details))
def test_put_mgr_ok_multi(self): """(RSE/PROTOCOLS): Put multiple files to storage (Success)""" if self.rse_settings['protocols'][0]['hostname'] == 'ssh1': result = mgr.upload(self.rse_settings, [{'name': '1_rse_local_put.raw', 'scope': 'user.%s' % self.user, 'md5': md5(str(self.tmpdir) + '/1_rse_local_put.raw'), 'filesize': os.stat('%s/1_rse_local_put.raw' % self.tmpdir)[ os.path.stat.ST_SIZE]}, {'name': '2_rse_local_put.raw', 'scope': 'user.%s' % self.user, 'md5': md5(str(self.tmpdir) + '/2_rse_local_put.raw'), 'filesize': os.stat('%s/2_rse_local_put.raw' % self.tmpdir)[ os.path.stat.ST_SIZE]}], source_dir=self.tmpdir, vo=self.vo, impl=self.impl) else: result = mgr.upload(self.rse_settings, [{'name': '1_rse_local_put.raw', 'scope': 'user.%s' % self.user, 'adler32': adler32('%s/1_rse_local_put.raw' % self.tmpdir), 'filesize': os.stat('%s/1_rse_local_put.raw' % self.tmpdir)[ os.path.stat.ST_SIZE]}, {'name': '2_rse_local_put.raw', 'scope': 'user.%s' % self.user, 'adler32': adler32('%s/2_rse_local_put.raw' % self.tmpdir), 'filesize': os.stat('%s/2_rse_local_put.raw' % self.tmpdir)[ os.path.stat.ST_SIZE]}], source_dir=self.tmpdir, vo=self.vo) status = result[0] details = result[1] if not (status and details['user.%s:1_rse_local_put.raw' % self.user] and details['user.%s:2_rse_local_put.raw' % self.user]): raise Exception('Return not as expected: %s, %s' % (status, details))
def test_put_mgr_ok_multi(self): """(RSE/PROTOCOLS): Put multiple files to storage (Success)""" status, details = mgr.upload(self.rse_settings, [{'name': '1_rse_local_put.raw', 'scope': 'user.%s' % self.user, 'adler32': adler32('%s/1_rse_local_put.raw' % self.tmpdir), 'filesize': os.stat('%s/1_rse_local_put.raw' % self.tmpdir)[os.path.stat.ST_SIZE]}, {'name': '2_rse_local_put.raw', 'scope': 'user.%s' % self.user, 'adler32': adler32('%s/2_rse_local_put.raw' % self.tmpdir), 'filesize': os.stat('%s/2_rse_local_put.raw' % self.tmpdir)[os.path.stat.ST_SIZE]}], self.tmpdir) if not (status and details['user.%s:1_rse_local_put.raw' % self.user] and details['user.%s:2_rse_local_put.raw' % self.user]): raise Exception('Return not as expected: %s, %s' % (status, details))
def test_utils_adler32(self): """(COMMON/UTILS): test calculating Adler32 of a file""" ret = adler32(self.temp_file_1.name) assert isinstance(ret, str) assert match('[a-fA-F0-9]', ret) is not None assert ret == '198d03ff' with pytest.raises(Exception, match='FATAL - could not get Adler-32 checksum of file no_file: \\[Errno 2\\] No such file or directory: \'no_file\''): adler32('no_file')
def test_utils_adler32(self): """(COMMON/UTILS): test calculating Adler32 of a file""" ret = adler32(self.temp_file_1.name) assert_is_instance(ret, str) assert_is_not_none(match('[a-fA-F0-9]', ret)) assert_equal(ret, '198d03ff') with assert_raises(Exception) as e: adler32('no_file') assert_equal('FATAL - could not get Adler32 checksum of file no_file - [Errno 2] No such file or directory: \'no_file\'', e.exception.message)
def _collect_file_info(self, filepath, item): """ Collects infos (e.g. size, checksums, etc.) about the file and returns them as a dictionary (This function is meant to be used as class internal only) :param filepath: path where the file is stored :param item: input options for the given file :returns: a dictionary containing all collected info and the input options """ new_item = copy.deepcopy(item) new_item['path'] = filepath new_item['dirname'] = os.path.dirname(filepath) new_item['basename'] = os.path.basename(filepath) new_item['bytes'] = os.stat(filepath).st_size new_item['adler32'] = adler32(filepath) new_item['md5'] = md5(filepath) new_item['meta'] = {'guid': self._get_file_guid(new_item)} new_item['state'] = 'C' if not new_item.get('did_scope'): new_item['did_scope'] = self.default_file_scope if not new_item.get('did_name'): new_item['did_name'] = new_item['basename'] return new_item
def test_put_mgr_ok_single(self): """(RSE/PROTOCOLS): Put a single file to storage (Success)""" if self.rse_settings['protocols'][0]['hostname'] == 'ssh1': mgr.upload(self.rse_settings, { 'name': '3_rse_local_put.raw', 'scope': 'user.%s' % self.user, 'md5': md5('%s/3_rse_local_put.raw' % self.tmpdir), 'filesize': os.stat('%s/3_rse_local_put.raw' % self.tmpdir)[os.path.stat.ST_SIZE] }, source_dir=self.tmpdir, vo=self.vo, impl=self.impl) else: mgr.upload(self.rse_settings, { 'name': '3_rse_local_put.raw', 'scope': 'user.%s' % self.user, 'adler32': adler32('%s/3_rse_local_put.raw' % self.tmpdir), 'filesize': os.stat('%s/3_rse_local_put.raw' % self.tmpdir)[os.path.stat.ST_SIZE] }, source_dir=self.tmpdir, vo=self.vo)
def _collect_file_info(self, filepath, settings): """ Collects infos (e.g. size, checksums, etc.) about the file and returns them as a dictionary (This function is meant to be used as class internal only) :param filepath: path where the file is stored :param settings: input options for the given file :returns: a dictionary containing all collected info and the input options """ file = copy.deepcopy(settings) file['path'] = filepath file['dirname'] = os.path.dirname(filepath) file['basename'] = os.path.basename(filepath) file['bytes'] = os.stat(filepath).st_size file['adler32'] = adler32(filepath) file['md5'] = md5(filepath) file['meta'] = {'guid': self._get_file_guid(file)} file['state'] = 'C' file.setdefault('did_scope', self.default_file_scope) file.setdefault('did_name', file['basename']) file.setdefault('lifetime', None) return file
def test_put_mgr_SourceNotFound_multi(self): """(RSE/PROTOCOLS): Put multiple files to storage (SourceNotFound)""" result = mgr.upload(self.rse_settings, [{ 'name': 'not_existing_data.raw', 'scope': 'user.%s' % self.user, 'adler32': 'some_random_stuff', 'filesize': 4711 }, { 'name': '4_rse_local_put.raw', 'scope': 'user.%s' % self.user, 'adler32': adler32('%s/4_rse_local_put.raw' % self.tmpdir), 'filesize': os.stat( '%s/4_rse_local_put.raw' % self.tmpdir)[os.path.stat.ST_SIZE] }], source_dir=self.tmpdir) status = result[0] details = result[1] if details['user.%s:4_rse_local_put.raw' % self.user]: raise details['user.%s:not_existing_data.raw' % self.user] else: raise Exception('Return not as expected: %s, %s' % (status, details))
def stat(self, pfn): """ Determines the file size in bytes and checksum (adler32) of the provided file. :param pfn: The PFN the file. :returns: a dict containing the keys filesize and adler32. """ path = self.pfn2path(pfn) return {'filesize': os.stat(path)[os.path.stat.ST_SIZE], 'adler32': adler32(path)}
def test_put_mgr_SourceNotFound_multi(self): """(RSE/PROTOCOLS): Put multiple files to storage (SourceNotFound)""" status, details = mgr.upload(self.rse_settings, [{'name': 'not_existing_data.raw', 'scope': 'user.%s' % self.user, 'adler32': 'some_random_stuff', 'filesize': 4711}, {'name': '4_rse_local_put.raw', 'scope': 'user.%s' % self.user, 'adler32': adler32('%s/4_rse_local_put.raw' % self.tmpdir), 'filesize': os.stat('%s/4_rse_local_put.raw' % self.tmpdir)[os.path.stat.ST_SIZE]}], self.tmpdir) if details['user.%s:4_rse_local_put.raw' % self.user]: raise details['user.%s:not_existing_data.raw' % self.user] else: raise Exception('Return not as expected: %s, %s' % (status, details))
def test_upload_multi(rse, scope, upload_client, download_client, file_factory): local_file1 = file_factory.file_generator(use_basedir=True) local_file2 = file_factory.file_generator(use_basedir=True) download_dir = file_factory.base_dir fn1 = os.path.basename(local_file1) fn2 = os.path.basename(local_file2) items = [ { 'path': local_file1, 'rse': rse, 'did_scope': scope, 'did_name': fn1, 'guid': generate_uuid() }, { 'path': local_file2, 'rse': rse, 'did_scope': scope, 'did_name': fn2, 'guid': generate_uuid() } ] status = upload_client.upload(items) assert status == 0 # download the files did1 = f"{scope}:{fn1}" did2 = f"{scope}:{fn2}" download_client.download_dids([ {'did': did1, 'base_dir': download_dir}, {'did': did2, 'base_dir': download_dir} ]) # match checksums downloaded_file1 = f"{download_dir}/{scope}/{fn1}" assert adler32(local_file1) == adler32(downloaded_file1) downloaded_file2 = f"{download_dir}/{scope}/{fn2}" assert adler32(local_file2) == adler32(downloaded_file2)
def test_upload_single(rse, scope, upload_client, download_client, file_factory): local_file = file_factory.file_generator() download_dir = file_factory.base_dir fn = os.path.basename(local_file) # upload a file status = upload_client.upload([{ 'path': local_file, 'rse': rse, 'did_scope': scope, 'did_name': fn, 'guid': generate_uuid() }]) assert status == 0 # download the file did = f"{scope}:{fn}" download_client.download_dids([{'did': did, 'base_dir': download_dir}]) # match checksums downloaded_file = f"{download_dir}/{scope}/{fn}" assert adler32(local_file) == adler32(downloaded_file)
def test_put_mgr_ok_single(self): """(RSE/PROTOCOLS): Put a single file to storage (Success)""" mgr.upload(self.rse_settings, { 'name': '3_rse_local_put.raw', 'scope': 'user.%s' % self.user, 'adler32': adler32('%s/3_rse_local_put.raw' % self.tmpdir), 'filesize': os.stat( '%s/3_rse_local_put.raw' % self.tmpdir)[os.path.stat.ST_SIZE] }, source_dir=self.tmpdir)
def collect_file_info(self, filepath, settings): file = copy.deepcopy(settings) file['path'] = filepath file['dirname'] = os.path.dirname(filepath) file['basename'] = os.path.basename(filepath) file['bytes'] = os.stat(filepath).st_size file['adler32'] = adler32(filepath) file['md5'] = md5(filepath) file['meta'] = {'guid': self.get_file_guid(file)} file['state'] = 'C' file.setdefault('did_scope', self.default_file_scope) file.setdefault('did_name', file['basename']) file.setdefault('lifetime', None) return file
def check_storage(filepath): """ Check size and checksum of a file on storage """ logging.info("Checking %s" % filepath) try: size = os.stat(filepath).st_size adler_checksum = adler32(filepath) md5_checksum = md5(filepath) # FIXME: some frames have len(adler_checksum)=7, is there a better way to # force len(adler_checksum)=8 than prepending a zero manually? if len(adler_checksum)!=8: adler_checksum="0{}".format(adler_checksum) logging.info("Got size and checksum of file: %s size=%s adler32 checksum=%s md5 checksum=%s" % (filepath, size, adler_checksum, md5_checksum)) except: logging.warning("no file found at %s" % filepath) return False return size, adler_checksum, md5_checksum
def rucio_register(self, filenames): files = [] dids = [] for filename in filenames: size = os.stat(str(filename)).st_size adler = adler32(str(filename)) files.append({ 'scope': self.scope, 'name': str(filename.parts[-1]), 'bytes': size, 'adler32': adler, 'pfn': self.pfn + str(filename.parts[-1]) }) replica_client = ReplicaClient() replica_client.add_replicas(rse=self.rse, files=files) didclient = DIDClient() didclient.add_files_to_dataset(self.scope, self.dataset, files)
def upload(files, scope, metadata, rse, account, source_dir, worker_number, total_workers, dataset_lifetime, did=None, set_metadata=False): logging.debug('In upload') dsn = None if did: dsn = {'scope': did.split(':')[0], 'name': did.split(':')[1]} client = Client() list_files = [] lfns = [] prepend_str = 'Thread [%i/%i] : ' % (worker_number, total_workers) logging.debug(prepend_str + 'Looping over the files') for filename in files: fullpath = '%s/%s' % (source_dir, filename) size = stat(fullpath).st_size checksum = adler32(fullpath) logging.info(prepend_str + 'File %s : Size %s , adler32 %s' % (fullpath, str(size), checksum)) list_files.append({'scope': scope, 'name': filename, 'bytes': size, 'adler32': checksum, 'meta': {'guid': generate_uuid()}}) lfns.append({'name': filename, 'scope': scope, 'filesize': size, 'adler32': checksum, 'filename': filename}) # Physical upload logging.info(prepend_str + 'Uploading physically the files %s on %s' % (str(lfns), rse)) rse_info = rsemgr.get_rse_info(rse) try: success_upload = True for cnt in xrange(0, 3): global_status, ret = rsemgr.upload(rse_info, lfns=lfns, source_dir=source_dir) logging.info(prepend_str + 'Returned global status : %s, Returned : %s' % (str(global_status), str(ret))) if not global_status: for item in ret: if (not isinstance(ret[item], FileReplicaAlreadyExists)) and ret[item] is not True: sleep(exp(cnt)) success_upload = False logging.error(prepend_str + 'Problem to upload file %s with error %s' % (item, str(ret[item]))) break else: break if not success_upload: logging.error(prepend_str + 'Upload operation to %s failed, removing leftovers' % (rse)) rsemgr.delete(rse_info, lfns=lfns) return False except Exception, error: logging.error(prepend_str + '%s' % (str(error))) return False
def upload(files, scope, metadata, rse, account, source_dir, worker_number, total_workers, dataset_lifetime, did=None): logging.debug('In upload') dsn = None if did: dsn = {'scope': did.split(':')[0], 'name': did.split(':')[1]} client = Client() list_files = [] lfns = [] logging.debug('Thread [%i/%i] : Looping over the files' % (worker_number, total_workers)) for filename in files: fullpath = '%s/%s' % (source_dir, filename) size = stat(fullpath).st_size checksum = adler32(fullpath) logging.info('Thread [%i/%i] : File %s : Size %s , adler32 %s' % (worker_number, total_workers, fullpath, str(size), checksum)) list_files.append({'scope': scope, 'name': filename, 'bytes': size, 'adler32': checksum, 'meta': {'guid': generate_uuid()}}) lfns.append({'name': filename, 'scope': scope, 'filesize': size, 'adler32': checksum}) # Physical upload logging.info('Thread [%i/%i] : Uploading physically the files %s on %s' % (worker_number, total_workers, str(lfns), rse)) rse_info = rsemgr.get_rse_info(rse) try: success_upload = True for i in xrange(0, 3): gs, ret = rsemgr.upload(rse_info, lfns=lfns, source_dir=source_dir) logging.info('Returned global status : %s, Returned : %s' % (str(gs), str(ret))) if not gs: for x in ret: if (not isinstance(ret[x], FileReplicaAlreadyExists)) and ret[x] is not True: sleep(exp(i)) success_upload = False logging.error('Problem to upload file %s with error %s' % (x, str(ret[x]))) break else: break if not success_upload: logging.error('Thread [%i/%i] : Upload operation to %s failed, removing leftovers' % (worker_number, total_workers, rse)) rsemgr.delete(rse_info, lfns=lfns) return False except Exception, e: return False
def test_put_mgr_ok_single(self): """(RSE/PROTOCOLS): Put a single file to storage (Success)""" mgr.upload(self.rse_settings, {'name': '3_rse_local_put.raw', 'scope': 'user.%s' % self.user, 'adler32': adler32('%s/3_rse_local_put.raw' % self.tmpdir), 'filesize': os.stat('%s/3_rse_local_put.raw' % self.tmpdir)[os.path.stat.ST_SIZE]}, self.tmpdir)
def download(rse_settings, files, dest_dir='.', printstatements=False): """ Copy a file from the connected storage to the local file system. Providing a list indicates the bulk mode. :param rse_settings: RSE to use :param files: a single dict or a list with dicts containing 'scope' and 'name' if LFNs are provided and additional 'pfn' if PFNs are provided. E.g. [{'name': '2_rse_remote_get.raw', 'scope': 'user.jdoe'}, {'name':'3_rse_remote_get.raw', 'scope': 'user.jdoe', 'pfn': 'user/jdoe/5a/98/3_rse_remote_get.raw'}] :param dest_dir: path to the directory where the downloaded files will be stored. For each scope a seperate subdirectory is created :returns: True/False for a single file or a dict object with 'scope:name' for LFNs or 'name' for PFNs as keys and True or the exception as value for each file in bulk mode :raises SourceNotFound: remote source file can not be found on storage :raises DestinationNotAccessible: local destination directory is not accessible :raises FileConsistencyMismatch: the checksum of the downloaded file does not match the provided one :raises ServiceUnavailable: for any other reason """ ret = {} gs = True # gs represents the global status which inidcates if every operation workd in bulk mode protocol = create_protocol(rse_settings, 'read') protocol.connect() files = [files] if not type(files) is list else files for f in files: pfn = f['pfn'] if 'pfn' in f else protocol.lfns2pfns(f).values()[0] try: if not os.path.exists('%s/%s' % (dest_dir, f['scope'])): os.makedirs('%s/%s' % (dest_dir, f['scope'])) # Each scope is stored into a separate folder finalfile = '%s/%s/%s' % (dest_dir, f['scope'], f['name']) # Check if the file already exists, if not download and validate it if not os.path.isfile(finalfile): if 'adler32' in f: tempfile = '%s/%s/%s.part' % (dest_dir, f['scope'], f['name']) if os.path.isfile(tempfile): if printstatements: print '%s already exists, probably from a failed attempt. Will remove it' % (tempfile) os.unlink(tempfile) protocol.get(pfn, tempfile) if printstatements: print 'File downloaded. Will be validated' localchecksum = utils.adler32(tempfile) if localchecksum == f['adler32']: if printstatements: print 'File validated' os.rename(tempfile, finalfile) else: os.unlink(tempfile) raise exception.FileConsistencyMismatch('Checksum mismatch : local %s vs recorded %s' % (str(localchecksum), str(f['adler32']))) else: protocol.get(pfn, '%s/%s/%s' % (dest_dir, f['scope'], f['name'])) ret['%s:%s' % (f['scope'], f['name'])] = True else: ret['%s:%s' % (f['scope'], f['name'])] = True except Exception as e: gs = False ret['%s:%s' % (f['scope'], f['name'])] = e protocol.close() if len(ret) == 1: for x in ret: if isinstance(ret[x], Exception): raise ret[x] else: return ret[x] return [gs, ret]
def test_upload_file_with_supported_protocol_from_config(rse_factory, upload_client, mock_scope, file_factory): """ Upload (CLIENT): Ensure the module associated to the first protocol supported by both the remote and local config read from rucio.cfg is called """ rse_name, rse_id = rse_factory.make_rse() add_protocol(rse_id, {'scheme': 'scp', 'hostname': '%s.cern.ch' % rse_id, 'port': 0, 'prefix': '/test/', 'impl': 'rucio.rse.protocols.ssh.Default', 'domains': { 'lan': {'read': 1, 'write': 1, 'delete': 1}, 'wan': {'read': 1, 'write': 1, 'delete': 1}}}) add_protocol(rse_id, {'scheme': 'file', 'hostname': '%s.cern.ch' % rse_id, 'port': 0, 'prefix': '/test/', 'impl': 'rucio.rse.protocols.posix.Default', 'domains': { 'lan': {'read': 2, 'write': 2, 'delete': 2}, 'wan': {'read': 2, 'write': 2, 'delete': 2}}}) add_protocol(rse_id, {'scheme': 'root', 'hostname': '%s.cern.ch' % rse_id, 'port': 0, 'prefix': '/test/', 'impl': 'rucio.rse.protocols.xrootd.Default', 'domains': { 'lan': {'read': 3, 'write': 3, 'delete': 3}, 'wan': {'read': 3, 'write': 3, 'delete': 3}}}) config_add_section('upload') config_set('upload', 'preferred_impl', 'rclone, xrootd') supported_impl = 'xrootd' path = file_factory.file_generator() name = os.path.basename(path) item = { 'path': path, 'rse': rse_name, 'did_scope': str(mock_scope), 'did_name': name, 'guid': generate_uuid() } with TemporaryDirectory() as tmp_dir: with patch('rucio.rse.protocols.%s.Default.put' % supported_impl, side_effect=lambda pfn, dest, dir, **kw: shutil.copy(path, tmp_dir)) as mock_put, \ patch('rucio.rse.protocols.%s.Default.connect' % supported_impl),\ patch('rucio.rse.protocols.%s.Default.exists' % supported_impl, side_effect=lambda pfn, **kw: False),\ patch('rucio.rse.protocols.%s.Default.delete' % supported_impl),\ patch('rucio.rse.protocols.%s.Default.rename' % supported_impl),\ patch('rucio.rse.protocols.%s.Default.stat' % supported_impl, side_effect=lambda pfn: {'filesize': os.stat(path)[os.path.stat.ST_SIZE], 'adler32': adler32(path)}),\ patch('rucio.rse.protocols.%s.Default.close' % supported_impl): mock_put.__name__ = "mock_put" upload_client.upload([item]) mock_put.assert_called()
def upload(files, scope, metadata, rse, account, source_dir, worker_number, total_workers, dataset_lifetime, did=None, set_metadata=False): logging.debug('In upload') dsn = None if did: dsn = {'scope': did.split(':')[0], 'name': did.split(':')[1]} client = Client() list_files = [] lfns = [] prepend_str = 'Thread [%i/%i] : ' % (worker_number, total_workers) logging.debug(prepend_str + 'Looping over the files') for filename in files: fullpath = '%s/%s' % (source_dir, filename) size = stat(fullpath).st_size checksum = adler32(fullpath) logging.info(prepend_str + 'File %s : Size %s , adler32 %s' % (fullpath, str(size), checksum)) list_files.append({'scope': scope, 'name': filename, 'bytes': size, 'adler32': checksum, 'meta': {'guid': generate_uuid()}}) lfns.append({'name': filename, 'scope': scope, 'filesize': size, 'adler32': checksum, 'filename': filename}) # Physical upload logging.info(prepend_str + 'Uploading physically the files %s on %s' % (str(lfns), rse)) rse_info = rsemgr.get_rse_info(rse) try: success_upload = True for cnt in range(0, 3): rows = rsemgr.upload(rse_info, lfns=lfns, source_dir=source_dir) # temporary hack global_status, ret = rows['success'], rows[1] logging.info(prepend_str + 'Returned global status : %s, Returned : %s' % (str(global_status), str(ret))) if not global_status: for item in ret: if (not isinstance(ret[item], FileReplicaAlreadyExists)) and ret[item] is not True: sleep(exp(cnt)) success_upload = False logging.error(prepend_str + 'Problem to upload file %s with error %s' % (item, str(ret[item]))) break else: break if not success_upload: logging.error(prepend_str + 'Upload operation to %s failed, removing leftovers' % (rse)) rsemgr.delete(rse_info, lfns=lfns) return False except Exception as error: logging.debug(traceback.format_exc()) logging.error(prepend_str + '%s' % (str(error))) return False logging.info(prepend_str + 'Files successfully copied on %s' % (rse)) # Registering DIDs and replicas in Rucio logging.info(prepend_str + 'Registering DIDs and replicas in Rucio') meta = metadata if not set_metadata: meta = None if dsn: try: client.add_dataset(scope=dsn['scope'], name=dsn['name'], rules=[{'account': account, 'copies': 1, 'rse_expression': rse, 'grouping': 'DATASET', 'activity': 'Functional Test'}], meta=meta, lifetime=dataset_lifetime) client.add_files_to_dataset(scope=dsn['scope'], name=dsn['name'], files=list_files, rse=rse) logging.info(prepend_str + 'Upload operation for %s:%s done' % (dsn['scope'], dsn['name'])) except Exception as error: logging.debug(traceback.format_exc()) logging.error(prepend_str + 'Failed to upload %(files)s' % locals()) logging.error(prepend_str + '%s' % (str(error))) logging.error(prepend_str + 'Removing files from the Storage') rsemgr.delete(rse_info, lfns=lfns) return False else: logging.warning(prepend_str + 'No dsn is specified') try: client.add_replicas(files=list_files, rse=rse) client.add_replication_rule(list_files, copies=1, rse_expression=rse, activity='Functional Test') logging.info(prepend_str + 'Upload operation for %s done' % (str(list_files))) except Exception as error: logging.debug(traceback.format_exc()) logging.error(prepend_str + 'Failed to upload %(files)s' % locals()) logging.error(prepend_str + '%s' % (str(error))) logging.error(prepend_str + 'Removing files from the Storage') rsemgr.delete(rse_info, lfns=lfns) return False return True
def upload(files, scope, metadata, rse, account, source_dir, dataset_lifetime, did=None, set_metadata=False, logger=logging.log): logger(logging.DEBUG, 'In upload') dsn = None if did: dsn = {'scope': did.split(':')[0], 'name': did.split(':')[1]} client = Client() list_files = [] lfns = [] for filename in files: physical_fname = filename if physical_fname.find('/') > -1: physical_fname = "".join(filename.split('/')) fullpath = '%s/%s' % (source_dir, physical_fname) size = stat(fullpath).st_size checksum = adler32(fullpath) logger(logging.INFO, 'File %s : Size %s , adler32 %s', fullpath, str(size), checksum) list_files.append({ 'scope': scope, 'name': filename, 'bytes': size, 'adler32': checksum, 'meta': { 'guid': generate_uuid() } }) lfns.append({ 'name': filename, 'scope': scope, 'filesize': size, 'adler32': checksum, 'filename': physical_fname }) # Physical upload logger(logging.INFO, 'Uploading physically the files %s on %s', str(lfns), rse) rse_info = rsemgr.get_rse_info(rse, vo=client.vo) try: success_upload = True for cnt in range(0, 3): rows = rsemgr.upload(rse_info, lfns=lfns, source_dir=source_dir, logger=logger) # temporary hack global_status, ret = rows['success'], rows[1] logger(logging.INFO, 'Returned global status : %s, Returned : %s', str(global_status), str(ret)) if not global_status: for item in ret: if (not isinstance(ret[item], FileReplicaAlreadyExists) ) and ret[item] is not True: sleep(exp(cnt)) success_upload = False logger(logging.ERROR, 'Problem to upload file %s with error %s', item, str(ret[item])) break else: break if not success_upload: logger(logging.ERROR, 'Upload operation to %s failed, removing leftovers', rse) rsemgr.delete(rse_info, lfns=lfns) return False except Exception as error: logger(logging.DEBUG, "Exception", exc_info=True) logger(logging.ERROR, '%s', str(error)) return False logger(logging.INFO, 'Files successfully copied on %s', rse) # Registering DIDs and replicas in Rucio logger(logging.INFO, 'Registering DIDs and replicas in Rucio') meta = metadata if not set_metadata: meta = None if dsn: try: client.add_dataset(scope=dsn['scope'], name=dsn['name'], rules=[{ 'account': account, 'copies': 1, 'rse_expression': rse, 'grouping': 'DATASET', 'activity': 'Functional Test' }], meta=meta, lifetime=dataset_lifetime) client.add_files_to_dataset(scope=dsn['scope'], name=dsn['name'], files=list_files, rse=rse) logger(logging.INFO, 'Upload operation for %s:%s done', dsn['scope'], dsn['name']) except Exception as error: logger(logging.DEBUG, "Exception", exc_info=True) logger(logging.ERROR, 'Failed to upload %s', str(list_files)) logger(logging.ERROR, '%s', str(error)) logger(logging.ERROR, 'removing files from the Storage') rsemgr.delete(rse_info, lfns=lfns) return False else: logger(logging.WARNING, 'No dsn is specified') try: client.add_replicas(files=list_files, rse=rse) client.add_replication_rule(list_files, copies=1, rse_expression=rse, activity='Functional Test') logger(logging.INFO, 'Upload operation for %s done', str(list_files)) except Exception as error: logger(logging.DEBUG, "Exception", exc_info=True) logger(logging.ERROR, 'Failed to upload %s', str(list_files)) logger(logging.ERROR, '%s', str(error)) logger(logging.ERROR, 'Removing files from the Storage') rsemgr.delete(rse_info, lfns=lfns) return False return True
PREFIX = PROTO['prefix'] + '/' + OPTIONS.scope.replace('.', '/') if SCHEMA == 'srm': PREFIX = PROTO['extended_attributes']['web_service_path'] + PREFIX URL = SCHEMA + '://' + PROTO['hostname'] if PROTO['port'] != 0: URL = URL + ':' + str(PROTO['port']) #URL = URL + PREFIX + '/' + OPTIONS.name URL = SCHEMA + '://' + PROTO['hostname'] + ":" + str(PROTO['port']) + PREFIX + '/' + OPTIONS.pfn print (URL) #GFAL = Gfal2Context() try: SIZE = os.stat(PREFIX+'/'+OPTIONS.pfn).st_size CHECKSUM = adler32(PREFIX+'/'+OPTIONS.pfn) # SIZE = GFAL.stat(str(URL)).st_size # CHECKSUM = GFAL.checksum(str(URL), 'adler32') print("Registering file: pfn=%s size=%s checksum=%s" % (URL, SIZE, CHECKSUM)) #except GError: except: print("no file found at %s" % URL) exit() R = ReplicaClient() REPLICAS = list(R.list_replicas([{'scope': OPTIONS.scope, 'name': OPTIONS.name}])) if REPLICAS: REPLICAS = REPLICAS[0] if 'rses' in REPLICAS: if OPTIONS.rse in REPLICAS['rses']:
def _download_item(self, item, trace, log_prefix=''): """ Downloads the given item and sends traces for success/failure. (This function is meant to be used as class internal only) :param item: dictionary that describes the item to download :param trace: dictionary representing a pattern of trace that will be send :param log_prefix: string that will be put at the beginning of every log message :returns: dictionary with all attributes from the input item and a clientState attribute """ logger = self.logger did_scope = item['scope'] did_name = item['name'] did_str = '%s:%s' % (did_scope, did_name) logger.info('%sPreparing download of %s' % (log_prefix, did_str)) trace['scope'] = did_scope trace['filename'] = did_name trace.setdefault('datasetScope', item.get('dataset_scope', '')) trace.setdefault('dataset', item.get('dataset_name', '')) trace.setdefault('filesize', item.get('bytes')) # if file already exists, set state, send trace, and return dest_file_path = item['dest_file_path'] if os.path.isfile(dest_file_path): logger.info('%sFile exists already locally: %s' % (log_prefix, did_str)) item['clientState'] = 'ALREADY_DONE' trace['transferStart'] = time.time() trace['transferEnd'] = time.time() trace['clientState'] = 'ALREADY_DONE' send_trace(trace, self.client.host, self.client.user_agent) return item # check if file has replicas sources = item.get('sources') if not sources or not len(sources): logger.warning('%sNo available source found for file: %s' % (log_prefix, did_str)) item['clientState'] = 'FILE_NOT_FOUND' trace['clientState'] = 'FILE_NOT_FOUND' send_trace(trace, self.client.host, self.client.user_agent) return item success = False # try different PFNs until one succeeded i = 0 while not success and i < len(sources): pfn = sources[i]['pfn'] rse_name = sources[i]['rse'] i += 1 scheme = pfn.split(':')[0] try: rse = rsemgr.get_rse_info(rse_name) except RSENotFound: logger.warning('%sCould not get info of RSE %s' % (log_prefix, rse_name)) continue trace['remoteSite'] = rse_name trace['clientState'] = 'DOWNLOAD_ATTEMPT' trace['protocol'] = scheme logger.info('%sTrying to download with %s from %s: %s ' % (log_prefix, scheme, rse_name, did_str)) try: protocol = rsemgr.create_protocol(rse, operation='read', scheme=scheme) protocol.connect() except Exception as error: logger.warning('%sFailed to create protocol for PFN: %s' % (log_prefix, pfn)) logger.debug('scheme: %s, exception: %s' % (scheme, error)) continue attempt = 0 retries = 2 # do some retries with the same PFN if the download fails while not success and attempt < retries: attempt += 1 item['attemptnr'] = attempt temp_file_path = item['temp_file_path'] if os.path.isfile(temp_file_path): logger.debug('%sDeleting existing temporary file: %s' % (log_prefix, temp_file_path)) os.unlink(temp_file_path) start_time = time.time() try: protocol.get(pfn, temp_file_path, transfer_timeout=item.get('transfer_timeout')) success = True except Exception as error: logger.debug(error) trace['clientState'] = str(type(error).__name__) end_time = time.time() if success and not item.get('ignore_checksum', False): rucio_checksum = item.get('adler32') local_checksum = None if not rucio_checksum: rucio_checksum = item.get('md5') local_checksum = md5(temp_file_path) else: local_checksum = adler32(temp_file_path) if rucio_checksum != local_checksum: success = False os.unlink(temp_file_path) logger.warning('%sChecksum validation failed for file: %s' % (log_prefix, did_str)) logger.debug('Local checksum: %s, Rucio checksum: %s' % (local_checksum, rucio_checksum)) try: self.client.declare_suspicious_file_replicas([pfn], reason='Corrupted') except Exception: pass trace['clientState'] = 'FAIL_VALIDATE' if not success: logger.warning('%sDownload attempt failed. Try %s/%s' % (log_prefix, attempt, retries)) send_trace(trace, self.client.host, self.client.user_agent) protocol.close() if not success: logger.error('%sFailed to download file %s' % (log_prefix, did_str)) item['clientState'] = 'FAILED' return item os.rename(temp_file_path, dest_file_path) trace['transferStart'] = start_time trace['transferEnd'] = end_time trace['clientState'] = 'DONE' item['clientState'] = 'DONE' send_trace(trace, self.client.host, self.client.user_agent) duration = round(end_time - start_time, 2) size = item.get('bytes') size_str = sizefmt(size, self.is_human_readable) if size and duration: rate = round((size / duration) * 1e-6, 2) logger.info('%sFile %s successfully downloaded. %s in %s seconds = %s MBps' % (log_prefix, did_str, size_str, duration, rate)) else: logger.info('%sFile %s successfully downloaded in %s seconds' % (log_prefix, did_str, duration)) return item
def _download_items_aria2c(self, items, aria_rpc, rpc_auth, trace_custom_fields={}): """ Uses aria2c to download the given items. Aria2c needs to be started as RPC background process first and a RPC proxy is needed. (This function is meant to be used as class internal only) :param items: list of dictionaries containing one dict for each file to download :param aria_rcp: RPCProxy to the aria2c process :param rpc_auth: the rpc authentication token :param trace_custom_fields: Custom key value pairs to send with the traces :returns: a list of dictionaries with an entry for each file, containing the input options, the did, and the clientState """ logger = self.logger gid_to_item = {} # maps an aria2c download id (gid) to the download item pfn_to_rse = {} items_to_queue = [item for item in items] # items get removed from gid_to_item when they are complete or failed while len(gid_to_item) or len(items_to_queue): num_queued = 0 # queue up to 100 files and then check arias status while (num_queued < 100) and len(items_to_queue): item = items_to_queue.pop() file_scope = item['scope'] file_name = item['name'] file_did_str = '%s:%s' % (file_scope, file_name) trace = {'scope': file_scope, 'filename': file_name, 'datasetScope': item.get('dataset_scope', ''), 'dataset': item.get('dataset_name', ''), 'protocol': 'https', 'remoteSite': '', 'filesize': item.get('bytes', None), 'transferStart': time.time(), 'transferEnd': time.time()} trace.update(self.trace_tpl) trace.update(trace_custom_fields) # get pfns from all replicas pfns = [] for src in item['sources']: pfn = src['pfn'] if pfn[0:4].lower() == 'davs': pfn = pfn.replace('davs', 'https', 1) pfns.append(pfn) pfn_to_rse[pfn] = src['rse'] # does file exist and are sources available? if os.path.isfile(item['dest_file_path']): logger.info('File exists already locally: %s' % file_did_str) item['clientState'] = 'ALREADY_DONE' trace['clientState'] = 'ALREADY_DONE' send_trace(trace, self.client.host, self.client.user_agent) elif len(pfns) == 0: logger.warning('No available source found for file: %s' % file_did_str) item['clientState'] = 'FILE_NOT_FOUND' trace['clientState'] = 'FILE_NOT_FOUND' send_trace(trace, self.client.host, self.client.user_agent) else: item['trace'] = trace options = {'dir': item['dest_dir_path'], 'out': os.path.basename(item['temp_file_path'])} gid = aria_rpc.aria2.addUri(rpc_auth, pfns, options) gid_to_item[gid] = item num_queued += 1 logger.debug('Queued file: %s' % file_did_str) # get some statistics aria_stat = aria_rpc.aria2.getGlobalStat(rpc_auth) num_active = int(aria_stat['numActive']) num_waiting = int(aria_stat['numWaiting']) num_stopped = int(aria_stat['numStoppedTotal']) # save start time if one of the active downloads has started active = aria_rpc.aria2.tellActive(rpc_auth, ['gid', 'completedLength']) for dlinfo in active: gid = dlinfo['gid'] if int(dlinfo['completedLength']) > 0: gid_to_item[gid].setdefault('transferStart', time.time()) stopped = aria_rpc.aria2.tellStopped(rpc_auth, -1, num_stopped, ['gid', 'status', 'files']) for dlinfo in stopped: gid = dlinfo['gid'] item = gid_to_item[gid] file_scope = item['scope'] file_name = item['name'] file_did_str = '%s:%s' % (file_scope, file_name) temp_file_path = item['temp_file_path'] dest_file_path = item['dest_file_path'] # ensure we didnt miss the active state (e.g. a very fast download) start_time = item.setdefault('transferStart', time.time()) end_time = item.setdefault('transferEnd', time.time()) # get used pfn for traces trace = item['trace'] for uri in dlinfo['files'][0]['uris']: if uri['status'].lower() == 'used': trace['remoteSite'] = pfn_to_rse.get(uri['uri'], '') trace['transferStart'] = start_time trace['transferEnd'] = end_time # ensure file exists status = dlinfo.get('status', '').lower() if status == 'complete' and os.path.isfile(temp_file_path): # checksum check skip_check = item.get('ignore_checksum', False) rucio_checksum = 0 if skip_check else item.get('adler32') local_checksum = 0 if skip_check else adler32(temp_file_path) if rucio_checksum == local_checksum: item['clientState'] = 'DONE' trace['clientState'] = 'DONE' # remove .part ending os.rename(temp_file_path, dest_file_path) # calculate duration duration = round(end_time - start_time, 2) duration = max(duration, 0.01) # protect against 0 division size = item.get('bytes', 0) rate = round((size / duration) * 1e-6, 2) size_str = sizefmt(size, self.is_human_readable) logger.info('File %s successfully downloaded. %s in %s seconds = %s MBps' % (file_did_str, size_str, duration, rate)) else: os.unlink(temp_file_path) logger.warning('Checksum validation failed for file: %s' % file_did_str) logger.debug('Local checksum: %s, Rucio checksum: %s' % (local_checksum, rucio_checksum)) item['clientState'] = 'FAIL_VALIDATE' trace['clientState'] = 'FAIL_VALIDATE' else: logger.error('Failed to download file: %s' % file_did_str) logger.debug('Aria2c status: %s' % status) item['clientState'] = 'FAILED' trace['clientState'] = 'DOWNLOAD_ATTEMPT' send_trace(trace, self.client.host, self.client.user_agent) del item['trace'] aria_rpc.aria2.removeDownloadResult(rpc_auth, gid) del gid_to_item[gid] if len(stopped) > 0: logger.info('Active: %d, Waiting: %d, Stopped: %d' % (num_active, num_waiting, num_stopped)) return items
def download(rse_settings, files, dest_dir=None, force_scheme=None, ignore_checksum=False, printstatements=False, domain='wan', transfer_timeout=None): """ Copy a file from the connected storage to the local file system. Providing a list indicates the bulk mode. :param rse_settings: RSE to use :param files: a single dict or a list with dicts containing 'scope' and 'name' if LFNs are provided and additional 'pfn' if PFNs are provided. Examples: [ {'name': '2_rse_remote_get.raw', 'scope': 'user.jdoe'}, {'name':'3_rse_remote_get.raw', 'scope': 'user.jdoe', 'pfn': 'user/jdoe/5a/98/3_rse_remote_get.raw'} ] :param dest_dir: path to the directory where the downloaded files will be stored. If not given, each scope is represented by its own directory. :param force_scheme: normally the scheme is dictated by the RSE object, when specifying the PFN it must be forced to the one specified in the PFN, overruling the RSE description. :param ignore_checksum: do not verify the checksum - caution: should only be used for rucio download --pfn :param transfer_timeout: set this timeout (in seconds) for the transfers, for protocols that support it :returns: True/False for a single file or a dict object with 'scope:name' for LFNs or 'name' for PFNs as keys and True or the exception as value for each file in bulk mode :raises SourceNotFound: remote source file can not be found on storage :raises DestinationNotAccessible: local destination directory is not accessible :raises FileConsistencyMismatch: the checksum of the downloaded file does not match the provided one :raises ServiceUnavailable: for any other reason """ ret = {} gs = True # gs represents the global status which inidcates if every operation workd in bulk mode protocol = create_protocol(rse_settings, 'read', scheme=force_scheme, domain=domain) protocol.connect() files = [files] if not type(files) is list else files for f in files: pfn = f['pfn'] if 'pfn' in f else list( protocol.lfns2pfns(f).values())[0] target_dir = "./%s" % f['scope'] if dest_dir is None else dest_dir try: if not os.path.exists(target_dir): os.makedirs(target_dir) # Each scope is stored into a separate folder finalfile = '%s/%s' % (target_dir, f['name']) # Check if the file already exists, if not download and validate it if not os.path.isfile(finalfile): if 'adler32' in f: tempfile = '%s/%s.part' % (target_dir, f['name']) if os.path.isfile(tempfile): if printstatements: print( '%s already exists, probably from a failed attempt. Will remove it' % (tempfile)) os.unlink(tempfile) protocol.get(pfn, tempfile, transfer_timeout=transfer_timeout) if printstatements: print('File downloaded. Will be validated') if ignore_checksum: if printstatements: print('Skipping checksum validation') else: ruciochecksum = f['adler32'] if f['adler32'] else f[ 'md5'] localchecksum = utils.adler32( tempfile) if f['adler32'] else utils.md5(tempfile) if localchecksum == ruciochecksum: if printstatements: print('File validated') else: os.unlink(tempfile) raise exception.FileConsistencyMismatch( 'Checksum mismatch : local %s vs recorded %s' % (str(localchecksum), str(ruciochecksum))) os.rename(tempfile, finalfile) else: protocol.get(pfn, '%s/%s' % (target_dir, f['name']), transfer_timeout=transfer_timeout) ret['%s:%s' % (f['scope'], f['name'])] = True else: ret['%s:%s' % (f['scope'], f['name'])] = True except Exception as e: gs = False ret['%s:%s' % (f['scope'], f['name'])] = e protocol.close() if len(ret) == 1: for x in ret: if isinstance(ret[x], Exception): raise ret[x] else: return ret[x] return [gs, ret]
def _file_stats(filepath): a32 = adler32(filepath) size = int(os.path.getsize(filepath)) return a32, size