def test_temp_file_with_final_name_creates_a_tmp_file_and_keeps_it(): with dumper.temp_file('/tmp') as (_, tmp_path): tmp_path = os.path.join('/tmp', tmp_path) assert os.path.exists(tmp_path) assert os.path.exists(tmp_path) os.unlink(tmp_path)
def consistency(rse, delta, configuration, cache_dir, results_dir): logger = logging.getLogger('auditor-worker') rsedump, rsedate = srmdumps.download_rse_dump(rse, configuration, destdir=cache_dir) results_path = os.path.join(results_dir, '{0}_{1}'.format(rse, rsedate.strftime('%Y%m%d'))) # pylint: disable=no-member if os.path.exists(results_path + '.bz2') or os.path.exists(results_path): logger.warning('Consistency check for "%s" (dump dated %s) already done, skipping check', rse, rsedate.strftime('%Y%m%d')) # pylint: disable=no-member return None rrdump_prev = ReplicaFromHDFS.download(rse, rsedate - delta, cache_dir=cache_dir) rrdump_next = ReplicaFromHDFS.download(rse, rsedate + delta, cache_dir=cache_dir) results = Consistency.dump( 'consistency-manual', rse, rsedump, rrdump_prev, rrdump_next, date=rsedate, cache_dir=cache_dir, ) mkdir(results_dir) with temp_file(results_dir, results_path) as (output, _): for result in results: output.write('{0}\n'.format(result.csv())) return results_path
def test_temp_file_cleanup_on_exception(): try: with dumper.temp_file('/tmp') as (_, tmp_path): tmp_path = os.path.join('/tmp', tmp_path) raise Exception except: pass finally: assert not os.path.exists(tmp_path)
def test_temp_file_with_final_name_creates_a_tmp_file_and_then_removes_it(): final_name = tempfile.mktemp() with dumper.temp_file('/tmp', final_name) as (_, tmp_path): tmp_path = os.path.join('/tmp', tmp_path) assert os.path.exists(tmp_path) assert not os.path.exists(final_name) assert os.path.exists(final_name) assert not os.path.exists(tmp_path) os.unlink(final_name)
def test_temp_file_cleanup_on_exception_with_final_name(): final_name = tempfile.mktemp() try: with dumper.temp_file('/tmp', final_name) as (_, tmp_path): tmp_path = os.path.join('/tmp', tmp_path) raise Exception except: pass finally: assert not os.path.exists(tmp_path) assert not os.path.exists(final_name)
def download(cls, rse, date='latest', cache_dir=DUMPS_CACHE_DIR): """ Downloads the requested dump and returns an open read-only mode file like object. """ logger = logging.getLogger('auditor.data_models') requests_session = get_requests_session() if date == 'latest': url = ''.join((cls.BASE_URL, cls.URI, '?rse={0}'.format(rse))) request_headers = requests_session.head(url) for field in request_headers.headers['content-disposition'].split( ';'): if field.startswith('filename='): date = field.split('=')[1].split('_')[-1].split('.')[0] else: assert isinstance(date, datetime.datetime) date = date.strftime('%d-%m-%Y') # pylint: disable=no-member url = ''.join(( cls.BASE_URL, cls.URI, '?rse={0}&date={1}'.format(rse, date), )) if not os.path.isdir(cache_dir): os.mkdir(cache_dir) filename = '{0}_{1}_{2}_{3}'.format( cls.__name__.lower(), rse, date, hashlib.sha1(url.encode()).hexdigest()) filename = re.sub(r'\W', '-', filename) path = os.path.join(cache_dir, filename) if not os.path.exists(path): logger.debug('Trying to download: "%s"', url) response = requests_session.head(url) if response.status_code != 200: logger.error( 'Retrieving %s returned %d status code', url, response.status_code, ) raise HTTPDownloadFailed('Downloading {0} dump'.format( cls.__name__), code=response.status_code) with temp_file(cache_dir, final_name=filename) as (tfile, _): http_download_to_file(url, tfile, session=requests_session) return path
def parse_and_filter_file(filepath, parser=lambda s: s, filter_=lambda s: s, prefix=None, postfix='parsed', cache_dir=DUMPS_CACHE_DIR): ''' Opens `filepath` as a read-only file, and for each line of the file for which the `filter_` function returns True, it writes a version parsed with the `parser` function. The name of the output file is generated appending '_' + `postfix` to the filename in `filepath`. If `prefix` is given it is used instead of `filepath`. The output file (and temporary files while processing are stored in `cache_dir`. Default values for the arguments: - `parser`: returns the same string. - `filter_`: returns True for any argument. - `prefix`: None (the name of the input file is used as prefix). - `postfix`: 'parsed'. - `cache_dir`: DUMPS_CACHE_DIR. The output file is created with a random name and renamed atomically when it is complete. '\n' is appended to each line, therefore if the input is 'a\nb\n' and `parser` is not especified the output will be 'a\n\nb\n\n' ''' prefix = os.path.basename(filepath) if prefix is None else prefix output_name = '_'.join((prefix, postfix)) output_path = os.path.join(cache_dir, output_name) if os.path.exists(output_path): return output_path with dumper.temp_file(cache_dir, final_name=output_name) as (output, _): input_ = dumper.smart_open(filepath) for line in input_: if filter_(line): output.write(parser(line) + '\n') input_.close() return output_path
def download_rse_dump(rse, configuration, date='latest', destdir=DUMPS_CACHE_DIR): ''' Downloads the dump for the given ddmendpoint. If this endpoint does not follow the standarized method to publish the dumps it should have an entry in the `configuration` object describing how to download the dump. `rse` is the DDMEndpoint name. `configuration` is a RawConfigParser subclass. `date` is a datetime instance with the date of the desired dump or 'latest' to download the lastest available dump. `destdir` is the directory where the dump will be saved (the final component in the path is created if it doesn't exist). Return value: a tuple with the filename and a datetime instance with the date of the dump. ''' logger = logging.getLogger('auditor.srmdumps') base_url, url_pattern = generate_url(rse, configuration) if date == 'latest': logger.debug('Looking for site dumps in: "%s"', base_url) links = get_links(base_url) url, date = get_newest(base_url, url_pattern, links) else: url = '{0}/{1}'.format(base_url, date.strftime(url_pattern)) if not os.path.isdir(destdir): os.mkdir(destdir) filename = '{0}_{1}_{2}_{3}'.format('ddmendpoint', rse, date.strftime('%d-%m-%Y'), hashlib.sha1(url).hexdigest()) filename = re.sub(r'\W', '-', filename) path = os.path.join(destdir, filename) if not os.path.exists(path): logger.debug('Trying to download: "%s"', url) with temp_file(destdir, final_name=filename) as (f, _): download(url, f) return (path, date)
def download(cls, rse, date, cache_dir=DUMPS_CACHE_DIR, buffer_size=65536): logger = logging.getLogger('auditor.hdfs') if not os.path.isdir(cache_dir): os.mkdir(cache_dir) tmp_dir = tempfile.mkdtemp(dir=cache_dir) url = cls.BASE_URL.format(date.strftime('%Y-%m-%d'), rse) filename = '{0}_{1}_{2}_{3}'.format( cls.__name__.lower(), rse, date.strftime('%d-%m-%Y'), hashlib.sha1(url.encode()).hexdigest()) filename = re.sub(r'\W', '-', filename) path = os.path.join(cache_dir, filename) if os.path.exists(path): logger.debug('Taking Rucio Replica Dump %s for %s from cache', path, rse) return path try: logging.debug('Trying to download: %s for %s', url, rse) _hdfs_get(cls.BASE_URL.format(date.strftime('%Y-%m-%d'), rse), tmp_dir) files = (os.path.join(tmp_dir, file_) for file_ in sorted(os.listdir(tmp_dir))) with temp_file(cache_dir, filename, binary=True) as (full_dump, _): for chunk_file in files: with open(chunk_file, 'rb') as partial_dump: while True: data_chunk = partial_dump.read(buffer_size) if not data_chunk: break full_dump.write(data_chunk) finally: shutil.rmtree(tmp_dir) return path