def test_delete_error(self): path1 = os.path.join(self.tmpdir, 'f1') with self.assertRaises(BeamIOError) as error: FileSystems.delete([path1]) self.assertTrue( error.exception.message.startswith('Delete operation failed')) self.assertEqual(error.exception.exception_details.keys(), [path1])
def test_delete_error(self): path1 = os.path.join(self.tmpdir, 'f1') with self.assertRaisesRegex(BeamIOError, r'^Delete operation failed') as error: FileSystems.delete([path1]) self.assertEqual(list(error.exception.exception_details.keys()), [path1])
def process(self, unused_element, signal): gcs_location = self.get_destination_uri() match_result = FileSystems.match([gcs_location ])[0].metadata_list logging.debug("%s: matched %s files", self.__class__.__name__, len(match_result)) paths = [x.path for x in match_result] FileSystems.delete(paths)
def test_delete(self): path1 = os.path.join(self.tmpdir, 'f1') with open(path1, 'a') as f: f.write('Hello') self.assertTrue(FileSystems.exists(path1)) FileSystems.delete([path1]) self.assertFalse(FileSystems.exists(path1))
def pre_finalize(self, init_result, writer_results): num_shards = len(list(writer_results)) dst_glob = self._get_final_name_glob(num_shards) dst_glob_files = [file_metadata.path for mr in FileSystems.match([dst_glob]) for file_metadata in mr.metadata_list] if dst_glob_files: logging.warn('Deleting %d existing files in target path matching: %s', len(dst_glob_files), self.shard_name_glob_format) FileSystems.delete(dst_glob_files)
def delete_files(file_paths): """A function to clean up files or directories using ``FileSystems``. Glob is supported in file path and directories will be deleted recursively. Args: file_paths: A list of strings contains file paths or directories. """ if len(file_paths) == 0: raise RuntimeError('Clean up failed. Invalid file path: %s.' % file_paths) FileSystems.delete(file_paths)
def test_should_write_multiple_entries(self): dict_list = [{KEY_1: VALUE_1}, {KEY_2: VALUE_2}] with TestPipeline() as p: _ = ( # flake8: noqa p | beam.Create(dict_list) | WritePropsToTFRecord(TFRECORDS_PATH, lambda x: [x])) filenames = list(find_matching_filenames(TFRECORDS_PATH + '*')) assert len(filenames) == 1 records = list(iter_read_tfrecord_file_as_dict_list(filenames[0])) assert records == dict_list FileSystems.delete(filenames)
def remove_path(path): """ Removes a path if it exists. """ # We need to include this first if statement # to allow local broken symbolic links to be deleted # as well (which aren't matched by the Beam methods). if os.path.islink(path): os.remove(path) elif get_path_exists(path): FileSystems.delete([path])
def pre_finalize(self, init_result, writer_results): writer_results = sorted(writer_results) num_shards = len(writer_results) existing_files = [] for shard_num in range(len(writer_results)): final_name = self._get_final_name(shard_num, num_shards) if FileSystems.exists(final_name): existing_files.append(final_name) if existing_files: logging.info('Deleting existing files in target path: %d', len(existing_files)) FileSystems.delete(existing_files)
def finalize_write(self, init_result, writer_results, pre_finalize_result): file_path_prefix = self.file_path_prefix.get() shard_paths = it.chain.from_iterable(writer_results) path_pairs = list(self._source_dest_shard_pairs(shard_paths)) unique_dest_dirs = {pp.split(pair[1])[0] for pair in path_pairs} num_shards = len(path_pairs) min_threads = min(num_shards, FileBasedSink._MAX_RENAME_THREADS) num_threads = max(1, min_threads) batch_size = FileSystems.get_chunk_size(file_path_prefix) batches = [ path_pairs[i:i + batch_size] for i in six.moves.range(0, len(path_pairs), batch_size) ] logging.info( 'Starting finalize_write threads with num_shards: %d, ' 'batches: %d, num_threads: %d', num_shards, len(batches), num_threads) start_time = time.time() if unique_dest_dirs: # Fix #18 run_using_threadpool raises if you pass in an empty list of inputs # so if we don't have any work to do, then just skip it util.run_using_threadpool(self._create_output_dir, unique_dest_dirs, num_threads) exception_batches = util.run_using_threadpool( self._rename_batch, batches, num_threads) all_exceptions = [ e for exception_batch in exception_batches for e in exception_batch ] if all_exceptions: raise Exception('Encountered exceptions in finalize_write: %s', all_exceptions) for _, final_name in path_pairs: yield final_name logging.info('Renamed %d shards in %.2f seconds.', num_shards, time.time() - start_time) try: FileSystems.delete([init_result]) except IOError: # May have already been removed. pass
def finalize_write(self, init_result, writer_results): # writer_results is LIST[ LIST[ TUPLE[date_ts, gcs_path] ] ] # so first we need to flatten it to just LIST[ TUPLE[date_ts, gcs_path] ] shard_paths = it.chain.from_iterable(writer_results) client = BigQueryWrapper() # start an async load table job for each date waiting_jobs = set(self._load_tables(client, shard_paths)) # wait for jobs to finish while waiting_jobs: logging.info('Waiting for %s bigquery tables to load...', len(waiting_jobs)) completed_jobs = set() for job in waiting_jobs: job_id, table_ref, date_ts = job table_str = encode_table_ref(table_ref) response = client.get_job_status(self.project_id, job_id) if response.status.state == "DONE": completed_jobs.add(job) if response.status.errorResult: logging.error('Bigquery table load failed for %s', table_str) for error in response.status.errors: logging.error('%s %s %s', error.reason, error.location, error.message) # raise exception raise RuntimeError( 'Bigquery table load failed for table %s. %s' % (table_str, response.status.errorResult.message)) else: logging.info('Bigquery table load complete for %s', table_str) yield table_str # not sure what anyone is going to do with these... else: # Not done yet... logging.debug('Bigquery table load status %s - %s' % (table_str, response.status.state)) waiting_jobs -= completed_jobs time.sleep(1.0) # wait for a bit and then check again continue try: FileSystems.delete([init_result]) except IOError: # May have already been removed. pass
def remove(path): """ Remove the given path, whether it is a directory, file, or link. """ if parse_linked_bundle_url(path).uses_beam: from apache_beam.io.filesystems import FileSystems if not FileSystems.exists(path): FileSystems.delete([path]) return check_isvalid(path, 'remove') set_write_permissions(path) # Allow permissions if os.path.islink(path): os.unlink(path) elif os.path.isdir(path): try: shutil.rmtree(path) except shutil.Error: pass else: os.remove(path) if os.path.exists(path): print('Failed to remove %s' % path)
def _verify_data(self, pcol, init_size, data_size): read = pcol | 'read' >> ReadAllFromParquet() v1 = ( read | 'get_number' >> Map(lambda x: x['number']) | 'sum_globally' >> CombineGlobally(sum) | 'validate_number' >> FlatMap(lambda x: TestParquetIT._sum_verifier(init_size, data_size, x))) v2 = ( read | 'make_pair' >> Map(lambda x: (x['name'], x['number'])) | 'count_per_key' >> Count.PerKey() | 'validate_name' >> FlatMap( lambda x: TestParquetIT._count_verifier(init_size, data_size, x))) _ = ((v1, v2, pcol) | 'flatten' >> Flatten() | 'reshuffle' >> Reshuffle() | 'cleanup' >> Map(lambda x: FileSystems.delete([x])))
def _verify_data(self, pcol, init_size, data_size): read = pcol | 'read' >> ReadAllFromParquet() v1 = (read | 'get_number' >> Map(lambda x: x['number']) | 'sum_globally' >> CombineGlobally(sum) | 'validate_number' >> FlatMap( lambda x: TestParquetIT._sum_verifier(init_size, data_size, x) ) ) v2 = (read | 'make_pair' >> Map(lambda x: (x['name'], x['number'])) | 'count_per_key' >> Count.PerKey() | 'validate_name' >> FlatMap( lambda x: TestParquetIT._count_verifier(init_size, data_size, x) ) ) _ = ((v1, v2, pcol) | 'flatten' >> Flatten() | 'reshuffle' >> Reshuffle() | 'cleanup' >> Map(lambda x: FileSystems.delete([x])) )
def finalize_write(self, init_result, writer_results): file_path_prefix = self.file_path_prefix.get() file_name_suffix = self.file_name_suffix.get() writer_results = sorted(writer_results) num_shards = len(writer_results) min_threads = min(num_shards, FileBasedSink._MAX_RENAME_THREADS) num_threads = max(1, min_threads) source_files = [] destination_files = [] chunk_size = FileSystems.get_chunk_size(file_path_prefix) for shard_num, shard in enumerate(writer_results): final_name = ''.join([ file_path_prefix, self.shard_name_format % dict(shard_num=shard_num, num_shards=num_shards), file_name_suffix ]) source_files.append(shard) destination_files.append(final_name) source_file_batch = [ source_files[i:i + chunk_size] for i in range(0, len(source_files), chunk_size) ] destination_file_batch = [ destination_files[i:i + chunk_size] for i in range(0, len(destination_files), chunk_size) ] logging.info( 'Starting finalize_write threads with num_shards: %d, ' 'batches: %d, num_threads: %d', num_shards, len(source_file_batch), num_threads) start_time = time.time() # Use a thread pool for renaming operations. def _rename_batch(batch): """_rename_batch executes batch rename operations.""" source_files, destination_files = batch exceptions = [] try: FileSystems.rename(source_files, destination_files) return exceptions except BeamIOError as exp: if exp.exception_details is None: raise for (src, dest), exception in exp.exception_details.iteritems(): if exception: logging.warning('Rename not successful: %s -> %s, %s', src, dest, exception) should_report = True if isinstance(exception, IOError): # May have already been copied. try: if FileSystems.exists(dest): should_report = False except Exception as exists_e: # pylint: disable=broad-except logging.warning( 'Exception when checking if file %s exists: ' '%s', dest, exists_e) if should_report: logging.warning( ('Exception in _rename_batch. src: %s, ' 'dest: %s, err: %s'), src, dest, exception) exceptions.append(exception) else: logging.debug('Rename successful: %s -> %s', src, dest) return exceptions exception_batches = util.run_using_threadpool( _rename_batch, zip(source_file_batch, destination_file_batch), num_threads) all_exceptions = [ e for exception_batch in exception_batches for e in exception_batch ] if all_exceptions: raise Exception('Encountered exceptions in finalize_write: %s' % all_exceptions) for final_name in destination_files: yield final_name logging.info('Renamed %d shards in %.2f seconds.', num_shards, time.time() - start_time) try: FileSystems.delete([init_result]) except IOError: # May have already been removed. pass
def test_delete_error(self): path1 = os.path.join(self.tmpdir, 'f1') with self.assertRaisesRegexp(BeamIOError, r'^Delete operation failed') as error: FileSystems.delete([path1]) self.assertEqual(list(error.exception.exception_details.keys()), [path1])
def process(self, unused_element, unused_signal, gcs_locations): FileSystems.delete(list(gcs_locations))
def tearDown(self): FileSystems.delete([self.gcs_tempdir + '/'])
def finalize_write(self, init_result, writer_results): file_path_prefix = self.file_path_prefix.get() file_name_suffix = self.file_name_suffix.get() writer_results = sorted(writer_results) num_shards = len(writer_results) min_threads = min(num_shards, FileSink._MAX_RENAME_THREADS) num_threads = max(1, min_threads) source_files = [] destination_files = [] chunk_size = FileSystems.get_chunk_size(file_path_prefix) for shard_num, shard in enumerate(writer_results): final_name = ''.join([ file_path_prefix, self.shard_name_format % dict( shard_num=shard_num, num_shards=num_shards), file_name_suffix ]) source_files.append(shard) destination_files.append(final_name) source_file_batch = [source_files[i:i + chunk_size] for i in xrange(0, len(source_files), chunk_size)] destination_file_batch = [destination_files[i:i + chunk_size] for i in xrange(0, len(destination_files), chunk_size)] logging.info( 'Starting finalize_write threads with num_shards: %d, ' 'batches: %d, num_threads: %d', num_shards, len(source_file_batch), num_threads) start_time = time.time() # Use a thread pool for renaming operations. def _rename_batch(batch): """_rename_batch executes batch rename operations.""" source_files, destination_files = batch exceptions = [] try: FileSystems.rename(source_files, destination_files) return exceptions except BeamIOError as exp: if exp.exception_details is None: raise for (src, dest), exception in exp.exception_details.iteritems(): if exception: logging.warning('Rename not successful: %s -> %s, %s', src, dest, exception) should_report = True if isinstance(exception, IOError): # May have already been copied. try: if FileSystems.exists(dest): should_report = False except Exception as exists_e: # pylint: disable=broad-except logging.warning('Exception when checking if file %s exists: ' '%s', dest, exists_e) if should_report: logging.warning(('Exception in _rename_batch. src: %s, ' 'dest: %s, err: %s'), src, dest, exception) exceptions.append(exception) else: logging.debug('Rename successful: %s -> %s', src, dest) return exceptions exception_batches = util.run_using_threadpool( _rename_batch, zip(source_file_batch, destination_file_batch), num_threads) all_exceptions = [e for exception_batch in exception_batches for e in exception_batch] if all_exceptions: raise Exception('Encountered exceptions in finalize_write: %s', all_exceptions) for final_name in destination_files: yield final_name logging.info('Renamed %d shards in %.2f seconds.', num_shards, time.time() - start_time) try: FileSystems.delete([init_result]) except IOError: # May have already been removed. pass
def finalize_write(self, init_result, writer_results, unused_pre_finalize_results): writer_results = sorted(writer_results) num_shards = len(writer_results) src_files, dst_files, delete_files, num_skipped = ( self._check_state_for_finalize_write(writer_results, num_shards)) num_skipped += len(delete_files) FileSystems.delete(delete_files) num_shards_to_finalize = len(src_files) min_threads = min(num_shards_to_finalize, FileBasedSink._MAX_RENAME_THREADS) num_threads = max(1, min_threads) chunk_size = FileSystems.get_chunk_size(self.file_path_prefix.get()) source_file_batch = [ src_files[i:i + chunk_size] for i in range(0, len(src_files), chunk_size) ] destination_file_batch = [ dst_files[i:i + chunk_size] for i in range(0, len(dst_files), chunk_size) ] if num_shards_to_finalize: logging.info( 'Starting finalize_write threads with num_shards: %d (skipped: %d), ' 'batches: %d, num_threads: %d', num_shards_to_finalize, num_skipped, len(source_file_batch), num_threads) start_time = time.time() # Use a thread pool for renaming operations. def _rename_batch(batch): """_rename_batch executes batch rename operations.""" source_files, destination_files = batch exceptions = [] try: FileSystems.rename(source_files, destination_files) return exceptions except BeamIOError as exp: if exp.exception_details is None: raise for (src, dst), exception in iteritems(exp.exception_details): if exception: logging.error( ('Exception in _rename_batch. src: %s, ' 'dst: %s, err: %s'), src, dst, exception) exceptions.append(exception) else: logging.debug('Rename successful: %s -> %s', src, dst) return exceptions exception_batches = util.run_using_threadpool( _rename_batch, list(zip(source_file_batch, destination_file_batch)), num_threads) all_exceptions = [ e for exception_batch in exception_batches for e in exception_batch ] if all_exceptions: raise Exception( 'Encountered exceptions in finalize_write: %s' % all_exceptions) for final_name in dst_files: yield final_name logging.info('Renamed %d shards in %.2f seconds.', num_shards_to_finalize, time.time() - start_time) else: logging.warning( 'No shards found to finalize. num_shards: %d, skipped: %d', num_shards, num_skipped) try: FileSystems.delete([init_result]) except IOError: # May have already been removed. pass
def finalize_write(self, init_result, writer_results, unused_pre_finalize_results): writer_results = sorted(writer_results) num_shards = len(writer_results) src_files = [] dst_files = [] delete_files = [] chunk_size = FileSystems.get_chunk_size(self.file_path_prefix.get()) num_skipped = 0 for shard_num, shard in enumerate(writer_results): final_name = self._get_final_name(shard_num, num_shards) src = shard dst = final_name src_exists = FileSystems.exists(src) dst_exists = FileSystems.exists(dst) if not src_exists and not dst_exists: raise BeamIOError('src and dst files do not exist. src: %s, dst: %s' % ( src, dst)) if not src_exists and dst_exists: logging.debug('src: %s -> dst: %s already renamed, skipping', src, dst) num_skipped += 1 continue if (src_exists and dst_exists and FileSystems.checksum(src) == FileSystems.checksum(dst)): logging.debug('src: %s == dst: %s, deleting src', src, dst) delete_files.append(src) continue src_files.append(src) dst_files.append(dst) num_skipped = len(delete_files) FileSystems.delete(delete_files) num_shards_to_finalize = len(src_files) min_threads = min(num_shards_to_finalize, FileBasedSink._MAX_RENAME_THREADS) num_threads = max(1, min_threads) source_file_batch = [src_files[i:i + chunk_size] for i in range(0, len(src_files), chunk_size)] destination_file_batch = [dst_files[i:i + chunk_size] for i in range(0, len(dst_files), chunk_size)] if num_shards_to_finalize: logging.info( 'Starting finalize_write threads with num_shards: %d (skipped: %d), ' 'batches: %d, num_threads: %d', num_shards_to_finalize, num_skipped, len(source_file_batch), num_threads) start_time = time.time() # Use a thread pool for renaming operations. def _rename_batch(batch): """_rename_batch executes batch rename operations.""" source_files, destination_files = batch exceptions = [] try: FileSystems.rename(source_files, destination_files) return exceptions except BeamIOError as exp: if exp.exception_details is None: raise for (src, dst), exception in exp.exception_details.iteritems(): if exception: logging.error(('Exception in _rename_batch. src: %s, ' 'dst: %s, err: %s'), src, dst, exception) exceptions.append(exception) else: logging.debug('Rename successful: %s -> %s', src, dst) return exceptions exception_batches = util.run_using_threadpool( _rename_batch, zip(source_file_batch, destination_file_batch), num_threads) all_exceptions = [e for exception_batch in exception_batches for e in exception_batch] if all_exceptions: raise Exception( 'Encountered exceptions in finalize_write: %s' % all_exceptions) for final_name in dst_files: yield final_name logging.info('Renamed %d shards in %.2f seconds.', num_shards_to_finalize, time.time() - start_time) else: logging.warning( 'No shards found to finalize. num_shards: %d, skipped: %d', num_shards, num_skipped) try: FileSystems.delete([init_result]) except IOError: # May have already been removed. pass
def tearDown(self): FileSystems.delete([self.outdir + '/'])
def delete_file(path, d_pl_options, recursive=False, r_level=0, debug=False): fs = FileSystems.get_filesystem(path) if type(fs) == GCSFileSystem: gcs_client = get_gcs_client() if debug: print( f"{'-'*(r_level)} delete_file (debug): path: {path}, recursive: {recursive}" ) if recursive: child_paths = list_dir(path, d_pl_options, exclude_subdir=False) for child_path in child_paths: if child_path != path: if debug: print( f"{'-'*(r_level+1)} delete_file (debug): path {path} has child: {child_path}" ) delete_file( child_path, d_pl_options, recursive=True, r_level=r_level + 1 ) # don't need to recurse (return, since gcsio deletes all leaves from the root) # not stripped, not corrrected case blob_path = get_gcs_bucket(d_pl_options).blob(path) path_not_stripped_not_gcs_corrected_exists = blob_path.exists( gcs_client) if debug: print( f"{'-'*(r_level)} {path} (not stripped, not gcs corrected): {blob_path}, exists: {path_not_stripped_not_gcs_corrected_exists}" ) if path_not_stripped_not_gcs_corrected_exists: blob_path_delete_result = blob_path.delete(gcs_client) if debug: print( f"{'-'*(r_level)} {path} (not stripped, not gcs corrected): {blob_path}, exists: {blob_path.exists(gcs_client)} (after delete attempt)" ) return blob_path_delete_result else: # not stripped, gcs corrected case path_not_stripped_gcs_corrected = gcs_correct_dir_path_form( path, d_pl_options, strip_prefix=False) blob_path = get_gcs_bucket(d_pl_options).blob( path_not_stripped_gcs_corrected) path_not_stripped_gcs_corrected_exists = blob_path.exists( gcs_client) if debug: print( f"{'-'*(r_level)} {path_not_stripped_gcs_corrected} (not stripped, gcs corrected): {blob_path}, exists: {path_not_stripped_gcs_corrected_exists}" ) if path_not_stripped_gcs_corrected_exists: blob_path_delete_result = blob_path.delete(gcs_client) if debug: print( f"{'-'*(r_level)} {path_not_stripped_gcs_corrected} (not stripped, gcs corrected): {blob_path}, exists: {blob_path.exists(gcs_client)} (after delete attempt)" ) return blob_path_delete_result else: # stripped, not gcs corrected case path_stripped_not_gcs_corrected = gcs_path_strip_prefix( path, d_pl_options) blob_path = get_gcs_bucket(d_pl_options).blob( path_stripped_not_gcs_corrected) path_stripped_not_gcs_corrected_exists = blob_path.exists( gcs_client) if debug: print( f"{'-'*(r_level)} {path_stripped_not_gcs_corrected} (stripped, not gcs corrected): {blob_path}, exists: {path_stripped_not_gcs_corrected_exists}" ) if path_stripped_not_gcs_corrected_exists: blob_path_delete_result = blob_path.delete(gcs_client) if debug: print( f"{'-'*(r_level)} {path_stripped_not_gcs_corrected} (stripped, not gcs corrected): {blob_path}, exists: {blob_path.exists(gcs_client)} (after delete attempt)" ) return blob_path_delete_result else: # stripped, gcs corrected case path_stripped_gcs_corrected = gcs_correct_dir_path_form( path, d_pl_options, strip_prefix=True) blob_path = get_gcs_bucket(d_pl_options).blob( path_stripped_gcs_corrected) path_stripped_gcs_corrected_exists = blob_path.exists( gcs_client) if debug: print( f"{'-'*(r_level)} {path_stripped_gcs_corrected} (stripped, gcs corrected): {blob_path}, exists: {path_stripped_gcs_corrected_exists}" ) if path_stripped_gcs_corrected_exists: blob_path_delete_result = blob_path.delete(gcs_client) if debug: print( f"{'-'*(r_level)} {path_stripped_gcs_corrected} (stripped, gcs corrected)): {blob_path}, exists: {blob_path.exists(gcs_client)} (after delete attempt)" ) return blob_path_delete_result else: if debug: print( f"{'-'*(r_level)} out of options trying to delete base path {path}!" ) return False else: return FileSystems.delete([path])