def _verify_copy(self, src, dst, dst_kms_key_name=None): self.assertTrue(FileSystems.exists(src), 'src does not exist: %s' % src) self.assertTrue(FileSystems.exists(dst), 'dst does not exist: %s' % dst) src_checksum = self.gcsio.checksum(src) dst_checksum = self.gcsio.checksum(dst) self.assertEqual(src_checksum, dst_checksum) self.assertEqual(self.gcsio.kms_key(dst), dst_kms_key_name)
def test_exists(self): path1 = os.path.join(self.tmpdir, 'f1') path2 = os.path.join(self.tmpdir, 'f2') with open(path1, 'a') as f: f.write('Hello') self.assertTrue(FileSystems.exists(path1)) self.assertFalse(FileSystems.exists(path2))
def mkdirs_if_not_exists(path): if not FileSystems.exists(path): try: get_logger().info('attempting to create directory: %s', path) FileSystems.mkdirs(path) except IOError: if not FileSystems.exists(path): raise
def test_delete_files_succeeds(self): path = os.path.join(self.tmpdir, 'f1') with open(path, 'a') as f: f.write('test') assert FileSystems.exists(path) utils.delete_files([path]) assert not FileSystems.exists(path)
def test_delete(self): path1 = os.path.join(self.tmpdir, 'f1') with open(path1, 'a') as f: f.write('Hello') self.assertTrue(FileSystems.exists(path1)) FileSystems.delete([path1]) self.assertFalse(FileSystems.exists(path1))
def _verify_copy(self, src, dst, dst_kms_key_name=None): self.assertTrue(FileSystems.exists(src), 'src does not exist: %s' % src) self.assertTrue(FileSystems.exists(dst), 'dst does not exist: %s' % dst) src_checksum = self.gcsio.checksum(src) dst_checksum = self.gcsio.checksum(dst) self.assertEqual(src_checksum, dst_checksum) actual_dst_kms_key = self.gcsio.kms_key(dst) if actual_dst_kms_key is None: self.assertEqual(actual_dst_kms_key, dst_kms_key_name) else: self.assertTrue(actual_dst_kms_key.startswith(dst_kms_key_name), "got: %s, wanted startswith: %s" % (actual_dst_kms_key, dst_kms_key_name))
def test_rename_directory(self): path_t1 = os.path.join(self.tmpdir, 't1') path_t2 = os.path.join(self.tmpdir, 't2') FileSystems.mkdirs(path_t1) path1 = os.path.join(path_t1, 'f1') path2 = os.path.join(path_t2, 'f1') with open(path1, 'a') as f: f.write('Hello') FileSystems.rename([path_t1], [path_t2]) self.assertTrue(FileSystems.exists(path_t2)) self.assertFalse(FileSystems.exists(path_t1)) self.assertTrue(FileSystems.exists(path2)) self.assertFalse(FileSystems.exists(path1))
def get_vcf_headers(input_file): # type: (str) -> vcf_header_io.VcfHeader """Returns VCF headers from ``input_file``. Args: input_file (str): A string specifying the path to the representative VCF file, i.e., the VCF file that contains a header representative of all VCF files matching the input_pattern of the job. It can be local or remote (e.g. on GCS). Returns: VCF header info. Raises: ValueError: If ``input_file`` is not a valid VCF file (e.g. bad format, empty, non-existent). """ if not FileSystems.exists(input_file): raise ValueError('VCF header does not exist') try: vcf_reader = vcf.Reader(fsock=_line_generator(input_file)) except (SyntaxError, StopIteration) as e: raise ValueError('Invalid VCF header in %s: %s' % (input_file, str(e))) return vcf_header_io.VcfHeader(infos=vcf_reader.infos, filters=vcf_reader.filters, alts=vcf_reader.alts, formats=vcf_reader.formats, contigs=vcf_reader.contigs, file_path=input_file)
def _rename_batch(batch): """_rename_batch executes batch rename operations.""" source_files, destination_files = batch exceptions = [] try: FileSystems.rename(source_files, destination_files) return exceptions except BeamIOError as exp: if exp.exception_details is None: raise for (src, dest), exception in exp.exception_details.iteritems(): if exception: logging.warning('Rename not successful: %s -> %s, %s', src, dest, exception) should_report = True if isinstance(exception, IOError): # May have already been copied. try: if FileSystems.exists(dest): should_report = False except Exception as exists_e: # pylint: disable=broad-except logging.warning('Exception when checking if file %s exists: ' '%s', dest, exists_e) if should_report: logging.warning(('Exception in _rename_batch. src: %s, ' 'dest: %s, err: %s'), src, dest, exception) exceptions.append(exception) else: logging.debug('Rename successful: %s -> %s', src, dest) return exceptions
def path_exists(path, d_pl_options, is_dir): dir_path = path fs = FileSystems.get_filesystem(dir_path) if type(fs) == GCSFileSystem: dir_path = gcs_correct_dir_path_form( dir_path, d_pl_options, strip_prefix=False) if is_dir else path return FileSystems.exists(dir_path), dir_path
def _rename_batch(batch): """_rename_batch executes batch rename operations.""" source_files, destination_files = batch exceptions = [] try: FileSystems.rename(source_files, destination_files) return exceptions except BeamIOError as exp: if exp.exception_details is None: raise for (src, dest), exception in exp.exception_details.iteritems(): if exception: logging.warning('Rename not successful: %s -> %s, %s', src, dest, exception) should_report = True if isinstance(exception, IOError): # May have already been copied. try: if FileSystems.exists(dest): should_report = False except Exception as exists_e: # pylint: disable=broad-except logging.warning( 'Exception when checking if file %s exists: ' '%s', dest, exists_e) if should_report: logging.warning( ('Exception in _rename_batch. src: %s, ' 'dest: %s, err: %s'), src, dest, exception) exceptions.append(exception) else: logging.debug('Rename successful: %s -> %s', src, dest) return exceptions
def run_pipeline(pipeline_args, known_args): """Splits images into separate directories using thresholds on randnum. Args: pipeline_args: arguments ingested by beam pipeline known_args: additional arguments for this project, such as the storage bucket, source_image_dir, and dest_image_dir. Returns: [nothing] - runs beam pipeline and copies output files to different dirs """ # Specify pipeline options pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True # Attach bucket prefix if running on cloud source_images_pattern = known_args.source_image_dir + '/*' dest_prefix = known_args.dest_image_dir + '/' if known_args.cloud: source_images_pattern = ('gs://' + known_args.storage_bucket + '/' + source_images_pattern) dest_prefix = ('gs://' + known_args.storage_bucket + '/' + dest_prefix) # Get output directories for split images split_names = known_args.split_names split_fractions = known_args.split_fractions dest_images_dirs = [dest_prefix + x + '/' for x in split_names] # Create output directories if they do not already exist (for local runs) for dest_images_dir in dest_images_dirs: if not FileSystems.exists(dest_images_dir): FileSystems.mkdirs(dest_images_dir) # Log information on source, destination, and split fractions split_log_list = [ x[0] + '(' + str(x[1]) + ')' for x in zip(split_names, split_fractions) ] logging.info('Starting ' + ' | '.join(split_log_list) + ' split from images with source file pattern ' + source_images_pattern) logging.info('Destination parent directory: ' + dest_prefix) with beam.Pipeline(options=pipeline_options) as p: # Read files and partition pipelines split_pipelines = ( p | 'read_images' >> beam.io.Read( LabeledImageFileReader(source_images_pattern)) | 'split_images' >> beam.Partition( generate_split_fn(split_fractions), len(split_fractions))) # Write each pipeline to a corresponding output directory for partition, split_name_and_dest_dir in enumerate( zip(split_names, dest_images_dirs)): _ = (split_pipelines[partition] | 'write_' + split_name_and_dest_dir[0] >> beam.Map( write_to_directory, dst_dir=split_name_and_dest_dir[1])) logging.info('Done splitting image sets')
def test_delete_files_fails_with_io_error(self, mocked_delete): f = tempfile.NamedTemporaryFile(dir=self.tmpdir, delete=False) assert FileSystems.exists(f.name) with self.assertRaises(BeamIOError): utils.delete_files([f.name]) self.assertTrue(mocked_delete.called) self.assertEqual(mocked_delete.call_count, 4)
def pre_finalize(self, init_result, writer_results): writer_results = sorted(writer_results) num_shards = len(writer_results) existing_files = [] for shard_num in range(len(writer_results)): final_name = self._get_final_name(shard_num, num_shards) if FileSystems.exists(final_name): existing_files.append(final_name) if existing_files: logging.info('Deleting existing files in target path: %d', len(existing_files)) FileSystems.delete(existing_files)
def run_pipeline(pipeline_args, known_args): """A beam pipeline to resize and pad images from urls and save to storage. Args: pipeline_args: Arguments consumed by the beam pipeline known_args: Extra args used to set various fields such as the dataset and table from which to read cat urls and labels, and the bucket and image directory to write processed images Returns: [nothing], just writes processed images to the image directory """ # Specify pipeline options pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True # Determine bigquery source from dataset and table arguments query = ('SELECT ROW_NUMBER() OVER() as index, original_url, label, randnum' ' from [' + known_args.dataset + '.' + known_args.table + ']') bq_source = bigquery.BigQuerySource(query=query) logging.info('Starting image collection into directory ' + known_args.output_dir) # Create destination directory if it doesn't exist output_dir = known_args.output_dir if known_args.cloud: output_dir = 'gs://' + known_args.storage_bucket + '/' + output_dir # Directory needs to be explicitly made on some filesystems. if not FileSystems.exists(output_dir): FileSystems.mkdirs(output_dir) # Run pipeline with beam.Pipeline(options=pipeline_options) as p: _ = (p | 'read_rows_from_cat_info_table' >> beam.io.Read(bq_source) | 'fetch_images_from_urls' >> beam.Map(fetch_image_from_url) | 'filter_bad_or_absent_images' >> beam.Filter(filter_bad_or_missing_image) | 'resize_and_pad_images' >> beam.Map(resize_and_pad, output_image_dim=known_args.output_image_dim) | 'write_images_to_storage' >> beam.Map(write_processed_image, output_dir=output_dir) ) logging.info('Done collecting images')
def get_metadata_header_lines(input_file): # type: (str) -> List[str] """Returns header lines from the given VCF file ``input_file``. Only returns lines starting with ## and not #. Args: input_file: A string specifying the path to a VCF file. It can be local or remote (e.g. on GCS). Returns: A list containing header lines of ``input_file``. Raises: ValueError: If ``input_file`` does not exist. """ if not FileSystems.exists(input_file): raise ValueError('{} does not exist'.format(input_file)) return [ line for line in _line_generator(input_file) if line.startswith('##') ]
def get_vcf_headers(input_file): if not FileSystems.exists(input_file): raise ValueError('VCF header does not exist') header = libcbcf.VariantHeader() lines = _header_line_generator(input_file) sample_line = None header.add_line('##fileformat=VCFv4.0\n') file_empty = True read_file_format_line = False for line in lines: if not read_file_format_line: read_file_format_line = True if line and not line.startswith( vcf_header_io.FILE_FORMAT_HEADER_TEMPLATE.format( VERSION='')): header.add_line( vcf_header_io.FILE_FORMAT_HEADER_TEMPLATE.format( VERSION='4.0')) if line.startswith('##'): header.add_line(line.strip()) file_empty = False elif line.startswith('#'): sample_line = line.strip() file_empty = False elif line: # If non-empty non-header line exists, #CHROM line has to be supplied. if not sample_line: raise ValueError('Header line is missing') else: if file_empty: raise ValueError('File is empty') # If no records were found, use dummy #CHROM line for sample extraction. if not sample_line: sample_line = vcf_header_io.LAST_HEADER_LINE_PREFIX return vcf_header_io.VcfHeader(infos=header.info, filters=header.filters, alts=header.alts, formats=header.formats, contigs=header.contigs, samples=sample_line, file_path=input_file)
def upload_to_bundle_store(self, bundle: Bundle, source: Source, git: bool, unpack: bool): """Uploads the given source to the bundle store. Given arguments are the same as UploadManager.upload_to_bundle_store(). Used when uploading from rest server.""" try: # bundle_path = self._bundle_store.get_bundle_location(bundle.uuid) is_url, is_fileobj, filename = self._interpret_source(source) if is_url: assert isinstance(source, str) if git: bundle_path = self._update_and_get_bundle_location( bundle, is_directory=True) self.write_git_repo(source, bundle_path) else: # If downloading from a URL, convert the source to a file object. is_fileobj = True source = (filename, urlopen_with_retry(source)) if is_fileobj: source_filename, source_fileobj = cast(Tuple[str, IO[bytes]], source) source_ext = zip_util.get_archive_ext(source_filename) if unpack and zip_util.path_is_archive(filename): bundle_path = self._update_and_get_bundle_location( bundle, is_directory=source_ext in ARCHIVE_EXTS_DIR) self.write_fileobj(source_ext, source_fileobj, bundle_path, unpack_archive=True) else: bundle_path = self._update_and_get_bundle_location( bundle, is_directory=False) self.write_fileobj(source_ext, source_fileobj, bundle_path, unpack_archive=False) except UsageError: if FileSystems.exists(bundle_path): path_util.remove(bundle_path) raise
def get_vcf_headers(input_file): """Returns VCF headers (FORMAT and INFO) from ``input_file``. Args: input_file (str): A string specifying the path to the representative VCF file, i.e., the VCF file that contains a header representative of all VCF files matching the input_pattern of the job. It can be local or remote (e.g. on GCS). Returns: ``HeaderFields`` specifying header info. Raises: ValueError: If ``input_file`` is not a valid VCF file (e.g. bad format, empty, non-existent). """ if not FileSystems.exists(input_file): raise ValueError('VCF header does not exist') try: vcf_reader = vcf.Reader(fsock=_line_generator(input_file)) except (SyntaxError, StopIteration) as e: raise ValueError('Invalid VCF header: %s' % str(e)) return HeaderFields(vcf_reader.infos, vcf_reader.formats)
def remove(path): """ Remove the given path, whether it is a directory, file, or link. """ if parse_linked_bundle_url(path).uses_beam: from apache_beam.io.filesystems import FileSystems if not FileSystems.exists(path): FileSystems.delete([path]) return check_isvalid(path, 'remove') set_write_permissions(path) # Allow permissions if os.path.islink(path): os.unlink(path) elif os.path.isdir(path): try: shutil.rmtree(path) except shutil.Error: pass else: os.remove(path) if os.path.exists(path): print('Failed to remove %s' % path)
def get_path_exists(path): """ Returns whether the given path exists. """ return FileSystems.exists(path)
def test_delete_files_succeeds(self): f = tempfile.NamedTemporaryFile(dir=self.tmpdir, delete=False) assert FileSystems.exists(f.name) utils.delete_files([f.name]) assert not FileSystems.exists(f.name)
def tftransform( pipeline_args, # type: List[str] temp_location, # type: str schema_file, # type: str output_dir, # type: str preprocessing_fn, # type: Any training_data=None, # type: Union[None, str] evaluation_data=None, # type: Union[None, str] transform_fn_dir=None, # type: Union[None, str] compression_type=None # type: str ): # type: (...) -> PipelineState """ Generic tf.transform pipeline that takes tf.{example, record} training and evaluation datasets and outputs transformed data together with transform function Saved Model. :param pipeline_args: un-parsed Dataflow arguments :param temp_location: temporary location for dataflow job working dir :param schema_file: path to the raw feature schema text file :param output_dir: output dir for transformed data and function :param preprocessing_fn: tf.transform preprocessing function :param training_data: path to the training data :param evaluation_data: path to the evaluation data :param transform_fn_dir: dir to previously saved transformation function to apply :param compression_type: compression type for writing of tf.records :return final state of the Beam pipeline """ assert_not_empty_string(temp_location) assert_not_empty_string(schema_file) assert_not_empty_string(output_dir) assert_not_none(preprocessing_fn) if compression_type is None: compression_type = CompressionTypes.AUTO raw_feature_spec = schema_txt_file_to_feature_spec(schema_file) raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema) raw_data_coder = ExampleProtoCoder(raw_data_metadata.schema) transformed_train_output_dir = os.path.join(output_dir, "training") transformed_eval_output_dir = os.path.join(output_dir, "evaluation") if not any(i.startswith("--job_name") for i in pipeline_args): pipeline_args.append("--job_name=tf-transform-{}-{}".format( getpass.getuser(), int(time.time()))) pipeline = beam.Pipeline(argv=pipeline_args) with beam_impl.Context(temp_dir=temp_location): if training_data is not None: # if training data is provided, transform_fn_dir will be ignored if transform_fn_dir is not None: warnings.warn( "Transform_fn_dir is ignored because training_data is provided" ) transform_fn_output = os.path.join(output_dir, "transform_fn", "saved_model.pb") if FileSystems.exists(transform_fn_output): raise ValueError("Transform fn already exists at %s!" % transform_fn_output) # compute the transform_fn and apply to the training data raw_train_data = (pipeline | "ReadTrainData" >> tfrecordio.ReadFromTFRecord( training_data, coder=raw_data_coder)) ((transformed_train_data, transformed_train_metadata), transform_fn) = ( (raw_train_data, raw_data_metadata) | ("AnalyzeAndTransformTrainData" >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) ) # noqa: E501 _ = ( # noqa: F841 transform_fn | "WriteTransformFn" >> transform_fn_io.WriteTransformFn(output_dir)) transformed_train_coder = ExampleProtoCoder( transformed_train_metadata.schema) _ = ( # noqa: F841 transformed_train_data | "WriteTransformedTrainData" >> tfrecordio.WriteToTFRecord( os.path.join(transformed_train_output_dir, "part"), # noqa: E501 coder=transformed_train_coder, # noqa: E501 compression_type=compression_type, # noqa: E501 file_name_suffix=".tfrecords")) # noqa: E501 else: if transform_fn_dir is None: raise ValueError( "Either training_data or transformed_fn needs to be provided" ) # load the transform_fn transform_fn = pipeline | transform_fn_io.ReadTransformFn( transform_fn_dir) if evaluation_data is not None: # if evaluation_data exists, apply the transform_fn to the evaluation data raw_eval_data = (pipeline | "ReadEvalData" >> tfrecordio.ReadFromTFRecord( evaluation_data, coder=raw_data_coder)) (transformed_eval_data, transformed_eval_metadata) = ( ((raw_eval_data, raw_data_metadata), transform_fn) | "TransformEvalData" >> beam_impl.TransformDataset()) transformed_eval_coder = ExampleProtoCoder( transformed_eval_metadata.schema) _ = ( # noqa: F841 transformed_eval_data | "WriteTransformedEvalData" >> tfrecordio.WriteToTFRecord( os.path.join(transformed_eval_output_dir, "part"), # noqa: E501 coder=transformed_eval_coder, # noqa: E501 compression_type=compression_type, # noqa: E501 file_name_suffix=".tfrecords")) # noqa: E501 result = pipeline.run().wait_until_finish() return result
def finalize_write(self, init_result, writer_results, unused_pre_finalize_results): writer_results = sorted(writer_results) num_shards = len(writer_results) src_files = [] dst_files = [] delete_files = [] chunk_size = FileSystems.get_chunk_size(self.file_path_prefix.get()) num_skipped = 0 for shard_num, shard in enumerate(writer_results): final_name = self._get_final_name(shard_num, num_shards) src = shard dst = final_name src_exists = FileSystems.exists(src) dst_exists = FileSystems.exists(dst) if not src_exists and not dst_exists: raise BeamIOError( 'src and dst files do not exist. src: %s, dst: %s' % (src, dst)) if not src_exists and dst_exists: logging.debug('src: %s -> dst: %s already renamed, skipping', src, dst) num_skipped += 1 continue if (src_exists and dst_exists and FileSystems.checksum(src) == FileSystems.checksum(dst)): logging.debug('src: %s == dst: %s, deleting src', src, dst) delete_files.append(src) continue src_files.append(src) dst_files.append(dst) num_skipped = len(delete_files) FileSystems.delete(delete_files) num_shards_to_finalize = len(src_files) min_threads = min(num_shards_to_finalize, FileBasedSink._MAX_RENAME_THREADS) num_threads = max(1, min_threads) source_file_batch = [ src_files[i:i + chunk_size] for i in range(0, len(src_files), chunk_size) ] destination_file_batch = [ dst_files[i:i + chunk_size] for i in range(0, len(dst_files), chunk_size) ] if num_shards_to_finalize: logging.info( 'Starting finalize_write threads with num_shards: %d (skipped: %d), ' 'batches: %d, num_threads: %d', num_shards_to_finalize, num_skipped, len(source_file_batch), num_threads) start_time = time.time() # Use a thread pool for renaming operations. def _rename_batch(batch): """_rename_batch executes batch rename operations.""" source_files, destination_files = batch exceptions = [] try: FileSystems.rename(source_files, destination_files) return exceptions except BeamIOError as exp: if exp.exception_details is None: raise for (src, dst), exception in exp.exception_details.iteritems(): if exception: logging.error( ('Exception in _rename_batch. src: %s, ' 'dst: %s, err: %s'), src, dst, exception) exceptions.append(exception) else: logging.debug('Rename successful: %s -> %s', src, dst) return exceptions exception_batches = util.run_using_threadpool( _rename_batch, zip(source_file_batch, destination_file_batch), num_threads) all_exceptions = [ e for exception_batch in exception_batches for e in exception_batch ] if all_exceptions: raise Exception( 'Encountered exceptions in finalize_write: %s' % all_exceptions) for final_name in dst_files: yield final_name logging.info('Renamed %d shards in %.2f seconds.', num_shards_to_finalize, time.time() - start_time) else: logging.warning( 'No shards found to finalize. num_shards: %d, skipped: %d', num_shards, num_skipped) try: FileSystems.delete([init_result]) except IOError: # May have already been removed. pass
def finalize_write(self, init_result, writer_results, unused_pre_finalize_results): writer_results = sorted(writer_results) num_shards = len(writer_results) src_files = [] dst_files = [] delete_files = [] chunk_size = FileSystems.get_chunk_size(self.file_path_prefix.get()) num_skipped = 0 for shard_num, shard in enumerate(writer_results): final_name = self._get_final_name(shard_num, num_shards) src = shard dst = final_name src_exists = FileSystems.exists(src) dst_exists = FileSystems.exists(dst) if not src_exists and not dst_exists: raise BeamIOError('src and dst files do not exist. src: %s, dst: %s' % ( src, dst)) if not src_exists and dst_exists: logging.debug('src: %s -> dst: %s already renamed, skipping', src, dst) num_skipped += 1 continue if (src_exists and dst_exists and FileSystems.checksum(src) == FileSystems.checksum(dst)): logging.debug('src: %s == dst: %s, deleting src', src, dst) delete_files.append(src) continue src_files.append(src) dst_files.append(dst) num_skipped = len(delete_files) FileSystems.delete(delete_files) num_shards_to_finalize = len(src_files) min_threads = min(num_shards_to_finalize, FileBasedSink._MAX_RENAME_THREADS) num_threads = max(1, min_threads) source_file_batch = [src_files[i:i + chunk_size] for i in range(0, len(src_files), chunk_size)] destination_file_batch = [dst_files[i:i + chunk_size] for i in range(0, len(dst_files), chunk_size)] if num_shards_to_finalize: logging.info( 'Starting finalize_write threads with num_shards: %d (skipped: %d), ' 'batches: %d, num_threads: %d', num_shards_to_finalize, num_skipped, len(source_file_batch), num_threads) start_time = time.time() # Use a thread pool for renaming operations. def _rename_batch(batch): """_rename_batch executes batch rename operations.""" source_files, destination_files = batch exceptions = [] try: FileSystems.rename(source_files, destination_files) return exceptions except BeamIOError as exp: if exp.exception_details is None: raise for (src, dst), exception in exp.exception_details.iteritems(): if exception: logging.error(('Exception in _rename_batch. src: %s, ' 'dst: %s, err: %s'), src, dst, exception) exceptions.append(exception) else: logging.debug('Rename successful: %s -> %s', src, dst) return exceptions exception_batches = util.run_using_threadpool( _rename_batch, zip(source_file_batch, destination_file_batch), num_threads) all_exceptions = [e for exception_batch in exception_batches for e in exception_batch] if all_exceptions: raise Exception( 'Encountered exceptions in finalize_write: %s' % all_exceptions) for final_name in dst_files: yield final_name logging.info('Renamed %d shards in %.2f seconds.', num_shards_to_finalize, time.time() - start_time) else: logging.warning( 'No shards found to finalize. num_shards: %d, skipped: %d', num_shards, num_skipped) try: FileSystems.delete([init_result]) except IOError: # May have already been removed. pass
def _compute_target_info_blob( path: str, depth: Union[int, float], return_generators=False ) -> TargetInfo: """Computes target info for a file that is externalized on Blob Storage, meaning that it's contained within an indexed archive file. Args: path (str): The path that refers to the specified target. depth (Union[int, float]): Depth until which directory contents are resolved. return_generators (bool, optional): If set to True, the 'contents' key of directories is equal to a generator instead of a list. Defaults to False. Raises: PathException: Path not found or invalid. Returns: TargetInfo: Target info of specified path. """ linked_bundle_path = parse_linked_bundle_url(path) if not FileSystems.exists(linked_bundle_path.bundle_path): raise PathException(linked_bundle_path.bundle_path) if not linked_bundle_path.is_archive: # Single file raise PathException( "Single files on Blob Storage are not supported; only a path within an archive file is supported." ) # process_contents is used to process the value of the 'contents' key (which is a generator) before it is returned. # If return_generators is False, it resolves the given generator into a list; otherwise, it just returns # the generator unchanged. process_contents = list if return_generators is False else lambda x: x with OpenIndexedArchiveFile(linked_bundle_path.bundle_path) as tf: islink = lambda finfo: stat.S_ISLNK(finfo.mode) readlink = lambda finfo: finfo.linkname isfile = lambda finfo: not stat.S_ISDIR(finfo.mode) isdir = lambda finfo: stat.S_ISDIR(finfo.mode) listdir = lambda path: cast(Dict[str, FileInfo], tf.listDir(path) or {}) def _get_info(path: str, depth: Union[int, float]) -> TargetInfo: """This function is called to get the target info of the specified path. If the specified path is a directory and additional depth is requested, this function is recursively called to retrieve the target info of files within the directory, much like _compute_target_info_local. """ if not path.startswith("/"): path = "/" + path finfo = cast(FileInfo, tf.getFileInfo(path)) if finfo is None: # Not found raise PathException("File not found.") result: TargetInfo = { 'name': os.path.basename(path), # get last part of path 'size': finfo.size, 'perm': finfo.mode & 0o777, 'type': '', } if islink(finfo): result['type'] = 'link' result['link'] = readlink(finfo) elif isfile(finfo): result['type'] = 'file' elif isdir(finfo): result['type'] = 'directory' if depth > 0: result['contents'] = process_contents( _get_info(path + "/" + file_name, depth - 1) for file_name in listdir(path) if file_name != "." ) return result if not linked_bundle_path.is_archive_dir: # Return the contents of the single .gz file. # The entry returned by ratarmount for a single .gz file is not technically part of a tar archive # and has a name hardcoded as "contents," so we modify the type, name, and permissions of # the output accordingly. return cast( TargetInfo, dict( _get_info("/contents", depth), type="file", name=linked_bundle_path.bundle_uuid, perm=0o755, ), ) if linked_bundle_path.archive_subpath: # Return the contents of a subpath within a directory. return _get_info(linked_bundle_path.archive_subpath, depth) else: # No subpath, return the entire directory with the bundle # contents in it. The permissions of this directory # cannot be set by the user (the user can only set permissions # of files *within* this directory that are part of the bundle # itself), so we just return a placeholder value of 0o755 # for this directory's permissions. file = FileSystems.match([path])[0].metadata_list[0] result: TargetInfo = { 'name': linked_bundle_path.bundle_uuid, 'type': 'directory', 'size': file.size_in_bytes, 'perm': 0o755, } if depth > 0: result['contents'] = process_contents( _get_info(file_name, depth - 1) for file_name in listdir("/") if file_name != "." ) return result
def _file_exists(file_url): result = FileSystems.exists(file_url) LOGGER.debug('file exists: result=%s, url=%s', result, file_url) return result