예제 #1
0
 def _verify_copy(self, src, dst, dst_kms_key_name=None):
   self.assertTrue(FileSystems.exists(src), 'src does not exist: %s' % src)
   self.assertTrue(FileSystems.exists(dst), 'dst does not exist: %s' % dst)
   src_checksum = self.gcsio.checksum(src)
   dst_checksum = self.gcsio.checksum(dst)
   self.assertEqual(src_checksum, dst_checksum)
   self.assertEqual(self.gcsio.kms_key(dst), dst_kms_key_name)
 def test_exists(self):
   path1 = os.path.join(self.tmpdir, 'f1')
   path2 = os.path.join(self.tmpdir, 'f2')
   with open(path1, 'a') as f:
     f.write('Hello')
   self.assertTrue(FileSystems.exists(path1))
   self.assertFalse(FileSystems.exists(path2))
예제 #3
0
 def test_exists(self):
     path1 = os.path.join(self.tmpdir, 'f1')
     path2 = os.path.join(self.tmpdir, 'f2')
     with open(path1, 'a') as f:
         f.write('Hello')
     self.assertTrue(FileSystems.exists(path1))
     self.assertFalse(FileSystems.exists(path2))
예제 #4
0
def mkdirs_if_not_exists(path):
    if not FileSystems.exists(path):
        try:
            get_logger().info('attempting to create directory: %s', path)
            FileSystems.mkdirs(path)
        except IOError:
            if not FileSystems.exists(path):
                raise
예제 #5
0
 def _verify_copy(self, src, dst, dst_kms_key_name=None):
     self.assertTrue(FileSystems.exists(src),
                     'src does not exist: %s' % src)
     self.assertTrue(FileSystems.exists(dst),
                     'dst does not exist: %s' % dst)
     src_checksum = self.gcsio.checksum(src)
     dst_checksum = self.gcsio.checksum(dst)
     self.assertEqual(src_checksum, dst_checksum)
     self.assertEqual(self.gcsio.kms_key(dst), dst_kms_key_name)
예제 #6
0
  def test_delete_files_succeeds(self):
    path = os.path.join(self.tmpdir, 'f1')

    with open(path, 'a') as f:
      f.write('test')

    assert FileSystems.exists(path)
    utils.delete_files([path])
    assert not FileSystems.exists(path)
예제 #7
0
    def test_delete(self):
        path1 = os.path.join(self.tmpdir, 'f1')

        with open(path1, 'a') as f:
            f.write('Hello')

        self.assertTrue(FileSystems.exists(path1))
        FileSystems.delete([path1])
        self.assertFalse(FileSystems.exists(path1))
  def test_delete(self):
    path1 = os.path.join(self.tmpdir, 'f1')

    with open(path1, 'a') as f:
      f.write('Hello')

    self.assertTrue(FileSystems.exists(path1))
    FileSystems.delete([path1])
    self.assertFalse(FileSystems.exists(path1))
예제 #9
0
 def _verify_copy(self, src, dst, dst_kms_key_name=None):
   self.assertTrue(FileSystems.exists(src), 'src does not exist: %s' % src)
   self.assertTrue(FileSystems.exists(dst), 'dst does not exist: %s' % dst)
   src_checksum = self.gcsio.checksum(src)
   dst_checksum = self.gcsio.checksum(dst)
   self.assertEqual(src_checksum, dst_checksum)
   actual_dst_kms_key = self.gcsio.kms_key(dst)
   if actual_dst_kms_key is None:
     self.assertEqual(actual_dst_kms_key, dst_kms_key_name)
   else:
     self.assertTrue(actual_dst_kms_key.startswith(dst_kms_key_name),
                     "got: %s, wanted startswith: %s" % (actual_dst_kms_key,
                                                         dst_kms_key_name))
  def test_rename_directory(self):
    path_t1 = os.path.join(self.tmpdir, 't1')
    path_t2 = os.path.join(self.tmpdir, 't2')
    FileSystems.mkdirs(path_t1)

    path1 = os.path.join(path_t1, 'f1')
    path2 = os.path.join(path_t2, 'f1')
    with open(path1, 'a') as f:
      f.write('Hello')

    FileSystems.rename([path_t1], [path_t2])
    self.assertTrue(FileSystems.exists(path_t2))
    self.assertFalse(FileSystems.exists(path_t1))
    self.assertTrue(FileSystems.exists(path2))
    self.assertFalse(FileSystems.exists(path1))
예제 #11
0
    def test_rename_directory(self):
        path_t1 = os.path.join(self.tmpdir, 't1')
        path_t2 = os.path.join(self.tmpdir, 't2')
        FileSystems.mkdirs(path_t1)

        path1 = os.path.join(path_t1, 'f1')
        path2 = os.path.join(path_t2, 'f1')
        with open(path1, 'a') as f:
            f.write('Hello')

        FileSystems.rename([path_t1], [path_t2])
        self.assertTrue(FileSystems.exists(path_t2))
        self.assertFalse(FileSystems.exists(path_t1))
        self.assertTrue(FileSystems.exists(path2))
        self.assertFalse(FileSystems.exists(path1))
def get_vcf_headers(input_file):
    # type: (str) -> vcf_header_io.VcfHeader
    """Returns VCF headers from ``input_file``.

  Args:
    input_file (str): A string specifying the path to the representative VCF
      file, i.e., the VCF file that contains a header representative of all VCF
      files matching the input_pattern of the job. It can be local or
      remote (e.g. on GCS).
  Returns:
    VCF header info.
  Raises:
    ValueError: If ``input_file`` is not a valid VCF file (e.g. bad format,
    empty, non-existent).
  """
    if not FileSystems.exists(input_file):
        raise ValueError('VCF header does not exist')
    try:
        vcf_reader = vcf.Reader(fsock=_line_generator(input_file))
    except (SyntaxError, StopIteration) as e:
        raise ValueError('Invalid VCF header in %s: %s' % (input_file, str(e)))
    return vcf_header_io.VcfHeader(infos=vcf_reader.infos,
                                   filters=vcf_reader.filters,
                                   alts=vcf_reader.alts,
                                   formats=vcf_reader.formats,
                                   contigs=vcf_reader.contigs,
                                   file_path=input_file)
예제 #13
0
 def _rename_batch(batch):
   """_rename_batch executes batch rename operations."""
   source_files, destination_files = batch
   exceptions = []
   try:
     FileSystems.rename(source_files, destination_files)
     return exceptions
   except BeamIOError as exp:
     if exp.exception_details is None:
       raise
     for (src, dest), exception in exp.exception_details.iteritems():
       if exception:
         logging.warning('Rename not successful: %s -> %s, %s', src, dest,
                         exception)
         should_report = True
         if isinstance(exception, IOError):
           # May have already been copied.
           try:
             if FileSystems.exists(dest):
               should_report = False
           except Exception as exists_e:  # pylint: disable=broad-except
             logging.warning('Exception when checking if file %s exists: '
                             '%s', dest, exists_e)
         if should_report:
           logging.warning(('Exception in _rename_batch. src: %s, '
                            'dest: %s, err: %s'), src, dest, exception)
           exceptions.append(exception)
       else:
         logging.debug('Rename successful: %s -> %s', src, dest)
     return exceptions
예제 #14
0
def path_exists(path, d_pl_options, is_dir):
    dir_path = path
    fs = FileSystems.get_filesystem(dir_path)
    if type(fs) == GCSFileSystem:
        dir_path = gcs_correct_dir_path_form(
            dir_path, d_pl_options, strip_prefix=False) if is_dir else path
    return FileSystems.exists(dir_path), dir_path
예제 #15
0
 def _rename_batch(batch):
     """_rename_batch executes batch rename operations."""
     source_files, destination_files = batch
     exceptions = []
     try:
         FileSystems.rename(source_files, destination_files)
         return exceptions
     except BeamIOError as exp:
         if exp.exception_details is None:
             raise
         for (src,
              dest), exception in exp.exception_details.iteritems():
             if exception:
                 logging.warning('Rename not successful: %s -> %s, %s',
                                 src, dest, exception)
                 should_report = True
                 if isinstance(exception, IOError):
                     # May have already been copied.
                     try:
                         if FileSystems.exists(dest):
                             should_report = False
                     except Exception as exists_e:  # pylint: disable=broad-except
                         logging.warning(
                             'Exception when checking if file %s exists: '
                             '%s', dest, exists_e)
                 if should_report:
                     logging.warning(
                         ('Exception in _rename_batch. src: %s, '
                          'dest: %s, err: %s'), src, dest, exception)
                     exceptions.append(exception)
             else:
                 logging.debug('Rename successful: %s -> %s', src, dest)
         return exceptions
예제 #16
0
def run_pipeline(pipeline_args, known_args):
    """Splits images into separate directories using thresholds on randnum.

  Args:
    pipeline_args: arguments ingested by beam pipeline
    known_args: additional arguments for this project, such as the storage
                bucket, source_image_dir, and dest_image_dir.

  Returns:
    [nothing] - runs beam pipeline and copies output files to different dirs
  """
    # Specify pipeline options
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    # Attach bucket prefix if running on cloud
    source_images_pattern = known_args.source_image_dir + '/*'
    dest_prefix = known_args.dest_image_dir + '/'
    if known_args.cloud:
        source_images_pattern = ('gs://' + known_args.storage_bucket + '/' +
                                 source_images_pattern)
        dest_prefix = ('gs://' + known_args.storage_bucket + '/' + dest_prefix)

    # Get output directories for split images
    split_names = known_args.split_names
    split_fractions = known_args.split_fractions
    dest_images_dirs = [dest_prefix + x + '/' for x in split_names]

    # Create output directories if they do not already exist (for local runs)
    for dest_images_dir in dest_images_dirs:
        if not FileSystems.exists(dest_images_dir):
            FileSystems.mkdirs(dest_images_dir)

    # Log information on source, destination, and split fractions
    split_log_list = [
        x[0] + '(' + str(x[1]) + ')' for x in zip(split_names, split_fractions)
    ]
    logging.info('Starting ' + ' | '.join(split_log_list) +
                 ' split from images with source file pattern ' +
                 source_images_pattern)
    logging.info('Destination parent directory: ' + dest_prefix)

    with beam.Pipeline(options=pipeline_options) as p:
        # Read files and partition pipelines
        split_pipelines = (
            p
            | 'read_images' >> beam.io.Read(
                LabeledImageFileReader(source_images_pattern))
            | 'split_images' >> beam.Partition(
                generate_split_fn(split_fractions), len(split_fractions)))

        # Write each pipeline to a corresponding output directory
        for partition, split_name_and_dest_dir in enumerate(
                zip(split_names, dest_images_dirs)):
            _ = (split_pipelines[partition]
                 | 'write_' + split_name_and_dest_dir[0] >> beam.Map(
                     write_to_directory, dst_dir=split_name_and_dest_dir[1]))

    logging.info('Done splitting image sets')
예제 #17
0
  def test_delete_files_fails_with_io_error(self, mocked_delete):
    f = tempfile.NamedTemporaryFile(dir=self.tmpdir, delete=False)
    assert FileSystems.exists(f.name)

    with self.assertRaises(BeamIOError):
      utils.delete_files([f.name])
    self.assertTrue(mocked_delete.called)
    self.assertEqual(mocked_delete.call_count, 4)
예제 #18
0
    def test_delete_files_fails_with_io_error(self, mocked_delete):
        f = tempfile.NamedTemporaryFile(dir=self.tmpdir, delete=False)
        assert FileSystems.exists(f.name)

        with self.assertRaises(BeamIOError):
            utils.delete_files([f.name])
        self.assertTrue(mocked_delete.called)
        self.assertEqual(mocked_delete.call_count, 4)
예제 #19
0
 def pre_finalize(self, init_result, writer_results):
   writer_results = sorted(writer_results)
   num_shards = len(writer_results)
   existing_files = []
   for shard_num in range(len(writer_results)):
     final_name = self._get_final_name(shard_num, num_shards)
     if FileSystems.exists(final_name):
       existing_files.append(final_name)
   if existing_files:
     logging.info('Deleting existing files in target path: %d',
                  len(existing_files))
     FileSystems.delete(existing_files)
예제 #20
0
 def pre_finalize(self, init_result, writer_results):
     writer_results = sorted(writer_results)
     num_shards = len(writer_results)
     existing_files = []
     for shard_num in range(len(writer_results)):
         final_name = self._get_final_name(shard_num, num_shards)
         if FileSystems.exists(final_name):
             existing_files.append(final_name)
     if existing_files:
         logging.info('Deleting existing files in target path: %d',
                      len(existing_files))
         FileSystems.delete(existing_files)
def run_pipeline(pipeline_args, known_args):
  """A beam pipeline to resize and pad images from urls and save to storage.

  Args:
    pipeline_args: Arguments consumed by the beam pipeline
    known_args: Extra args used to set various fields such as the dataset and
                table from which to read cat urls and labels, and the bucket
                and image directory to write processed images

  Returns:
    [nothing], just writes processed images to the image directory
  """

  # Specify pipeline options
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True

  # Determine bigquery source from dataset and table arguments
  query = ('SELECT ROW_NUMBER() OVER() as index, original_url, label, randnum'
           ' from [' + known_args.dataset + '.' + known_args.table + ']')
  bq_source = bigquery.BigQuerySource(query=query)

  logging.info('Starting image collection into directory '
               + known_args.output_dir)

  # Create destination directory if it doesn't exist
  output_dir = known_args.output_dir
  if known_args.cloud:
    output_dir = 'gs://' + known_args.storage_bucket + '/' + output_dir

  # Directory needs to be explicitly made on some filesystems.
  if not FileSystems.exists(output_dir):
    FileSystems.mkdirs(output_dir)

  # Run pipeline
  with beam.Pipeline(options=pipeline_options) as p:
    _ = (p
         | 'read_rows_from_cat_info_table'
         >> beam.io.Read(bq_source)
         | 'fetch_images_from_urls'
         >> beam.Map(fetch_image_from_url)
         | 'filter_bad_or_absent_images'
         >> beam.Filter(filter_bad_or_missing_image)
         | 'resize_and_pad_images'
         >> beam.Map(resize_and_pad,
                     output_image_dim=known_args.output_image_dim)
         | 'write_images_to_storage'
         >> beam.Map(write_processed_image,
                     output_dir=output_dir)
         )

  logging.info('Done collecting images')
def get_metadata_header_lines(input_file):
    # type: (str) -> List[str]
    """Returns header lines from the given VCF file ``input_file``.

  Only returns lines starting with ## and not #.

  Args:
    input_file: A string specifying the path to a VCF file.
      It can be local or remote (e.g. on GCS).
  Returns:
    A list containing header lines of ``input_file``.
  Raises:
    ValueError: If ``input_file`` does not exist.
  """
    if not FileSystems.exists(input_file):
        raise ValueError('{} does not exist'.format(input_file))
    return [
        line for line in _line_generator(input_file) if line.startswith('##')
    ]
예제 #23
0
def get_vcf_headers(input_file):

    if not FileSystems.exists(input_file):
        raise ValueError('VCF header does not exist')
    header = libcbcf.VariantHeader()
    lines = _header_line_generator(input_file)
    sample_line = None
    header.add_line('##fileformat=VCFv4.0\n')
    file_empty = True
    read_file_format_line = False
    for line in lines:
        if not read_file_format_line:
            read_file_format_line = True
            if line and not line.startswith(
                    vcf_header_io.FILE_FORMAT_HEADER_TEMPLATE.format(
                        VERSION='')):
                header.add_line(
                    vcf_header_io.FILE_FORMAT_HEADER_TEMPLATE.format(
                        VERSION='4.0'))
        if line.startswith('##'):
            header.add_line(line.strip())
            file_empty = False
        elif line.startswith('#'):
            sample_line = line.strip()
            file_empty = False
        elif line:
            # If non-empty non-header line exists, #CHROM line has to be supplied.
            if not sample_line:
                raise ValueError('Header line is missing')
        else:
            if file_empty:
                raise ValueError('File is empty')
            # If no records were found, use dummy #CHROM line for sample extraction.
            if not sample_line:
                sample_line = vcf_header_io.LAST_HEADER_LINE_PREFIX

    return vcf_header_io.VcfHeader(infos=header.info,
                                   filters=header.filters,
                                   alts=header.alts,
                                   formats=header.formats,
                                   contigs=header.contigs,
                                   samples=sample_line,
                                   file_path=input_file)
예제 #24
0
    def upload_to_bundle_store(self, bundle: Bundle, source: Source, git: bool,
                               unpack: bool):
        """Uploads the given source to the bundle store.
        Given arguments are the same as UploadManager.upload_to_bundle_store().
        Used when uploading from rest server."""
        try:
            # bundle_path = self._bundle_store.get_bundle_location(bundle.uuid)
            is_url, is_fileobj, filename = self._interpret_source(source)
            if is_url:
                assert isinstance(source, str)
                if git:
                    bundle_path = self._update_and_get_bundle_location(
                        bundle, is_directory=True)
                    self.write_git_repo(source, bundle_path)
                else:
                    # If downloading from a URL, convert the source to a file object.
                    is_fileobj = True
                    source = (filename, urlopen_with_retry(source))
            if is_fileobj:
                source_filename, source_fileobj = cast(Tuple[str, IO[bytes]],
                                                       source)
                source_ext = zip_util.get_archive_ext(source_filename)
                if unpack and zip_util.path_is_archive(filename):
                    bundle_path = self._update_and_get_bundle_location(
                        bundle, is_directory=source_ext in ARCHIVE_EXTS_DIR)
                    self.write_fileobj(source_ext,
                                       source_fileobj,
                                       bundle_path,
                                       unpack_archive=True)
                else:
                    bundle_path = self._update_and_get_bundle_location(
                        bundle, is_directory=False)
                    self.write_fileobj(source_ext,
                                       source_fileobj,
                                       bundle_path,
                                       unpack_archive=False)

        except UsageError:
            if FileSystems.exists(bundle_path):
                path_util.remove(bundle_path)
            raise
def get_vcf_headers(input_file):
    """Returns VCF headers (FORMAT and INFO) from ``input_file``.

  Args:
    input_file (str): A string specifying the path to the representative VCF
    file, i.e., the VCF file that contains a header representative of all VCF
    files matching the input_pattern of the job. It can be local or remote (e.g.
    on GCS).
  Returns:
    ``HeaderFields`` specifying header info.
  Raises:
    ValueError: If ``input_file`` is not a valid VCF file (e.g. bad format,
    empty, non-existent).
  """
    if not FileSystems.exists(input_file):
        raise ValueError('VCF header does not exist')
    try:
        vcf_reader = vcf.Reader(fsock=_line_generator(input_file))
    except (SyntaxError, StopIteration) as e:
        raise ValueError('Invalid VCF header: %s' % str(e))
    return HeaderFields(vcf_reader.infos, vcf_reader.formats)
예제 #26
0
def remove(path):
    """
    Remove the given path, whether it is a directory, file, or link.
    """
    if parse_linked_bundle_url(path).uses_beam:
        from apache_beam.io.filesystems import FileSystems

        if not FileSystems.exists(path):
            FileSystems.delete([path])
        return
    check_isvalid(path, 'remove')
    set_write_permissions(path)  # Allow permissions
    if os.path.islink(path):
        os.unlink(path)
    elif os.path.isdir(path):
        try:
            shutil.rmtree(path)
        except shutil.Error:
            pass
    else:
        os.remove(path)
    if os.path.exists(path):
        print('Failed to remove %s' % path)
예제 #27
0
def get_path_exists(path):
    """
    Returns whether the given path exists.
    """
    return FileSystems.exists(path)
예제 #28
0
 def test_delete_files_succeeds(self):
   f = tempfile.NamedTemporaryFile(dir=self.tmpdir, delete=False)
   assert FileSystems.exists(f.name)
   utils.delete_files([f.name])
   assert not FileSystems.exists(f.name)
예제 #29
0
 def test_delete_files_succeeds(self):
     f = tempfile.NamedTemporaryFile(dir=self.tmpdir, delete=False)
     assert FileSystems.exists(f.name)
     utils.delete_files([f.name])
     assert not FileSystems.exists(f.name)
예제 #30
0
def tftransform(
    pipeline_args,  # type: List[str]
    temp_location,  # type: str
    schema_file,  # type: str
    output_dir,  # type: str
    preprocessing_fn,  # type: Any
    training_data=None,  # type: Union[None, str]
    evaluation_data=None,  # type: Union[None, str]
    transform_fn_dir=None,  # type: Union[None, str]
    compression_type=None  # type: str
):  # type: (...) -> PipelineState
    """
    Generic tf.transform pipeline that takes tf.{example, record} training and evaluation
    datasets and outputs transformed data together with transform function Saved Model.

    :param pipeline_args: un-parsed Dataflow arguments
    :param temp_location: temporary location for dataflow job working dir
    :param schema_file: path to the raw feature schema text file
    :param output_dir: output dir for transformed data and function
    :param preprocessing_fn: tf.transform preprocessing function
    :param training_data: path to the training data
    :param evaluation_data: path to the evaluation data
    :param transform_fn_dir: dir to previously saved transformation function to apply
    :param compression_type: compression type for writing of tf.records
    :return final state of the Beam pipeline
    """
    assert_not_empty_string(temp_location)
    assert_not_empty_string(schema_file)
    assert_not_empty_string(output_dir)
    assert_not_none(preprocessing_fn)

    if compression_type is None:
        compression_type = CompressionTypes.AUTO

    raw_feature_spec = schema_txt_file_to_feature_spec(schema_file)
    raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
    raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema)
    raw_data_coder = ExampleProtoCoder(raw_data_metadata.schema)

    transformed_train_output_dir = os.path.join(output_dir, "training")
    transformed_eval_output_dir = os.path.join(output_dir, "evaluation")

    if not any(i.startswith("--job_name") for i in pipeline_args):
        pipeline_args.append("--job_name=tf-transform-{}-{}".format(
            getpass.getuser(), int(time.time())))

    pipeline = beam.Pipeline(argv=pipeline_args)
    with beam_impl.Context(temp_dir=temp_location):
        if training_data is not None:
            # if training data is provided, transform_fn_dir will be ignored
            if transform_fn_dir is not None:
                warnings.warn(
                    "Transform_fn_dir is ignored because training_data is provided"
                )

            transform_fn_output = os.path.join(output_dir, "transform_fn",
                                               "saved_model.pb")
            if FileSystems.exists(transform_fn_output):
                raise ValueError("Transform fn already exists at %s!" %
                                 transform_fn_output)

            # compute the transform_fn and apply to the training data
            raw_train_data = (pipeline
                              | "ReadTrainData" >> tfrecordio.ReadFromTFRecord(
                                  training_data, coder=raw_data_coder))

            ((transformed_train_data, transformed_train_metadata),
             transform_fn) = (
                 (raw_train_data, raw_data_metadata)
                 | ("AnalyzeAndTransformTrainData" >>
                    beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
             )  # noqa: E501

            _ = (  # noqa: F841
                transform_fn
                | "WriteTransformFn" >>
                transform_fn_io.WriteTransformFn(output_dir))

            transformed_train_coder = ExampleProtoCoder(
                transformed_train_metadata.schema)
            _ = (  # noqa: F841
                transformed_train_data
                | "WriteTransformedTrainData" >> tfrecordio.WriteToTFRecord(
                    os.path.join(transformed_train_output_dir,
                                 "part"),  # noqa: E501
                    coder=transformed_train_coder,  # noqa: E501
                    compression_type=compression_type,  # noqa: E501
                    file_name_suffix=".tfrecords"))  # noqa: E501
        else:
            if transform_fn_dir is None:
                raise ValueError(
                    "Either training_data or transformed_fn needs to be provided"
                )
            # load the transform_fn
            transform_fn = pipeline | transform_fn_io.ReadTransformFn(
                transform_fn_dir)

        if evaluation_data is not None:
            # if evaluation_data exists, apply the transform_fn to the evaluation data
            raw_eval_data = (pipeline
                             | "ReadEvalData" >> tfrecordio.ReadFromTFRecord(
                                 evaluation_data, coder=raw_data_coder))

            (transformed_eval_data, transformed_eval_metadata) = (
                ((raw_eval_data, raw_data_metadata), transform_fn)
                | "TransformEvalData" >> beam_impl.TransformDataset())

            transformed_eval_coder = ExampleProtoCoder(
                transformed_eval_metadata.schema)
            _ = (  # noqa: F841
                transformed_eval_data
                | "WriteTransformedEvalData" >> tfrecordio.WriteToTFRecord(
                    os.path.join(transformed_eval_output_dir,
                                 "part"),  # noqa: E501
                    coder=transformed_eval_coder,  # noqa: E501
                    compression_type=compression_type,  # noqa: E501
                    file_name_suffix=".tfrecords"))  # noqa: E501
    result = pipeline.run().wait_until_finish()

    return result
예제 #31
0
    def finalize_write(self, init_result, writer_results,
                       unused_pre_finalize_results):
        writer_results = sorted(writer_results)
        num_shards = len(writer_results)

        src_files = []
        dst_files = []
        delete_files = []
        chunk_size = FileSystems.get_chunk_size(self.file_path_prefix.get())
        num_skipped = 0
        for shard_num, shard in enumerate(writer_results):
            final_name = self._get_final_name(shard_num, num_shards)
            src = shard
            dst = final_name
            src_exists = FileSystems.exists(src)
            dst_exists = FileSystems.exists(dst)
            if not src_exists and not dst_exists:
                raise BeamIOError(
                    'src and dst files do not exist. src: %s, dst: %s' %
                    (src, dst))
            if not src_exists and dst_exists:
                logging.debug('src: %s -> dst: %s already renamed, skipping',
                              src, dst)
                num_skipped += 1
                continue
            if (src_exists and dst_exists and FileSystems.checksum(src)
                    == FileSystems.checksum(dst)):
                logging.debug('src: %s == dst: %s, deleting src', src, dst)
                delete_files.append(src)
                continue

            src_files.append(src)
            dst_files.append(dst)

        num_skipped = len(delete_files)
        FileSystems.delete(delete_files)
        num_shards_to_finalize = len(src_files)
        min_threads = min(num_shards_to_finalize,
                          FileBasedSink._MAX_RENAME_THREADS)
        num_threads = max(1, min_threads)

        source_file_batch = [
            src_files[i:i + chunk_size]
            for i in range(0, len(src_files), chunk_size)
        ]
        destination_file_batch = [
            dst_files[i:i + chunk_size]
            for i in range(0, len(dst_files), chunk_size)
        ]

        if num_shards_to_finalize:
            logging.info(
                'Starting finalize_write threads with num_shards: %d (skipped: %d), '
                'batches: %d, num_threads: %d', num_shards_to_finalize,
                num_skipped, len(source_file_batch), num_threads)
            start_time = time.time()

            # Use a thread pool for renaming operations.
            def _rename_batch(batch):
                """_rename_batch executes batch rename operations."""
                source_files, destination_files = batch
                exceptions = []
                try:
                    FileSystems.rename(source_files, destination_files)
                    return exceptions
                except BeamIOError as exp:
                    if exp.exception_details is None:
                        raise
                    for (src,
                         dst), exception in exp.exception_details.iteritems():
                        if exception:
                            logging.error(
                                ('Exception in _rename_batch. src: %s, '
                                 'dst: %s, err: %s'), src, dst, exception)
                            exceptions.append(exception)
                        else:
                            logging.debug('Rename successful: %s -> %s', src,
                                          dst)
                    return exceptions

            exception_batches = util.run_using_threadpool(
                _rename_batch, zip(source_file_batch, destination_file_batch),
                num_threads)

            all_exceptions = [
                e for exception_batch in exception_batches
                for e in exception_batch
            ]
            if all_exceptions:
                raise Exception(
                    'Encountered exceptions in finalize_write: %s' %
                    all_exceptions)

            for final_name in dst_files:
                yield final_name

            logging.info('Renamed %d shards in %.2f seconds.',
                         num_shards_to_finalize,
                         time.time() - start_time)
        else:
            logging.warning(
                'No shards found to finalize. num_shards: %d, skipped: %d',
                num_shards, num_skipped)

        try:
            FileSystems.delete([init_result])
        except IOError:
            # May have already been removed.
            pass
예제 #32
0
  def finalize_write(self, init_result, writer_results,
                     unused_pre_finalize_results):
    writer_results = sorted(writer_results)
    num_shards = len(writer_results)

    src_files = []
    dst_files = []
    delete_files = []
    chunk_size = FileSystems.get_chunk_size(self.file_path_prefix.get())
    num_skipped = 0
    for shard_num, shard in enumerate(writer_results):
      final_name = self._get_final_name(shard_num, num_shards)
      src = shard
      dst = final_name
      src_exists = FileSystems.exists(src)
      dst_exists = FileSystems.exists(dst)
      if not src_exists and not dst_exists:
        raise BeamIOError('src and dst files do not exist. src: %s, dst: %s' % (
            src, dst))
      if not src_exists and dst_exists:
        logging.debug('src: %s -> dst: %s already renamed, skipping', src, dst)
        num_skipped += 1
        continue
      if (src_exists and dst_exists and
          FileSystems.checksum(src) == FileSystems.checksum(dst)):
        logging.debug('src: %s == dst: %s, deleting src', src, dst)
        delete_files.append(src)
        continue

      src_files.append(src)
      dst_files.append(dst)

    num_skipped = len(delete_files)
    FileSystems.delete(delete_files)
    num_shards_to_finalize = len(src_files)
    min_threads = min(num_shards_to_finalize, FileBasedSink._MAX_RENAME_THREADS)
    num_threads = max(1, min_threads)

    source_file_batch = [src_files[i:i + chunk_size]
                         for i in range(0, len(src_files), chunk_size)]
    destination_file_batch = [dst_files[i:i + chunk_size]
                              for i in range(0, len(dst_files), chunk_size)]

    if num_shards_to_finalize:
      logging.info(
          'Starting finalize_write threads with num_shards: %d (skipped: %d), '
          'batches: %d, num_threads: %d',
          num_shards_to_finalize, num_skipped, len(source_file_batch),
          num_threads)
      start_time = time.time()

      # Use a thread pool for renaming operations.
      def _rename_batch(batch):
        """_rename_batch executes batch rename operations."""
        source_files, destination_files = batch
        exceptions = []
        try:
          FileSystems.rename(source_files, destination_files)
          return exceptions
        except BeamIOError as exp:
          if exp.exception_details is None:
            raise
          for (src, dst), exception in exp.exception_details.iteritems():
            if exception:
              logging.error(('Exception in _rename_batch. src: %s, '
                             'dst: %s, err: %s'), src, dst, exception)
              exceptions.append(exception)
            else:
              logging.debug('Rename successful: %s -> %s', src, dst)
          return exceptions

      exception_batches = util.run_using_threadpool(
          _rename_batch, zip(source_file_batch, destination_file_batch),
          num_threads)

      all_exceptions = [e for exception_batch in exception_batches
                        for e in exception_batch]
      if all_exceptions:
        raise Exception(
            'Encountered exceptions in finalize_write: %s' % all_exceptions)

      for final_name in dst_files:
        yield final_name

      logging.info('Renamed %d shards in %.2f seconds.', num_shards_to_finalize,
                   time.time() - start_time)
    else:
      logging.warning(
          'No shards found to finalize. num_shards: %d, skipped: %d',
          num_shards, num_skipped)

    try:
      FileSystems.delete([init_result])
    except IOError:
      # May have already been removed.
      pass
예제 #33
0
def _compute_target_info_blob(
    path: str, depth: Union[int, float], return_generators=False
) -> TargetInfo:
    """Computes target info for a file that is externalized on Blob Storage, meaning
    that it's contained within an indexed archive file.

    Args:
        path (str): The path that refers to the specified target.
        depth (Union[int, float]): Depth until which directory contents are resolved.
        return_generators (bool, optional): If set to True, the 'contents' key of directories is equal to a generator instead of a list. Defaults to False.

    Raises:
        PathException: Path not found or invalid.

    Returns:
        TargetInfo: Target info of specified path.
    """

    linked_bundle_path = parse_linked_bundle_url(path)
    if not FileSystems.exists(linked_bundle_path.bundle_path):
        raise PathException(linked_bundle_path.bundle_path)
    if not linked_bundle_path.is_archive:
        # Single file
        raise PathException(
            "Single files on Blob Storage are not supported; only a path within an archive file is supported."
        )

    # process_contents is used to process the value of the 'contents' key (which is a generator) before it is returned.
    # If return_generators is False, it resolves the given generator into a list; otherwise, it just returns
    # the generator unchanged.
    process_contents = list if return_generators is False else lambda x: x

    with OpenIndexedArchiveFile(linked_bundle_path.bundle_path) as tf:
        islink = lambda finfo: stat.S_ISLNK(finfo.mode)
        readlink = lambda finfo: finfo.linkname
        isfile = lambda finfo: not stat.S_ISDIR(finfo.mode)
        isdir = lambda finfo: stat.S_ISDIR(finfo.mode)
        listdir = lambda path: cast(Dict[str, FileInfo], tf.listDir(path) or {})

        def _get_info(path: str, depth: Union[int, float]) -> TargetInfo:
            """This function is called to get the target info of the specified path.
            If the specified path is a directory and additional depth is requested, this
            function is recursively called to retrieve the target info of files within
            the directory, much like _compute_target_info_local.
            """
            if not path.startswith("/"):
                path = "/" + path
            finfo = cast(FileInfo, tf.getFileInfo(path))
            if finfo is None:
                # Not found
                raise PathException("File not found.")
            result: TargetInfo = {
                'name': os.path.basename(path),  # get last part of path
                'size': finfo.size,
                'perm': finfo.mode & 0o777,
                'type': '',
            }
            if islink(finfo):
                result['type'] = 'link'
                result['link'] = readlink(finfo)
            elif isfile(finfo):
                result['type'] = 'file'
            elif isdir(finfo):
                result['type'] = 'directory'
                if depth > 0:
                    result['contents'] = process_contents(
                        _get_info(path + "/" + file_name, depth - 1)
                        for file_name in listdir(path)
                        if file_name != "."
                    )
            return result

        if not linked_bundle_path.is_archive_dir:
            # Return the contents of the single .gz file.
            # The entry returned by ratarmount for a single .gz file is not technically part of a tar archive
            # and has a name hardcoded as "contents," so we modify the type, name, and permissions of
            # the output accordingly.
            return cast(
                TargetInfo,
                dict(
                    _get_info("/contents", depth),
                    type="file",
                    name=linked_bundle_path.bundle_uuid,
                    perm=0o755,
                ),
            )
        if linked_bundle_path.archive_subpath:
            # Return the contents of a subpath within a directory.
            return _get_info(linked_bundle_path.archive_subpath, depth)
        else:
            # No subpath, return the entire directory with the bundle
            # contents in it. The permissions of this directory
            # cannot be set by the user (the user can only set permissions
            # of files *within* this directory that are part of the bundle
            # itself), so we just return a placeholder value of 0o755
            # for this directory's permissions.
            file = FileSystems.match([path])[0].metadata_list[0]
            result: TargetInfo = {
                'name': linked_bundle_path.bundle_uuid,
                'type': 'directory',
                'size': file.size_in_bytes,
                'perm': 0o755,
            }
            if depth > 0:
                result['contents'] = process_contents(
                    _get_info(file_name, depth - 1)
                    for file_name in listdir("/")
                    if file_name != "."
                )
            return result
예제 #34
0
def _file_exists(file_url):
    result = FileSystems.exists(file_url)
    LOGGER.debug('file exists: result=%s, url=%s', result, file_url)
    return result