Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        '--input_tars',
        nargs='+',
        required=True,
        help='A set of patterns specifying the paths to the input tar files.')
    parser.add_argument(
        '--input_csv',
        required=True,
        help=
        'The path to the (optionally compressed) CSV that contains the image'
        ' labels.')
    parser.add_argument('--output_jpg_dir',
                        required=False,
                        help='The directory to output the JPG files to.')
    parser.add_argument(
        '--output_bq_table',
        required=False,
        help=
        'A string of the form `project:dataset.table_name`. This table will '
        'be overwritten if it already exists.')
    parser.add_argument(
        '--output_tfrecord_dir',
        required=False,
        help='The directory to output the sharded TFRecords to.')
    parser.add_argument(
        '--output_image_shape',
        nargs='+',
        type=int,
        required=False,
        help=
        'The dimensions to resize the image to. Either HW or HWC. If this is'
        ' None, then the images will not be resized.')

    args, pipeline_args = parser.parse_known_args()
    beam_options = PipelineOptions(pipeline_args)
    beam_options.view_as(SetupOptions).save_main_session = True
    beam_options.view_as(SetupOptions).setup_file = get_setup_file()

    if args.output_image_shape is not None:
        if len(args.output_image_shape) not in (2, 3):
            parser.error('2 (HW) or 3 (HWC) integers are required for '
                         'output_image_shape')

    build_and_run_pipeline(pipeline_options=beam_options,
                           input_tars=args.input_tars,
                           input_csv=args.input_csv,
                           output_jpg_dir=args.output_jpg_dir,
                           output_bq_table=args.output_bq_table,
                           output_tfrecord_dir=args.output_tfrecord_dir,
                           output_image_shape=args.output_image_shape)
Esempio n. 2
0
def main():
    """Build and execute the Apache Beam pipeline using the commandline arguments."""
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        '--gcs_input_prefix',
        nargs='+',
        required=True,
        help="""One or more paths on Google Cloud Storage beginning with gs://.
      All the files with this prefix that match the path_sub_pattern regex will
      be decompressed.""")

    parser.add_argument(
        '--path_sub_pattern',
        required=True,
        help=
        """The regular expression pattern that file paths must match in order
      to be decompressed. This is regular expression is also used to generate
      the name of the new uncompressed file, by replacing the matching portion
      with path_sub_repl.""")

    parser.add_argument(
        '--path_sub_repl',
        required=True,
        help=r"""the string to replace the match in the original path with to
      create the path for the decompressed file. May contain regex
      capture groups, such as \1, \2, ...""")

    args, pipeline_args = parser.parse_known_args()
    beam_options = PipelineOptions(pipeline_args)
    # serialize and provide global imports, functions, etc. to workers.
    beam_options.view_as(SetupOptions).save_main_session = True
    beam_options.view_as(SetupOptions).setup_file = get_setup_file()

    with beam.Pipeline(options=beam_options) as p:
        _ = \
          (p
           | beam.Create(args.gcs_input_prefix)
           | 'GetPaths' >> beam.ParDo(GetPaths(validation_regex=args.path_sub_pattern))
           # Materialize and re-bundle paths with ReShuffle to enable parallelism.
           | beam.Reshuffle()
           | 'DecompressAndWrite' >> beam.ParDo(
               DecompressAndWrite(args.path_sub_pattern, args.path_sub_repl)))
Esempio n. 3
0
def main():
    """Build and execute the Apache Beam pipeline using the commandline arguments."""
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        '--input_tars',
        required=True,
        nargs='+',
        help="""One or more wildcard patterns that give the full paths to the
      input tar files on GCS.""")

    parser.add_argument(
        '--output_dir',
        required=True,
        help="""The output directory to write the untar'd files to.""")

    args, pipeline_args = parser.parse_known_args()
    beam_options = PipelineOptions(pipeline_args)
    # serialize and provide global imports, functions, etc. to workers.
    beam_options.view_as(SetupOptions).save_main_session = True
    beam_options.view_as(SetupOptions).setup_file = get_setup_file()

    if args.output_dir.endswith('/'):
        out_dir = args.output_dir[:-1]
    else:
        out_dir = args.output_dir

    def get_full_output_path(relative_path):
        if relative_path.startswith('/'):
            return out_dir + relative_path
        else:
            return '{}/{}'.format(out_dir, relative_path)

    with beam.Pipeline(options=beam_options) as p:
        _ = \
          (p
           | beam.Create(tf.io.gfile.glob(args.input_tars))
           | 'Untar' >> beam.ParDo(ReadTarFile(), get_full_output_path)
           | 'Write' >> beam.Map(write_file))
def get_commandline_args(description):
    """Generate command line arguments used by inference to BigQuery scripts.

  Args:
    description (str): The description of the script, which will appear at the
      top of the --help documentation.

  Returns:
    Tuple[Namespace, PipelineOptions]: 1) The commandline options with fields
      `input_tfrecord_pattern`, `keras_model`, and `bigquery_table` 2) The
      apache beam pipeline options.
  """
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument(
        '--input_tfrecord_pattern',
        type=str,
        help=
        'A file glob pattern that specifies the TFRecord files to read from.')
    parser.add_argument(
        '--keras_model',
        type=str,
        help=
        'The GCS uri, beginning with gs://, or a local path. This specifies '
        'the saved Keras model to use for inference. This model is created '
        'with `tf.keras.models.Model.save`.')
    parser.add_argument(
        '--bigquery_table',
        type=str,
        help='The table to store the labelled predictions in. This is a string '
        'of the form `project:dataset.table_name`. This table will be '
        'overwritten if it already exists.')
    args, pipeline_args = parser.parse_known_args()
    beam_options = PipelineOptions(pipeline_args)
    beam_options.view_as(SetupOptions).save_main_session = True
    beam_options.view_as(SetupOptions).setup_file = get_setup_file()
    return args, beam_options
Esempio n. 5
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    validators = list()
    validators.append(
        GcsInputPrefixValidator(
            parser.add_argument(
                '--gcs_input_prefix',
                nargs='+',
                required=True,
                help='One or more paths on Google Cloud Storage beginning with '
                'gs://. Specifies the image files to operate on.')))
    parser.add_argument('--image_format',
                        choices=['png', 'jpg'],
                        required=True,
                        help='The format of the input images.')
    parser.add_argument(
        '--output_image_shape',
        required=False,
        nargs='+',
        type=int,
        help='The dimensions to resize the image to. Either HW or HWC. If this '
        'is None, then the images will not be resized.')
    parser.add_argument(
        '--path_match_regex',
        required=False,
        help='A regex that the GCS paths of images must match if they are to be '
        'processed.')
    validators.append(
        QueryValidator(
            parser.add_argument(
                '--query',
                required=True,
                help='A StandardSQL BigQuery query that produces the labels for '
                'the  images. This query must output one column named `path`, '
                'which is used to join the labels with the images (using the '
                'image\'s full GCS path, beginning with gs://). All the other '
                'columns are used as labels in the output TFRecord dataset.')))
    parser.add_argument(
        '--output_tfrecord_prefix',
        required=False,
        help='The full GCS path to the output TFRecord, including the prefix of'
        'the object names, with will have shard identifies and .tfrecord '
        'appended to them.')
    args, pipeline_args = parser.parse_known_args()
    beam_options = PipelineOptions(pipeline_args)
    for validator in validators:
        validator.validate(
            args, project=beam_options.view_as(GoogleCloudOptions).project)
    beam_options.view_as(SetupOptions).save_main_session = True
    beam_options.view_as(SetupOptions).setup_file = get_setup_file()

    if args.output_image_shape is not None:
        if len(args.output_image_shape) not in (2, 3):
            parser.error('2 (HW) or 3 (HWC) integers are required for '
                         'output_image_shape')

    build_and_run_pipeline(pipeline_options=beam_options,
                           gcs_input_prefix=args.gcs_input_prefix,
                           image_format=args.image_format,
                           output_image_shape=args.output_image_shape,
                           path_match_regex=args.path_match_regex,
                           bigquery_query=args.query,
                           output_tfrecord_prefix=args.output_tfrecord_prefix)
Esempio n. 6
0
 def test_exists_diff_directory(self):
     os.chdir('/')
     self.assertTrue(os.path.isfile(get_setup_file()))
Esempio n. 7
0
 def test_exists(self):
     self.assertTrue(os.path.isfile(get_setup_file()))