Example #1
0
    def ReadExamplesArtifact(self,
                             examples: types.Artifact,
                             num_examples: int,
                             split_name: Optional[Text] = None):
        """Read records from Examples artifact.

    Currently it assumes Examples artifact contains serialized tf.Example in
    gzipped TFRecord files.

    Args:
      examples: `Examples` artifact.
      num_examples: Number of examples to read. If the specified value is larger
          than the actual number of examples, all examples would be read.
      split_name: Name of the split to read from the Examples artifact.

    Raises:
      RuntimeError: If read twice.
    """
        if self._records:
            raise RuntimeError('Cannot read records twice.')

        if num_examples < 1:
            raise ValueError('num_examples < 1 (got {})'.format(num_examples))

        available_splits = artifact_utils.decode_split_names(
            examples.split_names)
        if not available_splits:
            raise ValueError(
                'No split_name is available in given Examples artifact.')
        if split_name is None:
            split_name = available_splits[0]
        if split_name not in available_splits:
            raise ValueError(
                'No split_name {}; available split names: {}'.format(
                    split_name, ', '.join(available_splits)))

        # ExampleGen generates artifacts under each split_name directory.
        glob_pattern = os.path.join(examples.uri, split_name, '*')
        tfxio_factory = tfxio_utils.get_tfxio_factory_from_artifact(
            examples=[examples],
            telemetry_descriptors=_TELEMETRY_DESCRIPTORS,
            schema=None,
            read_as_raw_records=True,
            raw_record_column_name=_RAW_RECORDS_COLUMN)
        try:
            filenames = fileio.glob(glob_pattern)
        except tf.errors.NotFoundError:
            filenames = []
        if not filenames:
            raise ValueError(
                'Unable to find examples matching {}.'.format(glob_pattern))

        self._payload_format = examples_utils.get_payload_format(examples)
        tfxio = tfxio_factory(filenames)

        self._ReadFromDataset(
            tfxio.TensorFlowDataset(
                dataset_options.TensorFlowDatasetOptions(
                    batch_size=num_examples)))
Example #2
0
    def ReadExamplesArtifact(self,
                             examples: types.Artifact,
                             num_examples: int,
                             split_name: Optional[Text] = None):
        """Read records from Examples artifact.

    Currently it assumes Examples artifact contains serialized tf.Example in
    gzipped TFRecord files.

    Args:
      examples: `Examples` artifact.
      num_examples: Number of examples to read. If the specified value is larger
          than the actual number of examples, all examples would be read.
      split_name: Name of the split to read from the Examples artifact.

    Raises:
      RuntimeError: If read twice.
    """
        if self._records:
            raise RuntimeError('Cannot read records twice.')

        if num_examples < 1:
            raise ValueError('num_examples < 1 (got {})'.format(num_examples))

        available_splits = artifact_utils.decode_split_names(
            examples.split_names)
        if not available_splits:
            raise ValueError(
                'No split_name is available in given Examples artifact.')
        if split_name is None:
            split_name = available_splits[0]
        if split_name not in available_splits:
            raise ValueError(
                'No split_name {}; available split names: {}'.format(
                    split_name, ', '.join(available_splits)))

        # ExampleGen generates artifacts under each split_name directory.
        glob_pattern = os.path.join(examples.uri, split_name, '*.gz')
        try:
            filenames = fileio.glob(glob_pattern)
        except tf.errors.NotFoundError:
            filenames = []
        if not filenames:
            raise ValueError(
                'Unable to find examples matching {}.'.format(glob_pattern))

        # Assume we have a tf.Example logical format.
        self._record_format = _LogicalFormat.TF_EXAMPLE

        self._ReadFromDataset(tf.data.TFRecordDataset(filenames,
                                                      compression_type='GZIP'),
                              num_examples=num_examples)
Example #3
0
 def _get_results(self, path, file_name, proto_type):
     results = []
     filepattern = os.path.join(path, file_name) + '-?????-of-?????.gz'
     for f in fileio.glob(filepattern):
         record_iterator = tf.compat.v1.python_io.tf_record_iterator(
             path=f,
             options=tf.compat.v1.python_io.TFRecordOptions(
                 tf.compat.v1.python_io.TFRecordCompressionType.GZIP))
         for record_string in record_iterator:
             prediction_log = proto_type()
             prediction_log.MergeFromString(record_string)
             results.append(prediction_log)
     return results
Example #4
0
def _CsvToExample(  # pylint: disable=invalid-name
        pipeline: beam.Pipeline, exec_properties: Dict[Text, Any],
        split_pattern: Text) -> beam.pvalue.PCollection:
    """Read CSV files and transform to TF examples.

  Note that each input split will be transformed by this function separately.

  Args:
    pipeline: beam pipeline.
    exec_properties: A dict of execution properties.
      - input_base: input dir that contains CSV data. CSV must have header line.
    split_pattern: Split.pattern in Input config, glob relative file pattern
      that maps to input files with root directory given by input_base.

  Returns:
    PCollection of TF examples.

  Raises:
    RuntimeError: if split is empty or csv headers are not equal.
  """
    input_base_uri = exec_properties[standard_component_specs.INPUT_BASE_KEY]
    csv_pattern = os.path.join(input_base_uri, split_pattern)
    logging.info('Processing input csv data %s to TFExample.', csv_pattern)

    csv_files = fileio.glob(csv_pattern)
    if not csv_files:
        raise RuntimeError(
            'Split pattern {} does not match any files.'.format(csv_pattern))

    column_names = io_utils.load_csv_column_names(csv_files[0])
    for csv_file in csv_files[1:]:
        if io_utils.load_csv_column_names(csv_file) != column_names:
            raise RuntimeError(
                'Files in same split {} have different header.'.format(
                    csv_pattern))

    parsed_csv_lines = (
        pipeline
        | 'ReadFromText' >> beam.io.ReadFromText(file_pattern=csv_pattern,
                                                 skip_header_lines=1)
        | 'ParseCSVLine' >> beam.ParDo(csv_decoder.ParseCSVLine(delimiter=','))
        | 'ExtractParsedCSVLines' >> beam.Keys())
    column_infos = beam.pvalue.AsSingleton(
        parsed_csv_lines
        | 'InferColumnTypes' >> beam.CombineGlobally(
            csv_decoder.ColumnTypeInferrer(column_names,
                                           skip_blank_lines=True)))

    return (parsed_csv_lines
            |
            'ToTFExample' >> beam.ParDo(_ParsedCsvToTfExample(), column_infos))
Example #5
0
 def _get_results(self, prediction_log_path):
     results = []
     filepattern = os.path.join(
         prediction_log_path,
         executor._PREDICTION_LOGS_DIR_NAME) + '-?????-of-?????.gz'
     for f in fileio.glob(filepattern):
         record_iterator = tf.compat.v1.python_io.tf_record_iterator(
             path=f,
             options=tf.compat.v1.python_io.TFRecordOptions(
                 tf.compat.v1.python_io.TFRecordCompressionType.GZIP))
         for record_string in record_iterator:
             prediction_log = prediction_log_pb2.PredictionLog()
             prediction_log.MergeFromString(record_string)
             results.append(prediction_log)
     return results
Example #6
0
    def setUpClass(cls):
        super(ExecutorTest, cls).setUpClass()
        source_example_dir = os.path.join(cls._SOURCE_DATA_DIR,
                                          'csv_example_gen')

        io_utils.copy_dir(source_example_dir, cls._ARTIFACT1_URI)
        io_utils.copy_dir(source_example_dir, cls._ARTIFACT2_URI)

        # Duplicate the number of train and eval records such that
        # second artifact has twice as many as first.
        artifact2_pattern = os.path.join(cls._ARTIFACT2_URI, '*', '*')
        artifact2_files = fileio.glob(artifact2_pattern)
        for filepath in artifact2_files:
            directory, filename = os.path.split(filepath)
            io_utils.copy_file(filepath,
                               os.path.join(directory, 'dup_' + filename))
Example #7
0
def generate_fingerprint(split_name: Text, file_pattern: Text) -> Text:
    """Generates a fingerprint for all files that match the pattern."""
    files = fileio.glob(file_pattern)
    total_bytes = 0
    # Checksum used here is based on timestamp (mtime).
    # Checksums are xor'ed and sum'ed over the files so that they are order-
    # independent.
    xor_checksum = 0
    sum_checksum = 0
    for f in files:
        stat = fileio.stat(f)
        total_bytes += stat.length
        # Take mtime only up to second-granularity.
        mtime = int(stat.mtime_nsec / NANO_PER_SEC)
        xor_checksum ^= mtime
        sum_checksum += mtime

    return 'split:%s,num_files:%d,total_bytes:%d,xor_checksum:%d,sum_checksum:%d' % (
        split_name, len(files), total_bytes, xor_checksum, sum_checksum)
Example #8
0
    def expand(
            self, pipeline: beam.Pipeline
    ) -> beam.pvalue.PCollection[tf.train.Example]:
        logging.info('Processing input csv data %s to TFExample.',
                     self._csv_pattern)

        csv_files = fileio.glob(self._csv_pattern)
        if not csv_files:
            raise RuntimeError(
                'Split pattern {} does not match any files.'.format(
                    self._csv_pattern))

        column_names = io_utils.load_csv_column_names(csv_files[0])
        for csv_file in csv_files[1:]:
            if io_utils.load_csv_column_names(csv_file) != column_names:
                raise RuntimeError(
                    'Files in same split {} have different header.'.format(
                        self._csv_pattern))

        # Read each CSV file while maintaining order. This is done in order to group
        # together multi-line string fields.
        parsed_csv_lines = (
            pipeline
            | 'CreateFilenames' >> beam.Create(csv_files)
            | 'ReadFromText' >> beam.ParDo(_ReadCsvRecordsFromTextFile())
            | 'ParseCSVLine' >> beam.ParDo(
                csv_decoder.ParseCSVLine(delimiter=','))
            | 'ExtractParsedCSVLines' >> beam.Keys())
        column_infos = beam.pvalue.AsSingleton(
            parsed_csv_lines
            | 'InferColumnTypes' >> beam.CombineGlobally(
                csv_decoder.ColumnTypeInferrer(column_names,
                                               skip_blank_lines=True)))

        return (
            parsed_csv_lines
            |
            'ToTFExample' >> beam.ParDo(_ParsedCsvToTfExample(), column_infos))
Example #9
0
    def _verify_transform_outputs(self,
                                  materialize=True,
                                  store_cache=True,
                                  multiple_example_inputs=False,
                                  compute_statistics=False):
        expected_outputs = ['transformed_graph']

        if store_cache:
            expected_outputs.append('CACHE')
            self.assertNotEqual(
                0,
                len(fileio.listdir(self._updated_analyzer_cache_artifact.uri)))

        example_artifacts = self._example_artifacts[:1]
        transformed_example_artifacts = self._transformed_example_artifacts[:1]
        if multiple_example_inputs:
            example_artifacts = self._example_artifacts
            transformed_example_artifacts = self._transformed_example_artifacts

        if materialize:
            expected_outputs.append('transformed_examples')

            assert len(example_artifacts) == len(transformed_example_artifacts)
            for example, transformed_example in zip(
                    example_artifacts, transformed_example_artifacts):
                examples_train_files = fileio.glob(
                    os.path.join(example.uri, 'Split-train', '*'))
                transformed_train_files = fileio.glob(
                    os.path.join(transformed_example.uri, 'Split-train', '*'))
                self.assertGreater(len(transformed_train_files), 0)

                examples_eval_files = fileio.glob(
                    os.path.join(example.uri, 'Split-eval', '*'))
                transformed_eval_files = fileio.glob(
                    os.path.join(transformed_example.uri, 'Split-eval', '*'))
                self.assertGreater(len(transformed_eval_files), 0)

                # Construct datasets and count number of records in each split.
                examples_train_count = _get_dataset_size(examples_train_files)
                transformed_train_count = _get_dataset_size(
                    transformed_train_files)
                examples_eval_count = _get_dataset_size(examples_eval_files)
                transformed_eval_count = _get_dataset_size(
                    transformed_eval_files)

                # Check for each split that it contains the same number of records in
                # the input artifact as in the output artifact (i.e 1-to-1 mapping is
                # preserved).
                self.assertEqual(examples_train_count, transformed_train_count)
                self.assertEqual(examples_eval_count, transformed_eval_count)
                self.assertGreater(transformed_train_count,
                                   transformed_eval_count)

        path_to_pre_transform_statistics = os.path.join(
            self._transformed_output.uri,
            tft.TFTransformOutput.PRE_TRANSFORM_FEATURE_STATS_PATH)
        path_to_post_transform_statistics = os.path.join(
            self._transformed_output.uri,
            tft.TFTransformOutput.POST_TRANSFORM_FEATURE_STATS_PATH)
        if compute_statistics:
            self.assertTrue(fileio.exists(path_to_pre_transform_statistics))
            self.assertTrue(fileio.exists(path_to_post_transform_statistics))
        else:
            self.assertFalse(fileio.exists(path_to_pre_transform_statistics))
            self.assertFalse(fileio.exists(path_to_post_transform_statistics))

        # Depending on `materialize` and `store_cache`, check that
        # expected outputs are exactly correct. If either flag is False, its
        # respective output should not be present.
        self.assertCountEqual(expected_outputs,
                              fileio.listdir(self._output_data_dir))

        path_to_saved_model = os.path.join(
            self._transformed_output.uri,
            tft.TFTransformOutput.TRANSFORM_FN_DIR,
            tf.saved_model.SAVED_MODEL_FILENAME_PB)
        self.assertTrue(fileio.exists(path_to_saved_model))
Example #10
0
def _get_target_span_version(
    uri: str,
    split: example_gen_pb2.Input.Split,
    range_config: Optional[range_config_pb2.RangeConfig] = None
) -> Tuple[Optional[int], Optional[int]]:
    """Retrieves a  target span and version for a given split pattern.

  If both Span and Version spec occur in the split pattern, searches for and
  returns both the target Span and Version. If only Span exists in the split
  pattern, searches for the target Span, and Version is returned as None.
  If Version is present, but not Span, an error is raised. If neither Span
  nor Version is present, returns both as None.

  Additonally, supports parsing span number from date stamps using the Date.
  specs. Once the calendar date is parsed from the Date specs, it is converted
  into a span number by counting the number of days since 01/01/1970.

  Args:
    uri: The base path from which files will be searched.
    split: An example_gen_pb2.Input.Split object which contains a split pattern,
      to be searched on.
    range_config: An instance of range_config_pb2.RangeConfig, which specifies
      which spans to consider when finding the most recent span and version. If
      unset, search for latest span number with no restrictions.

  Returns:
    Tuple of two ints, Span (optional) and Version (optional). Note
      that this function will update the {SPAN} or Date tags as well as the
      {VERSION} tags in the split config to actual Span and Version numbers.

  Raises:
    ValueError: if any of the following occurs:
      - If either Span or Version spec is occurs in the split pattern
        more than once.
      - If Version spec is provided, but Span spec is not present.
      - If Span or Version found is not an integer.
      - If a matching cannot be found for split pattern provided.
  """
    is_match_span, is_match_date, is_match_version = verify_split_pattern_specs(
        split)

    if not is_match_span and not is_match_date:
        return (None, None)

    split_glob_pattern, split_regex_pattern = _create_matching_glob_and_regex(
        uri=uri,
        split=split,
        is_match_span=is_match_span,
        is_match_date=is_match_date,
        is_match_version=is_match_version,
        range_config=range_config)

    logging.info('Glob pattern for split %s: %s', split.name,
                 split_glob_pattern)
    logging.info('Regex pattern for split %s: %s', split.name,
                 split_regex_pattern)

    latest_span_tokens = None
    latest_span_int = None
    latest_version = None
    latest_version_int = None

    files = fileio.glob(split_glob_pattern)
    for file_path in files:
        match_span_tokens, match_span_int, match_version, match_version_int = (
            _find_matched_span_version_from_path(file_path,
                                                 split_regex_pattern,
                                                 is_match_span, is_match_date,
                                                 is_match_version))

        if latest_span_int is None or match_span_int > latest_span_int:
            # Uses str instead of int because of zero padding digits.
            latest_span_tokens = match_span_tokens
            latest_span_int = match_span_int
            latest_version = match_version
            latest_version_int = match_version_int
        elif (latest_span_int == match_span_int
              and (latest_version is None
                   or match_version_int >= latest_version_int)):
            latest_version = match_version
            latest_version_int = match_version_int

    if latest_span_int is None or (is_match_version
                                   and latest_version is None):
        raise ValueError('Cannot find matching for split %s based on %s' %
                         (split.name, split.pattern))

    # Update split pattern so executor can find the files to ingest.
    if is_match_span:
        split.pattern = re.sub(SPAN_FULL_REGEX, latest_span_tokens[0],
                               split.pattern)
    elif is_match_date:
        for spec, value in zip(DATE_SPECS, latest_span_tokens):
            split.pattern = split.pattern.replace(spec, value)

    if is_match_version:
        split.pattern = re.sub(VERSION_FULL_REGEX, latest_version,
                               split.pattern)

    return latest_span_int, latest_version_int