Beispiel #1
0
def calculate_splits_fingerprint_span_and_version(
    input_base_uri: Text, splits: Iterable[example_gen_pb2.Input.Split]
) -> Tuple[Text, int, Optional[int]]:
    """Calculates the fingerprint of files in a URI matching split patterns.

  If a pattern has the {SPAN} placeholder or the Date spec placeholders, {YYYY},
  {MM}, and {DD}, and optionally, the {VERSION} placeholder, attempts to find
  aligned values that results in all splits having the most recent span and most
  recent version for that span.

  Args:
    input_base_uri: The base path from which files will be searched.
    splits: An iterable collection of example_gen_pb2.Input.Split objects.

  Returns:
    A Tuple of [fingerprint, select_span, select_version], where select_span
    is either the value matched with the {SPAN} placeholder, the value mapped
    from matching the calendar date with the date placeholders {YYYY}, {MM},
    {DD} or 0 if a placeholder wasn't specified, and where select_version is
    either the value matched with the {VERSION} placeholder, or None if the
    placeholder wasn't specified. Note that this function will update the
    {SPAN} or Date tags as well as the {VERSION} tags in the split configs to
    actual Span and Version numbers.
  """

    split_fingerprints = []
    select_span = 0
    select_version = None
    # Calculate the fingerprint of files under input_base_uri.
    for split in splits:
        logging.info('select span and version = (%s, %s)', select_span,
                     select_version)
        # Find most recent span and version for this split.
        latest_span, latest_version = _retrieve_latest_span_version(
            input_base_uri, split)

        # TODO(b/162622803): add default behavior for when version spec not present.
        latest_span = latest_span or 0

        logging.info('latest span and version = (%s, %s)', latest_span,
                     latest_version)

        if select_span == 0 and select_version is None:
            select_span = latest_span
            select_version = latest_version

        # Check if latest span and version are the same over all splits.
        if select_span != latest_span:
            raise ValueError('Latest span should be the same for each split')
        if select_version != latest_version:
            raise ValueError(
                'Latest version should be the same for each split')

        # Calculate fingerprint.
        pattern = os.path.join(input_base_uri, split.pattern)
        split_fingerprint = io_utils.generate_fingerprint(split.name, pattern)
        split_fingerprints.append(split_fingerprint)

    fingerprint = '\n'.join(split_fingerprints)
    return fingerprint, select_span, select_version
Beispiel #2
0
    def _prepare_input_for_processing(
        self,
        input_dict: Dict[Text, List[types.TfxArtifact]],
        exec_properties: Dict[Text, Any],
    ) -> Dict[Text, List[types.TfxArtifact]]:
        """Resolves artifacts for external inputs."""
        input_config = example_gen_pb2.Input()
        json_format.Parse(exec_properties['input_config'], input_config)

        for input_list in input_dict.values():
            for single_input in input_list:
                tf.logging.info('Processing input {}.'.format(
                    single_input.uri))
                tf.logging.info('single_input {}.'.format(single_input))
                tf.logging.info('single_input.artifact {}.'.format(
                    single_input.artifact))

                # Set the fingerprint of input.
                split_fingerprints = []
                for split in input_config.splits:
                    pattern = os.path.join(single_input.uri, split.pattern)
                    split_fingerprints.append(
                        io_utils.generate_fingerprint(split.name, pattern))
                fingerprint = '\n'.join(split_fingerprints)
                single_input.set_string_custom_property(
                    FINGERPRINT, fingerprint)

                matched_artifacts = []
                for artifact in self._metadata_handler.get_artifacts_by_uri(
                        single_input.uri):
                    if (artifact.custom_properties[FINGERPRINT].string_value ==
                            fingerprint):
                        matched_artifacts.append(artifact)

                if matched_artifacts:
                    # If there are multiple matches, get the latest one for caching.
                    # Using id because spans are the same for matched artifacts.
                    latest_artifact = max(matched_artifacts,
                                          key=lambda artifact: artifact.id)
                    tf.logging.info(
                        'latest_artifact {}.'.format(latest_artifact))
                    tf.logging.info('type(latest_artifact) {}.'.format(
                        type(latest_artifact)))

                    single_input.set_artifact(latest_artifact)
                else:
                    # TODO(jyzhao): support span.
                    single_input.span = 1
                    # TODO(jyzhao): whether driver should be read-only for metadata.
                    [new_artifact] = self._metadata_handler.publish_artifacts(
                        [single_input])  # pylint: disable=unbalanced-tuple-unpacking
                    tf.logging.info(
                        'Registered new input: {}'.format(new_artifact))
                    single_input.set_artifact(new_artifact)

        return input_dict
Beispiel #3
0
 def testGeneratesFingerprint(self):
     d1_path = os.path.join(self._base_dir, 'fp', 'data1')
     io_utils.write_string_file(d1_path, 'testing')
     os.utime(d1_path, (0, 1))
     d2_path = os.path.join(self._base_dir, 'fp', 'data2')
     io_utils.write_string_file(d2_path, 'testing2')
     os.utime(d2_path, (0, 3))
     fingerprint = io_utils.generate_fingerprint(
         'split', os.path.join(self._base_dir, 'fp', '*'))
     self.assertEqual(
         'split:split,num_files:2,total_bytes:15,xor_checksum:2,sum_checksum:4',
         fingerprint)
Beispiel #4
0
 def testGeneratesFingerprint(self):
   self.createFiles({
       'fp': {
           'data1': 'testing',
           'data2': 'testing2'
       },
   })
   os.utime(self.relpath('fp', 'data1'), (0, 1))
   os.utime(self.relpath('fp', 'data2'), (0, 3))
   fingerprint = io_utils.generate_fingerprint(
       'split', os.path.join(self.relpath('fp'), '*'))
   self.assertEqual(
       'split:split,num_files:2,total_bytes:15,xor_checksum:2,sum_checksum:4',
       fingerprint)
Beispiel #5
0
Datei: utils.py Projekt: zvrr/tfx
def calculate_splits_fingerprint_and_span(
    input_base_uri: Text, splits: Iterable[example_gen_pb2.Input.Split]
) -> Tuple[Text, Optional[Text]]:
    """Calculates the fingerprint of files in a URI matching split patterns.

  If a pattern has the {SPAN} placeholder, attempts to find an identical value
  across splits that results in all splits having the most recently updated
  files.

  Args:
    input_base_uri: The base path from which files will be searched
    splits: An iterable collection of example_gen_pb2.Input.Split objects. Note
      that this function will update the {SPAN} in this split config to actual
      Span number.

  Returns:
    A Tuple of [fingerprint, select_span], where select_span is either
    the value matched with the {SPAN} placeholder, or None if the placeholder
    wasn't specified.
  """

    split_fingerprints = []
    select_span = None
    # Calculate the fingerprint of files under input_base_uri.
    for split in splits:
        logging.info('select span = %s', select_span)
        if SPAN_SPEC in split.pattern:
            latest_span = _retrieve_latest_span(input_base_uri, split)
            logging.info('latest span = %s', latest_span)
            if select_span is None:
                select_span = latest_span
            if select_span != latest_span:
                raise ValueError(
                    'Latest span should be the same for each split: %s != %s' %
                    (select_span, latest_span))
            split.pattern = split.pattern.replace(SPAN_SPEC, select_span)
        if select_span is None:
            select_span = '0'
        # Calculate fingerprint
        pattern = os.path.join(input_base_uri, split.pattern)
        split_fingerprint = io_utils.generate_fingerprint(split.name, pattern)
        split_fingerprints.append(split_fingerprint)
    fingerprint = '\n'.join(split_fingerprints)
    return fingerprint, select_span
Beispiel #6
0
    def resolve_input_artifacts(
        self,
        input_channels: Dict[Text, types.Channel],
        exec_properties: Dict[Text, Any],
        driver_args: data_types.DriverArgs,
        pipeline_info: data_types.PipelineInfo,
    ) -> Dict[Text, List[types.Artifact]]:
        """Overrides BaseDriver.resolve_input_artifacts()."""
        del driver_args  # unused
        del pipeline_info  # unused

        input_config = example_gen_pb2.Input()
        json_format.Parse(exec_properties['input_config'], input_config)

        input_dict = channel_utils.unwrap_channel_dict(input_channels)
        for input_list in input_dict.values():
            for single_input in input_list:
                absl.logging.debug('Processing input %s.' % single_input.uri)
                absl.logging.debug('single_input %s.' % single_input)
                absl.logging.debug('single_input.artifact %s.' %
                                   single_input.artifact)

                # Set the fingerprint of input.
                split_fingerprints = []
                select_span = None
                for split in input_config.splits:
                    # If SPAN is specified, pipeline will process the latest span, note
                    # that this span number must be the same for all splits and it will
                    # be stored in metadata as the span of input artifact.
                    if _SPAN_SPEC in split.pattern:
                        latest_span = self._retrieve_latest_span(
                            single_input.uri, split)
                        if select_span is None:
                            select_span = latest_span
                        if select_span != latest_span:
                            raise ValueError(
                                'Latest span should be the same for each split: %s != %s'
                                % (select_span, latest_span))
                        split.pattern = split.pattern.replace(
                            _SPAN_SPEC, select_span)

                    pattern = os.path.join(single_input.uri, split.pattern)
                    split_fingerprints.append(
                        io_utils.generate_fingerprint(split.name, pattern))
                fingerprint = '\n'.join(split_fingerprints)
                single_input.set_string_custom_property(
                    _FINGERPRINT, fingerprint)
                if select_span is None:
                    select_span = '0'
                single_input.set_string_custom_property(_SPAN, select_span)

                matched_artifacts = []
                for artifact in self._metadata_handler.get_artifacts_by_uri(
                        single_input.uri):
                    if (artifact.custom_properties[_FINGERPRINT].string_value
                            == fingerprint) and (
                                artifact.custom_properties[_SPAN].string_value
                                == select_span):
                        matched_artifacts.append(artifact)

                if matched_artifacts:
                    # TODO(b/138845899): consider use span instead of id.
                    # If there are multiple matches, get the latest one for caching.
                    # Using id because spans are the same for matched artifacts.
                    latest_artifact = max(matched_artifacts,
                                          key=lambda artifact: artifact.id)
                    absl.logging.debug('latest_artifact %s.' %
                                       (latest_artifact))
                    absl.logging.debug('type(latest_artifact) %s.' %
                                       type(latest_artifact))

                    single_input.set_artifact(latest_artifact)
                else:
                    # TODO(jyzhao): whether driver should be read-only for metadata.
                    [new_artifact] = self._metadata_handler.publish_artifacts(
                        [single_input])  # pylint: disable=unbalanced-tuple-unpacking
                    absl.logging.debug('Registered new input: %s' %
                                       (new_artifact))
                    single_input.set_artifact(new_artifact)

        exec_properties['input_config'] = json_format.MessageToJson(
            input_config, sort_keys=True)
        return input_dict