def calculate_splits_fingerprint_span_and_version( input_base_uri: Text, splits: Iterable[example_gen_pb2.Input.Split] ) -> Tuple[Text, int, Optional[int]]: """Calculates the fingerprint of files in a URI matching split patterns. If a pattern has the {SPAN} placeholder or the Date spec placeholders, {YYYY}, {MM}, and {DD}, and optionally, the {VERSION} placeholder, attempts to find aligned values that results in all splits having the most recent span and most recent version for that span. Args: input_base_uri: The base path from which files will be searched. splits: An iterable collection of example_gen_pb2.Input.Split objects. Returns: A Tuple of [fingerprint, select_span, select_version], where select_span is either the value matched with the {SPAN} placeholder, the value mapped from matching the calendar date with the date placeholders {YYYY}, {MM}, {DD} or 0 if a placeholder wasn't specified, and where select_version is either the value matched with the {VERSION} placeholder, or None if the placeholder wasn't specified. Note that this function will update the {SPAN} or Date tags as well as the {VERSION} tags in the split configs to actual Span and Version numbers. """ split_fingerprints = [] select_span = 0 select_version = None # Calculate the fingerprint of files under input_base_uri. for split in splits: logging.info('select span and version = (%s, %s)', select_span, select_version) # Find most recent span and version for this split. latest_span, latest_version = _retrieve_latest_span_version( input_base_uri, split) # TODO(b/162622803): add default behavior for when version spec not present. latest_span = latest_span or 0 logging.info('latest span and version = (%s, %s)', latest_span, latest_version) if select_span == 0 and select_version is None: select_span = latest_span select_version = latest_version # Check if latest span and version are the same over all splits. if select_span != latest_span: raise ValueError('Latest span should be the same for each split') if select_version != latest_version: raise ValueError( 'Latest version should be the same for each split') # Calculate fingerprint. pattern = os.path.join(input_base_uri, split.pattern) split_fingerprint = io_utils.generate_fingerprint(split.name, pattern) split_fingerprints.append(split_fingerprint) fingerprint = '\n'.join(split_fingerprints) return fingerprint, select_span, select_version
def _prepare_input_for_processing( self, input_dict: Dict[Text, List[types.TfxArtifact]], exec_properties: Dict[Text, Any], ) -> Dict[Text, List[types.TfxArtifact]]: """Resolves artifacts for external inputs.""" input_config = example_gen_pb2.Input() json_format.Parse(exec_properties['input_config'], input_config) for input_list in input_dict.values(): for single_input in input_list: tf.logging.info('Processing input {}.'.format( single_input.uri)) tf.logging.info('single_input {}.'.format(single_input)) tf.logging.info('single_input.artifact {}.'.format( single_input.artifact)) # Set the fingerprint of input. split_fingerprints = [] for split in input_config.splits: pattern = os.path.join(single_input.uri, split.pattern) split_fingerprints.append( io_utils.generate_fingerprint(split.name, pattern)) fingerprint = '\n'.join(split_fingerprints) single_input.set_string_custom_property( FINGERPRINT, fingerprint) matched_artifacts = [] for artifact in self._metadata_handler.get_artifacts_by_uri( single_input.uri): if (artifact.custom_properties[FINGERPRINT].string_value == fingerprint): matched_artifacts.append(artifact) if matched_artifacts: # If there are multiple matches, get the latest one for caching. # Using id because spans are the same for matched artifacts. latest_artifact = max(matched_artifacts, key=lambda artifact: artifact.id) tf.logging.info( 'latest_artifact {}.'.format(latest_artifact)) tf.logging.info('type(latest_artifact) {}.'.format( type(latest_artifact))) single_input.set_artifact(latest_artifact) else: # TODO(jyzhao): support span. single_input.span = 1 # TODO(jyzhao): whether driver should be read-only for metadata. [new_artifact] = self._metadata_handler.publish_artifacts( [single_input]) # pylint: disable=unbalanced-tuple-unpacking tf.logging.info( 'Registered new input: {}'.format(new_artifact)) single_input.set_artifact(new_artifact) return input_dict
def testGeneratesFingerprint(self): d1_path = os.path.join(self._base_dir, 'fp', 'data1') io_utils.write_string_file(d1_path, 'testing') os.utime(d1_path, (0, 1)) d2_path = os.path.join(self._base_dir, 'fp', 'data2') io_utils.write_string_file(d2_path, 'testing2') os.utime(d2_path, (0, 3)) fingerprint = io_utils.generate_fingerprint( 'split', os.path.join(self._base_dir, 'fp', '*')) self.assertEqual( 'split:split,num_files:2,total_bytes:15,xor_checksum:2,sum_checksum:4', fingerprint)
def testGeneratesFingerprint(self): self.createFiles({ 'fp': { 'data1': 'testing', 'data2': 'testing2' }, }) os.utime(self.relpath('fp', 'data1'), (0, 1)) os.utime(self.relpath('fp', 'data2'), (0, 3)) fingerprint = io_utils.generate_fingerprint( 'split', os.path.join(self.relpath('fp'), '*')) self.assertEqual( 'split:split,num_files:2,total_bytes:15,xor_checksum:2,sum_checksum:4', fingerprint)
def calculate_splits_fingerprint_and_span( input_base_uri: Text, splits: Iterable[example_gen_pb2.Input.Split] ) -> Tuple[Text, Optional[Text]]: """Calculates the fingerprint of files in a URI matching split patterns. If a pattern has the {SPAN} placeholder, attempts to find an identical value across splits that results in all splits having the most recently updated files. Args: input_base_uri: The base path from which files will be searched splits: An iterable collection of example_gen_pb2.Input.Split objects. Note that this function will update the {SPAN} in this split config to actual Span number. Returns: A Tuple of [fingerprint, select_span], where select_span is either the value matched with the {SPAN} placeholder, or None if the placeholder wasn't specified. """ split_fingerprints = [] select_span = None # Calculate the fingerprint of files under input_base_uri. for split in splits: logging.info('select span = %s', select_span) if SPAN_SPEC in split.pattern: latest_span = _retrieve_latest_span(input_base_uri, split) logging.info('latest span = %s', latest_span) if select_span is None: select_span = latest_span if select_span != latest_span: raise ValueError( 'Latest span should be the same for each split: %s != %s' % (select_span, latest_span)) split.pattern = split.pattern.replace(SPAN_SPEC, select_span) if select_span is None: select_span = '0' # Calculate fingerprint pattern = os.path.join(input_base_uri, split.pattern) split_fingerprint = io_utils.generate_fingerprint(split.name, pattern) split_fingerprints.append(split_fingerprint) fingerprint = '\n'.join(split_fingerprints) return fingerprint, select_span
def resolve_input_artifacts( self, input_channels: Dict[Text, types.Channel], exec_properties: Dict[Text, Any], driver_args: data_types.DriverArgs, pipeline_info: data_types.PipelineInfo, ) -> Dict[Text, List[types.Artifact]]: """Overrides BaseDriver.resolve_input_artifacts().""" del driver_args # unused del pipeline_info # unused input_config = example_gen_pb2.Input() json_format.Parse(exec_properties['input_config'], input_config) input_dict = channel_utils.unwrap_channel_dict(input_channels) for input_list in input_dict.values(): for single_input in input_list: absl.logging.debug('Processing input %s.' % single_input.uri) absl.logging.debug('single_input %s.' % single_input) absl.logging.debug('single_input.artifact %s.' % single_input.artifact) # Set the fingerprint of input. split_fingerprints = [] select_span = None for split in input_config.splits: # If SPAN is specified, pipeline will process the latest span, note # that this span number must be the same for all splits and it will # be stored in metadata as the span of input artifact. if _SPAN_SPEC in split.pattern: latest_span = self._retrieve_latest_span( single_input.uri, split) if select_span is None: select_span = latest_span if select_span != latest_span: raise ValueError( 'Latest span should be the same for each split: %s != %s' % (select_span, latest_span)) split.pattern = split.pattern.replace( _SPAN_SPEC, select_span) pattern = os.path.join(single_input.uri, split.pattern) split_fingerprints.append( io_utils.generate_fingerprint(split.name, pattern)) fingerprint = '\n'.join(split_fingerprints) single_input.set_string_custom_property( _FINGERPRINT, fingerprint) if select_span is None: select_span = '0' single_input.set_string_custom_property(_SPAN, select_span) matched_artifacts = [] for artifact in self._metadata_handler.get_artifacts_by_uri( single_input.uri): if (artifact.custom_properties[_FINGERPRINT].string_value == fingerprint) and ( artifact.custom_properties[_SPAN].string_value == select_span): matched_artifacts.append(artifact) if matched_artifacts: # TODO(b/138845899): consider use span instead of id. # If there are multiple matches, get the latest one for caching. # Using id because spans are the same for matched artifacts. latest_artifact = max(matched_artifacts, key=lambda artifact: artifact.id) absl.logging.debug('latest_artifact %s.' % (latest_artifact)) absl.logging.debug('type(latest_artifact) %s.' % type(latest_artifact)) single_input.set_artifact(latest_artifact) else: # TODO(jyzhao): whether driver should be read-only for metadata. [new_artifact] = self._metadata_handler.publish_artifacts( [single_input]) # pylint: disable=unbalanced-tuple-unpacking absl.logging.debug('Registered new input: %s' % (new_artifact)) single_input.set_artifact(new_artifact) exec_properties['input_config'] = json_format.MessageToJson( input_config, sort_keys=True) return input_dict