def _import_artifacts(self, source_uri: List[Text], reimport: bool, destination_channel: types.Channel, split_names: List[Text]) -> List[types.Artifact]: """Imports external resource in MLMD.""" results = [] for uri, s in zip(source_uri, split_names): absl.logging.info('Processing source uri: %s, split: %s' % (uri, s or 'NO_SPLIT')) result = destination_channel.type() # TODO(ccy): refactor importer to treat split name just like any other # property. unfiltered_previous_artifacts = self._metadata_handler.get_artifacts_by_uri( uri) # Filter by split name. desired_split_names = artifact_utils.encode_split_names([s or '']) previous_artifacts = [] for previous_artifact in unfiltered_previous_artifacts: # TODO(ccy): refactor importer to treat split name just like any other # property. if result.PROPERTIES and SPLIT_KEY in result.PROPERTIES: # Consider the previous artifact only if the split_names match. split_names = previous_artifact.properties.get( 'split_names', None) if split_names and split_names.string_value == desired_split_names: previous_artifacts.append(previous_artifact) else: # Unconditionally add the previous artifact for consideration. previous_artifacts.append(previous_artifact) # TODO(ccy): refactor importer to treat split name just like any other # property. if SPLIT_KEY in result.artifact_type.properties: result.split_names = desired_split_names result.uri = uri # If any registered artifact with the same uri also has the same # fingerprint and user does not ask for re-import, just reuse the latest. # Otherwise, register the external resource into MLMD using the type info # in the destination channel. if bool(previous_artifacts) and not reimport: absl.logging.info('Reusing existing artifact') result.set_mlmd_artifact( max(previous_artifacts, key=lambda m: m.id)) else: [registered_artifact ] = self._metadata_handler.publish_artifacts([result]) absl.logging.info('Registered new artifact: %s' % registered_artifact) result.set_mlmd_artifact(registered_artifact) results.append(result) return results
def _prepare_artifact( self, uri: Text, properties: Dict[Text, Any], custom_properties: Dict[Text, Any], reimport: bool, destination_channel: types.Channel) -> types.Artifact: """Prepares the Importer's output artifact. If there is already an artifact in MLMD with the same URI and properties / custom properties, that artifact will be reused unless the `reimport` argument is set to True. Args: uri: The uri of the artifact. properties: The properties of the artifact, given as a dictionary from string keys to integer / string values. Must conform to the declared properties of the destination channel's output type. custom_properties: The custom properties of the artifact, given as a dictionary from string keys to integer / string values. reimport: If set to True, will register a new artifact even if it already exists in the database. destination_channel: Destination channel for the imported artifact. Returns: An Artifact object representing the imported artifact. """ absl.logging.info( 'Processing source uri: %s, properties: %s, custom_properties: %s' % (uri, properties, custom_properties)) # Check types of custom properties. for key, value in custom_properties.items(): if not isinstance(value, (int, Text, bytes)): raise ValueError(( 'Custom property value for key %r must be a string or integer ' '(got %r instead)') % (key, value)) unfiltered_previous_artifacts = self._metadata_handler.get_artifacts_by_uri( uri) # Only consider previous artifacts as candidates to reuse, if the properties # of the imported artifact match those of the existing artifact. previous_artifacts = [] for candidate_mlmd_artifact in unfiltered_previous_artifacts: is_candidate = True candidate_artifact = destination_channel.type() candidate_artifact.set_mlmd_artifact(candidate_mlmd_artifact) for key, value in properties.items(): if getattr(candidate_artifact, key) != value: is_candidate = False break for key, value in custom_properties.items(): if isinstance(value, int): if candidate_artifact.get_int_custom_property( key) != value: is_candidate = False break elif isinstance(value, (Text, bytes)): if candidate_artifact.get_string_custom_property( key) != value: is_candidate = False break if is_candidate: previous_artifacts.append(candidate_mlmd_artifact) result = destination_channel.type() result.uri = uri for key, value in properties.items(): setattr(result, key, value) for key, value in custom_properties.items(): if isinstance(value, int): result.set_int_custom_property(key, value) elif isinstance(value, (Text, bytes)): result.set_string_custom_property(key, value) # If a registered artifact has the same uri and properties and the user does # not explicitly ask for reimport, reuse that artifact. if bool(previous_artifacts) and not reimport: absl.logging.info('Reusing existing artifact') result.set_mlmd_artifact( max(previous_artifacts, key=lambda m: m.id)) return result