def validate_platform(cls, values: Dict) -> Dict: value = values.get("platform") if value is not None and value != "": return values if not values.get("path_specs") and not values.get("path_spec"): raise ValueError("Either path_specs or path_spec needs to be specified") if values.get("path_specs") and values.get("path_spec"): raise ValueError( "Either path_specs or path_spec needs to be specified but not both" ) if values.get("path_spec"): logger.warning( "path_spec config property is deprecated, please use path_specs instead of it." ) values["path_specs"] = [values.get("path_spec")] bucket_name: str = "" for path_spec in values.get("path_specs", []): if path_spec.is_s3(): platform = "s3" else: if values.get("use_s3_object_tags") or values.get("use_s3_bucket_tags"): raise ValueError( "cannot grab s3 tags for platform != s3. Remove the flag or use s3." ) platform = "file" if values.get("platform", "") != "": if values["platform"] != platform: raise ValueError("all path_spec should belong to the same platform") else: values["platform"] = platform logger.debug(f'Setting config "platform": {values.get("platform")}') if platform == "s3": if bucket_name == "": bucket_name = get_bucket_name(path_spec.include) else: if bucket_name != get_bucket_name(path_spec.include): raise ValueError( "all path_spec should reference the same s3 bucket" ) return values
def get_s3_tags() -> Optional[GlobalTagsClass]: bucket_name = s3_util.get_bucket_name( table["StorageDescriptor"]["Location"]) tags_to_add = [] if self.source_config.use_s3_bucket_tags: try: bucket_tags = self.s3_client.get_bucket_tagging( Bucket=bucket_name) tags_to_add.extend([ make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""") for tag in bucket_tags["TagSet"] ]) except self.s3_client.exceptions.ClientError: logger.warn(f"No tags found for bucket={bucket_name}") if self.source_config.use_s3_object_tags: key_prefix = s3_util.get_key_prefix( table["StorageDescriptor"]["Location"]) object_tagging = self.s3_client.get_object_tagging( Bucket=bucket_name, Key=key_prefix) tag_set = object_tagging["TagSet"] if tag_set: tags_to_add.extend([ make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""") for tag in tag_set ]) else: # Unlike bucket tags, if an object does not have tags, it will just return an empty array # as opposed to an exception. logger.warn( f"No tags found for bucket={bucket_name} key={key_prefix}" ) if len(tags_to_add) == 0: return None if self.ctx.graph is not None: logger.debug( "Connected to DatahubApi, grabbing current tags to maintain." ) current_tags: Optional[ GlobalTagsClass] = self.ctx.graph.get_aspect_v2( entity_urn=dataset_urn, aspect="globalTags", aspect_type=GlobalTagsClass, ) if current_tags: tags_to_add.extend( [current_tag.tag for current_tag in current_tags.tags]) else: logger.warn( "Could not connect to DatahubApi. No current tags to maintain" ) # Remove duplicate tags tags_to_add = list(set(tags_to_add)) new_tags = GlobalTagsClass(tags=[ TagAssociationClass(tag_to_add) for tag_to_add in tags_to_add ]) return new_tags
def s3_browser(self) -> Iterable[tuple]: if self.source_config.aws_config is None: raise ValueError("aws_config not set. Cannot browse s3") s3 = self.source_config.aws_config.get_s3_resource() bucket_name = get_bucket_name(self.source_config.path_spec.include) logger.debug(f"Scanning bucket : {bucket_name}") bucket = s3.Bucket(bucket_name) prefix = self.get_prefix( get_bucket_relative_path(self.source_config.path_spec.include)) logger.debug(f"Scanning objects with prefix:{prefix}") for obj in bucket.objects.filter(Prefix=prefix).page_size(1000): s3_path = f"s3://{obj.bucket_name}/{obj.key}" yield s3_path, obj.last_modified
def create_container_hierarchy( self, table_data: TableData, dataset_urn: str) -> Iterable[MetadataWorkUnit]: logger.debug(f"Creating containers for {dataset_urn}") base_full_path = table_data.table_path parent_key = None if table_data.is_s3: bucket_name = get_bucket_name(table_data.table_path) bucket_key = self.gen_bucket_key(bucket_name) yield from self.create_emit_containers( container_key=bucket_key, name=bucket_name, sub_types=["S3 bucket"], parent_container_key=None, ) parent_key = bucket_key base_full_path = get_bucket_relative_path(table_data.table_path) parent_folder_path = (base_full_path[:base_full_path.rfind("/")] if base_full_path.rfind("/") != -1 else "") for folder in parent_folder_path.split("/"): abs_path = folder if parent_key: prefix: str = "" if isinstance(parent_key, S3BucketKey): prefix = parent_key.bucket_name elif isinstance(parent_key, FolderKey): prefix = parent_key.folder_abs_path abs_path = prefix + "/" + folder folder_key = self.gen_folder_key(abs_path) yield from self.create_emit_containers( container_key=folder_key, name=folder, sub_types=["Folder"], parent_container_key=parent_key, ) parent_key = folder_key if parent_key is None: logger.warning( f"Failed to associate Dataset ({dataset_urn}) with container") return yield from add_dataset_to_container(parent_key, dataset_urn)
def ingest_table(self, table_data: TableData, path_spec: PathSpec) -> Iterable[MetadataWorkUnit]: logger.info( f"Extracting table schema from file: {table_data.full_path}") browse_path: str = (strip_s3_prefix(table_data.table_path) if table_data.is_s3 else table_data.table_path.strip("/")) data_platform_urn = make_data_platform_urn(self.source_config.platform) logger.info(f"Creating dataset urn with name: {browse_path}") dataset_urn = make_dataset_urn_with_platform_instance( self.source_config.platform, browse_path, self.source_config.platform_instance, self.source_config.env, ) dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[], ) dataset_properties = DatasetPropertiesClass( description="", name=table_data.display_name, customProperties={ "number_of_files": str(table_data.number_of_files), "size_in_bytes": str(table_data.size_in_bytes), }, ) dataset_snapshot.aspects.append(dataset_properties) fields = self.get_fields(table_data, path_spec) schema_metadata = SchemaMetadata( schemaName=table_data.display_name, platform=data_platform_urn, version=0, hash="", fields=fields, platformSchema=OtherSchemaClass(rawSchema=""), ) dataset_snapshot.aspects.append(schema_metadata) if (self.source_config.use_s3_bucket_tags or self.source_config.use_s3_object_tags): bucket = get_bucket_name(table_data.table_path) key_prefix = (get_key_prefix(table_data.table_path) if table_data.full_path == table_data.table_path else None) s3_tags = self.get_s3_tags(bucket, key_prefix, dataset_urn) if s3_tags is not None: dataset_snapshot.aspects.append(s3_tags) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = MetadataWorkUnit(id=table_data.table_path, mce=mce) self.report.report_workunit(wu) yield wu yield from self.create_container_hierarchy(table_data, dataset_urn) if self.source_config.profiling.enabled: yield from self.get_table_profile(table_data, dataset_urn)