def ingest_table(self, table_data: TableData) -> Iterable[MetadataWorkUnit]: logger.info( f"Extracting table schema from file: {table_data.full_path}") browse_path: str = (strip_s3_prefix(table_data.table_path) if table_data.is_s3 else table_data.table_path.strip("/")) data_platform_urn = make_data_platform_urn(self.source_config.platform) logger.info(f"Creating dataset urn with name: {browse_path}") dataset_urn = make_dataset_urn_with_platform_instance( self.source_config.platform, browse_path, self.source_config.platform_instance, self.source_config.env, ) dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[], ) dataset_properties = DatasetPropertiesClass( description="", name=table_data.disaply_name, customProperties={}, ) dataset_snapshot.aspects.append(dataset_properties) fields = self.get_fields(table_data) schema_metadata = SchemaMetadata( schemaName=table_data.disaply_name, platform=data_platform_urn, version=0, hash="", fields=fields, platformSchema=OtherSchemaClass(rawSchema=""), ) dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = MetadataWorkUnit(id=table_data.table_path, mce=mce) self.report.report_workunit(wu) yield wu yield from self.create_container_hierarchy(table_data, dataset_urn) if self.source_config.profiling.enabled: yield from self.get_table_profile(table_data, dataset_urn)
def get_workunits_s3(self) -> Iterable[MetadataWorkUnit]: plain_base_path = strip_s3_prefix(self.source_config.base_path) # append a trailing slash if it's not there so prefix filtering works if not plain_base_path.endswith("/"): plain_base_path = plain_base_path + "/" if self.source_config.aws_config is None: raise ValueError("AWS config is required for S3 file sources") s3 = self.source_config.aws_config.get_s3_resource() bucket = s3.Bucket(plain_base_path.split("/")[0]) base_obj_paths = [] for obj in bucket.objects.filter( Prefix=plain_base_path.split("/", maxsplit=1)[1]): s3_path = f"s3://{obj.bucket_name}/{obj.key}" # if table patterns do not allow this file, skip if not self.source_config.schema_patterns.allowed(s3_path): continue # if the file is a directory, skip it if obj.key.endswith("/"): continue file = os.path.basename(obj.key) if self.source_config.ignore_dotfiles and file.startswith("."): continue base_obj_path = f"{obj.bucket_name}/{obj.key}" base_obj_paths.append(base_obj_path) for aws_file in sorted(base_obj_paths): relative_path = "./" + aws_file[len(plain_base_path):] # pass in the same relative_path as the full_path for S3 files yield from self.ingest_table(aws_file, relative_path, is_aws=True)
def get_workunits_s3(self) -> Iterable[MetadataWorkUnit]: plain_base_path = strip_s3_prefix(self.source_config.base_path) # append a trailing slash if it's not there so prefix filtering works if not plain_base_path.endswith("/"): plain_base_path = plain_base_path + "/" if self.source_config.aws_config is None: raise ValueError("AWS config is required for S3 file sources") s3 = self.source_config.aws_config.get_s3_resource() bucket = s3.Bucket(plain_base_path.split("/")[0]) base_obj_paths: List[Tuple[str, Dict[str, str]]] = [] for obj in bucket.objects.filter( Prefix=plain_base_path.split("/", maxsplit=1)[1]): s3_path = f"s3://{obj.bucket_name}/{obj.key}" # if table patterns do not allow this file, skip if not self.source_config.schema_patterns.allowed(s3_path): continue # if the file is a directory, skip it if obj.key.endswith("/"): continue file = os.path.basename(obj.key) if self.source_config.ignore_dotfiles and file.startswith("."): continue base_obj_path = f"{obj.bucket_name}/{obj.key}" properties = { "owner": str(obj.owner) if obj.owner else "", "e_tag": str(obj.e_tag) if obj.e_tag else "", "last_modified": str(obj.last_modified) if obj.last_modified else "", "size": str(obj.size) if obj.size else "", "storage_class": str(obj.storage_class) if obj.storage_class else "", "service_name": str(obj.meta.service_name) if obj.meta and obj.meta.service_name else "", } logger.debug(f"Adding file {base_obj_path} for ingestion") base_obj_paths.append((base_obj_path, properties)) for aws_file in sorted(base_obj_paths, key=lambda a: a[0]): path = aws_file[0] properties = aws_file[1] relative_path = "./" + path[len(plain_base_path):] # pass in the same relative_path as the full_path for S3 files yield from self.ingest_table(path, relative_path, is_aws=True, properties=properties)
def ingest_table(self, table_data: TableData, path_spec: PathSpec) -> Iterable[MetadataWorkUnit]: logger.info( f"Extracting table schema from file: {table_data.full_path}") browse_path: str = (strip_s3_prefix(table_data.table_path) if table_data.is_s3 else table_data.table_path.strip("/")) data_platform_urn = make_data_platform_urn(self.source_config.platform) logger.info(f"Creating dataset urn with name: {browse_path}") dataset_urn = make_dataset_urn_with_platform_instance( self.source_config.platform, browse_path, self.source_config.platform_instance, self.source_config.env, ) dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[], ) dataset_properties = DatasetPropertiesClass( description="", name=table_data.display_name, customProperties={ "number_of_files": str(table_data.number_of_files), "size_in_bytes": str(table_data.size_in_bytes), }, ) dataset_snapshot.aspects.append(dataset_properties) fields = self.get_fields(table_data, path_spec) schema_metadata = SchemaMetadata( schemaName=table_data.display_name, platform=data_platform_urn, version=0, hash="", fields=fields, platformSchema=OtherSchemaClass(rawSchema=""), ) dataset_snapshot.aspects.append(schema_metadata) if (self.source_config.use_s3_bucket_tags or self.source_config.use_s3_object_tags): bucket = get_bucket_name(table_data.table_path) key_prefix = (get_key_prefix(table_data.table_path) if table_data.full_path == table_data.table_path else None) s3_tags = self.get_s3_tags(bucket, key_prefix, dataset_urn) if s3_tags is not None: dataset_snapshot.aspects.append(s3_tags) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = MetadataWorkUnit(id=table_data.table_path, mce=mce) self.report.report_workunit(wu) yield wu yield from self.create_container_hierarchy(table_data, dataset_urn) if self.source_config.profiling.enabled: yield from self.get_table_profile(table_data, dataset_urn)