コード例 #1
0
ファイル: __init__.py プロジェクト: arunvasudevan/datahub
    def ingest_table(self,
                     table_data: TableData) -> Iterable[MetadataWorkUnit]:

        logger.info(
            f"Extracting table schema from file: {table_data.full_path}")
        browse_path: str = (strip_s3_prefix(table_data.table_path)
                            if table_data.is_s3 else
                            table_data.table_path.strip("/"))

        data_platform_urn = make_data_platform_urn(self.source_config.platform)
        logger.info(f"Creating dataset urn with name: {browse_path}")
        dataset_urn = make_dataset_urn_with_platform_instance(
            self.source_config.platform,
            browse_path,
            self.source_config.platform_instance,
            self.source_config.env,
        )

        dataset_snapshot = DatasetSnapshot(
            urn=dataset_urn,
            aspects=[],
        )

        dataset_properties = DatasetPropertiesClass(
            description="",
            name=table_data.disaply_name,
            customProperties={},
        )
        dataset_snapshot.aspects.append(dataset_properties)

        fields = self.get_fields(table_data)
        schema_metadata = SchemaMetadata(
            schemaName=table_data.disaply_name,
            platform=data_platform_urn,
            version=0,
            hash="",
            fields=fields,
            platformSchema=OtherSchemaClass(rawSchema=""),
        )
        dataset_snapshot.aspects.append(schema_metadata)

        mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
        wu = MetadataWorkUnit(id=table_data.table_path, mce=mce)
        self.report.report_workunit(wu)
        yield wu

        yield from self.create_container_hierarchy(table_data, dataset_urn)

        if self.source_config.profiling.enabled:
            yield from self.get_table_profile(table_data, dataset_urn)
コード例 #2
0
ファイル: __init__.py プロジェクト: swaroopjagadish/datahub
    def get_workunits_s3(self) -> Iterable[MetadataWorkUnit]:

        plain_base_path = strip_s3_prefix(self.source_config.base_path)

        # append a trailing slash if it's not there so prefix filtering works
        if not plain_base_path.endswith("/"):
            plain_base_path = plain_base_path + "/"

        if self.source_config.aws_config is None:
            raise ValueError("AWS config is required for S3 file sources")

        s3 = self.source_config.aws_config.get_s3_resource()
        bucket = s3.Bucket(plain_base_path.split("/")[0])

        base_obj_paths = []

        for obj in bucket.objects.filter(
                Prefix=plain_base_path.split("/", maxsplit=1)[1]):

            s3_path = f"s3://{obj.bucket_name}/{obj.key}"

            # if table patterns do not allow this file, skip
            if not self.source_config.schema_patterns.allowed(s3_path):
                continue

            # if the file is a directory, skip it
            if obj.key.endswith("/"):
                continue

            file = os.path.basename(obj.key)

            if self.source_config.ignore_dotfiles and file.startswith("."):
                continue

            base_obj_path = f"{obj.bucket_name}/{obj.key}"

            base_obj_paths.append(base_obj_path)

        for aws_file in sorted(base_obj_paths):

            relative_path = "./" + aws_file[len(plain_base_path):]

            # pass in the same relative_path as the full_path for S3 files
            yield from self.ingest_table(aws_file, relative_path, is_aws=True)
コード例 #3
0
ファイル: __init__.py プロジェクト: arunvasudevan/datahub
    def get_workunits_s3(self) -> Iterable[MetadataWorkUnit]:

        plain_base_path = strip_s3_prefix(self.source_config.base_path)

        # append a trailing slash if it's not there so prefix filtering works
        if not plain_base_path.endswith("/"):
            plain_base_path = plain_base_path + "/"

        if self.source_config.aws_config is None:
            raise ValueError("AWS config is required for S3 file sources")

        s3 = self.source_config.aws_config.get_s3_resource()
        bucket = s3.Bucket(plain_base_path.split("/")[0])

        base_obj_paths: List[Tuple[str, Dict[str, str]]] = []

        for obj in bucket.objects.filter(
                Prefix=plain_base_path.split("/", maxsplit=1)[1]):

            s3_path = f"s3://{obj.bucket_name}/{obj.key}"

            # if table patterns do not allow this file, skip
            if not self.source_config.schema_patterns.allowed(s3_path):
                continue

            # if the file is a directory, skip it
            if obj.key.endswith("/"):
                continue

            file = os.path.basename(obj.key)

            if self.source_config.ignore_dotfiles and file.startswith("."):
                continue

            base_obj_path = f"{obj.bucket_name}/{obj.key}"

            properties = {
                "owner":
                str(obj.owner) if obj.owner else "",
                "e_tag":
                str(obj.e_tag) if obj.e_tag else "",
                "last_modified":
                str(obj.last_modified) if obj.last_modified else "",
                "size":
                str(obj.size) if obj.size else "",
                "storage_class":
                str(obj.storage_class) if obj.storage_class else "",
                "service_name":
                str(obj.meta.service_name)
                if obj.meta and obj.meta.service_name else "",
            }
            logger.debug(f"Adding file {base_obj_path} for ingestion")
            base_obj_paths.append((base_obj_path, properties))

        for aws_file in sorted(base_obj_paths, key=lambda a: a[0]):
            path = aws_file[0]
            properties = aws_file[1]
            relative_path = "./" + path[len(plain_base_path):]

            # pass in the same relative_path as the full_path for S3 files
            yield from self.ingest_table(path,
                                         relative_path,
                                         is_aws=True,
                                         properties=properties)
コード例 #4
0
    def ingest_table(self, table_data: TableData,
                     path_spec: PathSpec) -> Iterable[MetadataWorkUnit]:

        logger.info(
            f"Extracting table schema from file: {table_data.full_path}")
        browse_path: str = (strip_s3_prefix(table_data.table_path)
                            if table_data.is_s3 else
                            table_data.table_path.strip("/"))

        data_platform_urn = make_data_platform_urn(self.source_config.platform)
        logger.info(f"Creating dataset urn with name: {browse_path}")
        dataset_urn = make_dataset_urn_with_platform_instance(
            self.source_config.platform,
            browse_path,
            self.source_config.platform_instance,
            self.source_config.env,
        )

        dataset_snapshot = DatasetSnapshot(
            urn=dataset_urn,
            aspects=[],
        )

        dataset_properties = DatasetPropertiesClass(
            description="",
            name=table_data.display_name,
            customProperties={
                "number_of_files": str(table_data.number_of_files),
                "size_in_bytes": str(table_data.size_in_bytes),
            },
        )
        dataset_snapshot.aspects.append(dataset_properties)

        fields = self.get_fields(table_data, path_spec)
        schema_metadata = SchemaMetadata(
            schemaName=table_data.display_name,
            platform=data_platform_urn,
            version=0,
            hash="",
            fields=fields,
            platformSchema=OtherSchemaClass(rawSchema=""),
        )
        dataset_snapshot.aspects.append(schema_metadata)
        if (self.source_config.use_s3_bucket_tags
                or self.source_config.use_s3_object_tags):
            bucket = get_bucket_name(table_data.table_path)
            key_prefix = (get_key_prefix(table_data.table_path)
                          if table_data.full_path == table_data.table_path else
                          None)
            s3_tags = self.get_s3_tags(bucket, key_prefix, dataset_urn)
            if s3_tags is not None:
                dataset_snapshot.aspects.append(s3_tags)

        mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
        wu = MetadataWorkUnit(id=table_data.table_path, mce=mce)
        self.report.report_workunit(wu)
        yield wu

        yield from self.create_container_hierarchy(table_data, dataset_urn)

        if self.source_config.profiling.enabled:
            yield from self.get_table_profile(table_data, dataset_urn)