コード例 #1
0
ファイル: config.py プロジェクト: hsheth2/datahub
    def validate_platform(cls, values: Dict) -> Dict:
        value = values.get("platform")
        if value is not None and value != "":
            return values

        if not values.get("path_specs") and not values.get("path_spec"):
            raise ValueError("Either path_specs or path_spec needs to be specified")

        if values.get("path_specs") and values.get("path_spec"):
            raise ValueError(
                "Either path_specs or path_spec needs to be specified but not both"
            )

        if values.get("path_spec"):
            logger.warning(
                "path_spec config property is deprecated, please use path_specs instead of it."
            )
            values["path_specs"] = [values.get("path_spec")]

        bucket_name: str = ""
        for path_spec in values.get("path_specs", []):
            if path_spec.is_s3():
                platform = "s3"
            else:
                if values.get("use_s3_object_tags") or values.get("use_s3_bucket_tags"):
                    raise ValueError(
                        "cannot grab s3 tags for platform != s3. Remove the flag or use s3."
                    )

                platform = "file"

            if values.get("platform", "") != "":
                if values["platform"] != platform:
                    raise ValueError("all path_spec should belong to the same platform")
            else:
                values["platform"] = platform
                logger.debug(f'Setting config "platform": {values.get("platform")}')

            if platform == "s3":
                if bucket_name == "":
                    bucket_name = get_bucket_name(path_spec.include)
                else:
                    if bucket_name != get_bucket_name(path_spec.include):
                        raise ValueError(
                            "all path_spec should reference the same s3 bucket"
                        )

        return values
コード例 #2
0
        def get_s3_tags() -> Optional[GlobalTagsClass]:
            bucket_name = s3_util.get_bucket_name(
                table["StorageDescriptor"]["Location"])
            tags_to_add = []
            if self.source_config.use_s3_bucket_tags:
                try:
                    bucket_tags = self.s3_client.get_bucket_tagging(
                        Bucket=bucket_name)
                    tags_to_add.extend([
                        make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""")
                        for tag in bucket_tags["TagSet"]
                    ])
                except self.s3_client.exceptions.ClientError:
                    logger.warn(f"No tags found for bucket={bucket_name}")
            if self.source_config.use_s3_object_tags:
                key_prefix = s3_util.get_key_prefix(
                    table["StorageDescriptor"]["Location"])
                object_tagging = self.s3_client.get_object_tagging(
                    Bucket=bucket_name, Key=key_prefix)
                tag_set = object_tagging["TagSet"]
                if tag_set:
                    tags_to_add.extend([
                        make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""")
                        for tag in tag_set
                    ])
                else:
                    # Unlike bucket tags, if an object does not have tags, it will just return an empty array
                    # as opposed to an exception.
                    logger.warn(
                        f"No tags found for bucket={bucket_name} key={key_prefix}"
                    )
            if len(tags_to_add) == 0:
                return None
            if self.ctx.graph is not None:
                logger.debug(
                    "Connected to DatahubApi, grabbing current tags to maintain."
                )
                current_tags: Optional[
                    GlobalTagsClass] = self.ctx.graph.get_aspect_v2(
                        entity_urn=dataset_urn,
                        aspect="globalTags",
                        aspect_type=GlobalTagsClass,
                    )
                if current_tags:
                    tags_to_add.extend(
                        [current_tag.tag for current_tag in current_tags.tags])
            else:
                logger.warn(
                    "Could not connect to DatahubApi. No current tags to maintain"
                )

            # Remove duplicate tags
            tags_to_add = list(set(tags_to_add))
            new_tags = GlobalTagsClass(tags=[
                TagAssociationClass(tag_to_add) for tag_to_add in tags_to_add
            ])
            return new_tags
コード例 #3
0
ファイル: __init__.py プロジェクト: arunvasudevan/datahub
 def s3_browser(self) -> Iterable[tuple]:
     if self.source_config.aws_config is None:
         raise ValueError("aws_config not set. Cannot browse s3")
     s3 = self.source_config.aws_config.get_s3_resource()
     bucket_name = get_bucket_name(self.source_config.path_spec.include)
     logger.debug(f"Scanning bucket : {bucket_name}")
     bucket = s3.Bucket(bucket_name)
     prefix = self.get_prefix(
         get_bucket_relative_path(self.source_config.path_spec.include))
     logger.debug(f"Scanning objects with prefix:{prefix}")
     for obj in bucket.objects.filter(Prefix=prefix).page_size(1000):
         s3_path = f"s3://{obj.bucket_name}/{obj.key}"
         yield s3_path, obj.last_modified
コード例 #4
0
    def create_container_hierarchy(
            self, table_data: TableData,
            dataset_urn: str) -> Iterable[MetadataWorkUnit]:
        logger.debug(f"Creating containers for {dataset_urn}")
        base_full_path = table_data.table_path
        parent_key = None
        if table_data.is_s3:
            bucket_name = get_bucket_name(table_data.table_path)
            bucket_key = self.gen_bucket_key(bucket_name)
            yield from self.create_emit_containers(
                container_key=bucket_key,
                name=bucket_name,
                sub_types=["S3 bucket"],
                parent_container_key=None,
            )
            parent_key = bucket_key
            base_full_path = get_bucket_relative_path(table_data.table_path)

        parent_folder_path = (base_full_path[:base_full_path.rfind("/")]
                              if base_full_path.rfind("/") != -1 else "")
        for folder in parent_folder_path.split("/"):
            abs_path = folder
            if parent_key:
                prefix: str = ""
                if isinstance(parent_key, S3BucketKey):
                    prefix = parent_key.bucket_name
                elif isinstance(parent_key, FolderKey):
                    prefix = parent_key.folder_abs_path
                abs_path = prefix + "/" + folder
            folder_key = self.gen_folder_key(abs_path)
            yield from self.create_emit_containers(
                container_key=folder_key,
                name=folder,
                sub_types=["Folder"],
                parent_container_key=parent_key,
            )
            parent_key = folder_key
        if parent_key is None:
            logger.warning(
                f"Failed to associate Dataset ({dataset_urn}) with container")
            return
        yield from add_dataset_to_container(parent_key, dataset_urn)
コード例 #5
0
    def ingest_table(self, table_data: TableData,
                     path_spec: PathSpec) -> Iterable[MetadataWorkUnit]:

        logger.info(
            f"Extracting table schema from file: {table_data.full_path}")
        browse_path: str = (strip_s3_prefix(table_data.table_path)
                            if table_data.is_s3 else
                            table_data.table_path.strip("/"))

        data_platform_urn = make_data_platform_urn(self.source_config.platform)
        logger.info(f"Creating dataset urn with name: {browse_path}")
        dataset_urn = make_dataset_urn_with_platform_instance(
            self.source_config.platform,
            browse_path,
            self.source_config.platform_instance,
            self.source_config.env,
        )

        dataset_snapshot = DatasetSnapshot(
            urn=dataset_urn,
            aspects=[],
        )

        dataset_properties = DatasetPropertiesClass(
            description="",
            name=table_data.display_name,
            customProperties={
                "number_of_files": str(table_data.number_of_files),
                "size_in_bytes": str(table_data.size_in_bytes),
            },
        )
        dataset_snapshot.aspects.append(dataset_properties)

        fields = self.get_fields(table_data, path_spec)
        schema_metadata = SchemaMetadata(
            schemaName=table_data.display_name,
            platform=data_platform_urn,
            version=0,
            hash="",
            fields=fields,
            platformSchema=OtherSchemaClass(rawSchema=""),
        )
        dataset_snapshot.aspects.append(schema_metadata)
        if (self.source_config.use_s3_bucket_tags
                or self.source_config.use_s3_object_tags):
            bucket = get_bucket_name(table_data.table_path)
            key_prefix = (get_key_prefix(table_data.table_path)
                          if table_data.full_path == table_data.table_path else
                          None)
            s3_tags = self.get_s3_tags(bucket, key_prefix, dataset_urn)
            if s3_tags is not None:
                dataset_snapshot.aspects.append(s3_tags)

        mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
        wu = MetadataWorkUnit(id=table_data.table_path, mce=mce)
        self.report.report_workunit(wu)
        yield wu

        yield from self.create_container_hierarchy(table_data, dataset_urn)

        if self.source_config.profiling.enabled:
            yield from self.get_table_profile(table_data, dataset_urn)