コード例 #1
0
    def _extract_mcps(self,
                      index: str) -> Iterable[MetadataChangeProposalWrapper]:
        logger.debug(f"index = {index}")
        raw_index = self.client.indices.get(index=index)
        raw_index_metadata = raw_index[index]

        # 0. Dedup data_streams.
        data_stream = raw_index_metadata.get("data_stream")
        if data_stream:
            index = data_stream
            self.data_stream_partition_count[index] += 1
            if self.data_stream_partition_count[index] > 1:
                # This is a duplicate, skip processing it further.
                return

        # 1. Construct and emit the schemaMetadata aspect
        # 1.1 Generate the schema fields from ES mappings.
        index_mappings = raw_index_metadata["mappings"]
        index_mappings_json_str: str = json.dumps(index_mappings)
        md5_hash = md5(index_mappings_json_str.encode()).hexdigest()
        schema_fields = list(
            ElasticToSchemaFieldConverter.get_schema_fields(index_mappings))

        # 1.2 Generate the SchemaMetadata aspect
        schema_metadata = SchemaMetadata(
            schemaName=index,
            platform=make_data_platform_urn(self.platform),
            version=0,
            hash=md5_hash,
            platformSchema=OtherSchemaClass(rawSchema=index_mappings_json_str),
            fields=schema_fields,
        )

        # 1.3 Emit the mcp
        dataset_urn: str = make_dataset_urn(self.platform, index,
                                            self.source_config.env)
        yield MetadataChangeProposalWrapper(
            entityType="dataset",
            entityUrn=dataset_urn,
            aspectName="schemaMetadata",
            aspect=schema_metadata,
            changeType=ChangeTypeClass.UPSERT,
        )

        # 2. Construct and emit the status aspect.
        yield MetadataChangeProposalWrapper(
            entityType="dataset",
            entityUrn=dataset_urn,
            aspectName="status",
            aspect=StatusClass(removed=False),
            changeType=ChangeTypeClass.UPSERT,
        )

        # 3. Construct and emit subtype
        yield MetadataChangeProposalWrapper(
            entityType="dataset",
            entityUrn=dataset_urn,
            aspectName="subTypes",
            aspect=SubTypesClass(
                typeNames=["Index" if not data_stream else "DataStream"]),
            changeType=ChangeTypeClass.UPSERT,
        )

        # 4. Construct and emit properties if needed
        index_aliases = raw_index_metadata.get("aliases", {}).keys()
        if index_aliases:
            yield MetadataChangeProposalWrapper(
                entityType="dataset",
                entityUrn=dataset_urn,
                aspectName="datasetProperties",
                aspect=DatasetPropertiesClass(
                    customProperties={"aliases": ",".join(index_aliases)}),
                changeType=ChangeTypeClass.UPSERT,
            )
コード例 #2
0
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        platform = "mongodb"

        database_names: List[str] = self.mongo_client.list_database_names()

        # traverse databases in sorted order so output is consistent
        for database_name in sorted(database_names):
            if database_name in DENY_DATABASE_LIST:
                continue
            if not self.config.database_pattern.allowed(database_name):
                self.report.report_dropped(database_name)
                continue

            database = self.mongo_client[database_name]
            collection_names: List[str] = database.list_collection_names()

            # traverse collections in sorted order so output is consistent
            for collection_name in sorted(collection_names):
                dataset_name = f"{database_name}.{collection_name}"

                if not self.config.collection_pattern.allowed(dataset_name):
                    self.report.report_dropped(dataset_name)
                    continue

                dataset_snapshot = DatasetSnapshot(
                    urn=f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.config.env})",
                    aspects=[],
                )

                dataset_properties = DatasetPropertiesClass(
                    tags=[],
                    customProperties={},
                )
                dataset_snapshot.aspects.append(dataset_properties)

                if self.config.enableSchemaInference:

                    collection_schema = construct_schema_pymongo(
                        database[collection_name],
                        delimiter=".",
                        sample_size=self.config.schemaSamplingSize,
                    )

                    # initialize the schema for the collection
                    canonical_schema: List[SchemaField] = []

                    # append each schema field (sort so output is consistent)
                    for schema_field in sorted(
                        collection_schema.values(), key=lambda x: x["delimited_name"]
                    ):
                        field = SchemaField(
                            fieldPath=schema_field["delimited_name"],
                            nativeDataType=self.get_pymongo_type_string(
                                schema_field["type"], dataset_name
                            ),
                            type=self.get_field_type(
                                schema_field["type"], dataset_name
                            ),
                            description=None,
                            nullable=schema_field["nullable"],
                            recursive=False,
                        )
                        canonical_schema.append(field)

                    # create schema metadata object for collection
                    actor = "urn:li:corpuser:etl"
                    sys_time = int(time.time() * 1000)
                    schema_metadata = SchemaMetadata(
                        schemaName=collection_name,
                        platform=f"urn:li:dataPlatform:{platform}",
                        version=0,
                        hash="",
                        platformSchema=SchemalessClass(),
                        created=AuditStamp(time=sys_time, actor=actor),
                        lastModified=AuditStamp(time=sys_time, actor=actor),
                        fields=canonical_schema,
                    )

                    dataset_snapshot.aspects.append(schema_metadata)

                # TODO: use list_indexes() or index_information() to get index information
                # See https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.list_indexes.

                mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
                wu = MetadataWorkUnit(id=dataset_name, mce=mce)
                self.report.report_workunit(wu)
                yield wu
コード例 #3
0
ファイル: kafka.py プロジェクト: northwesternmutual/datahub
    def _extract_record(self, topic: str) -> MetadataChangeEvent:
        logger.debug(f"topic = {topic}")
        platform = "kafka"
        dataset_name = topic

        dataset_snapshot = DatasetSnapshot(
            urn=
            f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.source_config.env})",
            aspects=[],  # we append to this list later on
        )
        dataset_snapshot.aspects.append(Status(removed=False))
        # Fetch schema from the registry.
        schema: Optional[Schema] = None
        try:
            registered_schema = self.schema_registry_client.get_latest_version(
                topic + "-value")
            schema = registered_schema.schema
        except Exception as e:
            self.report.report_warning(topic,
                                       f"failed to get value schema: {e}")

        # Parse the schema
        fields: List[SchemaField] = []
        if schema and schema.schema_type == "AVRO":
            # "value.id" or "value.[type=string]id"
            fields = schema_util.avro_schema_to_mce_fields(schema.schema_str)
        elif schema is not None:
            self.report.report_warning(
                topic,
                f"Parsing kafka schema type {schema.schema_type} is currently not implemented",
            )
        # Fetch key schema from the registry
        key_schema: Optional[Schema] = None
        try:
            registered_schema = self.schema_registry_client.get_latest_version(
                topic + "-key")
            key_schema = registered_schema.schema
        except Exception as e:
            # do not report warnings because it is okay to not have key schemas
            logger.debug(f"{topic}: no key schema found. {e}")
            pass

        # Parse the key schema
        key_fields: List[SchemaField] = []
        if key_schema and key_schema.schema_type == "AVRO":
            key_fields = schema_util.avro_schema_to_mce_fields(
                key_schema.schema_str, is_key_schema=True)
        elif key_schema is not None:
            self.report.report_warning(
                topic,
                f"Parsing kafka schema type {key_schema.schema_type} is currently not implemented",
            )

        key_schema_str: Optional[str] = None
        if schema is not None or key_schema is not None:
            # create a merged string for the combined schemas and compute an md5 hash across
            schema_as_string = schema.schema_str if schema is not None else ""
            schema_as_string = (schema_as_string + key_schema.schema_str
                                if key_schema is not None else "")
            md5_hash = md5(schema_as_string.encode()).hexdigest()

            if key_schema:
                key_schema_str = key_schema.schema_str

            schema_metadata = SchemaMetadata(
                schemaName=topic,
                version=0,
                hash=md5_hash,
                platform=f"urn:li:dataPlatform:{platform}",
                platformSchema=KafkaSchema(
                    documentSchema=schema.schema_str
                    if schema is not None else "",
                    keySchema=key_schema_str,
                ),
                fields=key_fields + fields,
            )
            dataset_snapshot.aspects.append(schema_metadata)

        browse_path = BrowsePathsClass(
            [f"/{self.source_config.env.lower()}/{platform}/{topic}"])
        dataset_snapshot.aspects.append(browse_path)

        metadata_record = MetadataChangeEvent(
            proposedSnapshot=dataset_snapshot)
        return metadata_record
コード例 #4
0
ファイル: __init__.py プロジェクト: swaroopjagadish/datahub
    def get_table_schema(self, file_path: str, table_name: str,
                         is_aws: bool) -> Iterable[MetadataWorkUnit]:

        data_platform_urn = make_data_platform_urn(self.source_config.platform)
        dataset_urn = make_dataset_urn(self.source_config.platform, table_name,
                                       self.source_config.env)

        dataset_name = os.path.basename(file_path)

        dataset_snapshot = DatasetSnapshot(
            urn=dataset_urn,
            aspects=[],
        )

        dataset_properties = DatasetPropertiesClass(
            description="",
            customProperties={},
        )
        dataset_snapshot.aspects.append(dataset_properties)

        if is_aws:
            if self.source_config.aws_config is None:
                raise ValueError("AWS config is required for S3 file sources")

            s3_client = self.source_config.aws_config.get_s3_client()

            file = smart_open(f"s3://{file_path}",
                              "rb",
                              transport_params={"client": s3_client})

        else:

            file = open(file_path, "rb")

        fields = []

        try:
            if file_path.endswith(".parquet"):
                fields = parquet.ParquetInferrer().infer_schema(file)
            elif file_path.endswith(".csv"):
                fields = csv_tsv.CsvInferrer(
                    max_rows=self.source_config.max_rows).infer_schema(file)
            elif file_path.endswith(".tsv"):
                fields = csv_tsv.TsvInferrer(
                    max_rows=self.source_config.max_rows).infer_schema(file)
            elif file_path.endswith(".json"):
                fields = json.JsonInferrer().infer_schema(file)
            elif file_path.endswith(".avro"):
                fields = avro.AvroInferrer().infer_schema(file)
            else:
                self.report.report_warning(
                    file_path, f"file {file_path} has unsupported extension")
            file.close()
        except Exception as e:
            self.report.report_warning(
                file_path, f"could not infer schema for file {file_path}: {e}")
            file.close()

        fields = sorted(fields, key=lambda f: f.fieldPath)
        schema_metadata = SchemaMetadata(
            schemaName=dataset_name,
            platform=data_platform_urn,
            version=0,
            hash="",
            fields=fields,
            platformSchema=OtherSchemaClass(rawSchema=""),
        )

        dataset_snapshot.aspects.append(schema_metadata)

        mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
        wu = MetadataWorkUnit(id=file_path, mce=mce)
        self.report.report_workunit(wu)
        yield wu
コード例 #5
0
ファイル: mongodb.py プロジェクト: swaroopjagadish/datahub
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        platform = "mongodb"

        database_names: List[str] = self.mongo_client.list_database_names()

        # traverse databases in sorted order so output is consistent
        for database_name in sorted(database_names):
            if database_name in DENY_DATABASE_LIST:
                continue
            if not self.config.database_pattern.allowed(database_name):
                self.report.report_dropped(database_name)
                continue

            database = self.mongo_client[database_name]
            collection_names: List[str] = database.list_collection_names()

            # traverse collections in sorted order so output is consistent
            for collection_name in sorted(collection_names):
                dataset_name = f"{database_name}.{collection_name}"

                if not self.config.collection_pattern.allowed(dataset_name):
                    self.report.report_dropped(dataset_name)
                    continue

                dataset_urn = f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.config.env})"

                dataset_snapshot = DatasetSnapshot(
                    urn=dataset_urn,
                    aspects=[],
                )

                dataset_properties = DatasetPropertiesClass(
                    tags=[],
                    customProperties={},
                )
                dataset_snapshot.aspects.append(dataset_properties)

                if self.config.enableSchemaInference:
                    assert self.config.maxDocumentSize is not None
                    collection_schema = construct_schema_pymongo(
                        database[collection_name],
                        delimiter=".",
                        use_random_sampling=self.config.useRandomSampling,
                        max_document_size=self.config.maxDocumentSize,
                        is_version_gte_4_4=self.is_server_version_gte_4_4(),
                        sample_size=self.config.schemaSamplingSize,
                    )

                    # initialize the schema for the collection
                    canonical_schema: List[SchemaField] = []
                    max_schema_size = self.config.maxSchemaSize
                    collection_schema_size = len(collection_schema.values())
                    collection_fields: Union[
                        List[SchemaDescription], ValuesView[
                            SchemaDescription]] = collection_schema.values()
                    assert max_schema_size is not None
                    if collection_schema_size > max_schema_size:
                        # downsample the schema, using frequency as the sort key
                        self.report.report_warning(
                            key=dataset_urn,
                            reason=
                            f"Downsampling the collection schema because it has {collection_schema_size} fields. Threshold is {max_schema_size}",
                        )
                        collection_fields = sorted(
                            collection_schema.values(),
                            key=lambda x: x["count"],
                            reverse=True,
                        )[0:max_schema_size]
                        # Add this information to the custom properties so user can know they are looking at downsampled schema
                        dataset_properties.customProperties[
                            "schema.downsampled"] = "True"
                        dataset_properties.customProperties[
                            "schema.totalFields"] = f"{collection_schema_size}"

                    logger.debug(
                        f"Size of collection fields = {len(collection_fields)}"
                    )
                    # append each schema field (sort so output is consistent)
                    for schema_field in sorted(
                            collection_fields,
                            key=lambda x: x["delimited_name"]):
                        field = SchemaField(
                            fieldPath=schema_field["delimited_name"],
                            nativeDataType=self.get_pymongo_type_string(
                                schema_field["type"], dataset_name),
                            type=self.get_field_type(schema_field["type"],
                                                     dataset_name),
                            description=None,
                            nullable=schema_field["nullable"],
                            recursive=False,
                        )
                        canonical_schema.append(field)

                    # create schema metadata object for collection
                    schema_metadata = SchemaMetadata(
                        schemaName=collection_name,
                        platform=f"urn:li:dataPlatform:{platform}",
                        version=0,
                        hash="",
                        platformSchema=SchemalessClass(),
                        fields=canonical_schema,
                    )

                    dataset_snapshot.aspects.append(schema_metadata)

                # TODO: use list_indexes() or index_information() to get index information
                # See https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.list_indexes.

                mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
                wu = MetadataWorkUnit(id=dataset_name, mce=mce)
                self.report.report_workunit(wu)
                yield wu
コード例 #6
0
    def ingest_table(self, table_data: TableData,
                     path_spec: PathSpec) -> Iterable[MetadataWorkUnit]:

        logger.info(
            f"Extracting table schema from file: {table_data.full_path}")
        browse_path: str = (strip_s3_prefix(table_data.table_path)
                            if table_data.is_s3 else
                            table_data.table_path.strip("/"))

        data_platform_urn = make_data_platform_urn(self.source_config.platform)
        logger.info(f"Creating dataset urn with name: {browse_path}")
        dataset_urn = make_dataset_urn_with_platform_instance(
            self.source_config.platform,
            browse_path,
            self.source_config.platform_instance,
            self.source_config.env,
        )

        dataset_snapshot = DatasetSnapshot(
            urn=dataset_urn,
            aspects=[],
        )

        dataset_properties = DatasetPropertiesClass(
            description="",
            name=table_data.display_name,
            customProperties={
                "number_of_files": str(table_data.number_of_files),
                "size_in_bytes": str(table_data.size_in_bytes),
            },
        )
        dataset_snapshot.aspects.append(dataset_properties)

        fields = self.get_fields(table_data, path_spec)
        schema_metadata = SchemaMetadata(
            schemaName=table_data.display_name,
            platform=data_platform_urn,
            version=0,
            hash="",
            fields=fields,
            platformSchema=OtherSchemaClass(rawSchema=""),
        )
        dataset_snapshot.aspects.append(schema_metadata)
        if (self.source_config.use_s3_bucket_tags
                or self.source_config.use_s3_object_tags):
            bucket = get_bucket_name(table_data.table_path)
            key_prefix = (get_key_prefix(table_data.table_path)
                          if table_data.full_path == table_data.table_path else
                          None)
            s3_tags = self.get_s3_tags(bucket, key_prefix, dataset_urn)
            if s3_tags is not None:
                dataset_snapshot.aspects.append(s3_tags)

        mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
        wu = MetadataWorkUnit(id=table_data.table_path, mce=mce)
        self.report.report_workunit(wu)
        yield wu

        yield from self.create_container_hierarchy(table_data, dataset_urn)

        if self.source_config.profiling.enabled:
            yield from self.get_table_profile(table_data, dataset_urn)