def _extract_mcps(self, index: str) -> Iterable[MetadataChangeProposalWrapper]: logger.debug(f"index = {index}") raw_index = self.client.indices.get(index=index) raw_index_metadata = raw_index[index] # 0. Dedup data_streams. data_stream = raw_index_metadata.get("data_stream") if data_stream: index = data_stream self.data_stream_partition_count[index] += 1 if self.data_stream_partition_count[index] > 1: # This is a duplicate, skip processing it further. return # 1. Construct and emit the schemaMetadata aspect # 1.1 Generate the schema fields from ES mappings. index_mappings = raw_index_metadata["mappings"] index_mappings_json_str: str = json.dumps(index_mappings) md5_hash = md5(index_mappings_json_str.encode()).hexdigest() schema_fields = list( ElasticToSchemaFieldConverter.get_schema_fields(index_mappings)) # 1.2 Generate the SchemaMetadata aspect schema_metadata = SchemaMetadata( schemaName=index, platform=make_data_platform_urn(self.platform), version=0, hash=md5_hash, platformSchema=OtherSchemaClass(rawSchema=index_mappings_json_str), fields=schema_fields, ) # 1.3 Emit the mcp dataset_urn: str = make_dataset_urn(self.platform, index, self.source_config.env) yield MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, aspectName="schemaMetadata", aspect=schema_metadata, changeType=ChangeTypeClass.UPSERT, ) # 2. Construct and emit the status aspect. yield MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, aspectName="status", aspect=StatusClass(removed=False), changeType=ChangeTypeClass.UPSERT, ) # 3. Construct and emit subtype yield MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, aspectName="subTypes", aspect=SubTypesClass( typeNames=["Index" if not data_stream else "DataStream"]), changeType=ChangeTypeClass.UPSERT, ) # 4. Construct and emit properties if needed index_aliases = raw_index_metadata.get("aliases", {}).keys() if index_aliases: yield MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, aspectName="datasetProperties", aspect=DatasetPropertiesClass( customProperties={"aliases": ",".join(index_aliases)}), changeType=ChangeTypeClass.UPSERT, )
def get_workunits(self) -> Iterable[MetadataWorkUnit]: platform = "mongodb" database_names: List[str] = self.mongo_client.list_database_names() # traverse databases in sorted order so output is consistent for database_name in sorted(database_names): if database_name in DENY_DATABASE_LIST: continue if not self.config.database_pattern.allowed(database_name): self.report.report_dropped(database_name) continue database = self.mongo_client[database_name] collection_names: List[str] = database.list_collection_names() # traverse collections in sorted order so output is consistent for collection_name in sorted(collection_names): dataset_name = f"{database_name}.{collection_name}" if not self.config.collection_pattern.allowed(dataset_name): self.report.report_dropped(dataset_name) continue dataset_snapshot = DatasetSnapshot( urn=f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.config.env})", aspects=[], ) dataset_properties = DatasetPropertiesClass( tags=[], customProperties={}, ) dataset_snapshot.aspects.append(dataset_properties) if self.config.enableSchemaInference: collection_schema = construct_schema_pymongo( database[collection_name], delimiter=".", sample_size=self.config.schemaSamplingSize, ) # initialize the schema for the collection canonical_schema: List[SchemaField] = [] # append each schema field (sort so output is consistent) for schema_field in sorted( collection_schema.values(), key=lambda x: x["delimited_name"] ): field = SchemaField( fieldPath=schema_field["delimited_name"], nativeDataType=self.get_pymongo_type_string( schema_field["type"], dataset_name ), type=self.get_field_type( schema_field["type"], dataset_name ), description=None, nullable=schema_field["nullable"], recursive=False, ) canonical_schema.append(field) # create schema metadata object for collection actor = "urn:li:corpuser:etl" sys_time = int(time.time() * 1000) schema_metadata = SchemaMetadata( schemaName=collection_name, platform=f"urn:li:dataPlatform:{platform}", version=0, hash="", platformSchema=SchemalessClass(), created=AuditStamp(time=sys_time, actor=actor), lastModified=AuditStamp(time=sys_time, actor=actor), fields=canonical_schema, ) dataset_snapshot.aspects.append(schema_metadata) # TODO: use list_indexes() or index_information() to get index information # See https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.list_indexes. mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = MetadataWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu
def _extract_record(self, topic: str) -> MetadataChangeEvent: logger.debug(f"topic = {topic}") platform = "kafka" dataset_name = topic dataset_snapshot = DatasetSnapshot( urn= f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.source_config.env})", aspects=[], # we append to this list later on ) dataset_snapshot.aspects.append(Status(removed=False)) # Fetch schema from the registry. schema: Optional[Schema] = None try: registered_schema = self.schema_registry_client.get_latest_version( topic + "-value") schema = registered_schema.schema except Exception as e: self.report.report_warning(topic, f"failed to get value schema: {e}") # Parse the schema fields: List[SchemaField] = [] if schema and schema.schema_type == "AVRO": # "value.id" or "value.[type=string]id" fields = schema_util.avro_schema_to_mce_fields(schema.schema_str) elif schema is not None: self.report.report_warning( topic, f"Parsing kafka schema type {schema.schema_type} is currently not implemented", ) # Fetch key schema from the registry key_schema: Optional[Schema] = None try: registered_schema = self.schema_registry_client.get_latest_version( topic + "-key") key_schema = registered_schema.schema except Exception as e: # do not report warnings because it is okay to not have key schemas logger.debug(f"{topic}: no key schema found. {e}") pass # Parse the key schema key_fields: List[SchemaField] = [] if key_schema and key_schema.schema_type == "AVRO": key_fields = schema_util.avro_schema_to_mce_fields( key_schema.schema_str, is_key_schema=True) elif key_schema is not None: self.report.report_warning( topic, f"Parsing kafka schema type {key_schema.schema_type} is currently not implemented", ) key_schema_str: Optional[str] = None if schema is not None or key_schema is not None: # create a merged string for the combined schemas and compute an md5 hash across schema_as_string = schema.schema_str if schema is not None else "" schema_as_string = (schema_as_string + key_schema.schema_str if key_schema is not None else "") md5_hash = md5(schema_as_string.encode()).hexdigest() if key_schema: key_schema_str = key_schema.schema_str schema_metadata = SchemaMetadata( schemaName=topic, version=0, hash=md5_hash, platform=f"urn:li:dataPlatform:{platform}", platformSchema=KafkaSchema( documentSchema=schema.schema_str if schema is not None else "", keySchema=key_schema_str, ), fields=key_fields + fields, ) dataset_snapshot.aspects.append(schema_metadata) browse_path = BrowsePathsClass( [f"/{self.source_config.env.lower()}/{platform}/{topic}"]) dataset_snapshot.aspects.append(browse_path) metadata_record = MetadataChangeEvent( proposedSnapshot=dataset_snapshot) return metadata_record
def get_table_schema(self, file_path: str, table_name: str, is_aws: bool) -> Iterable[MetadataWorkUnit]: data_platform_urn = make_data_platform_urn(self.source_config.platform) dataset_urn = make_dataset_urn(self.source_config.platform, table_name, self.source_config.env) dataset_name = os.path.basename(file_path) dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[], ) dataset_properties = DatasetPropertiesClass( description="", customProperties={}, ) dataset_snapshot.aspects.append(dataset_properties) if is_aws: if self.source_config.aws_config is None: raise ValueError("AWS config is required for S3 file sources") s3_client = self.source_config.aws_config.get_s3_client() file = smart_open(f"s3://{file_path}", "rb", transport_params={"client": s3_client}) else: file = open(file_path, "rb") fields = [] try: if file_path.endswith(".parquet"): fields = parquet.ParquetInferrer().infer_schema(file) elif file_path.endswith(".csv"): fields = csv_tsv.CsvInferrer( max_rows=self.source_config.max_rows).infer_schema(file) elif file_path.endswith(".tsv"): fields = csv_tsv.TsvInferrer( max_rows=self.source_config.max_rows).infer_schema(file) elif file_path.endswith(".json"): fields = json.JsonInferrer().infer_schema(file) elif file_path.endswith(".avro"): fields = avro.AvroInferrer().infer_schema(file) else: self.report.report_warning( file_path, f"file {file_path} has unsupported extension") file.close() except Exception as e: self.report.report_warning( file_path, f"could not infer schema for file {file_path}: {e}") file.close() fields = sorted(fields, key=lambda f: f.fieldPath) schema_metadata = SchemaMetadata( schemaName=dataset_name, platform=data_platform_urn, version=0, hash="", fields=fields, platformSchema=OtherSchemaClass(rawSchema=""), ) dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = MetadataWorkUnit(id=file_path, mce=mce) self.report.report_workunit(wu) yield wu
def get_workunits(self) -> Iterable[MetadataWorkUnit]: platform = "mongodb" database_names: List[str] = self.mongo_client.list_database_names() # traverse databases in sorted order so output is consistent for database_name in sorted(database_names): if database_name in DENY_DATABASE_LIST: continue if not self.config.database_pattern.allowed(database_name): self.report.report_dropped(database_name) continue database = self.mongo_client[database_name] collection_names: List[str] = database.list_collection_names() # traverse collections in sorted order so output is consistent for collection_name in sorted(collection_names): dataset_name = f"{database_name}.{collection_name}" if not self.config.collection_pattern.allowed(dataset_name): self.report.report_dropped(dataset_name) continue dataset_urn = f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.config.env})" dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[], ) dataset_properties = DatasetPropertiesClass( tags=[], customProperties={}, ) dataset_snapshot.aspects.append(dataset_properties) if self.config.enableSchemaInference: assert self.config.maxDocumentSize is not None collection_schema = construct_schema_pymongo( database[collection_name], delimiter=".", use_random_sampling=self.config.useRandomSampling, max_document_size=self.config.maxDocumentSize, is_version_gte_4_4=self.is_server_version_gte_4_4(), sample_size=self.config.schemaSamplingSize, ) # initialize the schema for the collection canonical_schema: List[SchemaField] = [] max_schema_size = self.config.maxSchemaSize collection_schema_size = len(collection_schema.values()) collection_fields: Union[ List[SchemaDescription], ValuesView[ SchemaDescription]] = collection_schema.values() assert max_schema_size is not None if collection_schema_size > max_schema_size: # downsample the schema, using frequency as the sort key self.report.report_warning( key=dataset_urn, reason= f"Downsampling the collection schema because it has {collection_schema_size} fields. Threshold is {max_schema_size}", ) collection_fields = sorted( collection_schema.values(), key=lambda x: x["count"], reverse=True, )[0:max_schema_size] # Add this information to the custom properties so user can know they are looking at downsampled schema dataset_properties.customProperties[ "schema.downsampled"] = "True" dataset_properties.customProperties[ "schema.totalFields"] = f"{collection_schema_size}" logger.debug( f"Size of collection fields = {len(collection_fields)}" ) # append each schema field (sort so output is consistent) for schema_field in sorted( collection_fields, key=lambda x: x["delimited_name"]): field = SchemaField( fieldPath=schema_field["delimited_name"], nativeDataType=self.get_pymongo_type_string( schema_field["type"], dataset_name), type=self.get_field_type(schema_field["type"], dataset_name), description=None, nullable=schema_field["nullable"], recursive=False, ) canonical_schema.append(field) # create schema metadata object for collection schema_metadata = SchemaMetadata( schemaName=collection_name, platform=f"urn:li:dataPlatform:{platform}", version=0, hash="", platformSchema=SchemalessClass(), fields=canonical_schema, ) dataset_snapshot.aspects.append(schema_metadata) # TODO: use list_indexes() or index_information() to get index information # See https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.list_indexes. mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = MetadataWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu
def ingest_table(self, table_data: TableData, path_spec: PathSpec) -> Iterable[MetadataWorkUnit]: logger.info( f"Extracting table schema from file: {table_data.full_path}") browse_path: str = (strip_s3_prefix(table_data.table_path) if table_data.is_s3 else table_data.table_path.strip("/")) data_platform_urn = make_data_platform_urn(self.source_config.platform) logger.info(f"Creating dataset urn with name: {browse_path}") dataset_urn = make_dataset_urn_with_platform_instance( self.source_config.platform, browse_path, self.source_config.platform_instance, self.source_config.env, ) dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[], ) dataset_properties = DatasetPropertiesClass( description="", name=table_data.display_name, customProperties={ "number_of_files": str(table_data.number_of_files), "size_in_bytes": str(table_data.size_in_bytes), }, ) dataset_snapshot.aspects.append(dataset_properties) fields = self.get_fields(table_data, path_spec) schema_metadata = SchemaMetadata( schemaName=table_data.display_name, platform=data_platform_urn, version=0, hash="", fields=fields, platformSchema=OtherSchemaClass(rawSchema=""), ) dataset_snapshot.aspects.append(schema_metadata) if (self.source_config.use_s3_bucket_tags or self.source_config.use_s3_object_tags): bucket = get_bucket_name(table_data.table_path) key_prefix = (get_key_prefix(table_data.table_path) if table_data.full_path == table_data.table_path else None) s3_tags = self.get_s3_tags(bucket, key_prefix, dataset_urn) if s3_tags is not None: dataset_snapshot.aspects.append(s3_tags) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = MetadataWorkUnit(id=table_data.table_path, mce=mce) self.report.report_workunit(wu) yield wu yield from self.create_container_hierarchy(table_data, dataset_urn) if self.source_config.profiling.enabled: yield from self.get_table_profile(table_data, dataset_urn)