def avro_schema_to_mce_fields(avro_schema_string: str) -> List[SchemaField]: """Converts an avro schema into a schema compatible with MCE""" # Handle some library compatability issues. if hasattr(avro.schema, "parse"): schema_parse_fn = avro.schema.parse else: schema_parse_fn = avro.schema.Parse parsed_schema: avro.schema.RecordSchema = schema_parse_fn(avro_schema_string) fields: List[SchemaField] = [] for parsed_field in parsed_schema.fields: field = SchemaField( fieldPath=parsed_field.name, nativeDataType=str(parsed_field.type), type=_get_column_type(parsed_field.type), description=parsed_field.props.get("doc", None), recursive=False, nullable=(parsed_field.type == "null"), ) fields.append(field) return fields
def get_schema_metadata_for_custom_sql( self, columns: List[dict]) -> Optional[SchemaMetadata]: schema_metadata = None for field in columns: # Datasource fields fields = [] nativeDataType = field.get("remoteType", "UNKNOWN") TypeClass = FIELD_TYPE_MAPPING.get(nativeDataType, NullTypeClass) schema_field = SchemaField( fieldPath=field.get("name", ""), type=SchemaFieldDataType(type=TypeClass()), nativeDataType=nativeDataType, description=field.get("description", ""), ) fields.append(schema_field) schema_metadata = SchemaMetadata( schemaName="test", platform=f"urn:li:dataPlatform:{self.platform}", version=0, fields=fields, hash="", platformSchema=OtherSchema(rawSchema=""), ) return schema_metadata
def get_schema_metadata(report: SourceReport, node: DBTNode, platform: str) -> SchemaMetadata: canonical_schema: List[SchemaField] = [] for column in node.columns: field = SchemaField( fieldPath=column.name, nativeDataType=column.data_type, type=get_column_type(report, node.dbt_name, column.data_type), description=column.comment, nullable=False, # TODO: actually autodetect this recursive=False, ) canonical_schema.append(field) actor, sys_time = "urn:li:corpuser:dbt_executor", int(time.time() * 1000) last_modified = sys_time if node.max_loaded_at is not None: last_modified = int( dateutil.parser.parse(node.max_loaded_at).timestamp() * 1000) return SchemaMetadata( schemaName=node.dbt_name, platform=f"urn:li:dataPlatform:{platform}", version=0, hash="", platformSchema=MySqlDDL(tableSchema=""), created=AuditStamp(time=sys_time, actor=actor), lastModified=AuditStamp(time=last_modified, actor=actor), fields=canonical_schema, )
def _get_schema_fields( self, elastic_schema_dict: Dict[str, Any]) -> Generator[SchemaField, None, None]: # append each schema field (sort so output is consistent) for columnName, column in elastic_schema_dict.items(): elastic_type: Optional[str] = column.get("type") nested_props: Optional[Dict[str, Any]] = column.get("properties") if elastic_type is not None: self._prefix_name_stack.append( f"[type={elastic_type}].{columnName}") schema_field_data_type = self.get_column_type(elastic_type) schema_field = SchemaField( fieldPath=self._get_cur_field_path(), nativeDataType=elastic_type, type=schema_field_data_type, description=None, nullable=True, recursive=False, ) yield schema_field self._prefix_name_stack.pop() elif nested_props: self._prefix_name_stack.append(f"[type={columnName}]") yield from self._get_schema_fields(nested_props) self._prefix_name_stack.pop() else: # Unexpected! Log a warning. logger.warning( f"Elastic schema does not have either 'type' or 'properties'!" f" Schema={json.dumps(elastic_schema_dict)}") continue
def infer_schema(self, file: IO[bytes]) -> List[SchemaField]: datastore = ujson.load(file) if not isinstance(datastore, list): datastore = [datastore] schema = construct_schema(datastore, delimiter=".") fields: List[SchemaField] = [] for schema_field in sorted(schema.values(), key=lambda x: x["delimited_name"]): mapped_type = _field_type_mapping.get(schema_field["type"], NullTypeClass) native_type = schema_field["type"] if isinstance(native_type, type): native_type = native_type.__name__ field = SchemaField( fieldPath=schema_field["delimited_name"], nativeDataType=native_type, type=SchemaFieldDataType(type=mapped_type()), nullable=schema_field["nullable"], recursive=False, ) fields.append(field) return fields
def _get_fields_and_primary_keys( view_fields: List[ViewField], reporter: SourceReport, tag_measures_and_dimensions: bool = True, ) -> Tuple[List[SchemaField], List[str]]: primary_keys: List = [] fields = [] for field in view_fields: schema_field = SchemaField( fieldPath=field.name, type=LookerUtil._get_field_type(field.type, reporter), nativeDataType=field.type, description=f"{field.description}" if tag_measures_and_dimensions is True else f"{field.field_type.value}. {field.description}", globalTags=LookerUtil._get_tags_from_field_type( field.field_type, reporter ) if tag_measures_and_dimensions is True else None, isPartOfKey=field.is_primary_key, ) fields.append(schema_field) if field.is_primary_key: primary_keys.append(schema_field.fieldPath) return fields, primary_keys
def set_metadata(dataset_name: str, fields: List, platform: str = "api") -> SchemaMetadata: canonical_schema: List[SchemaField] = [] for column in fields: field = SchemaField( fieldPath=column, nativeDataType="str", type=SchemaFieldDataTypeClass(type=StringTypeClass()), description="", recursive=False, ) canonical_schema.append(field) actor = "urn:li:corpuser:etl" sys_time = int(time.time() * 1000) schema_metadata = SchemaMetadata( schemaName=dataset_name, platform=f"urn:li:dataPlatform:{platform}", version=0, hash="", platformSchema=OtherSchemaClass(rawSchema=""), created=AuditStamp(time=sys_time, actor=actor), lastModified=AuditStamp(time=sys_time, actor=actor), fields=canonical_schema, ) return schema_metadata
def get_schema_metadata(glue_source: GlueSource) -> SchemaMetadata: schema = table["StorageDescriptor"]["Columns"] fields: List[SchemaField] = [] for field in schema: schema_field = SchemaField( fieldPath=field["Name"], nativeDataType=field["Type"], type=get_column_type( glue_source, field["Type"], table_name, field["Name"] ), description=field.get("Comment"), recursive=False, nullable=True, ) fields.append(schema_field) return SchemaMetadata( schemaName=table_name, version=0, fields=fields, platform="urn:li:dataPlatform:glue", created=AuditStamp(time=sys_time, actor="urn:li:corpuser:etl"), lastModified=AuditStamp(time=sys_time, actor="urn:li:corpuser:etl"), hash="", platformSchema=MySqlDDL(tableSchema=""), )
def get_schema_metadata(sql_report: SQLSourceReport, dataset_name: str, platform: str, columns: List[dict]) -> SchemaMetadata: canonical_schema: List[SchemaField] = [] for column in columns: field = SchemaField( fieldPath=column["name"], type=get_column_type(sql_report, dataset_name, column["type"]), nativeDataType=column.get("full_type", repr(column["type"])), description=column.get("comment", None), nullable=column["nullable"], recursive=False, ) canonical_schema.append(field) actor = "urn:li:corpuser:etl" sys_time = get_sys_time() schema_metadata = SchemaMetadata( schemaName=dataset_name, platform=f"urn:li:dataPlatform:{platform}", version=0, hash="", platformSchema=MySqlDDL(tableSchema=""), created=AuditStamp(time=sys_time, actor=actor), lastModified=AuditStamp(time=sys_time, actor=actor), fields=canonical_schema, ) return schema_metadata
def _create_schema_field(path: List[str], field: FieldDescriptor) -> _PathAndField: field_path = ".".join(path) schema_field = SchemaField( fieldPath=".".join(path), nativeDataType=_get_simple_native_type(field), # Protobuf field are always nullable nullable=True, type=_get_column_type(field), ) return _PathAndField(field_path, schema_field)
def get_schema_fields_for_column( self, dataset_name: str, column: dict, pk_constraints: dict = None) -> List[SchemaField]: field = SchemaField( fieldPath=column["name"], type=get_column_type(self.report, dataset_name, column["type"]), nativeDataType=column.get("full_type", repr(column["type"])), description=column.get("comment", None), nullable=column["nullable"], recursive=False, ) if (pk_constraints is not None and isinstance( pk_constraints, dict) # some dialects (hive) return list and column["name"] in pk_constraints.get( "constrained_columns", [])): field.isPartOfKey = True return [field]
def create_metadata_work_unit(timestamp): dataset_snapshot = DatasetSnapshot( urn="urn:li:dataset:(urn:li:dataPlatform:glue,datalake_grilled.Barbeque,PROD)", aspects=[], ) dataset_snapshot.aspects.append(Status(removed=False)) dataset_snapshot.aspects.append( OwnershipClass( owners=[ OwnerClass( owner="urn:li:corpuser:Susan", type=OwnershipTypeClass.DATAOWNER ) ], lastModified=AuditStampClass( time=timestamp, actor="urn:li:corpuser:datahub" ), ) ) dataset_snapshot.aspects.append( DatasetPropertiesClass( description="Grilled Food", customProperties={}, uri=None, tags=[], ) ) fields = [ SchemaField( fieldPath="Size", nativeDataType="int", type=SchemaFieldDataType(type=NumberTypeClass()), description="Maximum attendees permitted", nullable=True, recursive=False, ) ] schema_metadata = SchemaMetadata( schemaName="datalake_grilled.Barbeque", version=0, fields=fields, platform="urn:li:dataPlatform:glue", created=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"), lastModified=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"), hash="", platformSchema=MySqlDDL(tableSchema=""), ) dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) return MetadataWorkUnit(id="glue-datalake_grilled.Barbeque", mce=mce)
def get_schema_metadata(report: SourceReport, node: DBTNode, platform: str) -> SchemaMetadata: canonical_schema: List[SchemaField] = [] for column in node.columns: description = None if (column.comment and column.description and column.comment != column.description): description = f"{platform} comment: {column.comment}\n\ndbt model description: {column.description}" elif column.comment: description = column.comment elif column.description: description = column.description globalTags = None if column.tags: globalTags = GlobalTagsClass(tags=[ TagAssociationClass(f"urn:li:tag:{tag}") for tag in column.tags ]) field = SchemaField( fieldPath=column.name, nativeDataType=column.data_type, type=get_column_type(report, node.dbt_name, column.data_type), description=description, nullable=False, # TODO: actually autodetect this recursive=False, globalTags=globalTags, ) canonical_schema.append(field) last_modified = None if node.max_loaded_at is not None: actor = "urn:li:corpuser:dbt_executor" last_modified = AuditStamp( time=int( dateutil.parser.parse(node.max_loaded_at).timestamp() * 1000), actor=actor, ) description = None return SchemaMetadata( schemaName=node.dbt_name, platform=f"urn:li:dataPlatform:{platform}", version=0, hash="", platformSchema=MySqlDDL(tableSchema=""), lastModified=last_modified, fields=canonical_schema, )
def get_schema_metadata( report: SourceReport, node: DBTNode, platform: str ) -> SchemaMetadata: canonical_schema: List[SchemaField] = [] for column in node.columns: field = SchemaField() field.fieldPath = column.name field.nativeDataType = column.data_type field.type = get_column_type(report, node.dbt_name, column.data_type) field.description = column.comment canonical_schema.append(field) actor, sys_time = "urn:li:corpuser:dbt_executor", int(time.time()) * 1000 schema_metadata = SchemaMetadata( schemaName=node.dbt_name, platform=f"urn:li:dataPlatform:{platform}", version=0, hash="", platformSchema=MySqlDDL(tableSchema=""), created=AuditStamp(time=sys_time, actor=actor), lastModified=AuditStamp(time=sys_time, actor=actor), fields=canonical_schema, ) return schema_metadata
def get_schema_metadata(glue_source: GlueSource) -> SchemaMetadata: schema = table["StorageDescriptor"]["Columns"] fields: List[SchemaField] = [] for field in schema: schema_field = SchemaField( fieldPath=field["Name"], nativeDataType=field["Type"], type=get_column_type(glue_source, field["Type"], table_name, field["Name"]), description=field.get("Comment"), recursive=False, nullable=True, ) fields.append(schema_field) partition_keys = table.get("PartitionKeys", []) for partition_key in partition_keys: schema_field = SchemaField( fieldPath=partition_key["Name"], nativeDataType=partition_key["Type"], type=get_column_type( glue_source, partition_key["Type"], table_name, partition_key["Name"], ), recursive=False, nullable=False, ) fields.append(schema_field) return SchemaMetadata( schemaName=table_name, version=0, fields=fields, platform= f"urn:li:dataPlatform:{self.get_underlying_platform()}", hash="", platformSchema=MySqlDDL(tableSchema=""), )
def get_schema_metadata( sql_report: SQLSourceReport, dataset_name: str, platform: str, columns: List[dict], pk_constraints: dict = None, foreign_keys: List[ForeignKeyConstraint] = None, ) -> SchemaMetadata: canonical_schema: List[SchemaField] = [] for column in columns: field = SchemaField( fieldPath=column["name"], type=get_column_type(sql_report, dataset_name, column["type"]), nativeDataType=column.get("full_type", repr(column["type"])), description=column.get("comment", None), nullable=column["nullable"], recursive=False, ) if (pk_constraints is not None and isinstance( pk_constraints, dict) # some dialects (hive) return list and column["name"] in pk_constraints.get( "constrained_columns", [])): field.isPartOfKey = True canonical_schema.append(field) schema_metadata = SchemaMetadata( schemaName=dataset_name, platform=f"urn:li:dataPlatform:{platform}", version=0, hash="", platformSchema=MySqlDDL(tableSchema=""), fields=canonical_schema, ) if foreign_keys is not None and foreign_keys != []: schema_metadata.foreignKeys = foreign_keys return schema_metadata
def emit(self) -> Generator[SchemaField, None, None]: if (not isinstance( self._actual_schema, ( avro.schema.ArraySchema, avro.schema.Field, avro.schema.MapSchema, avro.schema.RecordSchema, ), ) and self._converter._fields_stack): # We are in the context of a non-nested(simple) field or the special-cased union. yield from self._converter._gen_from_last_field() else: # Just emit the SchemaField from schema provided in the Ctor. schema = self._schema actual_schema = self._actual_schema if isinstance(schema, avro.schema.Field): # Field's schema is actually it's type. schema = schema.type actual_schema = (self._converter. _get_underlying_type_if_option_as_union( schema, schema)) description = self._description if description is None: description = schema.props.get("doc", None) native_data_type = self._converter._prefix_name_stack[-1] if isinstance(schema, (avro.schema.Field, avro.schema.UnionSchema)): native_data_type = self._converter._prefix_name_stack[-2] type_prefix = "[type=" if native_data_type.startswith(type_prefix): native_data_type = native_data_type[slice( len(type_prefix), len(native_data_type) - 1)] field = SchemaField( fieldPath=self._converter._get_cur_field_path(), # Populate it with the simple native type for now. nativeDataType=native_data_type, type=self._converter._get_column_type(actual_schema.type), description=description, recursive=False, nullable=self._converter._is_nullable(schema), isPartOfKey=self._converter._is_key_schema, ) yield field
def emit_upstream_tables(self) -> Iterable[MetadataWorkUnit]: for (table_urn, (columns, path, is_embedded)) in self.upstream_tables.items(): if not is_embedded and not self.config.ingest_tables_external: logger.error( f"Skipping external table {table_urn} as ingest_tables_external is set to False" ) continue dataset_snapshot = DatasetSnapshot( urn=table_urn, aspects=[], ) if path: # Browse path browse_paths = BrowsePathsClass( paths=[f"/{self.config.env.lower()}/{self.platform}/{path}"] ) dataset_snapshot.aspects.append(browse_paths) else: logger.debug(f"Browse path not set for table {table_urn}") schema_metadata = None if columns: fields = [] for field in columns: nativeDataType = field.get("remoteType", "UNKNOWN") TypeClass = FIELD_TYPE_MAPPING.get(nativeDataType, NullTypeClass) schema_field = SchemaField( fieldPath=field["name"], type=SchemaFieldDataType(type=TypeClass()), description="", nativeDataType=nativeDataType, ) fields.append(schema_field) schema_metadata = SchemaMetadata( schemaName="test", platform=f"urn:li:dataPlatform:{self.platform}", version=0, fields=fields, hash="", platformSchema=OtherSchema(rawSchema=""), ) if schema_metadata is not None: dataset_snapshot.aspects.append(schema_metadata) yield self.get_metadata_change_event(dataset_snapshot)
def _get_fields_and_primary_keys( self, looker_view: LookerView ) -> Tuple[List[SchemaField], List[str]]: fields: List[SchemaField] = [] primary_keys: List = [] for field in looker_view.fields: schema_field = SchemaField( fieldPath=field.name, type=self._get_field_type(field.type), nativeDataType=field.type, description=f"{field.field_type.value}. {field.description}", ) fields.append(schema_field) if field.is_primary_key: primary_keys.append(schema_field.fieldPath) return fields, primary_keys
def _recordschema_to_mce_fields( schema: avro.schema.RecordSchema) -> List[SchemaField]: fields: List[SchemaField] = [] for parsed_field in schema.fields: field = SchemaField( fieldPath=parsed_field.name, nativeDataType=str(parsed_field.type), type=_get_column_type(parsed_field.type), description=parsed_field.props.get("doc", None), recursive=False, nullable=_is_nullable(parsed_field.type), ) fields.append(field) return fields
def get_table_schema_fields(table: Table, max_rows: int) -> List[SchemaField]: table.infer(limit=max_rows) fields: List[SchemaField] = [] for raw_field in table.schema.fields: mapped_type: Type = tableschema_type_map.get(raw_field.type, NullTypeClass) field = SchemaField( fieldPath=raw_field.name, type=SchemaFieldDataType(mapped_type()), nativeDataType=str(raw_field.type), recursive=False, ) fields.append(field) return fields
def _genericschema_to_mce_fields( schema: avro.schema.Schema) -> List[SchemaField]: fields: List[SchemaField] = [] # In the generic (non-RecordSchema) case, only a single SchemaField will be returned # and the fieldPath will be set to empty to signal that the type refers to the # the whole object. field = SchemaField( fieldPath="", nativeDataType=str(schema.type), type=_get_column_type(schema.type), description=schema.props.get("doc", None), recursive=False, nullable=_is_nullable(schema), ) fields.append(field) return fields
def _get_schema_metadata_for_datasource( self, datasource_fields: List[dict] ) -> Optional[SchemaMetadata]: fields = [] schema_metadata = None for field in datasource_fields: # check datasource - custom sql relations from a field being referenced self._track_custom_sql_ids(field) nativeDataType = field.get("dataType", "UNKNOWN") TypeClass = FIELD_TYPE_MAPPING.get(nativeDataType, NullTypeClass) schema_field = SchemaField( fieldPath=field["name"], type=SchemaFieldDataType(type=TypeClass()), description=make_description_from_params( field.get("description", ""), field.get("formula") ), nativeDataType=nativeDataType, globalTags=get_tags_from_params( [ field.get("role", ""), field.get("__typename", ""), field.get("aggregation", ""), ] ) if self.config.ingest_tags else None, ) fields.append(schema_field) if fields: schema_metadata = SchemaMetadata( schemaName="test", platform=f"urn:li:dataPlatform:{self.platform}", version=0, fields=fields, hash="", platformSchema=OtherSchema(rawSchema=""), ) return schema_metadata
def _recordschema_to_mce_fields( schema: avro.schema.RecordSchema) -> List[SchemaField]: fields: List[SchemaField] = [] for parsed_field in schema.fields: description: Optional[str] = parsed_field.doc if parsed_field.has_default: description = description if description else "No description available." description = f"{description}\nField default value: {parsed_field.default}" field = SchemaField( fieldPath=parsed_field.name, nativeDataType=str(parsed_field.type), type=_get_column_type(parsed_field.type), description=description, recursive=False, nullable=_is_nullable(parsed_field.type), ) fields.append(field) return fields
def infer_schema(self, file: IO[bytes]) -> List[SchemaField]: # infer schema of a parquet file without reading the whole file # read the first line of the file schema = pyarrow.parquet.read_schema(file, memory_map=True) fields: List[SchemaField] = [] for name, pyarrow_type in zip(schema.names, schema.types): mapped_type = map_pyarrow_type(pyarrow_type) field = SchemaField( fieldPath=name, type=SchemaFieldDataType(mapped_type()), nativeDataType=str(pyarrow_type), recursive=False, ) fields.append(field) return fields
def emit_upstream_tables(self) -> Iterable[MetadataWorkUnit]: for (table_urn, (columns, path)) in self.upstream_tables.items(): dataset_snapshot = DatasetSnapshot( urn=table_urn, aspects=[], ) # Browse path browse_paths = BrowsePathsClass( paths=[f"/{self.config.env.lower()}/{self.platform}/{path}"]) dataset_snapshot.aspects.append(browse_paths) fields = [] for field in columns: nativeDataType = field.get("remoteType", "UNKNOWN") TypeClass = FIELD_TYPE_MAPPING.get(nativeDataType, NullTypeClass) schema_field = SchemaField( fieldPath=field["name"], type=SchemaFieldDataType(type=TypeClass()), description="", nativeDataType=nativeDataType, ) fields.append(schema_field) schema_metadata = SchemaMetadata( schemaName="test", platform=f"urn:li:dataPlatform:{self.platform}", version=0, fields=fields, hash="", platformSchema=OtherSchema(rawSchema=""), ) if schema_metadata is not None: dataset_snapshot.aspects.append(schema_metadata) yield self.get_metadata_change_event(dataset_snapshot)
def _schema_fields_from_dag( graph: nx.DiGraph, is_key_schema: bool ) -> List[SchemaField]: generations: List = list(nx.algorithms.dag.topological_generations(graph)) fields: Dict = {} if generations and generations[0]: roots = generations[0] leafs: List = [] for node in graph: if graph.out_degree(node) == 0: leafs.append(node) type_of_nodes: Dict = nx.get_node_attributes(graph, "node_type") for root in roots: root_type = type_of_nodes[root] for leaf in leafs: paths = list(nx.all_simple_edge_paths(graph, root, leaf)) if paths: for path in paths: stack: List[str] = ["[version=2.0]"] if is_key_schema: stack.append("[key=True]") stack.append(root_type) if len(roots) > 1: stack.append(re.sub(r"^.*\.", "", root)) root_path = ".".join(stack) fields[root_path] = SchemaField( fieldPath=root_path, nativeDataType="message", type=SchemaFieldDataType(type=RecordTypeClass()), ) for field in _traverse_path(graph, path, stack): fields[field.path] = field.field return sorted(fields.values(), key=lambda sf: sf.fieldPath)
def emit(self) -> Generator[SchemaField, None, None]: if (not isinstance( self._actual_schema, ( avro.schema.ArraySchema, avro.schema.Field, avro.schema.MapSchema, avro.schema.RecordSchema, ), ) and self._converter._fields_stack): # We are in the context of a non-nested(simple) field or the special-cased union. yield from self._converter._gen_from_last_field() else: # Just emit the SchemaField from schema provided in the Ctor. schema = self._schema actual_schema = self._actual_schema if isinstance(schema, avro.schema.Field): # Field's schema is actually it's type. schema = schema.type actual_schema = (self._converter. _get_underlying_type_if_option_as_union( schema, schema)) description = self._description if description is None: description = schema.props.get("doc", None) native_data_type = self._converter._prefix_name_stack[-1] if isinstance(schema, (avro.schema.Field, avro.schema.UnionSchema)): native_data_type = self._converter._prefix_name_stack[-2] type_prefix = "[type=" if native_data_type.startswith(type_prefix): native_data_type = native_data_type[slice( len(type_prefix), len(native_data_type) - 1)] native_data_type = actual_schema.props.get( "native_data_type", native_data_type) field_path = self._converter._get_cur_field_path() merged_props = {} merged_props.update(self._schema.other_props) merged_props.update(schema.other_props) tags = None if "deprecated" in merged_props: description = ( f"<span style=\"color:red\">DEPRECATED: {merged_props['deprecated']}</span>\n" + description) tags = GlobalTagsClass(tags=[ TagAssociationClass(tag="urn:li:tag:Deprecated") ]) field = SchemaField( fieldPath=field_path, # Populate it with the simple native type for now. nativeDataType=native_data_type, type=self._converter._get_column_type( actual_schema.type, actual_schema.props.get("logicalType")), description=description, recursive=False, nullable=self._converter._is_nullable(schema), isPartOfKey=self._converter._is_key_schema, globalTags=tags, jsonProps=json.dumps(merged_props) if merged_props else None, ) yield field
def get_workunits(self) -> Iterable[MetadataWorkUnit]: platform = "mongodb" database_names: List[str] = self.mongo_client.list_database_names() # traverse databases in sorted order so output is consistent for database_name in sorted(database_names): if database_name in DENY_DATABASE_LIST: continue if not self.config.database_pattern.allowed(database_name): self.report.report_dropped(database_name) continue database = self.mongo_client[database_name] collection_names: List[str] = database.list_collection_names() # traverse collections in sorted order so output is consistent for collection_name in sorted(collection_names): dataset_name = f"{database_name}.{collection_name}" if not self.config.collection_pattern.allowed(dataset_name): self.report.report_dropped(dataset_name) continue dataset_urn = f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.config.env})" dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[], ) dataset_properties = DatasetPropertiesClass( tags=[], customProperties={}, ) dataset_snapshot.aspects.append(dataset_properties) if self.config.enableSchemaInference: assert self.config.maxDocumentSize is not None collection_schema = construct_schema_pymongo( database[collection_name], delimiter=".", use_random_sampling=self.config.useRandomSampling, max_document_size=self.config.maxDocumentSize, is_version_gte_4_4=self.is_server_version_gte_4_4(), sample_size=self.config.schemaSamplingSize, ) # initialize the schema for the collection canonical_schema: List[SchemaField] = [] max_schema_size = self.config.maxSchemaSize collection_schema_size = len(collection_schema.values()) collection_fields: Union[ List[SchemaDescription], ValuesView[ SchemaDescription]] = collection_schema.values() assert max_schema_size is not None if collection_schema_size > max_schema_size: # downsample the schema, using frequency as the sort key self.report.report_warning( key=dataset_urn, reason= f"Downsampling the collection schema because it has {collection_schema_size} fields. Threshold is {max_schema_size}", ) collection_fields = sorted( collection_schema.values(), key=lambda x: x["count"], reverse=True, )[0:max_schema_size] # Add this information to the custom properties so user can know they are looking at downsampled schema dataset_properties.customProperties[ "schema.downsampled"] = "True" dataset_properties.customProperties[ "schema.totalFields"] = f"{collection_schema_size}" logger.debug( f"Size of collection fields = {len(collection_fields)}" ) # append each schema field (sort so output is consistent) for schema_field in sorted( collection_fields, key=lambda x: x["delimited_name"]): field = SchemaField( fieldPath=schema_field["delimited_name"], nativeDataType=self.get_pymongo_type_string( schema_field["type"], dataset_name), type=self.get_field_type(schema_field["type"], dataset_name), description=None, nullable=schema_field["nullable"], recursive=False, ) canonical_schema.append(field) # create schema metadata object for collection schema_metadata = SchemaMetadata( schemaName=collection_name, platform=f"urn:li:dataPlatform:{platform}", version=0, hash="", platformSchema=SchemalessClass(), fields=canonical_schema, ) dataset_snapshot.aspects.append(schema_metadata) # TODO: use list_indexes() or index_information() to get index information # See https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.list_indexes. mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = MetadataWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu
def get_workunits(self) -> Iterable[MetadataWorkUnit]: platform = "mongodb" database_names: List[str] = self.mongo_client.list_database_names() # traverse databases in sorted order so output is consistent for database_name in sorted(database_names): if database_name in DENY_DATABASE_LIST: continue if not self.config.database_pattern.allowed(database_name): self.report.report_dropped(database_name) continue database = self.mongo_client[database_name] collection_names: List[str] = database.list_collection_names() # traverse collections in sorted order so output is consistent for collection_name in sorted(collection_names): dataset_name = f"{database_name}.{collection_name}" if not self.config.collection_pattern.allowed(dataset_name): self.report.report_dropped(dataset_name) continue dataset_snapshot = DatasetSnapshot( urn=f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.config.env})", aspects=[], ) dataset_properties = DatasetPropertiesClass( tags=[], customProperties={}, ) dataset_snapshot.aspects.append(dataset_properties) if self.config.enableSchemaInference: collection_schema = construct_schema_pymongo( database[collection_name], delimiter=".", sample_size=self.config.schemaSamplingSize, ) # initialize the schema for the collection canonical_schema: List[SchemaField] = [] # append each schema field (sort so output is consistent) for schema_field in sorted( collection_schema.values(), key=lambda x: x["delimited_name"] ): field = SchemaField( fieldPath=schema_field["delimited_name"], nativeDataType=self.get_pymongo_type_string( schema_field["type"], dataset_name ), type=self.get_field_type( schema_field["type"], dataset_name ), description=None, nullable=schema_field["nullable"], recursive=False, ) canonical_schema.append(field) # create schema metadata object for collection actor = "urn:li:corpuser:etl" sys_time = int(time.time() * 1000) schema_metadata = SchemaMetadata( schemaName=collection_name, platform=f"urn:li:dataPlatform:{platform}", version=0, hash="", platformSchema=SchemalessClass(), created=AuditStamp(time=sys_time, actor=actor), lastModified=AuditStamp(time=sys_time, actor=actor), fields=canonical_schema, ) dataset_snapshot.aspects.append(schema_metadata) # TODO: use list_indexes() or index_information() to get index information # See https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.list_indexes. mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = MetadataWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu