def get_workunits(self) -> Iterable[MetadataWorkUnit]: env: str = "PROD" platform = self.platform nodes = loadManifestAndCatalog( self.config.manifest_path, self.config.catalog_path, platform, env ) for node in nodes: mce = MetadataChangeEvent() dataset_snapshot = DatasetSnapshot() dataset_snapshot.urn = node.datahub_urn custom_properties = get_custom_properties(node) dbt_properties = DatasetPropertiesClass() dbt_properties.description = node.dbt_name dbt_properties.customProperties = custom_properties dataset_snapshot.aspects.append(dbt_properties) upstreams = get_upstream_lineage(node.upstream_urns) if upstreams is not None: dataset_snapshot.aspects.append(upstreams) schema_metadata = get_schema_metadata(self.report, node, platform) dataset_snapshot.aspects.append(schema_metadata) mce.proposedSnapshot = dataset_snapshot wu = MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce) self.report.report_workunit(wu) yield wu
def get_workunits(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]: try: self.inspect_version() except Exception as e: self.report.report_failure("version", f"Error: {e}") return for wu in super().get_workunits(): yield wu if (isinstance(wu, SqlWorkUnit) and isinstance(wu.metadata, MetadataChangeEvent) and isinstance(wu.metadata.proposedSnapshot, DatasetSnapshot)): lineage_mcp = None lineage_properties_aspect: Optional[ DatasetPropertiesClass] = None dataset_snapshot: DatasetSnapshotClass = wu.metadata.proposedSnapshot assert dataset_snapshot if self.config.include_table_lineage: lineage_mcp, lineage_properties_aspect = self.get_lineage_mcp( wu.metadata.proposedSnapshot.urn) if lineage_mcp is not None: lineage_wu = MetadataWorkUnit( id= f"redshift-{lineage_mcp.entityUrn}-{lineage_mcp.aspectName}", mcp=lineage_mcp, ) self.report.report_workunit(lineage_wu) yield lineage_wu if lineage_properties_aspect: aspects = dataset_snapshot.aspects if aspects is None: aspects = [] dataset_properties_aspect: Optional[ DatasetPropertiesClass] = None for aspect in aspects: if isinstance(aspect, DatasetPropertiesClass): dataset_properties_aspect = aspect if dataset_properties_aspect is None: dataset_properties_aspect = DatasetPropertiesClass() aspects.append(dataset_properties_aspect) custom_properties = ( { **dataset_properties_aspect.customProperties, **lineage_properties_aspect.customProperties, } if dataset_properties_aspect.customProperties else lineage_properties_aspect.customProperties) dataset_properties_aspect.customProperties = custom_properties dataset_snapshot.aspects = aspects dataset_snapshot.aspects.append(dataset_properties_aspect)
def get_workunits(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]: for wu in super().get_workunits(): if (self.config.include_table_lineage and isinstance(wu, MetadataWorkUnit) and isinstance(wu.metadata, MetadataChangeEvent) and isinstance(wu.metadata.proposedSnapshot, DatasetSnapshot)): dataset_snapshot: DatasetSnapshot = wu.metadata.proposedSnapshot assert dataset_snapshot # Join the workunit stream from super with the lineage info using the urn. lineage_info = self._get_upstream_lineage_info( dataset_snapshot.urn) if lineage_info is not None: # Emit the lineage work unit upstream_lineage, upstream_column_props = lineage_info lineage_mcpw = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_snapshot.urn, aspectName="upstreamLineage", aspect=upstream_lineage, ) lineage_wu = MetadataWorkUnit( id= f"{self.platform}-{lineage_mcpw.entityUrn}-{lineage_mcpw.aspectName}", mcp=lineage_mcpw, ) self.report.report_workunit(lineage_wu) yield lineage_wu # Update the super's workunit to include the column-lineage in the custom properties. We need to follow # the RCU semantics for both the aspects & customProperties in order to preserve the changes made by super. aspects = dataset_snapshot.aspects if aspects is None: aspects = [] dataset_properties_aspect: Optional[ DatasetPropertiesClass] = None for aspect in aspects: if isinstance(aspect, DatasetPropertiesClass): dataset_properties_aspect = aspect if dataset_properties_aspect is None: dataset_properties_aspect = DatasetPropertiesClass() aspects.append(dataset_properties_aspect) custom_properties = ({ **dataset_properties_aspect.customProperties, **upstream_column_props, } if dataset_properties_aspect.customProperties else upstream_column_props) dataset_properties_aspect.customProperties = custom_properties dataset_snapshot.aspects = aspects # Emit the work unit from super. yield wu
def get_workunits(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]: for wu in super().get_workunits(): if (self.config.include_table_lineage and isinstance(wu, SqlWorkUnit) and isinstance(wu.metadata, MetadataChangeEvent) and isinstance(wu.metadata.proposedSnapshot, DatasetSnapshot)): dataset_snapshot: DatasetSnapshotClass = wu.metadata.proposedSnapshot assert dataset_snapshot lineage_mcp, lineage_properties_aspect = self.get_lineage_mcp( wu.metadata.proposedSnapshot.urn) if lineage_mcp is not None: lineage_wu = MetadataWorkUnit( id= f"{self.platform}-{lineage_mcp.entityUrn}-{lineage_mcp.aspectName}", mcp=lineage_mcp, ) self.report.report_workunit(lineage_wu) yield lineage_wu if lineage_properties_aspect: aspects = dataset_snapshot.aspects if aspects is None: aspects = [] dataset_properties_aspect: Optional[ DatasetPropertiesClass] = None for aspect in aspects: if isinstance(aspect, DatasetPropertiesClass): dataset_properties_aspect = aspect if dataset_properties_aspect is None: dataset_properties_aspect = DatasetPropertiesClass() aspects.append(dataset_properties_aspect) custom_properties = ( { **dataset_properties_aspect.customProperties, **lineage_properties_aspect.customProperties, } if dataset_properties_aspect.customProperties else lineage_properties_aspect.customProperties) dataset_properties_aspect.customProperties = custom_properties dataset_snapshot.aspects = aspects dataset_snapshot.aspects.append(dataset_properties_aspect) # Emit the work unit from super. yield wu
def _create_dataset_properties_aspect( self, node: DBTNode, additional_custom_props_filtered: Dict[str, str] ) -> DatasetPropertiesClass: description = None if self.config.disable_dbt_node_creation: if node.comment and node.description and node.comment != node.description: description = f"{self.config.target_platform} comment: {node.comment}\n\ndbt model description: {node.description}" elif node.comment: description = node.comment elif node.description: description = node.description else: description = node.description custom_props = { **get_custom_properties(node), **additional_custom_props_filtered, } dbt_properties = DatasetPropertiesClass( description=description, customProperties=custom_props, tags=node.tags, name=node.name, ) return dbt_properties
def get_workunits(self) -> Iterable[SqlWorkUnit]: sql_config = self.config if logger.isEnabledFor(logging.DEBUG): # If debug logging is enabled, we also want to echo each SQL query issued. sql_config.options["echo"] = True url = sql_config.get_sql_alchemy_url() logger.debug(f"sql_alchemy_url={url}") engine = create_engine(url, **sql_config.options) inspector = reflection.Inspector.from_engine(engine) for schema in inspector.get_schema_names(): if not sql_config.schema_pattern.allowed(schema): self.report.report_dropped(schema) continue for table in inspector.get_table_names(schema): schema, table = sql_config.standardize_schema_table_names(schema, table) dataset_name = sql_config.get_identifier(schema, table) self.report.report_table_scanned(dataset_name) if not sql_config.table_pattern.allowed(dataset_name): self.report.report_dropped(dataset_name) continue columns = inspector.get_columns(table, schema) try: table_info: dict = inspector.get_table_comment(table, schema) except NotImplementedError: description: Optional[str] = None properties: Dict[str, str] = {} else: description = table_info["text"] # The "properties" field is a non-standard addition to SQLAlchemy's interface. properties = table_info.get("properties", {}) # TODO: capture inspector.get_pk_constraint # TODO: capture inspector.get_sorted_table_and_fkc_names dataset_snapshot = DatasetSnapshot( urn=f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{dataset_name},{self.config.env})", aspects=[], ) if description is not None or properties: dataset_properties = DatasetPropertiesClass( description=description, customProperties=properties, # uri=dataset_name, ) dataset_snapshot.aspects.append(dataset_properties) schema_metadata = get_schema_metadata( self.report, dataset_name, self.platform, columns ) dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = SqlWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu
def loop_tables( self, inspector: Inspector, schema: str, sql_config: SQLAlchemyConfig, ) -> Iterable[SqlWorkUnit]: for table in inspector.get_table_names(schema): schema, table = self.standardize_schema_table_names(schema=schema, entity=table) dataset_name = self.get_identifier(schema=schema, entity=table, inspector=inspector) self.report.report_entity_scanned(dataset_name, ent_type="table") if not sql_config.table_pattern.allowed(dataset_name): self.report.report_dropped(dataset_name) continue columns = inspector.get_columns(table, schema) if len(columns) == 0: self.report.report_warning(dataset_name, "missing column information") try: # SQLALchemy stubs are incomplete and missing this method. # PR: https://github.com/dropbox/sqlalchemy-stubs/pull/223. table_info: dict = inspector.get_table_comment( table, schema) # type: ignore except NotImplementedError: description: Optional[str] = None properties: Dict[str, str] = {} else: description = table_info["text"] # The "properties" field is a non-standard addition to SQLAlchemy's interface. properties = table_info.get("properties", {}) # TODO: capture inspector.get_pk_constraint # TODO: capture inspector.get_sorted_table_and_fkc_names dataset_snapshot = DatasetSnapshot( urn= f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{dataset_name},{self.config.env})", aspects=[], ) if description is not None or properties: dataset_properties = DatasetPropertiesClass( description=description, customProperties=properties, ) dataset_snapshot.aspects.append(dataset_properties) schema_metadata = get_schema_metadata(self.report, dataset_name, self.platform, columns) dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = SqlWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu
def get_initial_mce() -> MetadataChangeEventClass: return MetadataChangeEventClass( proposedSnapshot=DatasetSnapshotClass( urn="urn:li:dataset:(urn:li:dataPlatform:test_platform,test,PROD)", aspects=[DatasetPropertiesClass(description="test.description", )], ), systemMetadata=SystemMetadata(lastObserved=1586847600000, runId="pipeline_test"), )
def transform_one( self, mce: MetadataChangeEventClass) -> MetadataChangeEventClass: # legacy transformers should not receive metadata change proposal events assert not isinstance(mce, MetadataChangeProposalWrapper) if isinstance(mce, MetadataChangeEventClass): assert isinstance(mce.proposedSnapshot, DatasetSnapshotClass) mce.proposedSnapshot.aspects.append( DatasetPropertiesClass(description="Old Transformer was here")) return mce
def get_workunits(self) -> Iterable[SqlWorkUnit]: env: str = "PROD" sql_config = self.config platform = self.platform url = sql_config.get_sql_alchemy_url() logger.debug(f"sql_alchemy_url={url}") engine = create_engine(url, **sql_config.options) inspector = reflection.Inspector.from_engine(engine) for schema in inspector.get_schema_names(): if not sql_config.schema_pattern.allowed(schema): self.report.report_dropped(schema) continue for table in inspector.get_table_names(schema): schema, table = sql_config.standardize_schema_table_names( schema, table) dataset_name = sql_config.get_identifier(schema, table) self.report.report_table_scanned(dataset_name) if not sql_config.table_pattern.allowed(dataset_name): self.report.report_dropped(dataset_name) continue columns = inspector.get_columns(table, schema) try: description: Optional[str] = inspector.get_table_comment( table, schema)["text"] except NotImplementedError: description = None # TODO: capture inspector.get_pk_constraint # TODO: capture inspector.get_sorted_table_and_fkc_names dataset_snapshot = DatasetSnapshot( urn= f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{env})", aspects=[], ) if description is not None: dataset_properties = DatasetPropertiesClass( description=description, tags=[], customProperties={}, # uri=dataset_name, ) dataset_snapshot.aspects.append(dataset_properties) schema_metadata = get_schema_metadata(self.report, dataset_name, platform, columns) dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = SqlWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu
def create_metadata_work_unit(timestamp): dataset_snapshot = DatasetSnapshot( urn="urn:li:dataset:(urn:li:dataPlatform:glue,datalake_grilled.Barbeque,PROD)", aspects=[], ) dataset_snapshot.aspects.append(Status(removed=False)) dataset_snapshot.aspects.append( OwnershipClass( owners=[ OwnerClass( owner="urn:li:corpuser:Susan", type=OwnershipTypeClass.DATAOWNER ) ], lastModified=AuditStampClass( time=timestamp, actor="urn:li:corpuser:datahub" ), ) ) dataset_snapshot.aspects.append( DatasetPropertiesClass( description="Grilled Food", customProperties={}, uri=None, tags=[], ) ) fields = [ SchemaField( fieldPath="Size", nativeDataType="int", type=SchemaFieldDataType(type=NumberTypeClass()), description="Maximum attendees permitted", nullable=True, recursive=False, ) ] schema_metadata = SchemaMetadata( schemaName="datalake_grilled.Barbeque", version=0, fields=fields, platform="urn:li:dataPlatform:glue", created=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"), lastModified=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"), hash="", platformSchema=MySqlDDL(tableSchema=""), ) dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) return MetadataWorkUnit(id="glue-datalake_grilled.Barbeque", mce=mce)
def __to_datahub_dataset( self, dataset: Optional[PowerBiAPI.Dataset] ) -> List[MetadataChangeProposalWrapper]: """ Map PowerBi dataset to datahub dataset. Here we are mapping each table of PowerBi Dataset to Datahub dataset. In PowerBi Tile would be having single dataset, However corresponding Datahub's chart might have many input sources. """ dataset_mcps: List[MetadataChangeProposalWrapper] = [] if dataset is None: return dataset_mcps # We are only suporting relation PowerBi DataSources if (dataset.datasource is None or dataset.datasource.metadata.is_relational is False): LOGGER.warning( "Dataset {}({}) is not created from relational datasource". format(dataset.name, dataset.id)) return dataset_mcps LOGGER.info("Converting dataset={}(id={}) to datahub dataset".format( dataset.name, dataset.id)) for table in dataset.tables: # Create an URN for dataset ds_urn = builder.make_dataset_urn( platform=self.__config.dataset_type_mapping[ dataset.datasource.type], name="{}.{}.{}".format(dataset.datasource.database, table.schema_name, table.name), env=self.__config.env, ) LOGGER.info("{}={}".format(Constant.Dataset_URN, ds_urn)) # Create datasetProperties mcp ds_properties = DatasetPropertiesClass(description=table.name) info_mcp = self.new_mcp( entity_type=Constant.DATASET, entity_urn=ds_urn, aspect_name=Constant.DATASET_PROPERTIES, aspect=ds_properties, ) # Remove status mcp status_mcp = self.new_mcp( entity_type=Constant.DATASET, entity_urn=ds_urn, aspect_name=Constant.STATUS, aspect=StatusClass(removed=False), ) dataset_mcps.extend([info_mcp, status_mcp]) return dataset_mcps
def loop_views( self, inspector: Any, schema: str, sql_config: SQLAlchemyConfig, ) -> Iterable[SqlWorkUnit]: for view in inspector.get_view_names(schema): schema, view = sql_config.standardize_schema_table_names( schema, view) dataset_name = sql_config.get_identifier(schema, view) self.report.report_entity_scanned(dataset_name, ent_type="view") if not sql_config.view_pattern.allowed(dataset_name): self.report.report_dropped(dataset_name) continue columns = inspector.get_columns(view, schema) try: view_info: dict = inspector.get_table_comment(view, schema) except NotImplementedError: description: Optional[str] = None properties: Dict[str, str] = {} else: description = view_info["text"] # The "properties" field is a non-standard addition to SQLAlchemy's interface. properties = view_info.get("properties", {}) view_definition = inspector.get_view_definition(view) if view_definition is None: view_definition = "" properties["view_definition"] = view_definition properties["is_view"] = "True" dataset_snapshot = DatasetSnapshot( urn= f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{dataset_name},{self.config.env})", aspects=[], ) if description is not None or properties: dataset_properties = DatasetPropertiesClass( description=description, customProperties=properties, # uri=dataset_name, ) dataset_snapshot.aspects.append(dataset_properties) schema_metadata = get_schema_metadata(self.report, dataset_name, self.platform, columns) dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = SqlWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu
def _get_custom_properties( self, looker_view: LookerView) -> DatasetPropertiesClass: file_path = str( pathlib.Path(looker_view.absolute_file_path).resolve()).replace( str(self.source_config.base_folder.resolve()), "") custom_properties = { "looker.file.content": looker_view.raw_file_content[ 0:512000], # grab a limited slice of characters from the file "looker.file.path": file_path, } dataset_props = DatasetPropertiesClass( customProperties=custom_properties) if self.source_config.github_info is not None: github_file_url = self.source_config.github_info.get_url_for_file_path( file_path) dataset_props.externalUrl = github_file_url return dataset_props
def _get_data_stream_index_count_mcps( self, ) -> Iterable[MetadataChangeProposalWrapper]: for data_stream, count in self.data_stream_partition_count.items(): dataset_urn: str = make_dataset_urn(self.platform, data_stream, self.source_config.env) yield MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, aspectName="datasetProperties", aspect=DatasetPropertiesClass( customProperties={"numPartitions": str(count)}), changeType=ChangeTypeClass.UPSERT, )
def get_dataset_properties() -> DatasetPropertiesClass: return DatasetPropertiesClass( description=table.get("Description"), customProperties={ **table.get("Parameters", {}), **{ k: str(v) for k, v in table["StorageDescriptor"].items() if k not in ["Columns", "Parameters"] }, }, uri=table.get("Location"), tags=[], )
def _to_mce( # noqa: C901 self, config: LookerCommonConfig, reporter: SourceReport, ) -> Optional[MetadataChangeEvent]: # We only generate MCE-s for explores that contain from clauses and do NOT contain joins # All other explores (passthrough explores and joins) end in correct resolution of lineage, and don't need additional nodes in the graph. dataset_snapshot = DatasetSnapshot( urn=self.get_explore_urn(config), aspects=[], # we append to this list later on ) browse_paths = BrowsePathsClass(paths=[self.get_explore_browse_path(config)]) dataset_snapshot.aspects.append(browse_paths) dataset_snapshot.aspects.append(StatusClass(removed=False)) custom_properties = {"looker.type": "explore"} if self.label is not None: custom_properties["looker.explore.label"] = str(self.label) dataset_props = DatasetPropertiesClass( description=self.description, customProperties=custom_properties, ) dataset_snapshot.aspects.append(dataset_props) if self.upstream_views is not None: assert self.project_name is not None upstreams = [ UpstreamClass( dataset=LookerViewId( project_name=self.project_name, model_name=self.model_name, view_name=view_name, ).get_urn(config), type=DatasetLineageTypeClass.VIEW, ) for view_name in self.upstream_views ] upstream_lineage = UpstreamLineage(upstreams=upstreams) dataset_snapshot.aspects.append(upstream_lineage) if self.fields is not None: schema_metadata = LookerUtil._get_schema( platform_name=config.platform_name, schema_name=self.name, view_fields=self.fields, reporter=reporter, ) dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) return mce
def transform_one(self, mce: MetadataChangeEventClass) -> MetadataChangeEventClass: if not isinstance(mce.proposedSnapshot, DatasetSnapshotClass): return mce properties_to_add = self.config.add_properties_resolver_class( # type: ignore **self.resolver_args ).get_properties_to_add(mce.proposedSnapshot) if properties_to_add: properties = builder.get_or_add_aspect( mce, DatasetPropertiesClass(customProperties={}) ) properties.customProperties.update(properties_to_add) return mce
def ingest_table(self, table_data: TableData) -> Iterable[MetadataWorkUnit]: logger.info( f"Extracting table schema from file: {table_data.full_path}") browse_path: str = (strip_s3_prefix(table_data.table_path) if table_data.is_s3 else table_data.table_path.strip("/")) data_platform_urn = make_data_platform_urn(self.source_config.platform) logger.info(f"Creating dataset urn with name: {browse_path}") dataset_urn = make_dataset_urn_with_platform_instance( self.source_config.platform, browse_path, self.source_config.platform_instance, self.source_config.env, ) dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[], ) dataset_properties = DatasetPropertiesClass( description="", name=table_data.disaply_name, customProperties={}, ) dataset_snapshot.aspects.append(dataset_properties) fields = self.get_fields(table_data) schema_metadata = SchemaMetadata( schemaName=table_data.disaply_name, platform=data_platform_urn, version=0, hash="", fields=fields, platformSchema=OtherSchemaClass(rawSchema=""), ) dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = MetadataWorkUnit(id=table_data.table_path, mce=mce) self.report.report_workunit(wu) yield wu yield from self.create_container_hierarchy(table_data, dataset_urn) if self.source_config.profiling.enabled: yield from self.get_table_profile(table_data, dataset_urn)
def get_initial_mce() -> MetadataChangeEventClass: return MetadataChangeEventClass( proposedSnapshot=DatasetSnapshotClass( urn="urn:li:dataset:(urn:li:dataPlatform:test_platform,test,PROD)", aspects=[ DatasetPropertiesClass( description="test.description", customProperties={}, uri=None, tags=[], ) ], ) )
def test_old_transformers_working_as_before(mock_time): dataset_mce = make_generic_dataset() dataset_mcp = make_generic_dataset_mcp() transformer = OldMCETransformer.create( {}, PipelineContext(run_id="test-old-transformer"), ) outputs = list( transformer.transform([ RecordEnvelope(input, metadata={}) for input in [dataset_mce, dataset_mcp, EndOfStream()] ])) assert len(outputs) == 3 # MCP will come back untouched assert outputs[0].record == dataset_mce # Check that glossary terms were added. props_aspect = builder.get_aspect_if_available(outputs[0].record, DatasetPropertiesClass) assert props_aspect assert props_aspect.description == "Old Transformer was here" assert outputs[1].record == dataset_mcp assert isinstance(outputs[-1].record, EndOfStream) # MCP only stream dataset_mcps = [ make_generic_dataset_mcp(), make_generic_dataset_mcp(aspect=DatasetPropertiesClass( description="Another test MCP")), EndOfStream(), ] transformer = OldMCETransformer.create( {}, PipelineContext(run_id="test-old-transformer"), ) outputs = list( transformer.transform( [RecordEnvelope(input, metadata={}) for input in dataset_mcps])) assert len(outputs) == 3 # MCP-s will come back untouched assert outputs[0].record == dataset_mcps[0] assert outputs[1].record == dataset_mcps[1] assert isinstance(outputs[-1].record, EndOfStream)
def _get_custom_properties( self, looker_view: LookerView) -> DatasetPropertiesClass: custom_properties = { "looker.file.content": looker_view.raw_file_content[ 0:512000], # grab a limited slice of characters from the file "looker.file.path": str(pathlib.Path( looker_view.absolute_file_path).resolve()).replace( str(self.source_config.base_folder.resolve()), ""), } dataset_props = DatasetPropertiesClass( customProperties=custom_properties) return dataset_props
def construct_dataset_workunits( self, dataset_platform: str, dataset_name: str, dataset_urn: Optional[str] = None, external_url: Optional[str] = None, datasetProperties: Optional[Dict[str, str]] = None, ) -> Iterable[MetadataWorkUnit]: if not dataset_urn: dataset_urn = builder.make_dataset_urn( dataset_platform, dataset_name, self.config.env ) mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, changeType=ChangeTypeClass.UPSERT, aspectName="dataPlatformInstance", aspect=DataPlatformInstanceClass( platform=builder.make_data_platform_urn(dataset_platform) ), ) platform = ( dataset_platform[dataset_platform.rindex(":") + 1 :] if dataset_platform.startswith("urn:") else dataset_platform ) wu = MetadataWorkUnit(id=f"{platform}.{dataset_name}.{mcp.aspectName}", mcp=mcp) if wu.id not in self.report.workunit_ids: self.report.report_workunit(wu) yield wu mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, changeType=ChangeTypeClass.UPSERT, aspectName="datasetProperties", aspect=DatasetPropertiesClass( externalUrl=external_url, customProperties=datasetProperties ), ) wu = MetadataWorkUnit(id=f"{platform}.{dataset_name}.{mcp.aspectName}", mcp=mcp) if wu.id not in self.report.workunit_ids: self.report.report_workunit(wu) yield wu
def get_lineage_mcp( self, dataset_urn: str ) -> Tuple[Optional[MetadataChangeProposalWrapper], Optional[DatasetPropertiesClass]]: dataset_key = mce_builder.dataset_urn_to_key(dataset_urn) if dataset_key is None: return None, None if not self._lineage_map: self._populate_lineage() assert self._lineage_map is not None upstream_lineage: List[UpstreamClass] = [] custom_properties: Dict[str, str] = {} if dataset_key.name in self._lineage_map: item = self._lineage_map[dataset_key.name] for upstream in item.upstreams: upstream_table = UpstreamClass( dataset=builder.make_dataset_urn_with_platform_instance( upstream.platform.value, upstream.path, self.config.platform_instance, self.config.env, ), type=item.dataset_lineage_type, ) upstream_lineage.append(upstream_table) properties = None if custom_properties: properties = DatasetPropertiesClass( customProperties=custom_properties) if not upstream_lineage: return None, properties mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="upstreamLineage", aspect=UpstreamLineage(upstreams=upstream_lineage), ) return mcp, properties
def init_dataset( self, endpoint_k: str, endpoint_dets: dict ) -> Tuple[DatasetSnapshot, str]: config = self.config dataset_name = endpoint_k[1:].replace("/", ".") if len(dataset_name) > 0: if dataset_name[-1] == ".": dataset_name = dataset_name[:-1] else: dataset_name = "root" dataset_snapshot = DatasetSnapshot( urn=f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{config.name}.{dataset_name},PROD)", aspects=[], ) # adding description dataset_properties = DatasetPropertiesClass( description=endpoint_dets["description"], customProperties={} ) dataset_snapshot.aspects.append(dataset_properties) # adding tags tags_str = [make_tag_urn(t) for t in endpoint_dets["tags"]] tags_tac = [TagAssociationClass(t) for t in tags_str] gtc = GlobalTagsClass(tags_tac) dataset_snapshot.aspects.append(gtc) # the link will appear in the "documentation" link_url = clean_url(config.url + self.url_basepath + endpoint_k) link_description = "Link to call for the dataset." creation = AuditStampClass( time=int(time.time()), actor="urn:li:corpuser:etl", impersonator=None ) link_metadata = InstitutionalMemoryMetadataClass( url=link_url, description=link_description, createStamp=creation ) inst_memory = InstitutionalMemoryClass([link_metadata]) dataset_snapshot.aspects.append(inst_memory) return dataset_snapshot, dataset_name
def test_supression_works(): dataset_mce = make_generic_dataset() dataset_mcp = make_generic_dataset_mcp( aspect_name="datasetProperties", aspect=DatasetPropertiesClass(description="supressable description"), ) transformer = SuppressingTransformer.create( {}, PipelineContext(run_id="test-suppress-transformer"), ) outputs = list( transformer.transform([ RecordEnvelope(input, metadata={}) for input in [dataset_mce, dataset_mcp, EndOfStream()] ])) assert len(outputs) == 2 # MCP will be dropped
def get_workunits(self) -> Iterable[MetadataWorkUnit]: env = "PROD" platform = "mongodb" database_names: List[str] = self.mongo_client.list_database_names() for database_name in database_names: if database_name in DENY_DATABASE_LIST: continue if not self.config.database_pattern.allowed(database_name): self.report.report_dropped(database_name) continue database = self.mongo_client[database_name] collection_names: List[str] = database.list_collection_names() for collection_name in collection_names: dataset_name = f"{database_name}.{collection_name}" if not self.config.collection_pattern.allowed(dataset_name): self.report.report_dropped(dataset_name) continue dataset_snapshot = DatasetSnapshot( urn= f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{env})", aspects=[], ) dataset_properties = DatasetPropertiesClass( tags=[], customProperties={}, ) dataset_snapshot.aspects.append(dataset_properties) # TODO: Guess the schema via sampling # State of the art seems to be https://github.com/variety/variety. # TODO: use list_indexes() or index_information() to get index information # See https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.list_indexes. mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = MetadataWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu
def get_workunits(self) -> Iterable[MetadataWorkUnit]: platform = self.platform nodes = loadManifestAndCatalog( self.config.manifest_path, self.config.catalog_path, self.config.sources_path, self.config.load_schemas, self.config.target_platform, self.config.env, self.config.node_type_pattern, ) for node in nodes: dataset_snapshot = DatasetSnapshot( urn=node.datahub_urn, aspects=[], ) custom_properties = get_custom_properties(node) dbt_properties = DatasetPropertiesClass( description=node.dbt_name, customProperties=custom_properties, tags=[], ) dataset_snapshot.aspects.append(dbt_properties) upstreams = get_upstream_lineage(node.upstream_urns) if upstreams is not None: dataset_snapshot.aspects.append(upstreams) if self.config.load_schemas: schema_metadata = get_schema_metadata(self.report, node, platform) dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce) self.report.report_workunit(wu) yield wu
def loop_views( self, inspector: Inspector, schema: str, sql_config: SQLAlchemyConfig, ) -> Iterable[SqlWorkUnit]: for view in inspector.get_view_names(schema): schema, view = sql_config.standardize_schema_table_names( schema, view) dataset_name = sql_config.get_identifier(schema, view) self.report.report_entity_scanned(dataset_name, ent_type="view") if not sql_config.view_pattern.allowed(dataset_name): self.report.report_dropped(dataset_name) continue try: columns = inspector.get_columns(view, schema) except KeyError: # For certain types of views, we are unable to fetch the list of columns. self.report.report_warning( dataset_name, "unable to get schema for this view") schema_metadata = None else: schema_metadata = get_schema_metadata(self.report, dataset_name, self.platform, columns) try: # SQLALchemy stubs are incomplete and missing this method. # PR: https://github.com/dropbox/sqlalchemy-stubs/pull/223. view_info: dict = inspector.get_table_comment( view, schema) # type: ignore except NotImplementedError: description: Optional[str] = None properties: Dict[str, str] = {} else: description = view_info["text"] # The "properties" field is a non-standard addition to SQLAlchemy's interface. properties = view_info.get("properties", {}) try: view_definition = inspector.get_view_definition(view, schema) if view_definition is None: view_definition = "" else: # Some dialects return a TextClause instead of a raw string, # so we need to convert them to a string. view_definition = str(view_definition) except NotImplementedError: view_definition = "" properties["view_definition"] = view_definition properties["is_view"] = "True" dataset_snapshot = DatasetSnapshot( urn= f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{dataset_name},{self.config.env})", aspects=[], ) if description is not None or properties: dataset_properties = DatasetPropertiesClass( description=description, customProperties=properties, # uri=dataset_name, ) dataset_snapshot.aspects.append(dataset_properties) if schema_metadata: dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = SqlWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu
def _to_metadata_events( # noqa: C901 self, config: LookerCommonConfig, reporter: SourceReport, base_url: str) -> Optional[List[Union[MetadataChangeEvent, MetadataChangeProposalWrapper]]]: # We only generate MCE-s for explores that contain from clauses and do NOT contain joins # All other explores (passthrough explores and joins) end in correct resolution of lineage, and don't need additional nodes in the graph. dataset_snapshot = DatasetSnapshot( urn=self.get_explore_urn(config), aspects=[], # we append to this list later on ) browse_paths = BrowsePathsClass( paths=[self.get_explore_browse_path(config)]) dataset_snapshot.aspects.append(browse_paths) dataset_snapshot.aspects.append(StatusClass(removed=False)) custom_properties = {} if self.label is not None: custom_properties["looker.explore.label"] = str(self.label) if self.source_file is not None: custom_properties["looker.explore.file"] = str(self.source_file) dataset_props = DatasetPropertiesClass( description=self.description, customProperties=custom_properties, ) dataset_props.externalUrl = self._get_url(base_url) dataset_snapshot.aspects.append(dataset_props) if self.upstream_views is not None: assert self.project_name is not None upstreams = [ UpstreamClass( dataset=LookerViewId( project_name=self.project_name, model_name=self.model_name, view_name=view_name, ).get_urn(config), type=DatasetLineageTypeClass.VIEW, ) for view_name in sorted(self.upstream_views) ] upstream_lineage = UpstreamLineage(upstreams=upstreams) dataset_snapshot.aspects.append(upstream_lineage) if self.fields is not None: schema_metadata = LookerUtil._get_schema( platform_name=config.platform_name, schema_name=self.name, view_fields=self.fields, reporter=reporter, ) if schema_metadata is not None: dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_snapshot.urn, aspectName="subTypes", aspect=SubTypesClass(typeNames=["explore"]), ) return [mce, mcp]