def pretty_id(id: Optional[str]) -> str: if not id: return "" else: # breakpoint() assert id is not None if id.startswith("urn:li:datasetField:") or id.startswith( "urn:li:schemaField:"): # parse schema field schema_field_key = schema_field_urn_to_key( id.replace("urn:li:datasetField", "urn:li:schemaField")) if schema_field_key: assert schema_field_key is not None field_path = schema_field_key.fieldPath return f"{colored('field','cyan')}:{colored(pretty_field_path(field_path),'white')}" if id.startswith("[version=2.0]"): return f"{colored('field','cyan')}:{colored(pretty_field_path(id),'white')}" if id.startswith("urn:li:dataset"): # parse dataset urn dataset_key = dataset_urn_to_key(id) if dataset_key: return f"{colored('dataset','cyan')}:{colored(dataset_key.platform,'white')}:{colored(dataset_key.name,'white')}" # failed to prettify, return original return id
def get_lineage_mcp( self, dataset_urn: str) -> Optional[MetadataChangeProposalWrapper]: if self.lineage_metadata is None: logger.debug("No lineage metadata so skipping getting mcp") return None dataset_key: Optional[DatasetKey] = mce_builder.dataset_urn_to_key( dataset_urn) if dataset_key is None: logger.debug( f"No dataset_key for {dataset_urn} so skipping getting mcp") return None project_id, dataset_name, tablename = dataset_key.name.split(".") bq_table = BigQueryTableRef(project_id, dataset_name, tablename) if str(bq_table) in self.lineage_metadata: upstream_list: List[UpstreamClass] = [] # Sorting the list of upstream lineage events in order to avoid creating multiple aspects in backend # even if the lineage is same but the order is different. for upstream_table in sorted( self.get_upstream_tables(str(bq_table), tables_seen=[])): upstream_table_class = UpstreamClass( mce_builder.make_dataset_urn_with_platform_instance( self.platform, "{project}.{database}.{table}".format( project=upstream_table.project, database=upstream_table.dataset, table=upstream_table.table, ), self.config.platform_instance, self.config.env, ), DatasetLineageTypeClass.TRANSFORMED, ) if self.config.upstream_lineage_in_report: current_lineage_map: Set = self.report.upstream_lineage.get( str(bq_table), set()) current_lineage_map.add(str(upstream_table)) self.report.upstream_lineage[str( bq_table)] = current_lineage_map upstream_list.append(upstream_table_class) if upstream_list: upstream_lineage = UpstreamLineageClass( upstreams=upstream_list) mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="upstreamLineage", aspect=upstream_lineage, ) return mcp return None
def _get_upstream_lineage_info( self, dataset_urn: str ) -> Optional[Tuple[UpstreamLineage, Dict[str, str]]]: dataset_key = builder.dataset_urn_to_key(dataset_urn) if dataset_key is None: logger.warning( f"Invalid dataset urn {dataset_urn}. Could not get key!") return None if self._lineage_map is None: self._populate_lineage() assert self._lineage_map is not None dataset_name = dataset_key.name lineage = self._lineage_map.get(f"{dataset_name}", None) if lineage is None: logger.debug(f"No lineage found for {dataset_name}") return None upstream_tables: List[UpstreamClass] = [] column_lineage: Dict[str, str] = {} for lineage_entry in lineage: # Update the table-lineage upstream_table_name = lineage_entry[0] if not self._is_dataset_allowed(upstream_table_name): continue upstream_table = UpstreamClass( dataset=builder.make_dataset_urn(self.platform, upstream_table_name, self.config.env), type=DatasetLineageTypeClass.TRANSFORMED, ) upstream_tables.append(upstream_table) # Update column-lineage for each down-stream column. upstream_columns = [ d["columnName"].lower() for d in json.loads(lineage_entry[1]) ] downstream_columns = [ d["columnName"].lower() for d in json.loads(lineage_entry[2]) ] upstream_column_str = ( f"{upstream_table_name}({', '.join(sorted(upstream_columns))})" ) downstream_column_str = ( f"{dataset_name}({', '.join(sorted(downstream_columns))})") column_lineage_key = f"column_lineage[{upstream_table_name}]" column_lineage_value = ( f"{{{upstream_column_str} -> {downstream_column_str}}}") column_lineage[column_lineage_key] = column_lineage_value logger.debug(f"{column_lineage_key}:{column_lineage_value}") if upstream_tables: return UpstreamLineage(upstreams=upstream_tables), column_lineage return None
def get_lineage_mcp( self, dataset_urn: str ) -> Tuple[Optional[MetadataChangeProposalWrapper], Optional[DatasetPropertiesClass]]: dataset_key = mce_builder.dataset_urn_to_key(dataset_urn) if dataset_key is None: return None, None if not self._lineage_map: self._populate_lineage() assert self._lineage_map is not None upstream_lineage: List[UpstreamClass] = [] custom_properties: Dict[str, str] = {} if dataset_key.name in self._lineage_map: item = self._lineage_map[dataset_key.name] for upstream in item.upstreams: upstream_table = UpstreamClass( dataset=builder.make_dataset_urn_with_platform_instance( upstream.platform.value, upstream.path, self.config.platform_instance, self.config.env, ), type=item.dataset_lineage_type, ) upstream_lineage.append(upstream_table) properties = None if custom_properties: properties = DatasetPropertiesClass( customProperties=custom_properties) if not upstream_lineage: return None, properties mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="upstreamLineage", aspect=UpstreamLineage(upstreams=upstream_lineage), ) return mcp, properties
def get_lineage_mcp( self, dataset_urn: str) -> Optional[MetadataChangeProposalWrapper]: if self.lineage_metadata is None: return None dataset_key: Optional[DatasetKey] = mce_builder.dataset_urn_to_key( dataset_urn) if dataset_key is None: return None project_id, dataset_name, tablename = dataset_key.name.split(".") bq_table = BigQueryTableRef(project_id, dataset_name, tablename) if str(bq_table) in self.lineage_metadata: upstream_list: List[UpstreamClass] = [] # Sorting the list of upstream lineage events in order to avoid creating multiple aspects in backend # even if the lineage is same but the order is different. for ref_table in sorted(self.lineage_metadata[str(bq_table)]): upstream_table = BigQueryTableRef.from_string_name(ref_table) upstream_table_class = UpstreamClass( mce_builder.make_dataset_urn( self.platform, "{project}.{database}.{table}".format( project=upstream_table.project, database=upstream_table.dataset, table=upstream_table.table, ), self.config.env, ), DatasetLineageTypeClass.TRANSFORMED, ) upstream_list.append(upstream_table_class) if upstream_list: upstream_lineage = UpstreamLineageClass( upstreams=upstream_list) mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="upstreamLineage", aspect=upstream_lineage, ) return mcp return None
def get_lineage_mcp( self, dataset_urn: str) -> Optional[MetadataChangeProposalWrapper]: dataset_key = mce_builder.dataset_urn_to_key(dataset_urn) if dataset_key is None: return None dataset_params = dataset_key.name.split(".") db_name = dataset_params[0] schemaname = dataset_params[1] tablename = dataset_params[2] if db_name in self.catalog_metadata: if schemaname in self.catalog_metadata[db_name]: external_db_params = self.catalog_metadata[db_name][schemaname] upstream_lineage = UpstreamLineageClass(upstreams=[ UpstreamClass( mce_builder.make_dataset_urn( self.eskind_to_platform[ external_db_params["eskind"]], "{database}.{table}".format( database=external_db_params[ "external_database"], table=tablename, ), self.config.env, ), DatasetLineageTypeClass.COPY, ) ]) mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="upstreamLineage", aspect=upstream_lineage, ) return mcp return None
def _get_upstream_lineage_info( self, dataset_urn: str ) -> Optional[Tuple[UpstreamLineage, Dict[str, str]]]: dataset_key = builder.dataset_urn_to_key(dataset_urn) if dataset_key is None: logger.warning( f"Invalid dataset urn {dataset_urn}. Could not get key!") return None if self._lineage_map is None: self._populate_lineage() self._populate_view_lineage() if self._external_lineage_map is None: self._populate_external_lineage() assert self._lineage_map is not None assert self._external_lineage_map is not None dataset_name = dataset_key.name lineage = self._lineage_map[dataset_name] external_lineage = self._external_lineage_map[dataset_name] if not (lineage or external_lineage): logger.debug(f"No lineage found for {dataset_name}") return None upstream_tables: List[UpstreamClass] = [] column_lineage: Dict[str, str] = {} for lineage_entry in lineage: # Update the table-lineage upstream_table_name = lineage_entry[0] if not self._is_dataset_allowed(upstream_table_name): continue upstream_table = UpstreamClass( dataset=builder.make_dataset_urn_with_platform_instance( self.platform, upstream_table_name, self.config.platform_instance, self.config.env, ), type=DatasetLineageTypeClass.TRANSFORMED, ) upstream_tables.append(upstream_table) # Update column-lineage for each down-stream column. upstream_columns = [ d["columnName"].lower() for d in json.loads(lineage_entry[1]) ] downstream_columns = [ d["columnName"].lower() for d in json.loads(lineage_entry[2]) ] upstream_column_str = ( f"{upstream_table_name}({', '.join(sorted(upstream_columns))})" ) downstream_column_str = ( f"{dataset_name}({', '.join(sorted(downstream_columns))})") column_lineage_key = f"column_lineage[{upstream_table_name}]" column_lineage_value = ( f"{{{upstream_column_str} -> {downstream_column_str}}}") column_lineage[column_lineage_key] = column_lineage_value logger.debug(f"{column_lineage_key}:{column_lineage_value}") for external_lineage_entry in external_lineage: # For now, populate only for S3 if external_lineage_entry.startswith("s3://"): external_upstream_table = UpstreamClass( dataset=make_s3_urn(external_lineage_entry, self.config.env), type=DatasetLineageTypeClass.COPY, ) upstream_tables.append(external_upstream_table) if upstream_tables: logger.debug( f"Upstream lineage of '{dataset_name}': {[u.dataset for u in upstream_tables]}" ) if self.config.report_upstream_lineage: self.report.upstream_lineage[dataset_name] = [ u.dataset for u in upstream_tables ] return UpstreamLineage(upstreams=upstream_tables), column_lineage return None
def get_lineage_mcp( self, dataset_urn: str ) -> Tuple[Optional[MetadataChangeProposalWrapper], Optional[DatasetPropertiesClass]]: dataset_key = mce_builder.dataset_urn_to_key(dataset_urn) if dataset_key is None: return None, None if self._lineage_map is None: logger.debug("Populating lineage") self._populate_lineage() assert self._lineage_map is not None upstream_lineage: List[UpstreamClass] = [] custom_properties: Dict[str, str] = {} if dataset_key.name in self._lineage_map: item = self._lineage_map[dataset_key.name] if (self.config.capture_lineage_query_parser_failures and item.query_parser_failed_sqls): custom_properties[ "lineage_sql_parser_failed_queries"] = ",".join( item.query_parser_failed_sqls) for upstream in item.upstreams: upstream_table = UpstreamClass( dataset=builder.make_dataset_urn_with_platform_instance( upstream.platform.value, upstream.path, platform_instance=self.config.platform_instance_map. get(upstream.platform.value) if self.config.platform_instance_map else None, env=self.config.env, ), type=item.dataset_lineage_type, ) upstream_lineage.append(upstream_table) dataset_params = dataset_key.name.split(".") db_name = dataset_params[0] schemaname = dataset_params[1] tablename = dataset_params[2] if db_name in self.catalog_metadata: if schemaname in self.catalog_metadata[db_name]: external_db_params = self.catalog_metadata[db_name][schemaname] upstream_platform = self.eskind_to_platform[ external_db_params["eskind"]] catalog_upstream = UpstreamClass( mce_builder.make_dataset_urn_with_platform_instance( upstream_platform, "{database}.{table}".format( database=external_db_params["external_database"], table=tablename, ), platform_instance=self.config.platform_instance_map. get(upstream_platform) if self.config.platform_instance_map else None, env=self.config.env, ), DatasetLineageTypeClass.COPY, ) upstream_lineage.append(catalog_upstream) properties = None if custom_properties: properties = DatasetPropertiesClass( customProperties=custom_properties) if upstream_lineage: self.report.upstream_lineage[dataset_urn] = [ u.dataset for u in upstream_lineage ] else: return None, properties mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="upstreamLineage", aspect=UpstreamLineage(upstreams=upstream_lineage), ) return mcp, properties
def create_platform_mces( self, dbt_nodes: List[DBTNode], additional_custom_props_filtered: Dict[str, str], manifest_nodes_raw: Dict[str, Dict[str, Any]], mce_platform: str, ) -> Iterable[MetadataWorkUnit]: """ This function creates mce based out of dbt nodes. Since dbt ingestion creates "dbt" nodes and nodes for underlying platform the function gets called twice based on the mce_platform parameter. Further, this function takes specific actions based on the mce_platform passed in. If disable_dbt_node_creation = True, Create empty entities of the underlying platform with only lineage/key aspect. Create dbt entities with all metadata information. If disable_dbt_node_creation = False Create platform entities with all metadata information. """ action_processor = OperationProcessor( self.config.meta_mapping, self.config.tag_prefix, "SOURCE_CONTROL", self.config.strip_user_ids_from_email, ) for node in dbt_nodes: node_datahub_urn = get_urn_from_dbtNode( node.database, node.schema, node.name, mce_platform, self.config.env, ) meta_aspects: Dict[str, Any] = {} if self.config.enable_meta_mapping and node.meta: meta_aspects = action_processor.process(node.meta) aspects = self._generate_base_aspects( node, additional_custom_props_filtered, mce_platform, meta_aspects) if mce_platform == DBT_PLATFORM: # add upstream lineage upstream_lineage_class = self._create_lineage_aspect_for_dbt_node( node, manifest_nodes_raw) if upstream_lineage_class: aspects.append(upstream_lineage_class) # add view properties aspect if node.raw_sql: view_prop_aspect = self._create_view_properties_aspect( node) aspects.append(view_prop_aspect) # emit subtype mcp sub_type_wu = self._create_subType_wu(node, node_datahub_urn) if sub_type_wu: yield sub_type_wu self.report.report_workunit(sub_type_wu) else: if not self.config.disable_dbt_node_creation: # if dbt node creation is enabled we are creating empty node for platform and only add # lineage/keyaspect. aspects = [] if node.materialization == "ephemeral" or node.node_type == "test": continue # This code block is run when we are generating entities of platform type. # We will not link the platform not to the dbt node for type "source" because # in this case the platform table existed first. if node.node_type != "source": upstream_dbt_urn = get_urn_from_dbtNode( node.database, node.schema, node.name, DBT_PLATFORM, self.config.env, ) upstreams_lineage_class = get_upstream_lineage( [upstream_dbt_urn]) aspects.append(upstreams_lineage_class) else: dataset_key = mce_builder.dataset_urn_to_key( node_datahub_urn) assert dataset_key is not None key_aspect = DatasetKeyClass( "urn:li:dataPlatform:" + dataset_key.platform, dataset_key.name, dataset_key.origin, ) aspects.append(key_aspect) else: # add upstream lineage platform_upstream_aspect = ( self._create_lineage_aspect_for_platform_node( node, manifest_nodes_raw)) if platform_upstream_aspect: aspects.append(platform_upstream_aspect) dataset_snapshot = DatasetSnapshot(urn=node_datahub_urn, aspects=aspects) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) if self.config.write_semantics == "PATCH": mce = self.get_patched_mce(mce) wu = MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce) self.report.report_workunit(wu) yield wu
def _get_lightweight_repr(dataset_urn: str) -> str: """Reduces the amount of text in the URNs for smaller state footprint.""" SEP = KafkaCheckpointState._get_separator() key = dataset_urn_to_key(dataset_urn) assert key is not None return f"{key.platform}{SEP}{key.name}{SEP}{key.origin}"
def dataplatform2instance_func( instance: str, platform: str, dry_run: bool, env: str, force: bool, hard: bool, keep: bool, ) -> None: click.echo( f"Starting migration: platform:{platform}, instance={instance}, force={force}, dry-run={dry_run}" ) run_id: str = f"migrate-{uuid.uuid4()}" migration_report = MigrationReport(run_id, dry_run, keep) system_metadata = SystemMetadataClass(runId=run_id) all_aspects = [ "schemaMetadata", "datasetProperties", "viewProperties", "subTypes", "editableDatasetProperties", "ownership", "datasetDeprecation", "institutionalMemory", "editableSchemaMetadata", "globalTags", "glossaryTerms", "upstreamLineage", "datasetUpstreamLineage", "status", ] if not dry_run: rest_emitter = DatahubRestEmitter( gms_server=cli_utils.get_session_and_host()[1] ) urns_to_migrate = [] # we first calculate all the urns we will be migrating for src_entity_urn in cli_utils.get_urns_by_filter(platform=platform, env=env): key = dataset_urn_to_key(src_entity_urn) assert key # Does this urn already have a platform instance associated with it? response = cli_utils.get_aspects_for_entity( entity_urn=src_entity_urn, aspects=["dataPlatformInstance"], typed=True ) if "dataPlatformInstance" in response: assert isinstance( response["dataPlatformInstance"], DataPlatformInstanceClass ) data_platform_instance: DataPlatformInstanceClass = response[ "dataPlatformInstance" ] if data_platform_instance.instance: log.debug("This is already an instance-specific urn, will skip") continue else: log.debug( f"{src_entity_urn} is not an instance specific urn. {response}" ) urns_to_migrate.append(src_entity_urn) if not force and not dry_run: # get a confirmation from the operator before proceeding if this is not a dry run sampled_urns_to_migrate = random.choices( urns_to_migrate, k=min(10, len(urns_to_migrate)) ) sampled_new_urns: List[str] = [ make_dataset_urn_with_platform_instance( platform=key.platform, name=key.name, platform_instance=instance, env=str(key.origin), ) for key in [dataset_urn_to_key(x) for x in sampled_urns_to_migrate] if key ] click.echo( f"Will migrate {len(urns_to_migrate)} urns such as {random.choices(urns_to_migrate, k=min(10, len(urns_to_migrate)))}" ) click.echo(f"New urns will look like {sampled_new_urns}") click.confirm("Ok to proceed?", abort=True) for src_entity_urn in progressbar.progressbar( urns_to_migrate, redirect_stdout=True ): key = dataset_urn_to_key(src_entity_urn) assert key new_urn = make_dataset_urn_with_platform_instance( platform=key.platform, name=key.name, platform_instance=instance, env=str(key.origin), ) log.debug(f"Will migrate {src_entity_urn} to {new_urn}") relationships = migration_utils.get_incoming_relationships_dataset( src_entity_urn ) for mcp in migration_utils.clone_aspect( src_entity_urn, aspect_names=all_aspects, dst_urn=new_urn, dry_run=dry_run, run_id=run_id, ): if not dry_run: rest_emitter.emit_mcp(mcp) migration_report.on_entity_create(mcp.entityUrn, mcp.aspectName) # type: ignore if not dry_run: rest_emitter.emit_mcp( MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=new_urn, aspectName="dataPlatformInstance", aspect=DataPlatformInstanceClass( platform=make_data_platform_urn(platform), instance=make_dataplatform_instance_urn(platform, instance), ), systemMetadata=system_metadata, ) ) migration_report.on_entity_create(new_urn, "dataPlatformInstance") for relationship in relationships: target_urn = relationship["entity"] entity_type = _get_type_from_urn(target_urn) relationshipType = relationship["type"] aspect_name = ( migration_utils.get_aspect_name_from_relationship_type_and_entity( relationshipType, entity_type ) ) aspect_map = cli_utils.get_aspects_for_entity( target_urn, aspects=[aspect_name], typed=True ) if aspect_name in aspect_map: aspect = aspect_map[aspect_name] assert isinstance(aspect, DictWrapper) aspect = migration_utils.modify_urn_list_for_aspect( aspect_name, aspect, relationshipType, src_entity_urn, new_urn ) # use mcpw mcp = MetadataChangeProposalWrapper( entityType=entity_type, changeType=ChangeTypeClass.UPSERT, entityUrn=target_urn, aspectName=aspect_name, aspect=aspect, ) if not dry_run: rest_emitter.emit_mcp(mcp) migration_report.on_entity_affected(mcp.entityUrn, mcp.aspectName) # type: ignore else: log.debug(f"Didn't find aspect {aspect_name} for urn {target_urn}") if not dry_run and not keep: log.info(f"will {'hard' if hard else 'soft'} delete {src_entity_urn}") delete_cli._delete_one_urn(src_entity_urn, soft=not hard, run_id=run_id) migration_report.on_entity_migrated(src_entity_urn, "status") # type: ignore print(f"{migration_report}")