def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]: current_dashboard_page = 0 # we will set total dashboards to the actual number after we get the response total_dashboards = PAGE_SIZE while current_dashboard_page * PAGE_SIZE <= total_dashboards: dashboard_response = self.session.get( f"{self.config.connect_uri}/api/v1/dashboard", params= f"q=(page:{current_dashboard_page},page_size:{PAGE_SIZE})", ) payload = dashboard_response.json() total_dashboards = payload.get("count") or 0 current_dashboard_page += 1 payload = dashboard_response.json() for dashboard_data in payload["result"]: dashboard_snapshot = self.construct_dashboard_from_api_data( dashboard_data) mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot) wu = MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce) self.report.report_workunit(wu) yield wu
def get_workunits(self) -> Iterable[MetadataWorkUnit]: client = self._get_looker_client() dashboard_ids = [ dashboard_base.id for dashboard_base in client.all_dashboards(fields="id") if dashboard_base.id is not None ] for dashboard_id in dashboard_ids: self.reporter.report_dashboards_scanned() if not self.source_config.dashboard_pattern.allowed(dashboard_id): self.reporter.report_dashboards_dropped(dashboard_id) continue try: fields = [ "id", "title", "dashboard_elements", "dashboard_filters" ] dashboard_object = client.dashboard(dashboard_id=dashboard_id, fields=",".join(fields)) except SDKError: # A looker dashboard could be deleted in between the list and the get logger.warning( f"Error occuried while loading dashboard {dashboard_id}. Skipping." ) continue looker_dashboard = self._get_looker_dashboard(dashboard_object) mces = self._make_dashboard_and_chart_mces(looker_dashboard) for mce in mces: workunit = MetadataWorkUnit( id=f"looker-{mce.proposedSnapshot.urn}", mce=mce) self.reporter.report_workunit(workunit) yield workunit
def construct_flow_workunit( self, connector: ConnectorManifest ) -> Iterable[MetadataWorkUnit]: connector_name = connector.name connector_type = connector.type connector_class = connector.config.get("connector.class") # connector_url = connector.url # NOTE: this will expose connector credential when used flow_urn = builder.make_data_flow_urn( "kafka-connect", connector_name, self.config.env ) flow_property_bag: Optional[Dict[str, str]] = None mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataFlowSnapshotClass( urn=flow_urn, aspects=[ models.DataFlowInfoClass( name=connector_name, description=f"{connector_type.capitalize()} connector using `{connector_class}` plugin.", customProperties=flow_property_bag, # externalUrl=connector_url, # NOTE: this will expose connector credential when used ), # ownership, # tags, ], ) ) for c in [connector_name]: wu = MetadataWorkUnit(id=c, mce=mce) self.report.report_workunit(wu) yield wu
def get_datajob_wu(self, node: Dict[str, Any], job: Dict[str, Any]) -> MetadataWorkUnit: """ Generate a DataJob workunit for a component (node) in a Glue job. Parameters ---------- node: Node from process_dataflow_graph() job: Job object from get_all_jobs() """ mce = MetadataChangeEventClass(proposedSnapshot=DataJobSnapshotClass( urn=node["urn"], aspects=[ DataJobInfoClass( name=f"{job['Name']}:{node['NodeType']}-{node['Id']}", type="GLUE", customProperties={ **{x["Name"]: x["Value"] for x in node["Args"]}, "transformType": node["NodeType"], "nodeId": node["Id"], }, ), DataJobInputOutputClass( inputDatasets=node["inputDatasets"], outputDatasets=node["outputDatasets"], inputDatajobs=node["inputDatajobs"], ), ], )) return MetadataWorkUnit(id=f'{job["Name"]}-{node["Id"]}', mce=mce)
def get_dataflow_wu(self, flow_urn: str, job: Dict[str, Any]) -> MetadataWorkUnit: """ Generate a DataFlow workunit for a Glue job. Parameters ---------- flow_urn: URN for the flow job: Job object from get_all_jobs() """ mce = MetadataChangeEventClass(proposedSnapshot=DataFlowSnapshotClass( urn=flow_urn, aspects=[ DataFlowInfoClass( name=job["Name"], description=job["Description"], # specify a few Glue-specific properties customProperties={ "role": job["Role"], "created": str(job["CreatedOn"]), "modified": str(job["LastModifiedOn"]), "command": job["Command"]["ScriptLocation"], }, ), ], )) return MetadataWorkUnit(id=job["Name"], mce=mce)
def get_workunits(self) -> Iterable[MetadataWorkUnit]: env: str = "PROD" platform = self.platform nodes = loadManifestAndCatalog( self.config.manifest_path, self.config.catalog_path, platform, env ) for node in nodes: mce = MetadataChangeEvent() dataset_snapshot = DatasetSnapshot() dataset_snapshot.urn = node.datahub_urn custom_properties = get_custom_properties(node) dbt_properties = DatasetPropertiesClass() dbt_properties.description = node.dbt_name dbt_properties.customProperties = custom_properties dataset_snapshot.aspects.append(dbt_properties) upstreams = get_upstream_lineage(node.upstream_urns) if upstreams is not None: dataset_snapshot.aspects.append(upstreams) schema_metadata = get_schema_metadata(self.report, node, platform) dataset_snapshot.aspects.append(schema_metadata) mce.proposedSnapshot = dataset_snapshot wu = MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce) self.report.report_workunit(wu) yield wu
def handle_user(self, dn: str, attrs: Dict[str, Any]) -> Iterable[MetadataWorkUnit]: """ Handle a DN and attributes by adding manager info and constructing a work unit based on the information. """ manager_ldap = None if "manager" in attrs: try: m_cn = attrs["manager"][0].decode() manager_msgid = self.ldap_client.search_ext( m_cn, ldap.SCOPE_BASE, self.config.filter, serverctrls=[self.lc], ) _m_dn, m_attrs = self.ldap_client.result3(manager_msgid)[1][0] manager_ldap = guess_person_ldap(m_attrs) except ldap.LDAPError as e: self.report.report_warning( dn, "manager LDAP search failed: {}".format(e)) mce = self.build_corp_user_mce(dn, attrs, manager_ldap) if mce: wu = MetadataWorkUnit(dn, mce) self.report.report_workunit(wu) yield wu else: self.report.report_dropped(dn)
def handle_user(self, dn, attrs) -> Iterable[MetadataWorkUnit]: """ Handle a DN and attributes by adding manager info and constructing a work unit based on the information. """ manager_ldap = None if "manager" in attrs: try: m_cn = attrs["manager"][0].split(b",")[0] manager_msgid = self.ldap_client.search_ext( self.config.base_dn, ldap.SCOPE_SUBTREE, f"({m_cn.decode()})", serverctrls=[self.lc], ) m_dn, m_attrs = self.ldap_client.result3(manager_msgid)[1][0] manager_ldap = guess_person_ldap(m_dn, m_attrs) except ldap.LDAPError as e: self.report.report_warning( dn, "manager LDAP search failed: {}".format(e) ) mce = self.build_corp_user_mce(dn, attrs, manager_ldap) if mce: wu = MetadataWorkUnit(dn, mce) self.report.report_workunit(wu) yield wu yield from []
def get_workunits(self) -> Iterable[MetadataWorkUnit]: for i, mce in enumerate(iterate_mce_file(self.config.filename)): if not mce.validate(): raise ValueError( f"failed to parse into valid MCE: {mce} (index {i})") wu = MetadataWorkUnit(f"file://{self.config.filename}:{i}", mce) self.report.report_workunit(wu) yield wu
def handle_group(self, dn: str, attrs: Dict[str, Any]) -> Iterable[MetadataWorkUnit]: """Creates a workunit for LDAP groups.""" mce = self.build_corp_group_mce(attrs) if mce: wu = MetadataWorkUnit(dn, mce) self.report.report_workunit(wu) yield wu yield from []
def create_metadata_work_unit(timestamp): dataset_snapshot = DatasetSnapshot( urn="urn:li:dataset:(urn:li:dataPlatform:glue,datalake_grilled.Barbeque,PROD)", aspects=[], ) dataset_snapshot.aspects.append(Status(removed=False)) dataset_snapshot.aspects.append( OwnershipClass( owners=[ OwnerClass( owner="urn:li:corpuser:Susan", type=OwnershipTypeClass.DATAOWNER ) ], lastModified=AuditStampClass( time=timestamp, actor="urn:li:corpuser:datahub" ), ) ) dataset_snapshot.aspects.append( DatasetPropertiesClass( description="Grilled Food", customProperties={}, uri=None, tags=[], ) ) fields = [ SchemaField( fieldPath="Size", nativeDataType="int", type=SchemaFieldDataType(type=NumberTypeClass()), description="Maximum attendees permitted", nullable=True, recursive=False, ) ] schema_metadata = SchemaMetadata( schemaName="datalake_grilled.Barbeque", version=0, fields=fields, platform="urn:li:dataPlatform:glue", created=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"), lastModified=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"), hash="", platformSchema=MySqlDDL(tableSchema=""), ) dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) return MetadataWorkUnit(id="glue-datalake_grilled.Barbeque", mce=mce)
def get_workunits(self) -> Iterable[MetadataWorkUnit]: def get_all_tables() -> List[dict]: def get_tables_from_database(database_name: str, tables: List) -> List[dict]: kwargs = {"DatabaseName": database_name} while True: data = self.glue_client.get_tables(**kwargs) tables += data["TableList"] if "NextToken" in data: kwargs["NextToken"] = data["NextToken"] else: break return tables def get_tables_from_all_databases() -> List[dict]: tables = [] kwargs: Dict = {} while True: data = self.glue_client.search_tables(**kwargs) tables += data["TableList"] if "NextToken" in data: kwargs["NextToken"] = data["NextToken"] else: break return tables if self.source_config.database_pattern.is_fully_specified_allow_list( ): all_tables: List[dict] = [] database_names = self.source_config.database_pattern.get_allowed_list( ) for database in database_names: all_tables += get_tables_from_database( database, all_tables) else: all_tables = get_tables_from_all_databases() return all_tables tables = get_all_tables() for table in tables: database_name = table["DatabaseName"] table_name = table["Name"] full_table_name = f"{database_name}.{table_name}" self.report.report_table_scanned() if not self.source_config.database_pattern.allowed( database_name ) or not self.source_config.table_pattern.allowed(full_table_name): self.report.report_table_dropped(full_table_name) continue mce = self._extract_record(table, full_table_name) workunit = MetadataWorkUnit(id=f"glue-{full_table_name}", mce=mce) self.report.report_workunit(workunit) yield workunit
def get_workunits(self) -> Iterable[MetadataWorkUnit]: with open(self.config.filename, 'r') as f: mce_obj_list = json.load(f) if not isinstance(mce_obj_list, list): mce_obj_list = [mce_obj_list] for i, obj in enumerate(mce_obj_list): mce: MetadataChangeEvent = MetadataChangeEvent.from_obj(obj) wu = MetadataWorkUnit(f"file://{self.config.filename}:{i}", mce) self.report.report_workunit(wu) yield wu
def get_entity_wu(self, ingest_table, ingest_entity): """ Generate an MLPrimaryKey workunit for a Feast entity. Parameters ---------- ingest_table: ingested Feast table ingest_entity: ingested Feast entity """ # create snapshot instance for the entity entity_snapshot = MLPrimaryKeySnapshot( urn=builder.make_ml_primary_key_urn( ingest_table["name"], ingest_entity["name"] ), aspects=[], ) entity_sources = [] if ingest_entity["batch_source"] is not None: entity_sources.append( builder.make_dataset_urn( ingest_entity["batch_source_platform"], ingest_entity["batch_source_name"], self.config.env, ) ) if ingest_entity["stream_source"] is not None: entity_sources.append( builder.make_dataset_urn( ingest_entity["stream_source_platform"], ingest_entity["stream_source_name"], self.config.env, ) ) # append entity name and type entity_snapshot.aspects.append( MLPrimaryKeyPropertiesClass( description=ingest_entity["description"], dataType=self.get_field_type( ingest_entity["type"], ingest_entity["name"] ), sources=entity_sources, ) ) # make the MCE and workunit mce = MetadataChangeEvent(proposedSnapshot=entity_snapshot) return MetadataWorkUnit(id=ingest_entity["name"], mce=mce)
def get_workunits(self) -> Iterable[MetadataWorkUnit]: viewfile_loader = LookerViewFileLoader(self.source_config.base_folder) model_files = sorted( f for f in glob.glob( f"{self.source_config.base_folder}/**/*.model.lkml", recursive=True ) ) for file_path in model_files: model_name = Path(file_path).stem self.reporter.report_models_scanned() if not self.source_config.model_pattern.allowed(model_name): self.reporter.report_models_dropped(model_name) continue try: model = self._load_model(file_path) except Exception: self.reporter.report_warning( "LookML", f"unable to parse Looker model: {file_path}" ) continue for include in model.resolved_includes: is_view_seen = viewfile_loader.is_view_seen(include) if is_view_seen: continue looker_viewfile = viewfile_loader.load_viewfile( include, model.connection ) if looker_viewfile is not None: for raw_view in looker_viewfile.views: maybe_looker_view = LookerView.from_looker_dict( raw_view, model.connection, looker_viewfile, viewfile_loader, self.source_config.parse_table_names_from_sql, ) if maybe_looker_view: self.reporter.report_views_scanned() if self.source_config.view_pattern.allowed( maybe_looker_view.view_name ): mce = self._build_dataset_mce(maybe_looker_view) workunit = MetadataWorkUnit( id=f"lookml-{maybe_looker_view.view_name}", mce=mce ) self.reporter.report_workunit(workunit) yield workunit else: self.reporter.report_views_dropped( maybe_looker_view.view_name )
def get_workunits(self) -> Iterable[MetadataWorkUnit]: topics = self.consumer.list_topics().topics for t in topics: self.report.report_topic_scanned(t) if self.source_config.topic_patterns.allowed(t): mce = self._extract_record(t) wu = MetadataWorkUnit(id=f"kafka-{t}", mce=mce) self.report.report_workunit(wu) yield wu else: self.report.report_dropped(t)
def get_feature_wu(self, ingest_table, ingest_feature): """ Generate an MLFeature workunit for a Feast feature. Parameters ---------- ingest_table: ingested Feast table ingest_feature: ingested Feast feature """ # create snapshot instance for the feature feature_snapshot = MLFeatureSnapshot( urn=builder.make_ml_feature_urn( ingest_table["name"], ingest_feature["name"] ), aspects=[], ) feature_sources = [] if ingest_feature["batch_source"] is not None: feature_sources.append( builder.make_dataset_urn( ingest_feature["batch_source_platform"], ingest_feature["batch_source_name"], self.config.env, ) ) if ingest_feature["stream_source"] is not None: feature_sources.append( builder.make_dataset_urn( ingest_feature["stream_source_platform"], ingest_feature["stream_source_name"], self.config.env, ) ) # append feature name and type feature_snapshot.aspects.append( MLFeaturePropertiesClass( dataType=self.get_field_type( ingest_feature["type"], ingest_feature["name"] ), sources=feature_sources, ) ) # make the MCE and workunit mce = MetadataChangeEvent(proposedSnapshot=feature_snapshot) return MetadataWorkUnit(id=ingest_feature["name"], mce=mce)
def get_workunits(self) -> Iterable[MetadataWorkUnit]: with open(self.config.filename, "r") as f: mce_obj_list = json.load(f) if not isinstance(mce_obj_list, list): mce_obj_list = [mce_obj_list] for i, obj in enumerate(mce_obj_list): mce: MetadataChangeEvent = MetadataChangeEvent.from_obj(obj) if not mce.validate(): raise ValueError(f"failed to parse into valid MCE: {obj}") wu = MetadataWorkUnit(f"file://{self.config.filename}:{i}", mce) self.report.report_workunit(wu) yield wu
def construct_job_workunits( self, connector: ConnectorManifest) -> Iterable[MetadataWorkUnit]: connector_name = connector.name flow_urn = builder.make_data_flow_urn("kafka-connect", connector_name, self.config.env) job_property_bag: Optional[Dict[str, str]] = None lineages = connector.lineages if lineages: for lineage in lineages: source_dataset = lineage.source_dataset source_platform = lineage.source_platform target_dataset = lineage.target_dataset target_platform = lineage.target_platform job_urn = builder.make_data_job_urn_with_flow( flow_urn, source_dataset) inlets = [ builder.make_dataset_urn(source_platform, source_dataset) ] outlets = [ builder.make_dataset_urn(target_platform, target_dataset) ] mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn=job_urn, aspects=[ models.DataJobInfoClass( name=f"{connector_name}:{source_dataset}", type="COMMAND", description=None, customProperties=job_property_bag, # externalUrl=job_url, ), models.DataJobInputOutputClass( inputDatasets=inlets or [], outputDatasets=outlets or [], ), # ownership, # tags, ], )) wu = MetadataWorkUnit(id=source_dataset, mce=mce) self.report.report_workunit(wu) yield wu
def get_workunits( self) -> Iterable[Union[MetadataWorkUnit, UsageStatsWorkUnit]]: for i, obj in enumerate(iterate_generic_file(self.config.filename)): if not obj.validate(): raise ValueError(f"failed to parse: {obj} (index {i})") wu: Union[MetadataWorkUnit, UsageStatsWorkUnit] if isinstance(obj, UsageAggregationClass): wu = UsageStatsWorkUnit(f"file://{self.config.filename}:{i}", obj) else: wu = MetadataWorkUnit(f"file://{self.config.filename}:{i}", obj) self.report.report_workunit(wu) yield wu
def get_workunits(self) -> Iterable[MetadataWorkUnit]: def get_all_tables() -> List[dict]: def get_tables_from_database(database_name: str) -> List[dict]: new_tables = [] paginator = self.glue_client.get_paginator("get_tables") for page in paginator.paginate(DatabaseName=database_name): new_tables += page["TableList"] return new_tables def get_database_names() -> List[str]: database_names = [] paginator = self.glue_client.get_paginator("get_databases") for page in paginator.paginate(): for db in page["DatabaseList"]: if self.source_config.database_pattern.allowed(db["Name"]): database_names.append(db["Name"]) return database_names if self.source_config.database_pattern.is_fully_specified_allow_list(): database_names = self.source_config.database_pattern.get_allowed_list() else: database_names = get_database_names() all_tables: List[dict] = [] for database in database_names: all_tables += get_tables_from_database(database) return all_tables tables = get_all_tables() for table in tables: database_name = table["DatabaseName"] table_name = table["Name"] full_table_name = f"{database_name}.{table_name}" self.report.report_table_scanned() if not self.source_config.database_pattern.allowed( database_name ) or not self.source_config.table_pattern.allowed(full_table_name): self.report.report_table_dropped(full_table_name) continue mce = self._extract_record(table, full_table_name) workunit = MetadataWorkUnit(id=f"glue-{full_table_name}", mce=mce) self.report.report_workunit(workunit) yield workunit
def get_workunits(self) -> Iterable[MetadataWorkUnit]: env = "PROD" platform = "mongodb" database_names: List[str] = self.mongo_client.list_database_names() for database_name in database_names: if database_name in DENY_DATABASE_LIST: continue if not self.config.database_pattern.allowed(database_name): self.report.report_dropped(database_name) continue database = self.mongo_client[database_name] collection_names: List[str] = database.list_collection_names() for collection_name in collection_names: dataset_name = f"{database_name}.{collection_name}" if not self.config.collection_pattern.allowed(dataset_name): self.report.report_dropped(dataset_name) continue dataset_snapshot = DatasetSnapshot( urn= f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{env})", aspects=[], ) dataset_properties = DatasetPropertiesClass( tags=[], customProperties={}, ) dataset_snapshot.aspects.append(dataset_properties) # TODO: Guess the schema via sampling # State of the art seems to be https://github.com/variety/variety. # TODO: use list_indexes() or index_information() to get index information # See https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.list_indexes. mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = MetadataWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu
def get_workunits(self) -> Iterable[MetadataWorkUnit]: platform = self.platform nodes = loadManifestAndCatalog( self.config.manifest_path, self.config.catalog_path, self.config.sources_path, self.config.load_schemas, self.config.target_platform, self.config.env, self.config.node_type_pattern, ) for node in nodes: dataset_snapshot = DatasetSnapshot( urn=node.datahub_urn, aspects=[], ) custom_properties = get_custom_properties(node) dbt_properties = DatasetPropertiesClass( description=node.dbt_name, customProperties=custom_properties, tags=[], ) dataset_snapshot.aspects.append(dbt_properties) upstreams = get_upstream_lineage(node.upstream_urns) if upstreams is not None: dataset_snapshot.aspects.append(upstreams) if self.config.load_schemas: schema_metadata = get_schema_metadata(self.report, node, platform) dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce) self.report.report_workunit(wu) yield wu
def construct_lineage_workunits( self, connector: ConnectorManifest ) -> Iterable[MetadataWorkUnit]: lineages = connector.lineages if lineages: for lineage in lineages: source_dataset = lineage.source_dataset source_platform = lineage.source_platform target_dataset = lineage.target_dataset target_platform = lineage.target_platform mce = models.MetadataChangeEventClass( proposedSnapshot=models.DatasetSnapshotClass( urn=builder.make_dataset_urn( target_platform, target_dataset, self.config.env ), aspects=[ models.UpstreamLineageClass( upstreams=[ models.UpstreamClass( dataset=builder.make_dataset_urn( source_platform, source_dataset, self.config.env, ), type=models.DatasetLineageTypeClass.TRANSFORMED, auditStamp=models.AuditStampClass( time=builder.get_sys_time(), actor="urn:li:corpuser:datahub", ), ) ] ) ], ) ) wu = MetadataWorkUnit(id=source_dataset, mce=mce) self.report.report_workunit(wu) yield wu
def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]: current_chart_page = 0 # we will set total charts to the actual number after we get the response total_charts = PAGE_SIZE while current_chart_page * PAGE_SIZE <= total_charts: chart_response = self.session.get( f"{self.config.connect_uri}/api/v1/chart", params=f"q=(page:{current_chart_page},page_size:{PAGE_SIZE})", ) current_chart_page += 1 payload = chart_response.json() total_charts = payload["count"] for chart_data in payload["result"]: chart_snapshot = self.construct_chart_from_chart_data( chart_data) mce = MetadataChangeEvent(proposedSnapshot=chart_snapshot) wu = MetadataWorkUnit(id=chart_snapshot.urn, mce=mce) self.report.report_workunit(wu) yield wu
def get_feature_table_wu(self, ingest_table): """ Generate an MLFeatureTable workunit for a Feast feature table. Parameters ---------- ingest_table: ingested Feast table """ featuretable_snapshot = MLFeatureTableSnapshot( urn=builder.make_ml_feature_table_urn("feast", ingest_table["name"]), aspects=[], ) featuretable_snapshot.aspects.append( MLFeatureTablePropertiesClass( mlFeatures=[ builder.make_ml_feature_urn( ingest_table["name"], feature["name"], ) for feature in ingest_table["features"] ], # a feature table can have multiple primary keys, which then act as a composite key mlPrimaryKeys=[ builder.make_ml_primary_key_urn( ingest_table["name"], entity["name"] ) for entity in ingest_table["entities"] ], ) ) # make the MCE and workunit mce = MetadataChangeEvent(proposedSnapshot=featuretable_snapshot) return MetadataWorkUnit(id=ingest_table["name"], mce=mce)
def get_workunits(self) -> Iterable[MetadataWorkUnit]: platform = "mongodb" database_names: List[str] = self.mongo_client.list_database_names() # traverse databases in sorted order so output is consistent for database_name in sorted(database_names): if database_name in DENY_DATABASE_LIST: continue if not self.config.database_pattern.allowed(database_name): self.report.report_dropped(database_name) continue database = self.mongo_client[database_name] collection_names: List[str] = database.list_collection_names() # traverse collections in sorted order so output is consistent for collection_name in sorted(collection_names): dataset_name = f"{database_name}.{collection_name}" if not self.config.collection_pattern.allowed(dataset_name): self.report.report_dropped(dataset_name) continue dataset_snapshot = DatasetSnapshot( urn=f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.config.env})", aspects=[], ) dataset_properties = DatasetPropertiesClass( tags=[], customProperties={}, ) dataset_snapshot.aspects.append(dataset_properties) if self.config.enableSchemaInference: collection_schema = construct_schema_pymongo( database[collection_name], delimiter=".", sample_size=self.config.schemaSamplingSize, ) # initialize the schema for the collection canonical_schema: List[SchemaField] = [] # append each schema field (sort so output is consistent) for schema_field in sorted( collection_schema.values(), key=lambda x: x["delimited_name"] ): field = SchemaField( fieldPath=schema_field["delimited_name"], nativeDataType=self.get_pymongo_type_string( schema_field["type"], dataset_name ), type=self.get_field_type( schema_field["type"], dataset_name ), description=None, nullable=schema_field["nullable"], recursive=False, ) canonical_schema.append(field) # create schema metadata object for collection actor = "urn:li:corpuser:etl" sys_time = int(time.time() * 1000) schema_metadata = SchemaMetadata( schemaName=collection_name, platform=f"urn:li:dataPlatform:{platform}", version=0, hash="", platformSchema=SchemalessClass(), created=AuditStamp(time=sys_time, actor=actor), lastModified=AuditStamp(time=sys_time, actor=actor), fields=canonical_schema, ) dataset_snapshot.aspects.append(schema_metadata) # TODO: use list_indexes() or index_information() to get index information # See https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.list_indexes. mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = MetadataWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu
def get_workunits(self) -> Iterable[MetadataWorkUnit]: tables = self.get_all_tables() for table in tables: database_name = table["DatabaseName"] table_name = table["Name"] full_table_name = f"{database_name}.{table_name}" self.report.report_table_scanned() if not self.source_config.database_pattern.allowed( database_name ) or not self.source_config.table_pattern.allowed(full_table_name): self.report.report_table_dropped(full_table_name) continue mce = self._extract_record(table, full_table_name) workunit = MetadataWorkUnit(id=f"glue-{full_table_name}", mce=mce) self.report.report_workunit(workunit) yield workunit if self.extract_transforms: dags = {} for job in self.get_all_jobs(): flow_urn = mce_builder.make_data_flow_urn( "glue", job["Name"], self.env) flow_wu = self.get_dataflow_wu(flow_urn, job) self.report.report_workunit(flow_wu) yield flow_wu dag = self.get_dataflow_graph(job["Command"]["ScriptLocation"]) dags[flow_urn] = dag # run a first pass to pick up s3 bucket names and formats # in Glue, it's possible for two buckets to have files of different extensions # if this happens, we append the extension in the URN so the sources can be distinguished # see process_dataflow_node() for details s3_formats: typing.DefaultDict[str, Set[Union[ str, None]]] = defaultdict(lambda: set()) for dag in dags.values(): for s3_name, extension in self.get_dataflow_s3_names(dag): s3_formats[s3_name].add(extension) # run second pass to generate node workunits for flow_urn, dag in dags.items(): nodes, new_dataset_ids, new_dataset_mces = self.process_dataflow_graph( dag, flow_urn, s3_formats) for node in nodes.values(): if node["NodeType"] not in ["DataSource", "DataSink"]: job_wu = self.get_datajob_wu(node, job) self.report.report_workunit(job_wu) yield job_wu for dataset_id, dataset_mce in zip(new_dataset_ids, new_dataset_mces): dataset_wu = MetadataWorkUnit(id=dataset_id, mce=dataset_mce) self.report.report_workunit(dataset_wu) yield dataset_wu
def __init__(self): self.source_report = SourceReport() self.work_units: List[MetadataWorkUnit] = [ MetadataWorkUnit(id="workunit-1", mce=get_initial_mce()) ]