class DatahubRestSink(Sink): config: DatahubRestSinkConfig emitter: DatahubRestEmitter report: SinkReport def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig): super().__init__(ctx) self.config = config self.report = SinkReport() self.emitter = DatahubRestEmitter( self.config.server, self.config.token, connect_timeout_sec=self.config.timeout_sec, # reuse timeout_sec for connect timeout read_timeout_sec=self.config.timeout_sec, ) self.emitter.test_connection() @classmethod def create(cls, config_dict: dict, ctx: PipelineContext) -> "DatahubRestSink": config = DatahubRestSinkConfig.parse_obj(config_dict) return cls(ctx, config) def handle_work_unit_start(self, workunit: WorkUnit) -> None: pass def handle_work_unit_end(self, workunit: WorkUnit) -> None: pass def write_record_async( self, record_envelope: RecordEnvelope[ Union[ MetadataChangeEvent, MetadataChangeProposal, MetadataChangeProposalWrapper, UsageAggregation, ] ], write_callback: WriteCallback, ) -> None: record = record_envelope.record try: self.emitter.emit(record) self.report.report_record_written(record_envelope) write_callback.on_success(record_envelope, {}) except OperationalError as e: self.report.report_failure({"error": e.message, "info": e.info}) write_callback.on_failure(record_envelope, e, e.info) except Exception as e: self.report.report_failure({"e": e}) write_callback.on_failure(record_envelope, e, {}) def get_report(self) -> SinkReport: return self.report def close(self): pass
def test_datahub_rest_emitter(requests_mock, record, path, snapshot): def match_request_text(request: requests.Request) -> bool: requested_snapshot = request.json() assert ( requested_snapshot == snapshot ), f"Expected snapshot to be {json.dumps(snapshot)}, got {json.dumps(requested_snapshot)}" return True requests_mock.post( f"{MOCK_GMS_ENDPOINT}{path}", request_headers={"X-RestLi-Protocol-Version": "2.0.0"}, additional_matcher=match_request_text, ) emitter = DatahubRestEmitter(MOCK_GMS_ENDPOINT) emitter.emit(record)
# Construct an assertion platform object. assertion_dataPlatformInstance = DataPlatformInstance( platform=builder.make_data_platform_urn("great-expectations") ) # Construct a MetadataChangeProposalWrapper object for assertion platform assertion_dataPlatformInstance_mcp = MetadataChangeProposalWrapper( entityType="assertion", changeType=ChangeType.UPSERT, entityUrn=assertionUrn(assertion_maxVal), aspectName="dataPlatformInstance", aspect=assertion_dataPlatformInstance, ) # Emit Assertion entity platform aspect! emitter.emit(assertion_dataPlatformInstance_mcp) # Construct batch assertion result object for partition 1 batch assertionResult_maxVal_batch_partition1 = AssertionRunEvent( timestampMillis=int(time.time() * 1000), assertionUrn=assertionUrn(assertion_maxVal), asserteeUrn=datasetUrn("bazTable"), partitionSpec=PartitionSpec(partition=json.dumps([{"country": "IN"}])), runId="uuid1", status=AssertionRunStatus.COMPLETE, result=AssertionResult( type=AssertionResultType.SUCCESS, externalUrl="http://example.com/uuid1", actualAggValue=90, ),
log = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) # First we get the current terms gms_endpoint = "http://localhost:8080" rest_emitter = DatahubRestEmitter(gms_server=gms_endpoint) dataset_urn = make_dataset_urn(platform="hive", name="realestate_db.sales", env="PROD") term_to_add = make_term_urn("Classification.HighlyConfidential") term_association_to_add = GlossaryTermAssociationClass(urn=term_to_add) # an audit stamp that basically says we have no idea when these terms were added to this dataset # change the time value to (time.time() * 1000) if you want to specify the current time of running this code as the time of the application unknown_audit_stamp = AuditStampClass(time=0, actor="urn:li:corpuser:ingestion") # create a brand new terms aspect terms_aspect = GlossaryTermsClass( terms=[term_association_to_add], auditStamp=unknown_audit_stamp, ) event: MetadataChangeProposalWrapper = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="glossaryTerms", aspect=terms_aspect, ) rest_emitter.emit(event) log.info(f"Attached term {term_to_add} to dataset {dataset_urn}")
class DatahubRestSink(Sink): config: DatahubRestSinkConfig emitter: DatahubRestEmitter report: SinkReport treat_errors_as_warnings: bool = False def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig): super().__init__(ctx) self.config = config self.report = SinkReport() self.emitter = DatahubRestEmitter( self.config.server, self.config.token, connect_timeout_sec=self.config. timeout_sec, # reuse timeout_sec for connect timeout read_timeout_sec=self.config.timeout_sec, extra_headers=self.config.extra_headers, ) self.emitter.test_connection() @classmethod def create(cls, config_dict: dict, ctx: PipelineContext) -> "DatahubRestSink": config = DatahubRestSinkConfig.parse_obj(config_dict) return cls(ctx, config) def handle_work_unit_start(self, workunit: WorkUnit) -> None: if isinstance(workunit, MetadataWorkUnit): mwu: MetadataWorkUnit = cast(MetadataWorkUnit, workunit) self.treat_errors_as_warnings = mwu.treat_errors_as_warnings pass def handle_work_unit_end(self, workunit: WorkUnit) -> None: pass def write_record_async( self, record_envelope: RecordEnvelope[Union[MetadataChangeEvent, MetadataChangeProposal, MetadataChangeProposalWrapper, UsageAggregation, ]], write_callback: WriteCallback, ) -> None: record = record_envelope.record try: self.emitter.emit(record) self.report.report_record_written(record_envelope) write_callback.on_success(record_envelope, {}) except OperationalError as e: # only OperationalErrors should be ignored if not self.treat_errors_as_warnings: self.report.report_failure({ "error": e.message, "info": e.info }) else: # trim exception stacktraces when reporting warnings if "stackTrace" in e.info: try: e.info["stackTrace"] = "\n".join( e.info["stackTrace"].split("\n")[0:2]) except Exception: # ignore failures in trimming pass if isinstance(record, MetadataChangeProposalWrapper): # include information about the entity that failed entity_id = cast(MetadataChangeProposalWrapper, record).entityUrn e.info["id"] = entity_id else: entity_id = None self.report.report_warning({ "warning": e.message, "info": e.info }) write_callback.on_failure(record_envelope, e, e.info) except Exception as e: self.report.report_failure({"e": e}) write_callback.on_failure(record_envelope, e, {}) def get_report(self) -> SinkReport: return self.report def close(self): pass
def generate( schema_files: List[str], server: Optional[str], file: Optional[str], dot: Optional[str], png: Optional[str], extra_docs: Optional[str], ) -> None: logger.info(f"server = {server}") logger.info(f"file = {file}") logger.info(f"dot = {dot}") logger.info(f"png = {png}") entity_extra_docs = {} if extra_docs: for path in glob.glob(f"{extra_docs}/**/*.md", recursive=True): m = re.search("/docs/entities/(.*)/*.md", path) if m: entity_name = m.group(1) with open(path, "r") as doc_file: file_contents = doc_file.read() final_markdown = preprocess_markdown(file_contents) entity_extra_docs[entity_name] = final_markdown for schema_file in schema_files: if schema_file.endswith(".yml") or schema_file.endswith(".yaml"): # registry file load_registry_file(schema_file) else: # schema file load_schema_file(schema_file) if entity_extra_docs: for entity_name in entity_extra_docs: entity_registry.get( entity_name).doc_file_contents = entity_extra_docs[entity_name] relationship_graph = RelationshipGraph() events = generate_stitched_record(relationship_graph) generated_docs_dir = "../docs/generated/metamodel" import shutil shutil.rmtree(f"{generated_docs_dir}/entities", ignore_errors=True) entity_names = [(x, entity_registry.get(x)) for x in generated_documentation] sorted_entity_names = get_sorted_entity_names(entity_names) index = 0 for category, sorted_entities in sorted_entity_names: for entity_name in sorted_entities: entity_def = entity_registry.get(entity_name) entity_category = entity_def.category entity_dir = f"{generated_docs_dir}/entities/" import os os.makedirs(entity_dir, exist_ok=True) with open(f"{entity_dir}/{entity_name}.md", "w") as fp: fp.write("---\n") fp.write(f"sidebar_position: {index}\n") fp.write("---\n") fp.write(generated_documentation[entity_name]) index += 1 if file: logger.info(f"Will write events to {file}") Path(file).parent.mkdir(parents=True, exist_ok=True) fileSink = FileSink( PipelineContext(run_id="generated-metaModel"), FileSinkConfig(filename=file), ) for e in events: fileSink.write_record_async(RecordEnvelope(e, metadata={}), write_callback=NoopWriteCallback()) fileSink.close() pipeline_config = { "source": { "type": "file", "config": { "filename": file }, }, "sink": { "type": "datahub-rest", "config": { "server": "${DATAHUB_SERVER:-http://localhost:8080}", "token": "${DATAHUB_TOKEN:-}", }, }, "run_id": "modeldoc-generated", } pipeline_file = Path(file).parent.absolute() / "pipeline.yml" with open(pipeline_file, "w") as f: json.dump(pipeline_config, f, indent=2) logger.info(f"Wrote pipeline to {pipeline_file}") if server: logger.info(f"Will send events to {server}") assert server.startswith( "http://"), "server address must start with http://" emitter = DatahubRestEmitter(gms_server=server) emitter.test_connection() for e in events: emitter.emit(e) if dot: logger.info(f"Will write dot file to {dot}") import pydot graph = pydot.Dot("my_graph", graph_type="graph") for node, adjacency in relationship_graph.map.items(): my_node = pydot.Node( node, label=node, shape="box", ) graph.add_node(my_node) if adjacency.self_loop: for relnship in adjacency.self_loop: graph.add_edge( pydot.Edge(src=relnship.src, dst=relnship.dst, label=relnship.name)) if adjacency.outgoing: for relnship in adjacency.outgoing: graph.add_edge( pydot.Edge(src=relnship.src, dst=relnship.dst, label=relnship.name)) Path(dot).parent.mkdir(parents=True, exist_ok=True) graph.write_raw(dot) if png: try: graph.write_png(png) except Exception as e: logger.error( "Failed to create png file. Do you have graphviz installed?" ) raise e
def generate(schema_files: List[str], server: Optional[str], file: Optional[str], dot: Optional[str], png: Optional[str]) -> None: logger.info(f"server = {server}") logger.info(f"file = {file}") logger.info(f"dot = {dot}") logger.info(f"png = {png}") for schema_file in schema_files: if schema_file.endswith(".yml") or schema_file.endswith(".yaml"): # registry file load_registry_file(schema_file) else: # schema file load_schema_file(schema_file) relationship_graph = RelationshipGraph() events = generate_stitched_record(relationship_graph) if file: logger.info(f"Will write events to {file}") Path(file).parent.mkdir(parents=True, exist_ok=True) fileSink = FileSink( PipelineContext(run_id="generated-metaModel"), FileSinkConfig(filename=file), ) for e in events: fileSink.write_record_async(RecordEnvelope(e, metadata={}), write_callback=NoopWriteCallback()) fileSink.close() pipeline_config = { "source": { "type": "file", "config": { "filename": file }, }, "sink": { "type": "datahub-rest", "config": { "server": "${DATAHUB_SERVER:-http://localhost:8080}", "token": "${DATAHUB_TOKEN:-}", }, }, "run_id": "modeldoc-generated", } pipeline_file = Path(file).parent.absolute() / "pipeline.yml" with open(pipeline_file, "w") as f: json.dump(pipeline_config, f, indent=2) logger.info(f"Wrote pipeline to {pipeline_file}") if server: logger.info(f"Will send events to {server}") assert server.startswith( "http://"), "server address must start with http://" emitter = DatahubRestEmitter(gms_server=server) emitter.test_connection() for e in events: emitter.emit(e) if dot: logger.info(f"Will write dot file to {dot}") import pydot graph = pydot.Dot("my_graph", graph_type="graph") for node, adjacency in relationship_graph.map.items(): my_node = pydot.Node( node, label=node, shape="box", ) graph.add_node(my_node) if adjacency.self_loop: for relnship in adjacency.self_loop: graph.add_edge( pydot.Edge(src=relnship.src, dst=relnship.dst, label=relnship.name)) if adjacency.outgoing: for relnship in adjacency.outgoing: graph.add_edge( pydot.Edge(src=relnship.src, dst=relnship.dst, label=relnship.name)) Path(dot).parent.mkdir(parents=True, exist_ok=True) graph.write_raw(dot) if png: try: graph.write_png(png) except Exception as e: logger.error( "Failed to create png file. Do you have graphviz installed?" ) raise e