class DatahubRestSink(Sink): config: DatahubRestSinkConfig emitter: DatahubRestEmitter report: SinkReport def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig): super().__init__(ctx) self.config = config self.report = SinkReport() self.emitter = DatahubRestEmitter( self.config.server, self.config.token, connect_timeout_sec=self.config.timeout_sec, # reuse timeout_sec for connect timeout read_timeout_sec=self.config.timeout_sec, ) self.emitter.test_connection() @classmethod def create(cls, config_dict: dict, ctx: PipelineContext) -> "DatahubRestSink": config = DatahubRestSinkConfig.parse_obj(config_dict) return cls(ctx, config) def handle_work_unit_start(self, workunit: WorkUnit) -> None: pass def handle_work_unit_end(self, workunit: WorkUnit) -> None: pass def write_record_async( self, record_envelope: RecordEnvelope[ Union[ MetadataChangeEvent, MetadataChangeProposal, MetadataChangeProposalWrapper, UsageAggregation, ] ], write_callback: WriteCallback, ) -> None: record = record_envelope.record try: self.emitter.emit(record) self.report.report_record_written(record_envelope) write_callback.on_success(record_envelope, {}) except OperationalError as e: self.report.report_failure({"error": e.message, "info": e.info}) write_callback.on_failure(record_envelope, e, e.info) except Exception as e: self.report.report_failure({"e": e}) write_callback.on_failure(record_envelope, e, {}) def get_report(self) -> SinkReport: return self.report def close(self): pass
class DatahubRestSink(Sink): config: DatahubRestSinkConfig emitter: DatahubRestEmitter report: DataHubRestSinkReport treat_errors_as_warnings: bool = False def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig): super().__init__(ctx) self.config = config self.report = DataHubRestSinkReport() self.emitter = DatahubRestEmitter( self.config.server, self.config.token, connect_timeout_sec=self.config.timeout_sec, # reuse timeout_sec for connect timeout read_timeout_sec=self.config.timeout_sec, retry_status_codes=self.config.retry_status_codes, retry_max_times=self.config.retry_max_times, extra_headers=self.config.extra_headers, ca_certificate_path=self.config.ca_certificate_path, ) try: gms_config = self.emitter.test_connection() except Exception as exc: raise ConfigurationError( f"💥 Failed to connect to DataHub@{self.config.server} (token:{'XXX-redacted' if self.config.token else 'empty'}) over REST", exc, ) self.report.gms_version = ( gms_config.get("versions", {}) .get("linkedin/datahub", {}) .get("version", "") ) logger.debug("Setting env variables to override config") set_env_variables_override_config(self.config.server, self.config.token) logger.debug("Setting gms config") set_gms_config(gms_config) self.executor = concurrent.futures.ThreadPoolExecutor( max_workers=self.config.max_threads ) @classmethod def create(cls, config_dict: dict, ctx: PipelineContext) -> "DatahubRestSink": config = DatahubRestSinkConfig.parse_obj(config_dict) return cls(ctx, config) def handle_work_unit_start(self, workunit: WorkUnit) -> None: if isinstance(workunit, MetadataWorkUnit): mwu: MetadataWorkUnit = cast(MetadataWorkUnit, workunit) self.treat_errors_as_warnings = mwu.treat_errors_as_warnings def handle_work_unit_end(self, workunit: WorkUnit) -> None: pass def _write_done_callback( self, record_envelope: RecordEnvelope, write_callback: WriteCallback, future: concurrent.futures.Future, ) -> None: if future.cancelled(): self.report.report_failure({"error": "future was cancelled"}) write_callback.on_failure( record_envelope, OperationalError("future was cancelled"), {} ) elif future.done(): e = future.exception() if not e: self.report.report_record_written(record_envelope) start_time, end_time = future.result() self.report.report_downstream_latency(start_time, end_time) write_callback.on_success(record_envelope, {}) elif isinstance(e, OperationalError): # only OperationalErrors should be ignored if not self.treat_errors_as_warnings: self.report.report_failure({"error": e.message, "info": e.info}) else: # trim exception stacktraces when reporting warnings if "stackTrace" in e.info: try: e.info["stackTrace"] = "\n".join( e.info["stackTrace"].split("\n")[0:2] ) except Exception: # ignore failures in trimming pass record = record_envelope.record if isinstance(record, MetadataChangeProposalWrapper): # include information about the entity that failed entity_id = cast( MetadataChangeProposalWrapper, record ).entityUrn e.info["id"] = entity_id else: entity_id = None self.report.report_warning({"warning": e.message, "info": e.info}) write_callback.on_failure(record_envelope, e, e.info) else: self.report.report_failure({"e": e}) write_callback.on_failure(record_envelope, Exception(e), {}) def write_record_async( self, record_envelope: RecordEnvelope[ Union[ MetadataChangeEvent, MetadataChangeProposal, MetadataChangeProposalWrapper, UsageAggregation, ] ], write_callback: WriteCallback, ) -> None: record = record_envelope.record write_future = self.executor.submit(self.emitter.emit, record) write_future.add_done_callback( functools.partial( self._write_done_callback, record_envelope, write_callback ) ) def get_report(self) -> SinkReport: return self.report def close(self): self.executor.shutdown(wait=True)
def generate( schema_files: List[str], server: Optional[str], file: Optional[str], dot: Optional[str], png: Optional[str], extra_docs: Optional[str], ) -> None: logger.info(f"server = {server}") logger.info(f"file = {file}") logger.info(f"dot = {dot}") logger.info(f"png = {png}") entity_extra_docs = {} if extra_docs: for path in glob.glob(f"{extra_docs}/**/*.md", recursive=True): m = re.search("/docs/entities/(.*)/*.md", path) if m: entity_name = m.group(1) with open(path, "r") as doc_file: file_contents = doc_file.read() final_markdown = preprocess_markdown(file_contents) entity_extra_docs[entity_name] = final_markdown for schema_file in schema_files: if schema_file.endswith(".yml") or schema_file.endswith(".yaml"): # registry file load_registry_file(schema_file) else: # schema file load_schema_file(schema_file) if entity_extra_docs: for entity_name in entity_extra_docs: entity_registry.get( entity_name).doc_file_contents = entity_extra_docs[entity_name] relationship_graph = RelationshipGraph() events = generate_stitched_record(relationship_graph) generated_docs_dir = "../docs/generated/metamodel" import shutil shutil.rmtree(f"{generated_docs_dir}/entities", ignore_errors=True) entity_names = [(x, entity_registry.get(x)) for x in generated_documentation] sorted_entity_names = get_sorted_entity_names(entity_names) index = 0 for category, sorted_entities in sorted_entity_names: for entity_name in sorted_entities: entity_def = entity_registry.get(entity_name) entity_category = entity_def.category entity_dir = f"{generated_docs_dir}/entities/" import os os.makedirs(entity_dir, exist_ok=True) with open(f"{entity_dir}/{entity_name}.md", "w") as fp: fp.write("---\n") fp.write(f"sidebar_position: {index}\n") fp.write("---\n") fp.write(generated_documentation[entity_name]) index += 1 if file: logger.info(f"Will write events to {file}") Path(file).parent.mkdir(parents=True, exist_ok=True) fileSink = FileSink( PipelineContext(run_id="generated-metaModel"), FileSinkConfig(filename=file), ) for e in events: fileSink.write_record_async(RecordEnvelope(e, metadata={}), write_callback=NoopWriteCallback()) fileSink.close() pipeline_config = { "source": { "type": "file", "config": { "filename": file }, }, "sink": { "type": "datahub-rest", "config": { "server": "${DATAHUB_SERVER:-http://localhost:8080}", "token": "${DATAHUB_TOKEN:-}", }, }, "run_id": "modeldoc-generated", } pipeline_file = Path(file).parent.absolute() / "pipeline.yml" with open(pipeline_file, "w") as f: json.dump(pipeline_config, f, indent=2) logger.info(f"Wrote pipeline to {pipeline_file}") if server: logger.info(f"Will send events to {server}") assert server.startswith( "http://"), "server address must start with http://" emitter = DatahubRestEmitter(gms_server=server) emitter.test_connection() for e in events: emitter.emit(e) if dot: logger.info(f"Will write dot file to {dot}") import pydot graph = pydot.Dot("my_graph", graph_type="graph") for node, adjacency in relationship_graph.map.items(): my_node = pydot.Node( node, label=node, shape="box", ) graph.add_node(my_node) if adjacency.self_loop: for relnship in adjacency.self_loop: graph.add_edge( pydot.Edge(src=relnship.src, dst=relnship.dst, label=relnship.name)) if adjacency.outgoing: for relnship in adjacency.outgoing: graph.add_edge( pydot.Edge(src=relnship.src, dst=relnship.dst, label=relnship.name)) Path(dot).parent.mkdir(parents=True, exist_ok=True) graph.write_raw(dot) if png: try: graph.write_png(png) except Exception as e: logger.error( "Failed to create png file. Do you have graphviz installed?" ) raise e
class DatahubRestSink(Sink): config: DatahubRestSinkConfig emitter: DatahubRestEmitter report: SinkReport treat_errors_as_warnings: bool = False def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig): super().__init__(ctx) self.config = config self.report = SinkReport() self.emitter = DatahubRestEmitter( self.config.server, self.config.token, connect_timeout_sec=self.config. timeout_sec, # reuse timeout_sec for connect timeout read_timeout_sec=self.config.timeout_sec, extra_headers=self.config.extra_headers, ) self.emitter.test_connection() @classmethod def create(cls, config_dict: dict, ctx: PipelineContext) -> "DatahubRestSink": config = DatahubRestSinkConfig.parse_obj(config_dict) return cls(ctx, config) def handle_work_unit_start(self, workunit: WorkUnit) -> None: if isinstance(workunit, MetadataWorkUnit): mwu: MetadataWorkUnit = cast(MetadataWorkUnit, workunit) self.treat_errors_as_warnings = mwu.treat_errors_as_warnings pass def handle_work_unit_end(self, workunit: WorkUnit) -> None: pass def write_record_async( self, record_envelope: RecordEnvelope[Union[MetadataChangeEvent, MetadataChangeProposal, MetadataChangeProposalWrapper, UsageAggregation, ]], write_callback: WriteCallback, ) -> None: record = record_envelope.record try: self.emitter.emit(record) self.report.report_record_written(record_envelope) write_callback.on_success(record_envelope, {}) except OperationalError as e: # only OperationalErrors should be ignored if not self.treat_errors_as_warnings: self.report.report_failure({ "error": e.message, "info": e.info }) else: # trim exception stacktraces when reporting warnings if "stackTrace" in e.info: try: e.info["stackTrace"] = "\n".join( e.info["stackTrace"].split("\n")[0:2]) except Exception: # ignore failures in trimming pass if isinstance(record, MetadataChangeProposalWrapper): # include information about the entity that failed entity_id = cast(MetadataChangeProposalWrapper, record).entityUrn e.info["id"] = entity_id else: entity_id = None self.report.report_warning({ "warning": e.message, "info": e.info }) write_callback.on_failure(record_envelope, e, e.info) except Exception as e: self.report.report_failure({"e": e}) write_callback.on_failure(record_envelope, e, {}) def get_report(self) -> SinkReport: return self.report def close(self): pass
def generate(schema_files: List[str], server: Optional[str], file: Optional[str], dot: Optional[str], png: Optional[str]) -> None: logger.info(f"server = {server}") logger.info(f"file = {file}") logger.info(f"dot = {dot}") logger.info(f"png = {png}") for schema_file in schema_files: if schema_file.endswith(".yml") or schema_file.endswith(".yaml"): # registry file load_registry_file(schema_file) else: # schema file load_schema_file(schema_file) relationship_graph = RelationshipGraph() events = generate_stitched_record(relationship_graph) if file: logger.info(f"Will write events to {file}") Path(file).parent.mkdir(parents=True, exist_ok=True) fileSink = FileSink( PipelineContext(run_id="generated-metaModel"), FileSinkConfig(filename=file), ) for e in events: fileSink.write_record_async(RecordEnvelope(e, metadata={}), write_callback=NoopWriteCallback()) fileSink.close() pipeline_config = { "source": { "type": "file", "config": { "filename": file }, }, "sink": { "type": "datahub-rest", "config": { "server": "${DATAHUB_SERVER:-http://localhost:8080}", "token": "${DATAHUB_TOKEN:-}", }, }, "run_id": "modeldoc-generated", } pipeline_file = Path(file).parent.absolute() / "pipeline.yml" with open(pipeline_file, "w") as f: json.dump(pipeline_config, f, indent=2) logger.info(f"Wrote pipeline to {pipeline_file}") if server: logger.info(f"Will send events to {server}") assert server.startswith( "http://"), "server address must start with http://" emitter = DatahubRestEmitter(gms_server=server) emitter.test_connection() for e in events: emitter.emit(e) if dot: logger.info(f"Will write dot file to {dot}") import pydot graph = pydot.Dot("my_graph", graph_type="graph") for node, adjacency in relationship_graph.map.items(): my_node = pydot.Node( node, label=node, shape="box", ) graph.add_node(my_node) if adjacency.self_loop: for relnship in adjacency.self_loop: graph.add_edge( pydot.Edge(src=relnship.src, dst=relnship.dst, label=relnship.name)) if adjacency.outgoing: for relnship in adjacency.outgoing: graph.add_edge( pydot.Edge(src=relnship.src, dst=relnship.dst, label=relnship.name)) Path(dot).parent.mkdir(parents=True, exist_ok=True) graph.write_raw(dot) if png: try: graph.write_png(png) except Exception as e: logger.error( "Failed to create png file. Do you have graphviz installed?" ) raise e