Beispiel #1
0
class DatahubRestSink(Sink):
    config: DatahubRestSinkConfig
    emitter: DatahubRestEmitter
    report: SinkReport

    def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig):
        super().__init__(ctx)
        self.config = config
        self.report = SinkReport()
        self.emitter = DatahubRestEmitter(
            self.config.server,
            self.config.token,
            connect_timeout_sec=self.config.timeout_sec,  # reuse timeout_sec for connect timeout
            read_timeout_sec=self.config.timeout_sec,
        )
        self.emitter.test_connection()

    @classmethod
    def create(cls, config_dict: dict, ctx: PipelineContext) -> "DatahubRestSink":
        config = DatahubRestSinkConfig.parse_obj(config_dict)
        return cls(ctx, config)

    def handle_work_unit_start(self, workunit: WorkUnit) -> None:
        pass

    def handle_work_unit_end(self, workunit: WorkUnit) -> None:
        pass

    def write_record_async(
        self,
        record_envelope: RecordEnvelope[
            Union[
                MetadataChangeEvent,
                MetadataChangeProposal,
                MetadataChangeProposalWrapper,
                UsageAggregation,
            ]
        ],
        write_callback: WriteCallback,
    ) -> None:
        record = record_envelope.record

        try:
            self.emitter.emit(record)
            self.report.report_record_written(record_envelope)
            write_callback.on_success(record_envelope, {})
        except OperationalError as e:
            self.report.report_failure({"error": e.message, "info": e.info})
            write_callback.on_failure(record_envelope, e, e.info)
        except Exception as e:
            self.report.report_failure({"e": e})
            write_callback.on_failure(record_envelope, e, {})

    def get_report(self) -> SinkReport:
        return self.report

    def close(self):
        pass
Beispiel #2
0
def test_datahub_rest_emitter(requests_mock, record, path, snapshot):
    def match_request_text(request: requests.Request) -> bool:
        requested_snapshot = request.json()
        assert (
            requested_snapshot == snapshot
        ), f"Expected snapshot to be {json.dumps(snapshot)}, got {json.dumps(requested_snapshot)}"
        return True

    requests_mock.post(
        f"{MOCK_GMS_ENDPOINT}{path}",
        request_headers={"X-RestLi-Protocol-Version": "2.0.0"},
        additional_matcher=match_request_text,
    )

    emitter = DatahubRestEmitter(MOCK_GMS_ENDPOINT)
    emitter.emit(record)
Beispiel #3
0
# Construct an assertion platform object.
assertion_dataPlatformInstance = DataPlatformInstance(
    platform=builder.make_data_platform_urn("great-expectations")
)

# Construct a MetadataChangeProposalWrapper object for assertion platform
assertion_dataPlatformInstance_mcp = MetadataChangeProposalWrapper(
    entityType="assertion",
    changeType=ChangeType.UPSERT,
    entityUrn=assertionUrn(assertion_maxVal),
    aspectName="dataPlatformInstance",
    aspect=assertion_dataPlatformInstance,
)
# Emit Assertion entity platform aspect!
emitter.emit(assertion_dataPlatformInstance_mcp)


# Construct batch assertion result object for partition 1 batch
assertionResult_maxVal_batch_partition1 = AssertionRunEvent(
    timestampMillis=int(time.time() * 1000),
    assertionUrn=assertionUrn(assertion_maxVal),
    asserteeUrn=datasetUrn("bazTable"),
    partitionSpec=PartitionSpec(partition=json.dumps([{"country": "IN"}])),
    runId="uuid1",
    status=AssertionRunStatus.COMPLETE,
    result=AssertionResult(
        type=AssertionResultType.SUCCESS,
        externalUrl="http://example.com/uuid1",
        actualAggValue=90,
    ),
log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

# First we get the current terms
gms_endpoint = "http://localhost:8080"
rest_emitter = DatahubRestEmitter(gms_server=gms_endpoint)

dataset_urn = make_dataset_urn(platform="hive", name="realestate_db.sales", env="PROD")

term_to_add = make_term_urn("Classification.HighlyConfidential")
term_association_to_add = GlossaryTermAssociationClass(urn=term_to_add)
# an audit stamp that basically says we have no idea when these terms were added to this dataset
# change the time value to (time.time() * 1000) if you want to specify the current time of running this code as the time of the application
unknown_audit_stamp = AuditStampClass(time=0, actor="urn:li:corpuser:ingestion")

# create a brand new terms aspect
terms_aspect = GlossaryTermsClass(
    terms=[term_association_to_add],
    auditStamp=unknown_audit_stamp,
)

event: MetadataChangeProposalWrapper = MetadataChangeProposalWrapper(
    entityType="dataset",
    changeType=ChangeTypeClass.UPSERT,
    entityUrn=dataset_urn,
    aspectName="glossaryTerms",
    aspect=terms_aspect,
)
rest_emitter.emit(event)
log.info(f"Attached term {term_to_add} to dataset {dataset_urn}")
class DatahubRestSink(Sink):
    config: DatahubRestSinkConfig
    emitter: DatahubRestEmitter
    report: SinkReport
    treat_errors_as_warnings: bool = False

    def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig):
        super().__init__(ctx)
        self.config = config
        self.report = SinkReport()
        self.emitter = DatahubRestEmitter(
            self.config.server,
            self.config.token,
            connect_timeout_sec=self.config.
            timeout_sec,  # reuse timeout_sec for connect timeout
            read_timeout_sec=self.config.timeout_sec,
            extra_headers=self.config.extra_headers,
        )
        self.emitter.test_connection()

    @classmethod
    def create(cls, config_dict: dict,
               ctx: PipelineContext) -> "DatahubRestSink":
        config = DatahubRestSinkConfig.parse_obj(config_dict)
        return cls(ctx, config)

    def handle_work_unit_start(self, workunit: WorkUnit) -> None:
        if isinstance(workunit, MetadataWorkUnit):
            mwu: MetadataWorkUnit = cast(MetadataWorkUnit, workunit)
            self.treat_errors_as_warnings = mwu.treat_errors_as_warnings
        pass

    def handle_work_unit_end(self, workunit: WorkUnit) -> None:
        pass

    def write_record_async(
        self,
        record_envelope: RecordEnvelope[Union[MetadataChangeEvent,
                                              MetadataChangeProposal,
                                              MetadataChangeProposalWrapper,
                                              UsageAggregation, ]],
        write_callback: WriteCallback,
    ) -> None:
        record = record_envelope.record

        try:
            self.emitter.emit(record)
            self.report.report_record_written(record_envelope)
            write_callback.on_success(record_envelope, {})
        except OperationalError as e:
            # only OperationalErrors should be ignored
            if not self.treat_errors_as_warnings:
                self.report.report_failure({
                    "error": e.message,
                    "info": e.info
                })
            else:
                # trim exception stacktraces when reporting warnings
                if "stackTrace" in e.info:
                    try:
                        e.info["stackTrace"] = "\n".join(
                            e.info["stackTrace"].split("\n")[0:2])
                    except Exception:
                        # ignore failures in trimming
                        pass
                if isinstance(record, MetadataChangeProposalWrapper):
                    # include information about the entity that failed
                    entity_id = cast(MetadataChangeProposalWrapper,
                                     record).entityUrn
                    e.info["id"] = entity_id
                else:
                    entity_id = None
                self.report.report_warning({
                    "warning": e.message,
                    "info": e.info
                })
            write_callback.on_failure(record_envelope, e, e.info)
        except Exception as e:
            self.report.report_failure({"e": e})
            write_callback.on_failure(record_envelope, e, {})

    def get_report(self) -> SinkReport:
        return self.report

    def close(self):
        pass
Beispiel #6
0
def generate(
    schema_files: List[str],
    server: Optional[str],
    file: Optional[str],
    dot: Optional[str],
    png: Optional[str],
    extra_docs: Optional[str],
) -> None:
    logger.info(f"server = {server}")
    logger.info(f"file = {file}")
    logger.info(f"dot = {dot}")
    logger.info(f"png = {png}")

    entity_extra_docs = {}
    if extra_docs:
        for path in glob.glob(f"{extra_docs}/**/*.md", recursive=True):
            m = re.search("/docs/entities/(.*)/*.md", path)
            if m:
                entity_name = m.group(1)
                with open(path, "r") as doc_file:
                    file_contents = doc_file.read()
                    final_markdown = preprocess_markdown(file_contents)
                    entity_extra_docs[entity_name] = final_markdown

    for schema_file in schema_files:
        if schema_file.endswith(".yml") or schema_file.endswith(".yaml"):
            # registry file
            load_registry_file(schema_file)
        else:
            # schema file
            load_schema_file(schema_file)

    if entity_extra_docs:
        for entity_name in entity_extra_docs:

            entity_registry.get(
                entity_name).doc_file_contents = entity_extra_docs[entity_name]

    relationship_graph = RelationshipGraph()
    events = generate_stitched_record(relationship_graph)

    generated_docs_dir = "../docs/generated/metamodel"
    import shutil

    shutil.rmtree(f"{generated_docs_dir}/entities", ignore_errors=True)
    entity_names = [(x, entity_registry.get(x))
                    for x in generated_documentation]

    sorted_entity_names = get_sorted_entity_names(entity_names)

    index = 0
    for category, sorted_entities in sorted_entity_names:
        for entity_name in sorted_entities:
            entity_def = entity_registry.get(entity_name)

            entity_category = entity_def.category
            entity_dir = f"{generated_docs_dir}/entities/"
            import os

            os.makedirs(entity_dir, exist_ok=True)

            with open(f"{entity_dir}/{entity_name}.md", "w") as fp:
                fp.write("---\n")
                fp.write(f"sidebar_position: {index}\n")
                fp.write("---\n")
                fp.write(generated_documentation[entity_name])
                index += 1

    if file:
        logger.info(f"Will write events to {file}")
        Path(file).parent.mkdir(parents=True, exist_ok=True)
        fileSink = FileSink(
            PipelineContext(run_id="generated-metaModel"),
            FileSinkConfig(filename=file),
        )
        for e in events:
            fileSink.write_record_async(RecordEnvelope(e, metadata={}),
                                        write_callback=NoopWriteCallback())
        fileSink.close()
        pipeline_config = {
            "source": {
                "type": "file",
                "config": {
                    "filename": file
                },
            },
            "sink": {
                "type": "datahub-rest",
                "config": {
                    "server": "${DATAHUB_SERVER:-http://localhost:8080}",
                    "token": "${DATAHUB_TOKEN:-}",
                },
            },
            "run_id": "modeldoc-generated",
        }
        pipeline_file = Path(file).parent.absolute() / "pipeline.yml"
        with open(pipeline_file, "w") as f:
            json.dump(pipeline_config, f, indent=2)
            logger.info(f"Wrote pipeline to {pipeline_file}")

    if server:
        logger.info(f"Will send events to {server}")
        assert server.startswith(
            "http://"), "server address must start with http://"
        emitter = DatahubRestEmitter(gms_server=server)
        emitter.test_connection()
        for e in events:
            emitter.emit(e)

    if dot:
        logger.info(f"Will write dot file to {dot}")

        import pydot

        graph = pydot.Dot("my_graph", graph_type="graph")
        for node, adjacency in relationship_graph.map.items():
            my_node = pydot.Node(
                node,
                label=node,
                shape="box",
            )
            graph.add_node(my_node)
            if adjacency.self_loop:
                for relnship in adjacency.self_loop:
                    graph.add_edge(
                        pydot.Edge(src=relnship.src,
                                   dst=relnship.dst,
                                   label=relnship.name))
            if adjacency.outgoing:
                for relnship in adjacency.outgoing:
                    graph.add_edge(
                        pydot.Edge(src=relnship.src,
                                   dst=relnship.dst,
                                   label=relnship.name))
        Path(dot).parent.mkdir(parents=True, exist_ok=True)
        graph.write_raw(dot)
        if png:
            try:
                graph.write_png(png)
            except Exception as e:
                logger.error(
                    "Failed to create png file. Do you have graphviz installed?"
                )
                raise e
Beispiel #7
0
def generate(schema_files: List[str], server: Optional[str],
             file: Optional[str], dot: Optional[str],
             png: Optional[str]) -> None:
    logger.info(f"server = {server}")
    logger.info(f"file = {file}")
    logger.info(f"dot = {dot}")
    logger.info(f"png = {png}")

    for schema_file in schema_files:
        if schema_file.endswith(".yml") or schema_file.endswith(".yaml"):
            # registry file
            load_registry_file(schema_file)
        else:
            # schema file
            load_schema_file(schema_file)

    relationship_graph = RelationshipGraph()
    events = generate_stitched_record(relationship_graph)

    if file:
        logger.info(f"Will write events to {file}")
        Path(file).parent.mkdir(parents=True, exist_ok=True)
        fileSink = FileSink(
            PipelineContext(run_id="generated-metaModel"),
            FileSinkConfig(filename=file),
        )
        for e in events:
            fileSink.write_record_async(RecordEnvelope(e, metadata={}),
                                        write_callback=NoopWriteCallback())
        fileSink.close()
        pipeline_config = {
            "source": {
                "type": "file",
                "config": {
                    "filename": file
                },
            },
            "sink": {
                "type": "datahub-rest",
                "config": {
                    "server": "${DATAHUB_SERVER:-http://localhost:8080}",
                    "token": "${DATAHUB_TOKEN:-}",
                },
            },
            "run_id": "modeldoc-generated",
        }
        pipeline_file = Path(file).parent.absolute() / "pipeline.yml"
        with open(pipeline_file, "w") as f:
            json.dump(pipeline_config, f, indent=2)
            logger.info(f"Wrote pipeline to {pipeline_file}")

    if server:
        logger.info(f"Will send events to {server}")
        assert server.startswith(
            "http://"), "server address must start with http://"
        emitter = DatahubRestEmitter(gms_server=server)
        emitter.test_connection()
        for e in events:
            emitter.emit(e)

    if dot:
        logger.info(f"Will write dot file to {dot}")

        import pydot

        graph = pydot.Dot("my_graph", graph_type="graph")
        for node, adjacency in relationship_graph.map.items():
            my_node = pydot.Node(
                node,
                label=node,
                shape="box",
            )
            graph.add_node(my_node)
            if adjacency.self_loop:
                for relnship in adjacency.self_loop:
                    graph.add_edge(
                        pydot.Edge(src=relnship.src,
                                   dst=relnship.dst,
                                   label=relnship.name))
            if adjacency.outgoing:
                for relnship in adjacency.outgoing:
                    graph.add_edge(
                        pydot.Edge(src=relnship.src,
                                   dst=relnship.dst,
                                   label=relnship.name))
        Path(dot).parent.mkdir(parents=True, exist_ok=True)
        graph.write_raw(dot)
        if png:
            try:
                graph.write_png(png)
            except Exception as e:
                logger.error(
                    "Failed to create png file. Do you have graphviz installed?"
                )
                raise e