Exemple #1
0
def create_global_tags_aspect_mce(directive: Directive) -> MetadataChangeEventClass:
    return MetadataChangeEventClass(
        proposedSnapshot=DatasetSnapshotClass(
            urn=dataset_name_to_urn(directive.table),
            aspects=[GlobalTagsClass(tags=[])],
        )
    )
Exemple #2
0
    def get_dataflow_wu(self, flow_urn: str, job: Dict[str, Any]) -> MetadataWorkUnit:
        """
        Generate a DataFlow workunit for a Glue job.

        Parameters
        ----------
            flow_urn:
                URN for the flow
            job:
                Job object from get_all_jobs()
        """
        mce = MetadataChangeEventClass(
            proposedSnapshot=DataFlowSnapshotClass(
                urn=flow_urn,
                aspects=[
                    DataFlowInfoClass(
                        name=job["Name"],
                        description=job["Description"],
                        # specify a few Glue-specific properties
                        customProperties={
                            "role": job["Role"],
                            "created": str(job["CreatedOn"]),
                            "modified": str(job["LastModifiedOn"]),
                            "command": job["Command"]["ScriptLocation"],
                        },
                    ),
                ],
            )
        )

        return MetadataWorkUnit(id=job["Name"], mce=mce)
Exemple #3
0
    def get_datajob_wu(self, node: Dict[str, Any],
                       job: Dict[str, Any]) -> MetadataWorkUnit:
        """
        Generate a DataJob workunit for a component (node) in a Glue job.

        Parameters
        ----------
            node:
                Node from process_dataflow_graph()
            job:
                Job object from get_all_jobs()
        """
        mce = MetadataChangeEventClass(proposedSnapshot=DataJobSnapshotClass(
            urn=node["urn"],
            aspects=[
                DataJobInfoClass(
                    name=f"{job['Name']}:{node['NodeType']}-{node['Id']}",
                    type="GLUE",
                    customProperties={
                        **{x["Name"]: x["Value"]
                           for x in node["Args"]},
                        "transformType": node["NodeType"],
                        "nodeId": node["Id"],
                    },
                ),
                DataJobInputOutputClass(
                    inputDatasets=node["inputDatasets"],
                    outputDatasets=node["outputDatasets"],
                    inputDatajobs=node["inputDatajobs"],
                ),
            ],
        ))

        return MetadataWorkUnit(id=f'{job["Name"]}-{node["Id"]}', mce=mce)
Exemple #4
0
def make_lineage_mce(
    upstream_urns: List[str],
    downstream_urn: str,
    actor: str = make_user_urn("datahub"),
    lineage_type: str = DatasetLineageTypeClass.TRANSFORMED,
) -> MetadataChangeEventClass:
    sys_time = get_sys_time()

    mce = MetadataChangeEventClass(
        proposedSnapshot=DatasetSnapshotClass(
            urn=downstream_urn,
            aspects=[
                UpstreamLineageClass(
                    upstreams=[
                        UpstreamClass(
                            auditStamp=AuditStampClass(
                                time=sys_time,
                                actor=actor,
                            ),
                            dataset=upstream_urn,
                            type=lineage_type,
                        )
                        for upstream_urn in upstream_urns
                    ]
                )
            ],
        )
    )
    return mce
Exemple #5
0
def test_serde_to_avro(pytestconfig, json_filename):
    # In this test, we want to read in from JSON -> MCE object.
    # Next we serialize from MCE to Avro and then deserialize back to MCE.
    # Finally, we want to compare the two MCE objects.

    json_path = pytestconfig.rootpath / json_filename
    mces = list(iterate_mce_file(str(json_path)))

    # Serialize to Avro.
    parsed_schema = fastavro.parse_schema(json.loads(SCHEMA_JSON_STR))
    fo = io.BytesIO()
    out_records = [mce.to_obj(tuples=True) for mce in mces]
    fastavro.writer(fo, parsed_schema, out_records)

    # Deserialized from Avro.
    fo.seek(0)
    in_records = list(fastavro.reader(fo))
    in_mces = [
        MetadataChangeEventClass.from_obj(record, tuples=True)
        for record in in_records
    ]

    # Check diff
    assert len(mces) == len(in_mces)
    for i in range(len(mces)):
        assert str(mces[i]) == str(in_mces[i])
Exemple #6
0
def get_initial_mce() -> MetadataChangeEventClass:
    return MetadataChangeEventClass(
        proposedSnapshot=DatasetSnapshotClass(
            urn="urn:li:dataset:(urn:li:dataPlatform:test_platform,test,PROD)",
            aspects=[DatasetPropertiesClass(description="test.description", )],
        ),
        systemMetadata=SystemMetadata(lastObserved=1586847600000,
                                      runId="pipeline_test"),
    )
Exemple #7
0
def create_owner_entity_mce(owner: str) -> MetadataChangeEventClass:
    clean_name = clean_owner_name(owner)
    return MetadataChangeEventClass(proposedSnapshot=CorpUserSnapshotClass(
        urn=owner_name_to_urn(clean_name),
        aspects=[
            CorpUserInfoClass(
                active=True,
                displayName=owner,
                fullName=owner,
                email=f"{clean_name}[email protected]",
            )
        ],
    ))
Exemple #8
0
def get_initial_mce() -> MetadataChangeEventClass:
    return MetadataChangeEventClass(
        proposedSnapshot=DatasetSnapshotClass(
            urn="urn:li:dataset:(urn:li:dataPlatform:test_platform,test,PROD)",
            aspects=[
                DatasetPropertiesClass(
                    description="test.description",
                    customProperties={},
                    uri=None,
                    tags=[],
                )
            ],
        )
    )
 def transform_aspect(  # not marked as @abstractmethod to avoid impacting transformers that extend this class
         self, entity_urn: str, aspect_name: str,
         aspect: Optional[Aspect]) -> Optional[Aspect]:
     """A default implementation for transform_aspect that calls `transform_one` with a fake MCE to preserve compatibility with previous transformers coded against MCE"""
     fake_mce: MetadataChangeEventClass = MetadataChangeEventClass(
         proposedSnapshot=DatasetSnapshotClass(
             urn=entity_urn,
             aspects=[aspect] if aspect else [],  # type: ignore
         ))
     transformed_mce = self.transform_one(fake_mce)
     assert transformed_mce.proposedSnapshot
     assert (
         len(transformed_mce.proposedSnapshot.aspects) <= 1
     ), "This implementation assumes that transformers will return at most 1 aspect value back"
     return (transformed_mce.proposedSnapshot.aspects[0]  # type: ignore
             if len(transformed_mce.proposedSnapshot.aspects) else None)
Exemple #10
0
    def generate_mce(self) -> MetadataChangeEventClass:
        flow_mce = MetadataChangeEventClass(
            proposedSnapshot=DataFlowSnapshotClass(
                urn=str(self.urn),
                aspects=[
                    DataFlowInfoClass(
                        name=self.id,
                        description=self.description,
                        customProperties=self.properties,
                        externalUrl=self.url,
                    ),
                    *self.generate_ownership_aspect(),
                    *self.generate_tags_aspect(),
                ],
            ))

        return flow_mce
Exemple #11
0
def make_lineage_mce(
    upstream_urns: List[str],
    downstream_urn: str,
    lineage_type: str = DatasetLineageTypeClass.TRANSFORMED,
) -> MetadataChangeEventClass:
    mce = MetadataChangeEventClass(proposedSnapshot=DatasetSnapshotClass(
        urn=downstream_urn,
        aspects=[
            UpstreamLineageClass(upstreams=[
                UpstreamClass(
                    dataset=upstream_urn,
                    type=lineage_type,
                ) for upstream_urn in upstream_urns
            ])
        ],
    ))
    return mce
Exemple #12
0
def create_lineage_aspect_mce(
        directive: Directive) -> MetadataChangeEventClass:
    return MetadataChangeEventClass(proposedSnapshot=DatasetSnapshotClass(
        urn=dataset_name_to_urn(directive.table),
        aspects=[
            UpstreamLineageClass(upstreams=[
                UpstreamClass(
                    dataset=dataset_name_to_urn(upstream),
                    type=DatasetLineageTypeClass.TRANSFORMED,
                    auditStamp=AuditStampClass(
                        time=int(time.time() * 1000),
                        actor="urn:li:corpuser:datahub",
                    ),
                ) for upstream in directive.depends_on
            ])
        ],
    ))
Exemple #13
0
def assert_entity_mce_aspect(entity_urn: str, aspect: Any, aspect_type: Type,
                             file: str) -> int:
    test_output = load_json_file(file)
    entity_type = Urn.create_from_string(entity_urn).get_type()
    assert isinstance(test_output, list)
    # mce urns
    mces: List[MetadataChangeEventClass] = [
        MetadataChangeEventClass.from_obj(x) for x in test_output
        if _get_filter(mce=True, entity_type=entity_type)(x)
        and _get_element(x, _get_mce_urn_path_spec(entity_type)) == entity_urn
    ]
    matches = 0
    for mce in mces:
        for a in mce.proposedSnapshot.aspects:
            if isinstance(a, aspect_type):
                assert a == aspect
                matches = matches + 1
    return matches
Exemple #14
0
def create_editable_schema_info_aspect_mce(
    directive: Directive, ) -> MetadataChangeEventClass:
    return MetadataChangeEventClass(proposedSnapshot=DatasetSnapshotClass(
        urn=dataset_name_to_urn(directive.table),
        aspects=[
            EditableSchemaMetadataClass(
                created=AuditStampClass(
                    time=int(time.time() * 1000),
                    actor="urn:li:corpuser:datahub",
                ),
                lastModified=AuditStampClass(
                    time=int(time.time() * 1000),
                    actor="urn:li:corpuser:datahub",
                ),
                editableSchemaFieldInfo=[],
            )
        ],
    ))
Exemple #15
0
    def get_dataflow_wu(self, flow_urn: str,
                        job: Dict[str, Any]) -> MetadataWorkUnit:
        """
        Generate a DataFlow workunit for a Glue job.

        Parameters
        ----------
            flow_urn:
                URN for the flow
            job:
                Job object from get_all_jobs()
        """

        region = self.source_config.aws_region

        custom_props = {
            "role": job["Role"],
        }

        if job.get("CreatedOn") is not None:
            custom_props["created"] = str(job["CreatedOn"])

        if job.get("LastModifiedOn") is not None:
            custom_props["modified"] = str(job["LastModifiedOn"])

        command = job.get("Command", {}).get("ScriptLocation")
        if command is not None:
            custom_props["command"] = command

        mce = MetadataChangeEventClass(proposedSnapshot=DataFlowSnapshotClass(
            urn=flow_urn,
            aspects=[
                DataFlowInfoClass(
                    name=job["Name"],
                    description=job.get("Description"),
                    externalUrl=
                    f"https://{region}.console.aws.amazon.com/gluestudio/home?region={region}#/editor/job/{job['Name']}/graph",
                    # specify a few Glue-specific properties
                    customProperties=custom_props,
                ),
            ],
        ))

        return MetadataWorkUnit(id=job["Name"], mce=mce)
Exemple #16
0
def create_ownership_aspect_mce(
        directive: Directive) -> MetadataChangeEventClass:
    return MetadataChangeEventClass(proposedSnapshot=DatasetSnapshotClass(
        urn=dataset_name_to_urn(directive.table),
        aspects=[
            OwnershipClass(
                owners=[
                    OwnerClass(
                        owner=owner_name_to_urn(clean_owner_name(owner)),
                        type=OwnershipTypeClass.DATAOWNER,
                    ) for owner in directive.owners
                ],
                lastModified=AuditStampClass(
                    time=int(time.time() * 1000),
                    actor="urn:li:corpuser:datahub",
                ),
            )
        ],
    ))
Exemple #17
0
    def get_datajob_wu(
        self, node: Dict[str, Any], job: Dict[str, Any]
    ) -> MetadataWorkUnit:
        """
        Generate a DataJob workunit for a component (node) in a Glue job.

        Parameters
        ----------
            node:
                Node from process_dataflow_graph()
            job:
                Job object from get_all_jobs()
        """

        region = self.source_config.aws_region

        mce = MetadataChangeEventClass(
            proposedSnapshot=DataJobSnapshotClass(
                urn=node["urn"],
                aspects=[
                    DataJobInfoClass(
                        name=f"{job['Name']}:{node['NodeType']}-{node['Id']}",
                        type="GLUE",
                        # there's no way to view an individual job node by link, so just show the graph
                        externalUrl=f"https://{region}.console.aws.amazon.com/gluestudio/home?region={region}#/editor/job/{job['Name']}/graph",
                        customProperties={
                            **{x["Name"]: x["Value"] for x in node["Args"]},
                            "transformType": node["NodeType"],
                            "nodeId": node["Id"],
                        },
                    ),
                    DataJobInputOutputClass(
                        inputDatasets=node["inputDatasets"],
                        outputDatasets=node["outputDatasets"],
                        inputDatajobs=node["inputDatajobs"],
                    ),
                ],
            )
        )

        return MetadataWorkUnit(id=f'{job["Name"]}-{node["Id"]}', mce=mce)
Exemple #18
0
    def generate_mce(self) -> MetadataChangeEventClass:
        job_mce = MetadataChangeEventClass(
            proposedSnapshot=DataJobSnapshotClass(
                urn=str(self.urn),
                aspects=[
                    DataJobInfoClass(
                        name=self.name if self.name is not None else self.id,
                        type=AzkabanJobTypeClass.COMMAND,
                        description=self.description,
                        customProperties=self.properties,
                        externalUrl=self.url,
                    ),
                    DataJobInputOutputClass(
                        inputDatasets=[str(urn) for urn in self.inlets],
                        outputDatasets=[str(urn) for urn in self.outlets],
                        inputDatajobs=[str(urn) for urn in self.upstream_urns],
                    ),
                    *self.generate_ownership_aspect(),
                    *self.generate_tags_aspect(),
                ],
            ))

        return job_mce
Exemple #19
0
def create_editable_schema_info_aspect_mce(
        directive: Directive) -> MetadataChangeEventClass:
    return MetadataChangeEventClass(proposedSnapshot=DatasetSnapshotClass(
        urn=dataset_name_to_urn(directive.table),
        aspects=[EditableSchemaMetadataClass(editableSchemaFieldInfo=[])],
    ))
Exemple #20
0
def read_mces(path: os.PathLike) -> List[MetadataChangeEventClass]:
    with open(path) as f:
        objs = json.load(f)
        mces = [MetadataChangeEventClass.from_obj(obj) for obj in objs]
    return mces
Exemple #21
0
def generate_stitched_record(relnships_graph: RelationshipGraph) -> List[Any]:
    def strip_types(field_path: str) -> str:

        final_path = field_path
        final_path = re.sub(r"(\[type=[a-zA-Z]+\]\.)", "", final_path)
        final_path = re.sub(r"^\[version=2.0\]\.", "", final_path)
        return final_path

    datasets: List[DatasetSnapshotClass] = []

    for entity_name, entity_def in entity_registry.items():
        entity_display_name = entity_def.display_name
        entity_fields = []
        for aspect_name in entity_def.aspects:
            if aspect_name not in aspect_registry:
                print(
                    f"Did not find aspect name: {aspect_name} in aspect_registry"
                )
                continue

            # all aspects should have a schema
            aspect_schema = aspect_registry[aspect_name].schema
            assert aspect_schema
            entity_fields.append({
                "type": aspect_schema.to_json(),
                "name": aspect_name,
            })

        if entity_fields:
            names = avro.schema.Names()
            field_objects = []
            for f in entity_fields:
                field = avro.schema.Field(
                    type=f["type"],
                    name=f["name"],
                    has_default=False,
                )
                field_objects.append(field)

            with unittest.mock.patch("avro.schema.Names.add_name", add_name):
                entity_avro_schema = avro.schema.RecordSchema(
                    name=entity_name,
                    namespace="datahub.metadata.model",
                    names=names,
                    fields=[],
                )
                entity_avro_schema.set_prop("fields", field_objects)
            rawSchema = json.dumps(entity_avro_schema.to_json())
            # always add the URN which is the primary key
            urn_field = SchemaField(
                fieldPath="urn",
                type=SchemaFieldDataTypeClass(type=StringTypeClass()),
                nativeDataType="string",
                nullable=False,
                isPartOfKey=True,
                description=
                f"The primary identifier for the {entity_name} entity. See the {entity_def.keyAspect} field to understand the structure of this urn.",
            )
            schema_fields: List[SchemaField] = [
                urn_field
            ] + avro_schema_to_mce_fields(rawSchema)
            foreign_keys: List[ForeignKeyConstraintClass] = []
            source_dataset_urn = make_dataset_urn(
                platform=make_data_platform_urn("datahub"),
                name=f"{entity_display_name}",
            )
            for f_field in schema_fields:
                if f_field.jsonProps:
                    json_dict = json.loads(f_field.jsonProps)
                    if "Aspect" in json_dict:
                        aspect_info = json_dict["Aspect"]
                        f_field.globalTags = f_field.globalTags or GlobalTagsClass(
                            tags=[])
                        f_field.globalTags.tags.append(
                            TagAssociationClass(tag="urn:li:tag:Aspect"))
                        # if this is the key aspect, also add primary-key
                        if entity_def.keyAspect == aspect_info.get("name"):
                            f_field.isPartOfKey = True

                        if "timeseries" == aspect_info.get("type", ""):
                            # f_field.globalTags = f_field.globalTags or GlobalTagsClass(
                            #    tags=[]
                            # )
                            f_field.globalTags.tags.append(
                                TagAssociationClass(tag="urn:li:tag:Temporal"))
                        import pdb

                        # breakpoint()
                    if "Searchable" in json_dict:
                        f_field.globalTags = f_field.globalTags or GlobalTagsClass(
                            tags=[])
                        f_field.globalTags.tags.append(
                            TagAssociationClass(tag="urn:li:tag:Searchable"))
                    if "Relationship" in json_dict:
                        relationship_info = json_dict["Relationship"]
                        # detect if we have relationship specified at leaf level or thru path specs
                        if "entityTypes" not in relationship_info:
                            # path spec
                            assert (
                                len(relationship_info.keys()) == 1
                            ), "We should never have more than one path spec assigned to a relationship annotation"
                            final_info = None
                            for k, v in relationship_info.items():
                                final_info = v
                            relationship_info = final_info

                        assert "entityTypes" in relationship_info

                        entity_types: List[str] = relationship_info.get(
                            "entityTypes", [])
                        relnship_name = relationship_info.get("name", None)
                        for entity_type in entity_types:
                            destination_entity_name = capitalize_first(
                                entity_type)

                            foreign_dataset_urn = make_dataset_urn(
                                platform=make_data_platform_urn("datahub"),
                                name=destination_entity_name,
                            )
                            fkey = ForeignKeyConstraintClass(
                                name=relnship_name,
                                foreignDataset=foreign_dataset_urn,
                                foreignFields=[
                                    f"urn:li:schemaField:({foreign_dataset_urn}, urn)"
                                ],
                                sourceFields=[
                                    f"urn:li:schemaField:({source_dataset_urn},{f_field.fieldPath})"
                                ],
                            )
                            foreign_keys.append(fkey)
                            relnships_graph.add_edge(
                                entity_display_name,
                                destination_entity_name,
                                fkey.name,
                                f" via `{strip_types(f_field.fieldPath)}`",
                                edge_id=
                                f"{entity_display_name}:{fkey.name}:{destination_entity_name}:{strip_types(f_field.fieldPath)}",
                            )

            schemaMetadata = SchemaMetadataClass(
                schemaName=f"{entity_name}",
                platform=make_data_platform_urn("datahub"),
                platformSchema=OtherSchemaClass(rawSchema=rawSchema),
                fields=schema_fields,
                version=0,
                hash="",
                foreignKeys=foreign_keys if foreign_keys else None,
            )

            dataset = DatasetSnapshotClass(
                urn=make_dataset_urn(
                    platform=make_data_platform_urn("datahub"),
                    name=f"{entity_display_name}",
                ),
                aspects=[
                    schemaMetadata,
                    GlobalTagsClass(
                        tags=[TagAssociationClass(tag="urn:li:tag:Entity")]),
                    BrowsePathsClass(
                        [f"/prod/datahub/entities/{entity_display_name}"]),
                ],
            )
            datasets.append(dataset)

    events: List[Union[MetadataChangeEventClass,
                       MetadataChangeProposalWrapper]] = []

    for d in datasets:
        entity_name = d.urn.split(":")[-1].split(",")[1]
        d.aspects.append(
            DatasetPropertiesClass(
                description=make_entity_docs(entity_name, relnships_graph)))

        mce = MetadataChangeEventClass(proposedSnapshot=d, )
        events.append(mce)

        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            changeType=ChangeTypeClass.UPSERT,
            entityUrn=d.urn,
            aspectName="subTypes",
            aspect=SubTypesClass(typeNames=["entity"]),
        )
        events.append(mcp)
    return events