Ejemplo n.º 1
0
    def get_ownership_to_set(
        graph: DataHubGraph, urn: str, mce_ownership: Optional[OwnershipClass]
    ) -> Optional[OwnershipClass]:
        if not mce_ownership or not mce_ownership.owners:
            # nothing to add, no need to consult server
            return None
        assert mce_ownership
        server_ownership = graph.get_ownership(entity_urn=urn)
        if server_ownership:
            # compute patch
            # we only include owners who are not present in the server ownership
            # if owner ids match, but the ownership type differs, we prefer the transformers opinion
            owners_to_add: List[OwnerClass] = []
            needs_update = False
            server_owner_ids = [o.owner for o in server_ownership.owners]
            for owner in mce_ownership.owners:
                if owner.owner not in server_owner_ids:
                    owners_to_add.append(owner)
                else:
                    # we need to check if the type matches, and if it doesn't, update it
                    for server_owner in server_ownership.owners:
                        if (
                            owner.owner == server_owner.owner
                            and owner.type != server_owner.type
                        ):
                            server_owner.type = owner.type
                            needs_update = True

            if owners_to_add or needs_update:
                mce_ownership.owners = server_ownership.owners + owners_to_add
                return mce_ownership
            else:
                return None
        else:
            return mce_ownership
Ejemplo n.º 2
0
 def __init__(
     self,
     run_id: str,
     datahub_api: Optional[DatahubClientConfig] = None,
     pipeline_name: Optional[str] = None,
     dry_run: bool = False,
     preview_mode: bool = False,
 ) -> None:
     self.run_id = run_id
     self.graph = DataHubGraph(
         datahub_api) if datahub_api is not None else None
     self.pipeline_name = pipeline_name
     self.dry_run_mode = dry_run
     self.preview_mode = preview_mode
     self.reporters: Dict[str, Committable] = dict()
     self.checkpointers: Dict[str, Committable] = dict()
     self._set_dataset_urn_to_lower_if_needed()
Ejemplo n.º 3
0
 def __init__(
     self,
     run_id: str,
     datahub_api: Optional[DatahubClientConfig] = None,
     pipeline_name: Optional[str] = None,
     dry_run: bool = False,
     preview_mode: bool = False,
 ) -> None:
     self.run_id = run_id
     self.graph = DataHubGraph(
         datahub_api) if datahub_api is not None else None
     self.pipeline_name = pipeline_name
     self.dry_run_mode = dry_run
     self.preview_mode = preview_mode
Ejemplo n.º 4
0
class PipelineContext:
    def __init__(
        self,
        run_id: str,
        datahub_api: Optional[DatahubClientConfig] = None,
        pipeline_name: Optional[str] = None,
        dry_run: bool = False,
        preview_mode: bool = False,
    ) -> None:
        self.run_id = run_id
        self.graph = DataHubGraph(
            datahub_api) if datahub_api is not None else None
        self.pipeline_name = pipeline_name
        self.dry_run_mode = dry_run
        self.preview_mode = preview_mode
        self.reporters: Dict[str, Committable] = dict()
        self.checkpointers: Dict[str, Committable] = dict()
        self._set_dataset_urn_to_lower_if_needed()

    def _set_dataset_urn_to_lower_if_needed(self) -> None:
        # TODO: Get rid of this function once lower-casing is the standard.
        if self.graph:
            server_config = self.graph.get_config()
            if server_config and server_config.get("datasetUrnNameCasing"):
                set_dataset_urn_to_lower(True)

    def register_checkpointer(self, committable: Committable) -> None:
        if committable.name in self.checkpointers:
            raise IndexError(
                f"Checkpointing provider {committable.name} already registered."
            )
        self.checkpointers[committable.name] = committable

    def register_reporter(self, committable: Committable) -> None:
        if committable.name in self.reporters:
            raise IndexError(
                f"Reporting provider {committable.name} already registered.")
        self.reporters[committable.name] = committable

    def get_reporters(self) -> Iterable[Committable]:
        for committable in self.reporters.values():
            yield committable

    def get_committables(self) -> Iterable[Tuple[str, Committable]]:
        for reporting_item_commitable in self.reporters.items():
            yield reporting_item_commitable
        for checkpointing_item_commitable in self.checkpointers.items():
            yield checkpointing_item_commitable
 def create(cls, config_dict: Dict[str, Any],
            ctx: PipelineContext) -> IngestionStateProvider:
     if ctx.graph:
         return cls(ctx.graph)
     elif config_dict is None:
         raise ConfigurationError("Missing provider configuration")
     else:
         provider_config = DatahubIngestionStateProviderConfig.parse_obj(
             config_dict)
         if provider_config.datahub_api:
             graph = DataHubGraph(provider_config.datahub_api)
             return cls(graph)
         else:
             raise ConfigurationError(
                 "Missing datahub_api. Provide either a global one or under the state_provider."
             )
Ejemplo n.º 6
0
 def create(cls, config_dict: Dict[str, Any], ctx: PipelineContext,
            name: str) -> IngestionCheckpointingProviderBase:
     if ctx.graph:
         # Use the pipeline-level graph if set
         return cls(ctx.graph, name)
     elif config_dict is None:
         raise ConfigurationError("Missing provider configuration.")
     else:
         provider_config = DatahubIngestionStateProviderConfig.parse_obj(
             config_dict)
         if provider_config.datahub_api:
             graph = DataHubGraph(provider_config.datahub_api)
             return cls(graph, name)
         else:
             raise ConfigurationError(
                 "Missing datahub_api. Provide either a global one or under the state_provider."
             )
Ejemplo n.º 7
0
logging.basicConfig(level=logging.INFO)

# Inputs -> owner, ownership_type, dataset
owner_to_add = make_user_urn("jdoe")
ownership_type = OwnershipTypeClass.DATAOWNER
dataset_urn = make_dataset_urn(platform="hive",
                               name="realestate_db.sales",
                               env="PROD")

# Some objects to help with conditional pathways later
owner_class_to_add = OwnerClass(owner=owner_to_add, type=ownership_type)
ownership_to_add = OwnershipClass(owners=[owner_class_to_add])

# First we get the current owners
gms_endpoint = "http://localhost:8080"
graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint))

current_owners: Optional[OwnershipClass] = graph.get_aspect_v2(
    entity_urn=dataset_urn,
    aspect="ownership",
    aspect_type=OwnershipClass,
)

need_write = False
if current_owners:
    if (owner_to_add, ownership_type) not in [(x.owner, x.type)
                                              for x in current_owners.owners]:
        # owners exist, but this owner is not present in the current owners
        current_owners.owners.append(owner_class_to_add)
        need_write = True
else:
Ejemplo n.º 8
0
        path = ".".join(tokens)
        return path
    else:
        # not a v2, we assume this is a simple path
        return field_path


# Inputs -> the column, dataset and the tag to set
column = "address.zipcode"
dataset_urn = make_dataset_urn(platform="hive", name="realestate_db.sales", env="PROD")
tag_to_add = make_tag_urn("location")


# First we get the current editable schema metadata
gms_endpoint = "http://localhost:8080"
graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint))


current_editable_schema_metadata = graph.get_aspect_v2(
    entity_urn=dataset_urn,
    aspect="editableSchemaMetadata",
    aspect_type=EditableSchemaMetadataClass,
)


# Some pre-built objects to help all the conditional pathways
tag_association_to_add = TagAssociationClass(tag=tag_to_add)
tags_aspect_to_set = GlobalTagsClass(tags=[tag_association_to_add])
field_info_to_set = EditableSchemaFieldInfoClass(
    fieldPath=column, globalTags=tags_aspect_to_set
)
link_description = "This is the definition of what real estate means"
dataset_urn = make_dataset_urn(platform="hive", name="realestate_db.sales", env="PROD")

# Some helpful variables to fill out objects later
now = int(time.time() * 1000)  # milliseconds since epoch
current_timestamp = AuditStampClass(time=now, actor="urn:li:corpuser:ingestion")
institutional_memory_element = InstitutionalMemoryMetadataClass(
    url=link_to_add,
    description=link_description,
    createStamp=current_timestamp,
)


# First we get the current owners
gms_endpoint = "http://localhost:8080"
graph = DataHubGraph(config=DatahubClientConfig(server=gms_endpoint))

current_editable_properties = graph.get_aspect_v2(
    entity_urn=dataset_urn,
    aspect="editableDatasetProperties",
    aspect_type=EditableDatasetPropertiesClass,
)

need_write = False
if current_editable_properties:
    if documentation_to_add != current_editable_properties.description:
        current_editable_properties.description = documentation_to_add
        need_write = True
else:
    # create a brand new editable dataset properties aspect
    current_editable_properties = EditableDatasetPropertiesClass(
Ejemplo n.º 10
0
from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph

# Imports for metadata model classes
from datahub.metadata.schema_classes import (
    AuditStampClass,
    ChangeTypeClass,
    GlossaryTermAssociationClass,
    GlossaryTermsClass,
)

log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

# First we get the current terms
gms_endpoint = "http://localhost:8080"
graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint))

dataset_urn = make_dataset_urn(platform="hive",
                               name="realestate_db.sales",
                               env="PROD")

current_terms: Optional[GlossaryTermsClass] = graph.get_aspect_v2(
    entity_urn=dataset_urn,
    aspect="glossaryTerms",
    aspect_type=GlossaryTermsClass,
)

term_to_add = make_term_urn("Classification.HighlyConfidential")
term_association_to_add = GlossaryTermAssociationClass(urn=term_to_add)
# an audit stamp that basically says we have no idea when these terms were added to this dataset
# change the time value to (time.time() * 1000) if you want to specify the current time of running this code as the time