コード例 #1
0
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        env: str = "PROD"
        platform = self.platform
        nodes = loadManifestAndCatalog(
            self.config.manifest_path, self.config.catalog_path, platform, env
        )

        for node in nodes:
            mce = MetadataChangeEvent()

            dataset_snapshot = DatasetSnapshot()
            dataset_snapshot.urn = node.datahub_urn
            custom_properties = get_custom_properties(node)

            dbt_properties = DatasetPropertiesClass()
            dbt_properties.description = node.dbt_name
            dbt_properties.customProperties = custom_properties

            dataset_snapshot.aspects.append(dbt_properties)

            upstreams = get_upstream_lineage(node.upstream_urns)
            if upstreams is not None:
                dataset_snapshot.aspects.append(upstreams)

            schema_metadata = get_schema_metadata(self.report, node, platform)
            dataset_snapshot.aspects.append(schema_metadata)

            mce.proposedSnapshot = dataset_snapshot
            wu = MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce)
            self.report.report_workunit(wu)

            yield wu
コード例 #2
0
ファイル: redshift.py プロジェクト: shirshanka/datahub
    def get_workunits(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]:
        try:
            self.inspect_version()
        except Exception as e:
            self.report.report_failure("version", f"Error: {e}")
            return

        for wu in super().get_workunits():
            yield wu
            if (isinstance(wu, SqlWorkUnit)
                    and isinstance(wu.metadata, MetadataChangeEvent) and
                    isinstance(wu.metadata.proposedSnapshot, DatasetSnapshot)):
                lineage_mcp = None
                lineage_properties_aspect: Optional[
                    DatasetPropertiesClass] = None

                dataset_snapshot: DatasetSnapshotClass = wu.metadata.proposedSnapshot
                assert dataset_snapshot

                if self.config.include_table_lineage:
                    lineage_mcp, lineage_properties_aspect = self.get_lineage_mcp(
                        wu.metadata.proposedSnapshot.urn)

                if lineage_mcp is not None:
                    lineage_wu = MetadataWorkUnit(
                        id=
                        f"redshift-{lineage_mcp.entityUrn}-{lineage_mcp.aspectName}",
                        mcp=lineage_mcp,
                    )
                    self.report.report_workunit(lineage_wu)

                    yield lineage_wu

                if lineage_properties_aspect:
                    aspects = dataset_snapshot.aspects
                    if aspects is None:
                        aspects = []

                    dataset_properties_aspect: Optional[
                        DatasetPropertiesClass] = None

                    for aspect in aspects:
                        if isinstance(aspect, DatasetPropertiesClass):
                            dataset_properties_aspect = aspect

                    if dataset_properties_aspect is None:
                        dataset_properties_aspect = DatasetPropertiesClass()
                        aspects.append(dataset_properties_aspect)

                    custom_properties = (
                        {
                            **dataset_properties_aspect.customProperties,
                            **lineage_properties_aspect.customProperties,
                        } if dataset_properties_aspect.customProperties else
                        lineage_properties_aspect.customProperties)
                    dataset_properties_aspect.customProperties = custom_properties
                    dataset_snapshot.aspects = aspects

                    dataset_snapshot.aspects.append(dataset_properties_aspect)
コード例 #3
0
ファイル: snowflake.py プロジェクト: swaroopjagadish/datahub
    def get_workunits(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]:
        for wu in super().get_workunits():
            if (self.config.include_table_lineage
                    and isinstance(wu, MetadataWorkUnit)
                    and isinstance(wu.metadata, MetadataChangeEvent) and
                    isinstance(wu.metadata.proposedSnapshot, DatasetSnapshot)):
                dataset_snapshot: DatasetSnapshot = wu.metadata.proposedSnapshot
                assert dataset_snapshot
                # Join the workunit stream from super with the lineage info using the urn.
                lineage_info = self._get_upstream_lineage_info(
                    dataset_snapshot.urn)
                if lineage_info is not None:
                    # Emit the lineage work unit
                    upstream_lineage, upstream_column_props = lineage_info
                    lineage_mcpw = MetadataChangeProposalWrapper(
                        entityType="dataset",
                        changeType=ChangeTypeClass.UPSERT,
                        entityUrn=dataset_snapshot.urn,
                        aspectName="upstreamLineage",
                        aspect=upstream_lineage,
                    )
                    lineage_wu = MetadataWorkUnit(
                        id=
                        f"{self.platform}-{lineage_mcpw.entityUrn}-{lineage_mcpw.aspectName}",
                        mcp=lineage_mcpw,
                    )
                    self.report.report_workunit(lineage_wu)
                    yield lineage_wu

                    # Update the super's workunit to include the column-lineage in the custom properties. We need to follow
                    # the RCU semantics for both the aspects & customProperties in order to preserve the changes made by super.
                    aspects = dataset_snapshot.aspects
                    if aspects is None:
                        aspects = []
                    dataset_properties_aspect: Optional[
                        DatasetPropertiesClass] = None
                    for aspect in aspects:
                        if isinstance(aspect, DatasetPropertiesClass):
                            dataset_properties_aspect = aspect
                    if dataset_properties_aspect is None:
                        dataset_properties_aspect = DatasetPropertiesClass()
                        aspects.append(dataset_properties_aspect)

                    custom_properties = ({
                        **dataset_properties_aspect.customProperties,
                        **upstream_column_props,
                    } if dataset_properties_aspect.customProperties else
                                         upstream_column_props)
                    dataset_properties_aspect.customProperties = custom_properties
                    dataset_snapshot.aspects = aspects

            # Emit the work unit from super.
            yield wu
コード例 #4
0
ファイル: clickhouse.py プロジェクト: hsheth2/datahub
    def get_workunits(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]:
        for wu in super().get_workunits():
            if (self.config.include_table_lineage
                    and isinstance(wu, SqlWorkUnit)
                    and isinstance(wu.metadata, MetadataChangeEvent) and
                    isinstance(wu.metadata.proposedSnapshot, DatasetSnapshot)):
                dataset_snapshot: DatasetSnapshotClass = wu.metadata.proposedSnapshot
                assert dataset_snapshot

                lineage_mcp, lineage_properties_aspect = self.get_lineage_mcp(
                    wu.metadata.proposedSnapshot.urn)

                if lineage_mcp is not None:
                    lineage_wu = MetadataWorkUnit(
                        id=
                        f"{self.platform}-{lineage_mcp.entityUrn}-{lineage_mcp.aspectName}",
                        mcp=lineage_mcp,
                    )
                    self.report.report_workunit(lineage_wu)

                    yield lineage_wu

                if lineage_properties_aspect:
                    aspects = dataset_snapshot.aspects
                    if aspects is None:
                        aspects = []

                    dataset_properties_aspect: Optional[
                        DatasetPropertiesClass] = None

                    for aspect in aspects:
                        if isinstance(aspect, DatasetPropertiesClass):
                            dataset_properties_aspect = aspect

                    if dataset_properties_aspect is None:
                        dataset_properties_aspect = DatasetPropertiesClass()
                        aspects.append(dataset_properties_aspect)

                    custom_properties = (
                        {
                            **dataset_properties_aspect.customProperties,
                            **lineage_properties_aspect.customProperties,
                        } if dataset_properties_aspect.customProperties else
                        lineage_properties_aspect.customProperties)
                    dataset_properties_aspect.customProperties = custom_properties
                    dataset_snapshot.aspects = aspects

                    dataset_snapshot.aspects.append(dataset_properties_aspect)

            # Emit the work unit from super.
            yield wu
コード例 #5
0
    def _create_dataset_properties_aspect(
        self, node: DBTNode, additional_custom_props_filtered: Dict[str, str]
    ) -> DatasetPropertiesClass:
        description = None
        if self.config.disable_dbt_node_creation:
            if node.comment and node.description and node.comment != node.description:
                description = f"{self.config.target_platform} comment: {node.comment}\n\ndbt model description: {node.description}"
            elif node.comment:
                description = node.comment
            elif node.description:
                description = node.description
        else:
            description = node.description

        custom_props = {
            **get_custom_properties(node),
            **additional_custom_props_filtered,
        }
        dbt_properties = DatasetPropertiesClass(
            description=description,
            customProperties=custom_props,
            tags=node.tags,
            name=node.name,
        )
        return dbt_properties
コード例 #6
0
ファイル: sql_common.py プロジェクト: NathanFaught/datahub
    def get_workunits(self) -> Iterable[SqlWorkUnit]:
        sql_config = self.config
        if logger.isEnabledFor(logging.DEBUG):
            # If debug logging is enabled, we also want to echo each SQL query issued.
            sql_config.options["echo"] = True

        url = sql_config.get_sql_alchemy_url()
        logger.debug(f"sql_alchemy_url={url}")
        engine = create_engine(url, **sql_config.options)
        inspector = reflection.Inspector.from_engine(engine)
        for schema in inspector.get_schema_names():
            if not sql_config.schema_pattern.allowed(schema):
                self.report.report_dropped(schema)
                continue

            for table in inspector.get_table_names(schema):
                schema, table = sql_config.standardize_schema_table_names(schema, table)
                dataset_name = sql_config.get_identifier(schema, table)
                self.report.report_table_scanned(dataset_name)

                if not sql_config.table_pattern.allowed(dataset_name):
                    self.report.report_dropped(dataset_name)
                    continue

                columns = inspector.get_columns(table, schema)
                try:
                    table_info: dict = inspector.get_table_comment(table, schema)
                except NotImplementedError:
                    description: Optional[str] = None
                    properties: Dict[str, str] = {}
                else:
                    description = table_info["text"]

                    # The "properties" field is a non-standard addition to SQLAlchemy's interface.
                    properties = table_info.get("properties", {})

                # TODO: capture inspector.get_pk_constraint
                # TODO: capture inspector.get_sorted_table_and_fkc_names

                dataset_snapshot = DatasetSnapshot(
                    urn=f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{dataset_name},{self.config.env})",
                    aspects=[],
                )
                if description is not None or properties:
                    dataset_properties = DatasetPropertiesClass(
                        description=description,
                        customProperties=properties,
                        # uri=dataset_name,
                    )
                    dataset_snapshot.aspects.append(dataset_properties)
                schema_metadata = get_schema_metadata(
                    self.report, dataset_name, self.platform, columns
                )
                dataset_snapshot.aspects.append(schema_metadata)

                mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
                wu = SqlWorkUnit(id=dataset_name, mce=mce)
                self.report.report_workunit(wu)
                yield wu
コード例 #7
0
    def loop_tables(
        self,
        inspector: Inspector,
        schema: str,
        sql_config: SQLAlchemyConfig,
    ) -> Iterable[SqlWorkUnit]:
        for table in inspector.get_table_names(schema):
            schema, table = self.standardize_schema_table_names(schema=schema,
                                                                entity=table)
            dataset_name = self.get_identifier(schema=schema,
                                               entity=table,
                                               inspector=inspector)
            self.report.report_entity_scanned(dataset_name, ent_type="table")

            if not sql_config.table_pattern.allowed(dataset_name):
                self.report.report_dropped(dataset_name)
                continue

            columns = inspector.get_columns(table, schema)
            if len(columns) == 0:
                self.report.report_warning(dataset_name,
                                           "missing column information")

            try:
                # SQLALchemy stubs are incomplete and missing this method.
                # PR: https://github.com/dropbox/sqlalchemy-stubs/pull/223.
                table_info: dict = inspector.get_table_comment(
                    table, schema)  # type: ignore
            except NotImplementedError:
                description: Optional[str] = None
                properties: Dict[str, str] = {}
            else:
                description = table_info["text"]

                # The "properties" field is a non-standard addition to SQLAlchemy's interface.
                properties = table_info.get("properties", {})

            # TODO: capture inspector.get_pk_constraint
            # TODO: capture inspector.get_sorted_table_and_fkc_names

            dataset_snapshot = DatasetSnapshot(
                urn=
                f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{dataset_name},{self.config.env})",
                aspects=[],
            )
            if description is not None or properties:
                dataset_properties = DatasetPropertiesClass(
                    description=description,
                    customProperties=properties,
                )
                dataset_snapshot.aspects.append(dataset_properties)
            schema_metadata = get_schema_metadata(self.report, dataset_name,
                                                  self.platform, columns)
            dataset_snapshot.aspects.append(schema_metadata)

            mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
            wu = SqlWorkUnit(id=dataset_name, mce=mce)
            self.report.report_workunit(wu)
            yield wu
コード例 #8
0
def get_initial_mce() -> MetadataChangeEventClass:
    return MetadataChangeEventClass(
        proposedSnapshot=DatasetSnapshotClass(
            urn="urn:li:dataset:(urn:li:dataPlatform:test_platform,test,PROD)",
            aspects=[DatasetPropertiesClass(description="test.description", )],
        ),
        systemMetadata=SystemMetadata(lastObserved=1586847600000,
                                      runId="pipeline_test"),
    )
コード例 #9
0
    def transform_one(
            self, mce: MetadataChangeEventClass) -> MetadataChangeEventClass:
        # legacy transformers should not receive metadata change proposal events
        assert not isinstance(mce, MetadataChangeProposalWrapper)
        if isinstance(mce, MetadataChangeEventClass):
            assert isinstance(mce.proposedSnapshot, DatasetSnapshotClass)
            mce.proposedSnapshot.aspects.append(
                DatasetPropertiesClass(description="Old Transformer was here"))

        return mce
コード例 #10
0
    def get_workunits(self) -> Iterable[SqlWorkUnit]:
        env: str = "PROD"
        sql_config = self.config
        platform = self.platform
        url = sql_config.get_sql_alchemy_url()
        logger.debug(f"sql_alchemy_url={url}")
        engine = create_engine(url, **sql_config.options)
        inspector = reflection.Inspector.from_engine(engine)
        for schema in inspector.get_schema_names():
            if not sql_config.schema_pattern.allowed(schema):
                self.report.report_dropped(schema)
                continue

            for table in inspector.get_table_names(schema):
                schema, table = sql_config.standardize_schema_table_names(
                    schema, table)
                dataset_name = sql_config.get_identifier(schema, table)
                self.report.report_table_scanned(dataset_name)

                if not sql_config.table_pattern.allowed(dataset_name):
                    self.report.report_dropped(dataset_name)
                    continue

                columns = inspector.get_columns(table, schema)
                try:
                    description: Optional[str] = inspector.get_table_comment(
                        table, schema)["text"]
                except NotImplementedError:
                    description = None

                # TODO: capture inspector.get_pk_constraint
                # TODO: capture inspector.get_sorted_table_and_fkc_names

                dataset_snapshot = DatasetSnapshot(
                    urn=
                    f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{env})",
                    aspects=[],
                )
                if description is not None:
                    dataset_properties = DatasetPropertiesClass(
                        description=description,
                        tags=[],
                        customProperties={},
                        # uri=dataset_name,
                    )
                    dataset_snapshot.aspects.append(dataset_properties)
                schema_metadata = get_schema_metadata(self.report,
                                                      dataset_name, platform,
                                                      columns)
                dataset_snapshot.aspects.append(schema_metadata)

                mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
                wu = SqlWorkUnit(id=dataset_name, mce=mce)
                self.report.report_workunit(wu)
                yield wu
コード例 #11
0
def create_metadata_work_unit(timestamp):
    dataset_snapshot = DatasetSnapshot(
        urn="urn:li:dataset:(urn:li:dataPlatform:glue,datalake_grilled.Barbeque,PROD)",
        aspects=[],
    )

    dataset_snapshot.aspects.append(Status(removed=False))

    dataset_snapshot.aspects.append(
        OwnershipClass(
            owners=[
                OwnerClass(
                    owner="urn:li:corpuser:Susan", type=OwnershipTypeClass.DATAOWNER
                )
            ],
            lastModified=AuditStampClass(
                time=timestamp, actor="urn:li:corpuser:datahub"
            ),
        )
    )

    dataset_snapshot.aspects.append(
        DatasetPropertiesClass(
            description="Grilled Food",
            customProperties={},
            uri=None,
            tags=[],
        )
    )

    fields = [
        SchemaField(
            fieldPath="Size",
            nativeDataType="int",
            type=SchemaFieldDataType(type=NumberTypeClass()),
            description="Maximum attendees permitted",
            nullable=True,
            recursive=False,
        )
    ]

    schema_metadata = SchemaMetadata(
        schemaName="datalake_grilled.Barbeque",
        version=0,
        fields=fields,
        platform="urn:li:dataPlatform:glue",
        created=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"),
        lastModified=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"),
        hash="",
        platformSchema=MySqlDDL(tableSchema=""),
    )
    dataset_snapshot.aspects.append(schema_metadata)

    mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
    return MetadataWorkUnit(id="glue-datalake_grilled.Barbeque", mce=mce)
コード例 #12
0
ファイル: powerbi.py プロジェクト: hsheth2/datahub
    def __to_datahub_dataset(
        self, dataset: Optional[PowerBiAPI.Dataset]
    ) -> List[MetadataChangeProposalWrapper]:
        """
        Map PowerBi dataset to datahub dataset. Here we are mapping each table of PowerBi Dataset to Datahub dataset.
        In PowerBi Tile would be having single dataset, However corresponding Datahub's chart might have many input sources.
        """

        dataset_mcps: List[MetadataChangeProposalWrapper] = []
        if dataset is None:
            return dataset_mcps

        # We are only suporting relation PowerBi DataSources
        if (dataset.datasource is None
                or dataset.datasource.metadata.is_relational is False):
            LOGGER.warning(
                "Dataset {}({}) is not created from relational datasource".
                format(dataset.name, dataset.id))
            return dataset_mcps

        LOGGER.info("Converting dataset={}(id={}) to datahub dataset".format(
            dataset.name, dataset.id))

        for table in dataset.tables:
            # Create an URN for dataset
            ds_urn = builder.make_dataset_urn(
                platform=self.__config.dataset_type_mapping[
                    dataset.datasource.type],
                name="{}.{}.{}".format(dataset.datasource.database,
                                       table.schema_name, table.name),
                env=self.__config.env,
            )
            LOGGER.info("{}={}".format(Constant.Dataset_URN, ds_urn))
            # Create datasetProperties mcp
            ds_properties = DatasetPropertiesClass(description=table.name)

            info_mcp = self.new_mcp(
                entity_type=Constant.DATASET,
                entity_urn=ds_urn,
                aspect_name=Constant.DATASET_PROPERTIES,
                aspect=ds_properties,
            )

            # Remove status mcp
            status_mcp = self.new_mcp(
                entity_type=Constant.DATASET,
                entity_urn=ds_urn,
                aspect_name=Constant.STATUS,
                aspect=StatusClass(removed=False),
            )

            dataset_mcps.extend([info_mcp, status_mcp])

        return dataset_mcps
コード例 #13
0
    def loop_views(
        self,
        inspector: Any,
        schema: str,
        sql_config: SQLAlchemyConfig,
    ) -> Iterable[SqlWorkUnit]:
        for view in inspector.get_view_names(schema):
            schema, view = sql_config.standardize_schema_table_names(
                schema, view)
            dataset_name = sql_config.get_identifier(schema, view)
            self.report.report_entity_scanned(dataset_name, ent_type="view")

            if not sql_config.view_pattern.allowed(dataset_name):
                self.report.report_dropped(dataset_name)
                continue

            columns = inspector.get_columns(view, schema)
            try:
                view_info: dict = inspector.get_table_comment(view, schema)
            except NotImplementedError:
                description: Optional[str] = None
                properties: Dict[str, str] = {}
            else:
                description = view_info["text"]

                # The "properties" field is a non-standard addition to SQLAlchemy's interface.
                properties = view_info.get("properties", {})

            view_definition = inspector.get_view_definition(view)
            if view_definition is None:
                view_definition = ""
            properties["view_definition"] = view_definition
            properties["is_view"] = "True"

            dataset_snapshot = DatasetSnapshot(
                urn=
                f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{dataset_name},{self.config.env})",
                aspects=[],
            )
            if description is not None or properties:
                dataset_properties = DatasetPropertiesClass(
                    description=description,
                    customProperties=properties,
                    # uri=dataset_name,
                )
                dataset_snapshot.aspects.append(dataset_properties)
            schema_metadata = get_schema_metadata(self.report, dataset_name,
                                                  self.platform, columns)
            dataset_snapshot.aspects.append(schema_metadata)

            mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
            wu = SqlWorkUnit(id=dataset_name, mce=mce)
            self.report.report_workunit(wu)
            yield wu
コード例 #14
0
ファイル: lookml.py プロジェクト: swaroopjagadish/datahub
    def _get_custom_properties(
            self, looker_view: LookerView) -> DatasetPropertiesClass:
        file_path = str(
            pathlib.Path(looker_view.absolute_file_path).resolve()).replace(
                str(self.source_config.base_folder.resolve()), "")

        custom_properties = {
            "looker.file.content": looker_view.raw_file_content[
                0:512000],  # grab a limited slice of characters from the file
            "looker.file.path": file_path,
        }
        dataset_props = DatasetPropertiesClass(
            customProperties=custom_properties)

        if self.source_config.github_info is not None:
            github_file_url = self.source_config.github_info.get_url_for_file_path(
                file_path)
            dataset_props.externalUrl = github_file_url

        return dataset_props
コード例 #15
0
 def _get_data_stream_index_count_mcps(
     self, ) -> Iterable[MetadataChangeProposalWrapper]:
     for data_stream, count in self.data_stream_partition_count.items():
         dataset_urn: str = make_dataset_urn(self.platform, data_stream,
                                             self.source_config.env)
         yield MetadataChangeProposalWrapper(
             entityType="dataset",
             entityUrn=dataset_urn,
             aspectName="datasetProperties",
             aspect=DatasetPropertiesClass(
                 customProperties={"numPartitions": str(count)}),
             changeType=ChangeTypeClass.UPSERT,
         )
コード例 #16
0
 def get_dataset_properties() -> DatasetPropertiesClass:
     return DatasetPropertiesClass(
         description=table.get("Description"),
         customProperties={
             **table.get("Parameters", {}),
             **{
                 k: str(v)
                 for k, v in table["StorageDescriptor"].items()
                 if k not in ["Columns", "Parameters"]
             },
         },
         uri=table.get("Location"),
         tags=[],
     )
コード例 #17
0
    def _to_mce(  # noqa: C901
        self,
        config: LookerCommonConfig,
        reporter: SourceReport,
    ) -> Optional[MetadataChangeEvent]:
        # We only generate MCE-s for explores that contain from clauses and do NOT contain joins
        # All other explores (passthrough explores and joins) end in correct resolution of lineage, and don't need additional nodes in the graph.

        dataset_snapshot = DatasetSnapshot(
            urn=self.get_explore_urn(config),
            aspects=[],  # we append to this list later on
        )
        browse_paths = BrowsePathsClass(paths=[self.get_explore_browse_path(config)])
        dataset_snapshot.aspects.append(browse_paths)
        dataset_snapshot.aspects.append(StatusClass(removed=False))

        custom_properties = {"looker.type": "explore"}
        if self.label is not None:
            custom_properties["looker.explore.label"] = str(self.label)
        dataset_props = DatasetPropertiesClass(
            description=self.description,
            customProperties=custom_properties,
        )
        dataset_snapshot.aspects.append(dataset_props)
        if self.upstream_views is not None:
            assert self.project_name is not None
            upstreams = [
                UpstreamClass(
                    dataset=LookerViewId(
                        project_name=self.project_name,
                        model_name=self.model_name,
                        view_name=view_name,
                    ).get_urn(config),
                    type=DatasetLineageTypeClass.VIEW,
                )
                for view_name in self.upstream_views
            ]
            upstream_lineage = UpstreamLineage(upstreams=upstreams)
            dataset_snapshot.aspects.append(upstream_lineage)
        if self.fields is not None:
            schema_metadata = LookerUtil._get_schema(
                platform_name=config.platform_name,
                schema_name=self.name,
                view_fields=self.fields,
                reporter=reporter,
            )
            dataset_snapshot.aspects.append(schema_metadata)

        mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
        return mce
コード例 #18
0
    def transform_one(self, mce: MetadataChangeEventClass) -> MetadataChangeEventClass:
        if not isinstance(mce.proposedSnapshot, DatasetSnapshotClass):
            return mce

        properties_to_add = self.config.add_properties_resolver_class(  # type: ignore
            **self.resolver_args
        ).get_properties_to_add(mce.proposedSnapshot)
        if properties_to_add:
            properties = builder.get_or_add_aspect(
                mce, DatasetPropertiesClass(customProperties={})
            )
            properties.customProperties.update(properties_to_add)

        return mce
コード例 #19
0
ファイル: __init__.py プロジェクト: arunvasudevan/datahub
    def ingest_table(self,
                     table_data: TableData) -> Iterable[MetadataWorkUnit]:

        logger.info(
            f"Extracting table schema from file: {table_data.full_path}")
        browse_path: str = (strip_s3_prefix(table_data.table_path)
                            if table_data.is_s3 else
                            table_data.table_path.strip("/"))

        data_platform_urn = make_data_platform_urn(self.source_config.platform)
        logger.info(f"Creating dataset urn with name: {browse_path}")
        dataset_urn = make_dataset_urn_with_platform_instance(
            self.source_config.platform,
            browse_path,
            self.source_config.platform_instance,
            self.source_config.env,
        )

        dataset_snapshot = DatasetSnapshot(
            urn=dataset_urn,
            aspects=[],
        )

        dataset_properties = DatasetPropertiesClass(
            description="",
            name=table_data.disaply_name,
            customProperties={},
        )
        dataset_snapshot.aspects.append(dataset_properties)

        fields = self.get_fields(table_data)
        schema_metadata = SchemaMetadata(
            schemaName=table_data.disaply_name,
            platform=data_platform_urn,
            version=0,
            hash="",
            fields=fields,
            platformSchema=OtherSchemaClass(rawSchema=""),
        )
        dataset_snapshot.aspects.append(schema_metadata)

        mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
        wu = MetadataWorkUnit(id=table_data.table_path, mce=mce)
        self.report.report_workunit(wu)
        yield wu

        yield from self.create_container_hierarchy(table_data, dataset_urn)

        if self.source_config.profiling.enabled:
            yield from self.get_table_profile(table_data, dataset_urn)
コード例 #20
0
def get_initial_mce() -> MetadataChangeEventClass:
    return MetadataChangeEventClass(
        proposedSnapshot=DatasetSnapshotClass(
            urn="urn:li:dataset:(urn:li:dataPlatform:test_platform,test,PROD)",
            aspects=[
                DatasetPropertiesClass(
                    description="test.description",
                    customProperties={},
                    uri=None,
                    tags=[],
                )
            ],
        )
    )
コード例 #21
0
def test_old_transformers_working_as_before(mock_time):

    dataset_mce = make_generic_dataset()
    dataset_mcp = make_generic_dataset_mcp()
    transformer = OldMCETransformer.create(
        {},
        PipelineContext(run_id="test-old-transformer"),
    )

    outputs = list(
        transformer.transform([
            RecordEnvelope(input, metadata={})
            for input in [dataset_mce, dataset_mcp,
                          EndOfStream()]
        ]))

    assert len(outputs) == 3  # MCP will come back untouched

    assert outputs[0].record == dataset_mce
    # Check that glossary terms were added.
    props_aspect = builder.get_aspect_if_available(outputs[0].record,
                                                   DatasetPropertiesClass)
    assert props_aspect
    assert props_aspect.description == "Old Transformer was here"

    assert outputs[1].record == dataset_mcp

    assert isinstance(outputs[-1].record, EndOfStream)

    # MCP only stream
    dataset_mcps = [
        make_generic_dataset_mcp(),
        make_generic_dataset_mcp(aspect=DatasetPropertiesClass(
            description="Another test MCP")),
        EndOfStream(),
    ]
    transformer = OldMCETransformer.create(
        {},
        PipelineContext(run_id="test-old-transformer"),
    )

    outputs = list(
        transformer.transform(
            [RecordEnvelope(input, metadata={}) for input in dataset_mcps]))

    assert len(outputs) == 3  # MCP-s will come back untouched

    assert outputs[0].record == dataset_mcps[0]
    assert outputs[1].record == dataset_mcps[1]
    assert isinstance(outputs[-1].record, EndOfStream)
コード例 #22
0
 def _get_custom_properties(
         self, looker_view: LookerView) -> DatasetPropertiesClass:
     custom_properties = {
         "looker.file.content":
         looker_view.raw_file_content[
             0:512000],  # grab a limited slice of characters from the file
         "looker.file.path":
         str(pathlib.Path(
             looker_view.absolute_file_path).resolve()).replace(
                 str(self.source_config.base_folder.resolve()), ""),
     }
     dataset_props = DatasetPropertiesClass(
         customProperties=custom_properties)
     return dataset_props
コード例 #23
0
ファイル: nifi.py プロジェクト: swaroopjagadish/datahub
    def construct_dataset_workunits(
        self,
        dataset_platform: str,
        dataset_name: str,
        dataset_urn: Optional[str] = None,
        external_url: Optional[str] = None,
        datasetProperties: Optional[Dict[str, str]] = None,
    ) -> Iterable[MetadataWorkUnit]:

        if not dataset_urn:
            dataset_urn = builder.make_dataset_urn(
                dataset_platform, dataset_name, self.config.env
            )

        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            entityUrn=dataset_urn,
            changeType=ChangeTypeClass.UPSERT,
            aspectName="dataPlatformInstance",
            aspect=DataPlatformInstanceClass(
                platform=builder.make_data_platform_urn(dataset_platform)
            ),
        )
        platform = (
            dataset_platform[dataset_platform.rindex(":") + 1 :]
            if dataset_platform.startswith("urn:")
            else dataset_platform
        )
        wu = MetadataWorkUnit(id=f"{platform}.{dataset_name}.{mcp.aspectName}", mcp=mcp)
        if wu.id not in self.report.workunit_ids:
            self.report.report_workunit(wu)
            yield wu

        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            entityUrn=dataset_urn,
            changeType=ChangeTypeClass.UPSERT,
            aspectName="datasetProperties",
            aspect=DatasetPropertiesClass(
                externalUrl=external_url, customProperties=datasetProperties
            ),
        )

        wu = MetadataWorkUnit(id=f"{platform}.{dataset_name}.{mcp.aspectName}", mcp=mcp)
        if wu.id not in self.report.workunit_ids:
            self.report.report_workunit(wu)
            yield wu
コード例 #24
0
ファイル: clickhouse.py プロジェクト: hsheth2/datahub
    def get_lineage_mcp(
        self, dataset_urn: str
    ) -> Tuple[Optional[MetadataChangeProposalWrapper],
               Optional[DatasetPropertiesClass]]:
        dataset_key = mce_builder.dataset_urn_to_key(dataset_urn)
        if dataset_key is None:
            return None, None

        if not self._lineage_map:
            self._populate_lineage()
        assert self._lineage_map is not None

        upstream_lineage: List[UpstreamClass] = []
        custom_properties: Dict[str, str] = {}

        if dataset_key.name in self._lineage_map:
            item = self._lineage_map[dataset_key.name]
            for upstream in item.upstreams:
                upstream_table = UpstreamClass(
                    dataset=builder.make_dataset_urn_with_platform_instance(
                        upstream.platform.value,
                        upstream.path,
                        self.config.platform_instance,
                        self.config.env,
                    ),
                    type=item.dataset_lineage_type,
                )
                upstream_lineage.append(upstream_table)

        properties = None
        if custom_properties:
            properties = DatasetPropertiesClass(
                customProperties=custom_properties)

        if not upstream_lineage:
            return None, properties

        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            changeType=ChangeTypeClass.UPSERT,
            entityUrn=dataset_urn,
            aspectName="upstreamLineage",
            aspect=UpstreamLineage(upstreams=upstream_lineage),
        )

        return mcp, properties
コード例 #25
0
ファイル: openapi.py プロジェクト: swaroopjagadish/datahub
    def init_dataset(
        self, endpoint_k: str, endpoint_dets: dict
    ) -> Tuple[DatasetSnapshot, str]:
        config = self.config

        dataset_name = endpoint_k[1:].replace("/", ".")

        if len(dataset_name) > 0:
            if dataset_name[-1] == ".":
                dataset_name = dataset_name[:-1]
        else:
            dataset_name = "root"

        dataset_snapshot = DatasetSnapshot(
            urn=f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{config.name}.{dataset_name},PROD)",
            aspects=[],
        )

        # adding description
        dataset_properties = DatasetPropertiesClass(
            description=endpoint_dets["description"], customProperties={}
        )
        dataset_snapshot.aspects.append(dataset_properties)

        # adding tags
        tags_str = [make_tag_urn(t) for t in endpoint_dets["tags"]]
        tags_tac = [TagAssociationClass(t) for t in tags_str]
        gtc = GlobalTagsClass(tags_tac)
        dataset_snapshot.aspects.append(gtc)

        # the link will appear in the "documentation"
        link_url = clean_url(config.url + self.url_basepath + endpoint_k)
        link_description = "Link to call for the dataset."
        creation = AuditStampClass(
            time=int(time.time()), actor="urn:li:corpuser:etl", impersonator=None
        )
        link_metadata = InstitutionalMemoryMetadataClass(
            url=link_url, description=link_description, createStamp=creation
        )
        inst_memory = InstitutionalMemoryClass([link_metadata])
        dataset_snapshot.aspects.append(inst_memory)

        return dataset_snapshot, dataset_name
コード例 #26
0
def test_supression_works():
    dataset_mce = make_generic_dataset()
    dataset_mcp = make_generic_dataset_mcp(
        aspect_name="datasetProperties",
        aspect=DatasetPropertiesClass(description="supressable description"),
    )
    transformer = SuppressingTransformer.create(
        {},
        PipelineContext(run_id="test-suppress-transformer"),
    )

    outputs = list(
        transformer.transform([
            RecordEnvelope(input, metadata={})
            for input in [dataset_mce, dataset_mcp,
                          EndOfStream()]
        ]))

    assert len(outputs) == 2  # MCP will be dropped
コード例 #27
0
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        env = "PROD"
        platform = "mongodb"

        database_names: List[str] = self.mongo_client.list_database_names()
        for database_name in database_names:
            if database_name in DENY_DATABASE_LIST:
                continue
            if not self.config.database_pattern.allowed(database_name):
                self.report.report_dropped(database_name)
                continue

            database = self.mongo_client[database_name]
            collection_names: List[str] = database.list_collection_names()
            for collection_name in collection_names:
                dataset_name = f"{database_name}.{collection_name}"
                if not self.config.collection_pattern.allowed(dataset_name):
                    self.report.report_dropped(dataset_name)
                    continue

                dataset_snapshot = DatasetSnapshot(
                    urn=
                    f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{env})",
                    aspects=[],
                )

                dataset_properties = DatasetPropertiesClass(
                    tags=[],
                    customProperties={},
                )
                dataset_snapshot.aspects.append(dataset_properties)

                # TODO: Guess the schema via sampling
                # State of the art seems to be https://github.com/variety/variety.

                # TODO: use list_indexes() or index_information() to get index information
                # See https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.list_indexes.

                mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
                wu = MetadataWorkUnit(id=dataset_name, mce=mce)
                self.report.report_workunit(wu)
                yield wu
コード例 #28
0
ファイル: dbt.py プロジェクト: emailstonl/datahub
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        platform = self.platform
        nodes = loadManifestAndCatalog(
            self.config.manifest_path,
            self.config.catalog_path,
            self.config.sources_path,
            self.config.load_schemas,
            self.config.target_platform,
            self.config.env,
            self.config.node_type_pattern,
        )

        for node in nodes:

            dataset_snapshot = DatasetSnapshot(
                urn=node.datahub_urn,
                aspects=[],
            )
            custom_properties = get_custom_properties(node)

            dbt_properties = DatasetPropertiesClass(
                description=node.dbt_name,
                customProperties=custom_properties,
                tags=[],
            )
            dataset_snapshot.aspects.append(dbt_properties)

            upstreams = get_upstream_lineage(node.upstream_urns)
            if upstreams is not None:
                dataset_snapshot.aspects.append(upstreams)

            if self.config.load_schemas:
                schema_metadata = get_schema_metadata(self.report, node,
                                                      platform)
                dataset_snapshot.aspects.append(schema_metadata)

            mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
            wu = MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce)
            self.report.report_workunit(wu)

            yield wu
コード例 #29
0
    def loop_views(
        self,
        inspector: Inspector,
        schema: str,
        sql_config: SQLAlchemyConfig,
    ) -> Iterable[SqlWorkUnit]:
        for view in inspector.get_view_names(schema):
            schema, view = sql_config.standardize_schema_table_names(
                schema, view)
            dataset_name = sql_config.get_identifier(schema, view)
            self.report.report_entity_scanned(dataset_name, ent_type="view")

            if not sql_config.view_pattern.allowed(dataset_name):
                self.report.report_dropped(dataset_name)
                continue

            try:
                columns = inspector.get_columns(view, schema)
            except KeyError:
                # For certain types of views, we are unable to fetch the list of columns.
                self.report.report_warning(
                    dataset_name, "unable to get schema for this view")
                schema_metadata = None
            else:
                schema_metadata = get_schema_metadata(self.report,
                                                      dataset_name,
                                                      self.platform, columns)

            try:
                # SQLALchemy stubs are incomplete and missing this method.
                # PR: https://github.com/dropbox/sqlalchemy-stubs/pull/223.
                view_info: dict = inspector.get_table_comment(
                    view, schema)  # type: ignore
            except NotImplementedError:
                description: Optional[str] = None
                properties: Dict[str, str] = {}
            else:
                description = view_info["text"]

                # The "properties" field is a non-standard addition to SQLAlchemy's interface.
                properties = view_info.get("properties", {})

            try:
                view_definition = inspector.get_view_definition(view, schema)
                if view_definition is None:
                    view_definition = ""
                else:
                    # Some dialects return a TextClause instead of a raw string,
                    # so we need to convert them to a string.
                    view_definition = str(view_definition)
            except NotImplementedError:
                view_definition = ""
            properties["view_definition"] = view_definition
            properties["is_view"] = "True"

            dataset_snapshot = DatasetSnapshot(
                urn=
                f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{dataset_name},{self.config.env})",
                aspects=[],
            )
            if description is not None or properties:
                dataset_properties = DatasetPropertiesClass(
                    description=description,
                    customProperties=properties,
                    # uri=dataset_name,
                )
                dataset_snapshot.aspects.append(dataset_properties)

            if schema_metadata:
                dataset_snapshot.aspects.append(schema_metadata)

            mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
            wu = SqlWorkUnit(id=dataset_name, mce=mce)
            self.report.report_workunit(wu)
            yield wu
コード例 #30
0
    def _to_metadata_events(  # noqa: C901
        self, config: LookerCommonConfig, reporter: SourceReport,
        base_url: str) -> Optional[List[Union[MetadataChangeEvent,
                                              MetadataChangeProposalWrapper]]]:
        # We only generate MCE-s for explores that contain from clauses and do NOT contain joins
        # All other explores (passthrough explores and joins) end in correct resolution of lineage, and don't need additional nodes in the graph.

        dataset_snapshot = DatasetSnapshot(
            urn=self.get_explore_urn(config),
            aspects=[],  # we append to this list later on
        )
        browse_paths = BrowsePathsClass(
            paths=[self.get_explore_browse_path(config)])
        dataset_snapshot.aspects.append(browse_paths)
        dataset_snapshot.aspects.append(StatusClass(removed=False))

        custom_properties = {}
        if self.label is not None:
            custom_properties["looker.explore.label"] = str(self.label)
        if self.source_file is not None:
            custom_properties["looker.explore.file"] = str(self.source_file)
        dataset_props = DatasetPropertiesClass(
            description=self.description,
            customProperties=custom_properties,
        )
        dataset_props.externalUrl = self._get_url(base_url)

        dataset_snapshot.aspects.append(dataset_props)
        if self.upstream_views is not None:
            assert self.project_name is not None
            upstreams = [
                UpstreamClass(
                    dataset=LookerViewId(
                        project_name=self.project_name,
                        model_name=self.model_name,
                        view_name=view_name,
                    ).get_urn(config),
                    type=DatasetLineageTypeClass.VIEW,
                ) for view_name in sorted(self.upstream_views)
            ]
            upstream_lineage = UpstreamLineage(upstreams=upstreams)
            dataset_snapshot.aspects.append(upstream_lineage)
        if self.fields is not None:
            schema_metadata = LookerUtil._get_schema(
                platform_name=config.platform_name,
                schema_name=self.name,
                view_fields=self.fields,
                reporter=reporter,
            )
            if schema_metadata is not None:
                dataset_snapshot.aspects.append(schema_metadata)

        mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            changeType=ChangeTypeClass.UPSERT,
            entityUrn=dataset_snapshot.urn,
            aspectName="subTypes",
            aspect=SubTypesClass(typeNames=["explore"]),
        )

        return [mce, mcp]