def test_upstream_table_generation_with_temporary_table_with_multiple_temp_upstream():
    from datahub.ingestion.api.common import PipelineContext
    from datahub.ingestion.source.sql.bigquery import BigQueryConfig, BigQuerySource
    from datahub.ingestion.source.usage.bigquery_usage import BigQueryTableRef

    a: BigQueryTableRef = BigQueryTableRef(
        project="test-project", dataset="test-dataset", table="a"
    )
    b: BigQueryTableRef = BigQueryTableRef(
        project="test-project", dataset="_temp-dataset", table="b"
    )
    c: BigQueryTableRef = BigQueryTableRef(
        project="test-project", dataset="test-dataset", table="c"
    )
    d: BigQueryTableRef = BigQueryTableRef(
        project="test-project", dataset="_test-dataset", table="d"
    )
    e: BigQueryTableRef = BigQueryTableRef(
        project="test-project", dataset="test-dataset", table="e"
    )

    config = BigQueryConfig.parse_obj(
        {
            "project_id": "test-project",
        }
    )
    source = BigQuerySource(config=config, ctx=PipelineContext(run_id="test"))
    source.lineage_metadata = {
        str(a): set([str(b)]),
        str(b): set([str(c), str(d)]),
        str(d): set([str(e)]),
    }
    upstreams = source.get_upstream_tables(str(a), [])
    assert list(upstreams).sort() == [c, e].sort()
Exemple #2
0
def test_simple_upstream_table_generation():

    a: BigQueryTableRef = BigQueryTableRef(project="test-project",
                                           dataset="test-dataset",
                                           table="a")
    b: BigQueryTableRef = BigQueryTableRef(project="test-project",
                                           dataset="test-dataset",
                                           table="b")

    config = BigQueryConfig.parse_obj({
        "project_id": "test-project",
    })
    source = BigQuerySource(config=config, ctx=PipelineContext(run_id="test"))
    source.lineage_metadata = {str(a): set([str(b)])}
    upstreams = source.get_upstream_tables(str(a), [])
    assert list(upstreams) == [b]
Exemple #3
0
    def normalise_dataset_name(self, dataset_name: str) -> str:
        (project_id, schema, table) = dataset_name.split(".")

        trimmed_table_name = (BigQueryTableRef.from_spec_obj({
            "projectId": project_id,
            "datasetId": schema,
            "tableId": table
        }).remove_extras().table)
        return f"{project_id}.{schema}.{trimmed_table_name}"
Exemple #4
0
    def get_lineage_mcp(
            self, dataset_urn: str) -> Optional[MetadataChangeProposalWrapper]:
        if self.lineage_metadata is None:
            return None
        dataset_key: Optional[DatasetKey] = mce_builder.dataset_urn_to_key(
            dataset_urn)
        if dataset_key is None:
            return None
        project_id, dataset_name, tablename = dataset_key.name.split(".")
        bq_table = BigQueryTableRef(project_id, dataset_name, tablename)
        if str(bq_table) in self.lineage_metadata:
            upstream_list: List[UpstreamClass] = []
            # Sorting the list of upstream lineage events in order to avoid creating multiple aspects in backend
            # even if the lineage is same but the order is different.
            for ref_table in sorted(self.lineage_metadata[str(bq_table)]):
                upstream_table = BigQueryTableRef.from_string_name(ref_table)
                upstream_table_class = UpstreamClass(
                    mce_builder.make_dataset_urn(
                        self.platform,
                        "{project}.{database}.{table}".format(
                            project=upstream_table.project,
                            database=upstream_table.dataset,
                            table=upstream_table.table,
                        ),
                        self.config.env,
                    ),
                    DatasetLineageTypeClass.TRANSFORMED,
                )
                upstream_list.append(upstream_table_class)

            if upstream_list:
                upstream_lineage = UpstreamLineageClass(
                    upstreams=upstream_list)
                mcp = MetadataChangeProposalWrapper(
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=dataset_urn,
                    aspectName="upstreamLineage",
                    aspect=upstream_lineage,
                )
                return mcp
        return None
def test_simple_upstream_table_generation():
    from datahub.ingestion.api.common import PipelineContext
    from datahub.ingestion.source.sql.bigquery import BigQueryConfig, BigQuerySource
    from datahub.ingestion.source.usage.bigquery_usage import BigQueryTableRef

    a: BigQueryTableRef = BigQueryTableRef(
        project="test-project", dataset="test-dataset", table="a"
    )
    b: BigQueryTableRef = BigQueryTableRef(
        project="test-project", dataset="test-dataset", table="b"
    )

    config = BigQueryConfig.parse_obj(
        {
            "project_id": "test-project",
        }
    )
    source = BigQuerySource(config=config, ctx=PipelineContext(run_id="test"))
    source.lineage_metadata = {str(a): set([str(b)])}
    upstreams = source.get_upstream_tables(str(a), [])
    assert list(upstreams) == [b]
Exemple #6
0
    def get_lineage_mcp(
            self, dataset_urn: str) -> Optional[MetadataChangeProposalWrapper]:
        if self.lineage_metadata is None:
            logger.debug("No lineage metadata so skipping getting mcp")
            return None
        dataset_key: Optional[DatasetKey] = mce_builder.dataset_urn_to_key(
            dataset_urn)
        if dataset_key is None:
            logger.debug(
                f"No dataset_key for {dataset_urn} so skipping getting mcp")
            return None
        project_id, dataset_name, tablename = dataset_key.name.split(".")
        bq_table = BigQueryTableRef(project_id, dataset_name, tablename)
        if str(bq_table) in self.lineage_metadata:
            upstream_list: List[UpstreamClass] = []
            # Sorting the list of upstream lineage events in order to avoid creating multiple aspects in backend
            # even if the lineage is same but the order is different.
            for upstream_table in sorted(
                    self.get_upstream_tables(str(bq_table), tables_seen=[])):
                upstream_table_class = UpstreamClass(
                    mce_builder.make_dataset_urn_with_platform_instance(
                        self.platform,
                        "{project}.{database}.{table}".format(
                            project=upstream_table.project,
                            database=upstream_table.dataset,
                            table=upstream_table.table,
                        ),
                        self.config.platform_instance,
                        self.config.env,
                    ),
                    DatasetLineageTypeClass.TRANSFORMED,
                )
                if self.config.upstream_lineage_in_report:
                    current_lineage_map: Set = self.report.upstream_lineage.get(
                        str(bq_table), set())
                    current_lineage_map.add(str(upstream_table))
                    self.report.upstream_lineage[str(
                        bq_table)] = current_lineage_map
                upstream_list.append(upstream_table_class)

            if upstream_list:
                upstream_lineage = UpstreamLineageClass(
                    upstreams=upstream_list)
                mcp = MetadataChangeProposalWrapper(
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=dataset_urn,
                    aspectName="upstreamLineage",
                    aspect=upstream_lineage,
                )
                return mcp
        return None
Exemple #7
0
 def get_identifier(
     self,
     *,
     schema: str,
     entity: str,
     inspector: Inspector,
     **kwargs: Any,
 ) -> str:
     assert inspector
     project_id = self._get_project_id(inspector)
     table_name = BigQueryTableRef.from_spec_obj({
         "projectId": project_id,
         "datasetId": schema,
         "tableId": entity
     }).table
     return f"{project_id}.{schema}.{table_name}"
Exemple #8
0
 def get_upstream_tables(
         self,
         bq_table: str,
         tables_seen: List[str] = []) -> Set[BigQueryTableRef]:
     upstreams: Set[BigQueryTableRef] = set()
     assert self.lineage_metadata
     for ref_table in self.lineage_metadata[str(bq_table)]:
         upstream_table = BigQueryTableRef.from_string_name(ref_table)
         if upstream_table.is_temporary_table():
             # making sure we don't process a table twice and not get into a recurisve loop
             if ref_table in tables_seen:
                 logger.debug(
                     f"Skipping table {ref_table} because it was seen already"
                 )
                 continue
             tables_seen.append(ref_table)
             if ref_table in self.lineage_metadata:
                 upstreams = upstreams.union(
                     self.get_upstream_tables(ref_table,
                                              tables_seen=tables_seen))
         else:
             upstreams.add(upstream_table)
     return upstreams
def test_remove_extras(test_input, expected):
    table_ref = BigQueryTableRef("test_project", "test_dataset", test_input)
    assert table_ref.remove_extras().table == expected