Ejemplo n.º 1
0
from datahub.ingestion.extractor import schema_util
from datahub.ingestion.source.sql.sql_common import (
    BasicSQLAlchemyConfig,
    SQLAlchemySource,
    register_custom_type,
)
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
    DateTypeClass,
    NullTypeClass,
    NumberTypeClass,
    SchemaField,
    TimeTypeClass,
)

register_custom_type(HiveDate, DateTypeClass)
register_custom_type(HiveTimestamp, TimeTypeClass)
register_custom_type(HiveDecimal, NumberTypeClass)


class HiveConfig(BasicSQLAlchemyConfig):
    # defaults
    scheme = "hive"

    # Hive SQLAlchemy connector returns views as tables.
    # See https://github.com/dropbox/PyHive/blob/b21c507a24ed2f2b0cf15b0b6abb1c43f31d3ee0/pyhive/sqlalchemy_hive.py#L270-L273.
    # Disabling views helps us prevent this duplication.
    include_views = False


class HiveSource(SQLAlchemySource):
Ejemplo n.º 2
0
    query = textwrap.dedent(query) + audit_log_filter

    return textwrap.dedent(query)


def get_view_definition(self, connection, view_name, schema=None, **kw):
    view = self._get_table(connection, view_name, schema)
    return view.view_query


pybigquery.sqlalchemy_bigquery.BigQueryDialect.get_view_definition = get_view_definition

# Handle the GEOGRAPHY type. We will temporarily patch the _type_map
# in the get_workunits method of the source.
GEOGRAPHY = make_sqlalchemy_type("GEOGRAPHY")
register_custom_type(GEOGRAPHY)
assert pybigquery.sqlalchemy_bigquery._type_map


class BigQueryCredential(ConfigModel):
    project_id: str
    private_key_id: str
    private_key: str
    client_email: str
    client_id: str
    auth_uri: str = "https://accounts.google.com/o/oauth2/auth"
    token_uri: str = "https://oauth2.googleapis.com/token"
    auth_provider_x509_cert_url: str = "https://www.googleapis.com/oauth2/v1/certs"
    type: str = "service_account"
    client_x509_cert_url: Optional[str]
Ejemplo n.º 3
0
    SqlWorkUnit,
    TimeTypeClass,
    register_custom_type,
)
from datahub.ingestion.source_config.sql.snowflake import SnowflakeConfig
from datahub.ingestion.source_report.sql.snowflake import SnowflakeReport
from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
    DatasetLineageTypeClass,
    UpstreamClass,
    UpstreamLineage,
)
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
from datahub.metadata.schema_classes import ChangeTypeClass, DatasetPropertiesClass

register_custom_type(custom_types.TIMESTAMP_TZ, TimeTypeClass)
register_custom_type(custom_types.TIMESTAMP_LTZ, TimeTypeClass)
register_custom_type(custom_types.TIMESTAMP_NTZ, TimeTypeClass)
register_custom_type(custom_types.VARIANT, RecordTypeClass)

logger: logging.Logger = logging.getLogger(__name__)

snowdialect.ischema_names["GEOGRAPHY"] = sqltypes.NullType


class SnowflakeSource(SQLAlchemySource):
    def __init__(self, config: SnowflakeConfig, ctx: PipelineContext):
        super().__init__(config, ctx, "snowflake")
        self._lineage_map: Optional[Dict[str, List[Tuple[str, str,
                                                         str]]]] = None
        self._external_lineage_map: Optional[Dict[str, Set[str]]] = None
Ejemplo n.º 4
0
    config_class,
    platform_name,
    support_status,
)
from datahub.ingestion.source.sql.sql_common import (
    BasicSQLAlchemyConfig,
    SQLAlchemySource,
    register_custom_type,
)
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
    ArrayTypeClass,
    BytesTypeClass,
    MapTypeClass,
)

register_custom_type(custom_types.ARRAY, ArrayTypeClass)
register_custom_type(custom_types.JSON, BytesTypeClass)
register_custom_type(custom_types.JSONB, BytesTypeClass)
register_custom_type(custom_types.HSTORE, MapTypeClass)


class PostgresConfig(BasicSQLAlchemyConfig):
    # defaults
    scheme = Field(default="postgresql+psycopg2",
                   description="database scheme")
    schema_pattern = Field(default=AllowDenyPattern(
        deny=["information_schema"]))

    def get_identifier(self: BasicSQLAlchemyConfig, schema: str,
                       table: str) -> str:
        regular = f"{schema}.{table}"
Ejemplo n.º 5
0
        duration = relativedelta(hours=1)
        if not partition_datetime:
            partition_datetime = datetime.datetime.strptime(
                partition_id, "%Y%m%d%H")
    else:
        raise ValueError(
            f"check your partition_id {partition_id}. It must be yearly/monthly/daily/hourly."
        )
    upper_bound_partition_datetime = partition_datetime + duration
    return partition_datetime, upper_bound_partition_datetime


# Handle the GEOGRAPHY type. We will temporarily patch the _type_map
# in the get_workunits method of the source.
GEOGRAPHY = make_sqlalchemy_type("GEOGRAPHY")
register_custom_type(GEOGRAPHY)
assert sqlalchemy_bigquery._types._type_map
# STRUCT is a custom sqlalchemy data type defined by the sqlalchemy_bigquery library
# https://github.com/googleapis/python-bigquery-sqlalchemy/blob/934e25f705fd9f226e438d075c7e00e495cce04e/sqlalchemy_bigquery/_types.py#L47
register_custom_type(sqlalchemy_bigquery.STRUCT, output=RecordTypeClass)


@dataclass
class BigQueryPartitionColumn:
    table_catalog: str
    table_schema: str
    table_name: str
    column_name: str
    data_type: str
    partition_id: str
Ejemplo n.º 6
0
    MapTypeClass,
    NumberTypeClass,
    RecordTypeClass,
    SchemaField,
)

if sys.version_info >= (3, 7):  # noqa: C901
    # This import verifies that the dependencies are available.
    import sqlalchemy_trino  # noqa: F401
    from sqlalchemy import exc, sql
    from sqlalchemy.engine import reflection
    from sqlalchemy.sql import sqltypes
    from sqlalchemy_trino import datatype, error
    from sqlalchemy_trino.dialect import TrinoDialect

    register_custom_type(datatype.ROW, RecordTypeClass)
    register_custom_type(datatype.MAP, MapTypeClass)
    register_custom_type(datatype.DOUBLE, NumberTypeClass)

    # Read only table names and skip view names, as view names will also be returned
    # from get_view_names
    @reflection.cache  # type: ignore
    def get_table_names(self, connection, schema: str = None, **kw):  # type: ignore
        schema = schema or self._get_default_schema_name(connection)
        if schema is None:
            raise exc.NoSuchTableError("schema is required")
        query = dedent(
            """
            SELECT "table_name"
            FROM "information_schema"."tables"
            WHERE "table_schema" = :schema and "table_type" != 'VIEW'
Ejemplo n.º 7
0
base.ischema_names["DateTime64(3)"] = DATETIME
base.ischema_names["DateTime64(4)"] = DATETIME
base.ischema_names["DateTime64(5)"] = DATETIME
base.ischema_names["DateTime64(6)"] = DATETIME
base.ischema_names["DateTime64(7)"] = DATETIME
base.ischema_names["DateTime64(8)"] = DATETIME
base.ischema_names["DateTime64(9)"] = DATETIME
base.ischema_names["Date32"] = DATE
base.ischema_names["Bool"] = BOOLEAN
base.ischema_names["Nothing"] = sqltypes.NullType
base.ischema_names["Int128"] = INTEGER
base.ischema_names["Int256"] = INTEGER
base.ischema_names["UInt128"] = INTEGER
base.ischema_names["UInt256"] = INTEGER

register_custom_type(custom_types.common.Array, ArrayTypeClass)
register_custom_type(custom_types.ip.IPv4, NumberTypeClass)
register_custom_type(custom_types.ip.IPv6, StringTypeClass)
register_custom_type(custom_types.common.Map, MapTypeClass)
register_custom_type(custom_types.common.Tuple, UnionTypeClass)


class LineageCollectorType(Enum):
    TABLE = "table"
    VIEW = "view"
    MATERIALIZED_VIEW = "materialized_view"


class LineageDatasetPlatform(Enum):
    CLICKHOUSE = "clickhouse"
Ejemplo n.º 8
0
import pymysql  # noqa: F401
from sqlalchemy.dialects.mysql import base

from datahub.ingestion.source.sql.sql_common import (
    BasicSQLAlchemyConfig,
    SQLAlchemySource,
    make_sqlalchemy_type,
    register_custom_type,
)

GEOMETRY = make_sqlalchemy_type("GEOMETRY")
POINT = make_sqlalchemy_type("POINT")
LINESTRING = make_sqlalchemy_type("LINESTRING")
POLYGON = make_sqlalchemy_type("POLYGON")

register_custom_type(GEOMETRY)
register_custom_type(POINT)
register_custom_type(LINESTRING)
register_custom_type(POLYGON)

base.ischema_names["geometry"] = GEOMETRY
base.ischema_names["point"] = POINT
base.ischema_names["linestring"] = LINESTRING
base.ischema_names["polygon"] = POLYGON


class MySQLConfig(BasicSQLAlchemyConfig):
    # defaults
    host_port = "localhost:3306"
    scheme = "mysql+pymysql"