Example #1
0
    def test_get_column_type_contains_key(self):

        field_type = "char"
        data_type = get_column_type(self.glue_source, field_type, "a_table", "a_field")
        self.assertEqual(
            data_type.to_obj(), SchemaFieldDataType(type=StringTypeClass()).to_obj()
        )
Example #2
0
    def infer_schema(self, file: IO[bytes]) -> List[SchemaField]:

        datastore = ujson.load(file)

        if not isinstance(datastore, list):
            datastore = [datastore]

        schema = construct_schema(datastore, delimiter=".")
        fields: List[SchemaField] = []

        for schema_field in sorted(schema.values(),
                                   key=lambda x: x["delimited_name"]):
            mapped_type = _field_type_mapping.get(schema_field["type"],
                                                  NullTypeClass)

            native_type = schema_field["type"]

            if isinstance(native_type, type):
                native_type = native_type.__name__

            field = SchemaField(
                fieldPath=schema_field["delimited_name"],
                nativeDataType=native_type,
                type=SchemaFieldDataType(type=mapped_type()),
                nullable=schema_field["nullable"],
                recursive=False,
            )
            fields.append(field)

        return fields
Example #3
0
    def get_schema_metadata_for_custom_sql(
            self, columns: List[dict]) -> Optional[SchemaMetadata]:
        schema_metadata = None
        for field in columns:
            # Datasource fields
            fields = []
            nativeDataType = field.get("remoteType", "UNKNOWN")
            TypeClass = FIELD_TYPE_MAPPING.get(nativeDataType, NullTypeClass)
            schema_field = SchemaField(
                fieldPath=field.get("name", ""),
                type=SchemaFieldDataType(type=TypeClass()),
                nativeDataType=nativeDataType,
                description=field.get("description", ""),
            )
            fields.append(schema_field)

            schema_metadata = SchemaMetadata(
                schemaName="test",
                platform=f"urn:li:dataPlatform:{self.platform}",
                version=0,
                fields=fields,
                hash="",
                platformSchema=OtherSchema(rawSchema=""),
            )
        return schema_metadata
Example #4
0
def test_get_column_type_contains_map():

    field_type = "map_hehe"
    data_type = get_column_type(glue_source(), field_type, "a_table",
                                "a_field")
    assert data_type.to_obj() == SchemaFieldDataType(
        type=MapTypeClass()).to_obj()
Example #5
0
def test_get_column_type_contains_set():

    field_type = "set_yolo"
    data_type = get_column_type(glue_source(), field_type, "a_table",
                                "a_field")
    assert data_type.to_obj() == SchemaFieldDataType(
        type=ArrayTypeClass()).to_obj()
Example #6
0
    def test_get_column_type_contains_map(self):

        field_type = "map_hehe"
        data_type = get_column_type(self.glue_source, field_type, "a_table", "a_field")
        self.assertEqual(
            data_type.to_obj(), SchemaFieldDataType(type=MapTypeClass()).to_obj()
        )
Example #7
0
def test_get_column_type_contains_key():

    field_type = "char"
    data_type = get_column_type(glue_source(), field_type, "a_table",
                                "a_field")
    assert data_type.to_obj() == SchemaFieldDataType(
        type=StringTypeClass()).to_obj()
Example #8
0
    def test_get_column_type_contains_set(self):

        field_type = "set_yolo"
        data_type = get_column_type(self.glue_source, field_type, "a_table", "a_field")
        self.assertEqual(
            data_type.to_obj(), SchemaFieldDataType(type=ArrayTypeClass()).to_obj()
        )
Example #9
0
def get_column_type(
    report: SourceReport, dataset_name: str, column_type: str
) -> SchemaFieldDataType:
    """
    Maps known DBT types to datahub types
    """
    column_type_stripped = ""

    pattern = re.compile(r"[\w ]+")  # drop all non alphanumerics
    match = pattern.match(column_type)
    if match is not None:
        column_type_stripped = match.group()

    TypeClass: Any = None
    for key in _field_type_mapping.keys():
        if key == column_type_stripped:
            TypeClass = _field_type_mapping[column_type_stripped]
            break

    if TypeClass is None:
        report.report_warning(
            dataset_name, f"unable to map type {column_type} to metadata schema"
        )
        TypeClass = NullTypeClass

    return SchemaFieldDataType(type=TypeClass())
Example #10
0
def get_column_type(
    sql_report: SQLSourceReport, dataset_name: str, column_type: Any
) -> SchemaFieldDataType:
    """
    Maps SQLAlchemy types (https://docs.sqlalchemy.org/en/13/core/type_basics.html) to corresponding schema types
    """

    TypeClass: Optional[Type] = None
    for sql_type in _field_type_mapping.keys():
        if isinstance(column_type, sql_type):
            TypeClass = _field_type_mapping[sql_type]
            break
    if TypeClass is None:
        for sql_type in _known_unknown_field_types:
            if isinstance(column_type, sql_type):
                TypeClass = NullTypeClass
                break

    if TypeClass is None:
        sql_report.report_warning(
            dataset_name, f"unable to map type {column_type!r} to metadata schema"
        )
        TypeClass = NullTypeClass

    return SchemaFieldDataType(type=TypeClass())
Example #11
0
 def _get_column_type(self, field_type: Union[str,
                                              dict]) -> SchemaFieldDataType:
     tp = field_type
     if hasattr(tp, "type"):
         tp = tp.type  # type: ignore
     tp = str(tp)
     TypeClass: Any = self.field_type_mapping.get(tp)
     dt = SchemaFieldDataType(type=TypeClass())
     return dt
Example #12
0
def create_metadata_work_unit(timestamp):
    dataset_snapshot = DatasetSnapshot(
        urn="urn:li:dataset:(urn:li:dataPlatform:glue,datalake_grilled.Barbeque,PROD)",
        aspects=[],
    )

    dataset_snapshot.aspects.append(Status(removed=False))

    dataset_snapshot.aspects.append(
        OwnershipClass(
            owners=[
                OwnerClass(
                    owner="urn:li:corpuser:Susan", type=OwnershipTypeClass.DATAOWNER
                )
            ],
            lastModified=AuditStampClass(
                time=timestamp, actor="urn:li:corpuser:datahub"
            ),
        )
    )

    dataset_snapshot.aspects.append(
        DatasetPropertiesClass(
            description="Grilled Food",
            customProperties={},
            uri=None,
            tags=[],
        )
    )

    fields = [
        SchemaField(
            fieldPath="Size",
            nativeDataType="int",
            type=SchemaFieldDataType(type=NumberTypeClass()),
            description="Maximum attendees permitted",
            nullable=True,
            recursive=False,
        )
    ]

    schema_metadata = SchemaMetadata(
        schemaName="datalake_grilled.Barbeque",
        version=0,
        fields=fields,
        platform="urn:li:dataPlatform:glue",
        created=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"),
        lastModified=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"),
        hash="",
        platformSchema=MySqlDDL(tableSchema=""),
    )
    dataset_snapshot.aspects.append(schema_metadata)

    mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
    return MetadataWorkUnit(id="glue-datalake_grilled.Barbeque", mce=mce)
Example #13
0
def get_column_type(
    glue_source: GlueSource, field_type: str, table_name: str, field_name: str
) -> SchemaFieldDataType:
    field_type_mapping = {
        "array": ArrayTypeClass,
        "bigint": NumberTypeClass,
        "binary": BytesTypeClass,
        "boolean": BooleanTypeClass,
        "char": StringTypeClass,
        "date": DateTypeClass,
        "decimal": NumberTypeClass,
        "double": NumberTypeClass,
        "float": NumberTypeClass,
        "int": NumberTypeClass,
        "integer": NumberTypeClass,
        "interval": TimeTypeClass,
        "long": NumberTypeClass,
        "map": MapTypeClass,
        "null": NullTypeClass,
        "set": ArrayTypeClass,
        "smallint": NumberTypeClass,
        "string": StringTypeClass,
        "struct": MapTypeClass,
        "timestamp": TimeTypeClass,
        "tinyint": NumberTypeClass,
        "union": UnionTypeClass,
        "varchar": StringTypeClass,
    }

    field_starts_type_mapping = {
        "array": ArrayTypeClass,
        "set": ArrayTypeClass,
        "map": MapTypeClass,
        "struct": MapTypeClass,
        "varchar": StringTypeClass,
        "decimal": NumberTypeClass,
    }

    type_class = None
    if field_type in field_type_mapping:
        type_class = field_type_mapping[field_type]
    else:
        for key in field_starts_type_mapping:
            if field_type.startswith(key):
                type_class = field_starts_type_mapping[key]
                break

    if type_class is None:
        glue_source.report.report_warning(
            field_type,
            f"The type '{field_type}' is not recognised for field '{field_name}' in table '{table_name}', setting as StringTypeClass.",
        )
        type_class = StringTypeClass
    data_type = SchemaFieldDataType(type=type_class())
    return data_type
Example #14
0
def _get_column_type(field_type) -> SchemaFieldDataType:
    tp = field_type
    if hasattr(tp, "type"):
        tp = tp.type
    tp = str(tp)
    TypeClass: Any = _field_type_mapping.get(tp)
    # Note: we could populate the nestedTypes field for unions and similar fields
    # for the other types as well. However, since we already populate the nativeDataType
    # field below, it is mostly ok to leave this as not fully initialized.
    dt = SchemaFieldDataType(type=TypeClass())
    return dt
Example #15
0
def test_get_column_type_not_contained():

    glue_source_instance = glue_source()

    field_type = "bad_column_type"
    data_type = get_column_type(glue_source_instance, field_type, "a_table", "a_field")
    assert data_type.to_obj() == SchemaFieldDataType(type=StringTypeClass()).to_obj()
    assert glue_source_instance.report.warnings["bad_column_type"] == [
        "The type 'bad_column_type' is not recognised for field 'a_field' in table 'a_table', "
        "setting as StringTypeClass."
    ]
Example #16
0
    def get_column_type(elastic_column_type: str) -> SchemaFieldDataType:

        type_class: Optional[
            Type] = ElasticToSchemaFieldConverter._field_type_to_schema_field_type.get(
                elastic_column_type)
        if type_class is None:
            logger.warning(
                f"Cannot map {elastic_column_type!r} to SchemaFieldDataType, using NullTypeClass."
            )
            type_class = NullTypeClass

        return SchemaFieldDataType(type=type_class())
Example #17
0
 def _get_column_type(self, field_type: Union[str, dict],
                      logical_type: str) -> SchemaFieldDataType:
     tp = field_type
     if hasattr(tp, "type"):
         tp = tp.type  # type: ignore
     tp = str(tp)
     TypeClass: Any = self.field_type_mapping.get(tp)
     if logical_type is not None:
         TypeClass = self.field_logical_type_mapping.get(
             logical_type, TypeClass)
     dt = SchemaFieldDataType(type=TypeClass())
     return dt
Example #18
0
    def test_get_column_type_not_contained(self):

        field_type = "bad_column_type"
        data_type = get_column_type(self.glue_source, field_type, "a_table", "a_field")
        self.assertEqual(
            data_type.to_obj(), SchemaFieldDataType(type=StringTypeClass()).to_obj()
        )
        self.assertEqual(
            self.glue_source.report.warnings["bad_column_type"],
            [
                "The type 'bad_column_type' is not recognised for field 'a_field' in table 'a_table', "
                "setting as StringTypeClass."
            ],
        )
Example #19
0
    def emit_upstream_tables(self) -> Iterable[MetadataWorkUnit]:
        for (table_urn, (columns, path, is_embedded)) in self.upstream_tables.items():
            if not is_embedded and not self.config.ingest_tables_external:
                logger.error(
                    f"Skipping external table {table_urn} as ingest_tables_external is set to False"
                )
                continue

            dataset_snapshot = DatasetSnapshot(
                urn=table_urn,
                aspects=[],
            )
            if path:
                # Browse path
                browse_paths = BrowsePathsClass(
                    paths=[f"/{self.config.env.lower()}/{self.platform}/{path}"]
                )
                dataset_snapshot.aspects.append(browse_paths)
            else:
                logger.debug(f"Browse path not set for table {table_urn}")
            schema_metadata = None
            if columns:
                fields = []
                for field in columns:
                    nativeDataType = field.get("remoteType", "UNKNOWN")
                    TypeClass = FIELD_TYPE_MAPPING.get(nativeDataType, NullTypeClass)

                    schema_field = SchemaField(
                        fieldPath=field["name"],
                        type=SchemaFieldDataType(type=TypeClass()),
                        description="",
                        nativeDataType=nativeDataType,
                    )

                    fields.append(schema_field)

                schema_metadata = SchemaMetadata(
                    schemaName="test",
                    platform=f"urn:li:dataPlatform:{self.platform}",
                    version=0,
                    fields=fields,
                    hash="",
                    platformSchema=OtherSchema(rawSchema=""),
                )
            if schema_metadata is not None:
                dataset_snapshot.aspects.append(schema_metadata)

            yield self.get_metadata_change_event(dataset_snapshot)
Example #20
0
    def get_column_type(data_type: str) -> SchemaFieldDataType:
        type_class: Type = NullTypeClass
        if CatalogSource.int_pattern.match(data_type) is not None:
            type_class = NumberTypeClass
        elif CatalogSource.text_pattern.match(data_type) is not None:
            type_class = StringTypeClass
        elif CatalogSource.byte_pattern.match(data_type) is not None:
            type_class = BytesTypeClass
        elif CatalogSource.date_pattern.match(data_type) is not None:
            type_class = DateTypeClass
        elif CatalogSource.time_pattern.match(data_type) is not None:
            type_class = TimeTypeClass
        elif CatalogSource.timestamp_pattern.match(data_type) is not None:
            type_class = TimeTypeClass

        return SchemaFieldDataType(type=type_class())
Example #21
0
def get_table_schema_fields(table: Table, max_rows: int) -> List[SchemaField]:
    table.infer(limit=max_rows)

    fields: List[SchemaField] = []

    for raw_field in table.schema.fields:
        mapped_type: Type = tableschema_type_map.get(raw_field.type,
                                                     NullTypeClass)

        field = SchemaField(
            fieldPath=raw_field.name,
            type=SchemaFieldDataType(mapped_type()),
            nativeDataType=str(raw_field.type),
            recursive=False,
        )
        fields.append(field)

    return fields
Example #22
0
    def _get_field_type(self, native_type: str) -> SchemaFieldDataType:
        field_type_mapping = {
            "date": DateTypeClass,
            "date_time": TimeTypeClass,
            "distance": NumberTypeClass,
            "duration": NumberTypeClass,
            "location": UnionTypeClass,
            "number": NumberTypeClass,
            "string": StringTypeClass,
            "tier": EnumTypeClass,
            "time": TimeTypeClass,
            "unquoted": StringTypeClass,
            "yesno": BooleanTypeClass,
            "zipcode": EnumTypeClass,
            "int": NumberTypeClass,
            "average": NumberTypeClass,
            "average_distinct": NumberTypeClass,
            "count": NumberTypeClass,
            "count_distinct": NumberTypeClass,
            "list": ArrayTypeClass,
            "max": NumberTypeClass,
            "median": NumberTypeClass,
            "median_distinct": NumberTypeClass,
            "min": NumberTypeClass,
            "percent_of_previous": NumberTypeClass,
            "percent_of_total": NumberTypeClass,
            "percentile": NumberTypeClass,
            "percentile_distinct": NumberTypeClass,
            "running_total": NumberTypeClass,
            "sum": NumberTypeClass,
            "sum_distinct": NumberTypeClass,
        }

        if native_type in field_type_mapping:
            type_class = field_type_mapping[native_type]
        else:
            self.reporter.report_warning(
                native_type,
                f"The type '{native_type}' is not recognised for field type, setting as NullTypeClass.",
            )
            type_class = NullTypeClass
        data_type = SchemaFieldDataType(type=type_class())
        return data_type
Example #23
0
    def _get_field_type(self, native_type: str) -> SchemaFieldDataType:

        type_class = field_type_mapping.get(native_type)

        if type_class is None:

            # attempt Postgres modified type
            type_class = resolve_postgres_modified_type(native_type)

        # if still not found, report a warning
        if type_class is None:
            self.reporter.report_warning(
                native_type,
                f"The type '{native_type}' is not recognized for field type, setting as NullTypeClass.",
            )
            type_class = NullTypeClass

        data_type = SchemaFieldDataType(type=type_class())
        return data_type
Example #24
0
    def _get_field_type(native_type: str,
                        reporter: SourceReport) -> SchemaFieldDataType:

        type_class = LookerUtil.field_type_mapping.get(native_type)

        if type_class is None:

            # attempt Postgres modified type
            type_class = resolve_postgres_modified_type(native_type)

        # if still not found, log and continue
        if type_class is None:
            logger.info(
                f"The type '{native_type}' is not recognized for field type, setting as NullTypeClass.",
            )
            type_class = NullTypeClass

        data_type = SchemaFieldDataType(type=type_class())
        return data_type
Example #25
0
def _get_column_type(descriptor: DescriptorBase) -> SchemaFieldDataType:
    native_type: str = _get_simple_native_type(descriptor)
    type_class: Any
    if getattr(descriptor, "label", None) == FieldDescriptor.LABEL_REPEATED:
        type_class = ArrayTypeClass(nestedType=[native_type])
    elif getattr(descriptor, "type", None) == FieldDescriptor.TYPE_ENUM:
        type_class = EnumTypeClass()
    #
    # TODO: Find a better way to detect maps
    #
    # elif simple_type == "map":
    #    type_class = MapTypeClass(
    #        keyType=descriptor.key_type,
    #        valueType=descriptor.val_type,
    #    )
    else:
        type_class = _native_type_to_typeclass.get(native_type, RecordTypeClass)()

    return SchemaFieldDataType(type=type_class)
Example #26
0
def get_column_type(report: SourceReport, dataset_name: str,
                    column_type: str) -> SchemaFieldDataType:
    """
    Maps known DBT types to datahub types
    """
    TypeClass: Any = _field_type_mapping.get(column_type)

    if TypeClass is None:
        # attempt Postgres modified type
        TypeClass = resolve_postgres_modified_type(column_type)

    # if still not found, report the warning
    if TypeClass is None:
        report.report_warning(
            dataset_name,
            f"unable to map type {column_type} to metadata schema")
        TypeClass = NullTypeClass

    return SchemaFieldDataType(type=TypeClass())
Example #27
0
    def _get_schema_metadata_for_datasource(
        self, datasource_fields: List[dict]
    ) -> Optional[SchemaMetadata]:
        fields = []
        schema_metadata = None
        for field in datasource_fields:
            # check datasource - custom sql relations from a field being referenced
            self._track_custom_sql_ids(field)

            nativeDataType = field.get("dataType", "UNKNOWN")
            TypeClass = FIELD_TYPE_MAPPING.get(nativeDataType, NullTypeClass)

            schema_field = SchemaField(
                fieldPath=field["name"],
                type=SchemaFieldDataType(type=TypeClass()),
                description=make_description_from_params(
                    field.get("description", ""), field.get("formula")
                ),
                nativeDataType=nativeDataType,
                globalTags=get_tags_from_params(
                    [
                        field.get("role", ""),
                        field.get("__typename", ""),
                        field.get("aggregation", ""),
                    ]
                )
                if self.config.ingest_tags
                else None,
            )
            fields.append(schema_field)

        if fields:
            schema_metadata = SchemaMetadata(
                schemaName="test",
                platform=f"urn:li:dataPlatform:{self.platform}",
                version=0,
                fields=fields,
                hash="",
                platformSchema=OtherSchema(rawSchema=""),
            )

        return schema_metadata
Example #28
0
def get_column_type(report: SourceReport, dataset_name: str,
                    column_type: str) -> SchemaFieldDataType:
    """
    Maps known Spark types to datahub types
    """
    TypeClass: Any = None

    for field_type, type_class in _field_type_mapping.items():
        if isinstance(column_type, field_type):
            TypeClass = type_class
            break

    # if still not found, report the warning
    if TypeClass is None:
        report.report_warning(
            dataset_name,
            f"unable to map type {column_type} to metadata schema")
        TypeClass = NullTypeClass

    return SchemaFieldDataType(type=TypeClass())
Example #29
0
    def infer_schema(self, file: IO[bytes]) -> List[SchemaField]:
        # infer schema of a parquet file without reading the whole file

        # read the first line of the file
        schema = pyarrow.parquet.read_schema(file, memory_map=True)

        fields: List[SchemaField] = []

        for name, pyarrow_type in zip(schema.names, schema.types):
            mapped_type = map_pyarrow_type(pyarrow_type)

            field = SchemaField(
                fieldPath=name,
                type=SchemaFieldDataType(mapped_type()),
                nativeDataType=str(pyarrow_type),
                recursive=False,
            )

            fields.append(field)

        return fields
Example #30
0
    def get_field_type(self, field_type: Union[Type, str],
                       collection_name: str) -> SchemaFieldDataType:
        """
        Maps types encountered in PyMongo to corresponding schema types.

        Parameters
        ----------
            field_type:
                type of a Python object
            collection_name:
                name of collection (for logging)
        """
        TypeClass: Optional[Type] = _field_type_mapping.get(field_type)

        if TypeClass is None:
            self.report.report_warning(
                collection_name,
                f"unable to map type {field_type} to metadata schema")
            TypeClass = NullTypeClass

        return SchemaFieldDataType(type=TypeClass())