Example #1
0
def avro_schema_to_mce_fields(avro_schema_string: str) -> List[SchemaField]:
    """Converts an avro schema into a schema compatible with MCE"""

    # Handle some library compatability issues.
    if hasattr(avro.schema, "parse"):
        schema_parse_fn = avro.schema.parse
    else:
        schema_parse_fn = avro.schema.Parse

    parsed_schema: avro.schema.RecordSchema = schema_parse_fn(avro_schema_string)

    fields: List[SchemaField] = []
    for parsed_field in parsed_schema.fields:
        field = SchemaField(
            fieldPath=parsed_field.name,
            nativeDataType=str(parsed_field.type),
            type=_get_column_type(parsed_field.type),
            description=parsed_field.props.get("doc", None),
            recursive=False,
            nullable=(parsed_field.type == "null"),
        )

        fields.append(field)

    return fields
Example #2
0
    def get_schema_metadata_for_custom_sql(
            self, columns: List[dict]) -> Optional[SchemaMetadata]:
        schema_metadata = None
        for field in columns:
            # Datasource fields
            fields = []
            nativeDataType = field.get("remoteType", "UNKNOWN")
            TypeClass = FIELD_TYPE_MAPPING.get(nativeDataType, NullTypeClass)
            schema_field = SchemaField(
                fieldPath=field.get("name", ""),
                type=SchemaFieldDataType(type=TypeClass()),
                nativeDataType=nativeDataType,
                description=field.get("description", ""),
            )
            fields.append(schema_field)

            schema_metadata = SchemaMetadata(
                schemaName="test",
                platform=f"urn:li:dataPlatform:{self.platform}",
                version=0,
                fields=fields,
                hash="",
                platformSchema=OtherSchema(rawSchema=""),
            )
        return schema_metadata
Example #3
0
def get_schema_metadata(report: SourceReport, node: DBTNode,
                        platform: str) -> SchemaMetadata:
    canonical_schema: List[SchemaField] = []
    for column in node.columns:
        field = SchemaField(
            fieldPath=column.name,
            nativeDataType=column.data_type,
            type=get_column_type(report, node.dbt_name, column.data_type),
            description=column.comment,
            nullable=False,  # TODO: actually autodetect this
            recursive=False,
        )

        canonical_schema.append(field)

    actor, sys_time = "urn:li:corpuser:dbt_executor", int(time.time() * 1000)

    last_modified = sys_time

    if node.max_loaded_at is not None:
        last_modified = int(
            dateutil.parser.parse(node.max_loaded_at).timestamp() * 1000)

    return SchemaMetadata(
        schemaName=node.dbt_name,
        platform=f"urn:li:dataPlatform:{platform}",
        version=0,
        hash="",
        platformSchema=MySqlDDL(tableSchema=""),
        created=AuditStamp(time=sys_time, actor=actor),
        lastModified=AuditStamp(time=last_modified, actor=actor),
        fields=canonical_schema,
    )
Example #4
0
 def _get_schema_fields(
     self,
     elastic_schema_dict: Dict[str,
                               Any]) -> Generator[SchemaField, None, None]:
     # append each schema field (sort so output is consistent)
     for columnName, column in elastic_schema_dict.items():
         elastic_type: Optional[str] = column.get("type")
         nested_props: Optional[Dict[str, Any]] = column.get("properties")
         if elastic_type is not None:
             self._prefix_name_stack.append(
                 f"[type={elastic_type}].{columnName}")
             schema_field_data_type = self.get_column_type(elastic_type)
             schema_field = SchemaField(
                 fieldPath=self._get_cur_field_path(),
                 nativeDataType=elastic_type,
                 type=schema_field_data_type,
                 description=None,
                 nullable=True,
                 recursive=False,
             )
             yield schema_field
             self._prefix_name_stack.pop()
         elif nested_props:
             self._prefix_name_stack.append(f"[type={columnName}]")
             yield from self._get_schema_fields(nested_props)
             self._prefix_name_stack.pop()
         else:
             # Unexpected! Log a warning.
             logger.warning(
                 f"Elastic schema does not have either 'type' or 'properties'!"
                 f" Schema={json.dumps(elastic_schema_dict)}")
             continue
Example #5
0
    def infer_schema(self, file: IO[bytes]) -> List[SchemaField]:

        datastore = ujson.load(file)

        if not isinstance(datastore, list):
            datastore = [datastore]

        schema = construct_schema(datastore, delimiter=".")
        fields: List[SchemaField] = []

        for schema_field in sorted(schema.values(),
                                   key=lambda x: x["delimited_name"]):
            mapped_type = _field_type_mapping.get(schema_field["type"],
                                                  NullTypeClass)

            native_type = schema_field["type"]

            if isinstance(native_type, type):
                native_type = native_type.__name__

            field = SchemaField(
                fieldPath=schema_field["delimited_name"],
                nativeDataType=native_type,
                type=SchemaFieldDataType(type=mapped_type()),
                nullable=schema_field["nullable"],
                recursive=False,
            )
            fields.append(field)

        return fields
Example #6
0
 def _get_fields_and_primary_keys(
     view_fields: List[ViewField],
     reporter: SourceReport,
     tag_measures_and_dimensions: bool = True,
 ) -> Tuple[List[SchemaField], List[str]]:
     primary_keys: List = []
     fields = []
     for field in view_fields:
         schema_field = SchemaField(
             fieldPath=field.name,
             type=LookerUtil._get_field_type(field.type, reporter),
             nativeDataType=field.type,
             description=f"{field.description}"
             if tag_measures_and_dimensions is True
             else f"{field.field_type.value}. {field.description}",
             globalTags=LookerUtil._get_tags_from_field_type(
                 field.field_type, reporter
             )
             if tag_measures_and_dimensions is True
             else None,
             isPartOfKey=field.is_primary_key,
         )
         fields.append(schema_field)
         if field.is_primary_key:
             primary_keys.append(schema_field.fieldPath)
     return fields, primary_keys
Example #7
0
def set_metadata(dataset_name: str,
                 fields: List,
                 platform: str = "api") -> SchemaMetadata:
    canonical_schema: List[SchemaField] = []

    for column in fields:
        field = SchemaField(
            fieldPath=column,
            nativeDataType="str",
            type=SchemaFieldDataTypeClass(type=StringTypeClass()),
            description="",
            recursive=False,
        )
        canonical_schema.append(field)

    actor = "urn:li:corpuser:etl"
    sys_time = int(time.time() * 1000)
    schema_metadata = SchemaMetadata(
        schemaName=dataset_name,
        platform=f"urn:li:dataPlatform:{platform}",
        version=0,
        hash="",
        platformSchema=OtherSchemaClass(rawSchema=""),
        created=AuditStamp(time=sys_time, actor=actor),
        lastModified=AuditStamp(time=sys_time, actor=actor),
        fields=canonical_schema,
    )
    return schema_metadata
Example #8
0
 def get_schema_metadata(glue_source: GlueSource) -> SchemaMetadata:
     schema = table["StorageDescriptor"]["Columns"]
     fields: List[SchemaField] = []
     for field in schema:
         schema_field = SchemaField(
             fieldPath=field["Name"],
             nativeDataType=field["Type"],
             type=get_column_type(
                 glue_source, field["Type"], table_name, field["Name"]
             ),
             description=field.get("Comment"),
             recursive=False,
             nullable=True,
         )
         fields.append(schema_field)
     return SchemaMetadata(
         schemaName=table_name,
         version=0,
         fields=fields,
         platform="urn:li:dataPlatform:glue",
         created=AuditStamp(time=sys_time, actor="urn:li:corpuser:etl"),
         lastModified=AuditStamp(time=sys_time, actor="urn:li:corpuser:etl"),
         hash="",
         platformSchema=MySqlDDL(tableSchema=""),
     )
Example #9
0
def get_schema_metadata(sql_report: SQLSourceReport, dataset_name: str,
                        platform: str, columns: List[dict]) -> SchemaMetadata:
    canonical_schema: List[SchemaField] = []
    for column in columns:
        field = SchemaField(
            fieldPath=column["name"],
            type=get_column_type(sql_report, dataset_name, column["type"]),
            nativeDataType=column.get("full_type", repr(column["type"])),
            description=column.get("comment", None),
            nullable=column["nullable"],
            recursive=False,
        )
        canonical_schema.append(field)

    actor = "urn:li:corpuser:etl"
    sys_time = get_sys_time()
    schema_metadata = SchemaMetadata(
        schemaName=dataset_name,
        platform=f"urn:li:dataPlatform:{platform}",
        version=0,
        hash="",
        platformSchema=MySqlDDL(tableSchema=""),
        created=AuditStamp(time=sys_time, actor=actor),
        lastModified=AuditStamp(time=sys_time, actor=actor),
        fields=canonical_schema,
    )
    return schema_metadata
Example #10
0
def _create_schema_field(path: List[str], field: FieldDescriptor) -> _PathAndField:
    field_path = ".".join(path)
    schema_field = SchemaField(
        fieldPath=".".join(path),
        nativeDataType=_get_simple_native_type(field),
        # Protobuf field are always nullable
        nullable=True,
        type=_get_column_type(field),
    )
    return _PathAndField(field_path, schema_field)
Example #11
0
 def get_schema_fields_for_column(
         self,
         dataset_name: str,
         column: dict,
         pk_constraints: dict = None) -> List[SchemaField]:
     field = SchemaField(
         fieldPath=column["name"],
         type=get_column_type(self.report, dataset_name, column["type"]),
         nativeDataType=column.get("full_type", repr(column["type"])),
         description=column.get("comment", None),
         nullable=column["nullable"],
         recursive=False,
     )
     if (pk_constraints is not None and isinstance(
             pk_constraints, dict)  # some dialects (hive) return list
             and column["name"] in pk_constraints.get(
                 "constrained_columns", [])):
         field.isPartOfKey = True
     return [field]
Example #12
0
def create_metadata_work_unit(timestamp):
    dataset_snapshot = DatasetSnapshot(
        urn="urn:li:dataset:(urn:li:dataPlatform:glue,datalake_grilled.Barbeque,PROD)",
        aspects=[],
    )

    dataset_snapshot.aspects.append(Status(removed=False))

    dataset_snapshot.aspects.append(
        OwnershipClass(
            owners=[
                OwnerClass(
                    owner="urn:li:corpuser:Susan", type=OwnershipTypeClass.DATAOWNER
                )
            ],
            lastModified=AuditStampClass(
                time=timestamp, actor="urn:li:corpuser:datahub"
            ),
        )
    )

    dataset_snapshot.aspects.append(
        DatasetPropertiesClass(
            description="Grilled Food",
            customProperties={},
            uri=None,
            tags=[],
        )
    )

    fields = [
        SchemaField(
            fieldPath="Size",
            nativeDataType="int",
            type=SchemaFieldDataType(type=NumberTypeClass()),
            description="Maximum attendees permitted",
            nullable=True,
            recursive=False,
        )
    ]

    schema_metadata = SchemaMetadata(
        schemaName="datalake_grilled.Barbeque",
        version=0,
        fields=fields,
        platform="urn:li:dataPlatform:glue",
        created=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"),
        lastModified=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"),
        hash="",
        platformSchema=MySqlDDL(tableSchema=""),
    )
    dataset_snapshot.aspects.append(schema_metadata)

    mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
    return MetadataWorkUnit(id="glue-datalake_grilled.Barbeque", mce=mce)
Example #13
0
def get_schema_metadata(report: SourceReport, node: DBTNode,
                        platform: str) -> SchemaMetadata:
    canonical_schema: List[SchemaField] = []
    for column in node.columns:

        description = None

        if (column.comment and column.description
                and column.comment != column.description):
            description = f"{platform} comment: {column.comment}\n\ndbt model description: {column.description}"
        elif column.comment:
            description = column.comment
        elif column.description:
            description = column.description

        globalTags = None
        if column.tags:
            globalTags = GlobalTagsClass(tags=[
                TagAssociationClass(f"urn:li:tag:{tag}") for tag in column.tags
            ])

        field = SchemaField(
            fieldPath=column.name,
            nativeDataType=column.data_type,
            type=get_column_type(report, node.dbt_name, column.data_type),
            description=description,
            nullable=False,  # TODO: actually autodetect this
            recursive=False,
            globalTags=globalTags,
        )

        canonical_schema.append(field)

    last_modified = None
    if node.max_loaded_at is not None:
        actor = "urn:li:corpuser:dbt_executor"
        last_modified = AuditStamp(
            time=int(
                dateutil.parser.parse(node.max_loaded_at).timestamp() * 1000),
            actor=actor,
        )

    description = None

    return SchemaMetadata(
        schemaName=node.dbt_name,
        platform=f"urn:li:dataPlatform:{platform}",
        version=0,
        hash="",
        platformSchema=MySqlDDL(tableSchema=""),
        lastModified=last_modified,
        fields=canonical_schema,
    )
Example #14
0
def get_schema_metadata(
    report: SourceReport, node: DBTNode, platform: str
) -> SchemaMetadata:
    canonical_schema: List[SchemaField] = []
    for column in node.columns:
        field = SchemaField()
        field.fieldPath = column.name
        field.nativeDataType = column.data_type
        field.type = get_column_type(report, node.dbt_name, column.data_type)
        field.description = column.comment

        canonical_schema.append(field)

    actor, sys_time = "urn:li:corpuser:dbt_executor", int(time.time()) * 1000
    schema_metadata = SchemaMetadata(
        schemaName=node.dbt_name,
        platform=f"urn:li:dataPlatform:{platform}",
        version=0,
        hash="",
        platformSchema=MySqlDDL(tableSchema=""),
        created=AuditStamp(time=sys_time, actor=actor),
        lastModified=AuditStamp(time=sys_time, actor=actor),
        fields=canonical_schema,
    )
    return schema_metadata
Example #15
0
        def get_schema_metadata(glue_source: GlueSource) -> SchemaMetadata:
            schema = table["StorageDescriptor"]["Columns"]
            fields: List[SchemaField] = []
            for field in schema:
                schema_field = SchemaField(
                    fieldPath=field["Name"],
                    nativeDataType=field["Type"],
                    type=get_column_type(glue_source, field["Type"],
                                         table_name, field["Name"]),
                    description=field.get("Comment"),
                    recursive=False,
                    nullable=True,
                )
                fields.append(schema_field)

            partition_keys = table.get("PartitionKeys", [])
            for partition_key in partition_keys:
                schema_field = SchemaField(
                    fieldPath=partition_key["Name"],
                    nativeDataType=partition_key["Type"],
                    type=get_column_type(
                        glue_source,
                        partition_key["Type"],
                        table_name,
                        partition_key["Name"],
                    ),
                    recursive=False,
                    nullable=False,
                )
                fields.append(schema_field)

            return SchemaMetadata(
                schemaName=table_name,
                version=0,
                fields=fields,
                platform=
                f"urn:li:dataPlatform:{self.get_underlying_platform()}",
                hash="",
                platformSchema=MySqlDDL(tableSchema=""),
            )
Example #16
0
def get_schema_metadata(
    sql_report: SQLSourceReport,
    dataset_name: str,
    platform: str,
    columns: List[dict],
    pk_constraints: dict = None,
    foreign_keys: List[ForeignKeyConstraint] = None,
) -> SchemaMetadata:
    canonical_schema: List[SchemaField] = []

    for column in columns:
        field = SchemaField(
            fieldPath=column["name"],
            type=get_column_type(sql_report, dataset_name, column["type"]),
            nativeDataType=column.get("full_type", repr(column["type"])),
            description=column.get("comment", None),
            nullable=column["nullable"],
            recursive=False,
        )
        if (pk_constraints is not None and isinstance(
                pk_constraints, dict)  # some dialects (hive) return list
                and column["name"] in pk_constraints.get(
                    "constrained_columns", [])):
            field.isPartOfKey = True
        canonical_schema.append(field)

    schema_metadata = SchemaMetadata(
        schemaName=dataset_name,
        platform=f"urn:li:dataPlatform:{platform}",
        version=0,
        hash="",
        platformSchema=MySqlDDL(tableSchema=""),
        fields=canonical_schema,
    )
    if foreign_keys is not None and foreign_keys != []:
        schema_metadata.foreignKeys = foreign_keys

    return schema_metadata
Example #17
0
        def emit(self) -> Generator[SchemaField, None, None]:
            if (not isinstance(
                    self._actual_schema,
                (
                    avro.schema.ArraySchema,
                    avro.schema.Field,
                    avro.schema.MapSchema,
                    avro.schema.RecordSchema,
                ),
            ) and self._converter._fields_stack):
                # We are in the context of a non-nested(simple) field or the special-cased union.
                yield from self._converter._gen_from_last_field()
            else:
                # Just emit the SchemaField from schema provided in the Ctor.

                schema = self._schema
                actual_schema = self._actual_schema
                if isinstance(schema, avro.schema.Field):
                    # Field's schema is actually it's type.
                    schema = schema.type
                    actual_schema = (self._converter.
                                     _get_underlying_type_if_option_as_union(
                                         schema, schema))

                description = self._description
                if description is None:
                    description = schema.props.get("doc", None)

                native_data_type = self._converter._prefix_name_stack[-1]
                if isinstance(schema,
                              (avro.schema.Field, avro.schema.UnionSchema)):
                    native_data_type = self._converter._prefix_name_stack[-2]
                type_prefix = "[type="
                if native_data_type.startswith(type_prefix):
                    native_data_type = native_data_type[slice(
                        len(type_prefix),
                        len(native_data_type) - 1)]

                field = SchemaField(
                    fieldPath=self._converter._get_cur_field_path(),
                    # Populate it with the simple native type for now.
                    nativeDataType=native_data_type,
                    type=self._converter._get_column_type(actual_schema.type),
                    description=description,
                    recursive=False,
                    nullable=self._converter._is_nullable(schema),
                    isPartOfKey=self._converter._is_key_schema,
                )
                yield field
Example #18
0
    def emit_upstream_tables(self) -> Iterable[MetadataWorkUnit]:
        for (table_urn, (columns, path, is_embedded)) in self.upstream_tables.items():
            if not is_embedded and not self.config.ingest_tables_external:
                logger.error(
                    f"Skipping external table {table_urn} as ingest_tables_external is set to False"
                )
                continue

            dataset_snapshot = DatasetSnapshot(
                urn=table_urn,
                aspects=[],
            )
            if path:
                # Browse path
                browse_paths = BrowsePathsClass(
                    paths=[f"/{self.config.env.lower()}/{self.platform}/{path}"]
                )
                dataset_snapshot.aspects.append(browse_paths)
            else:
                logger.debug(f"Browse path not set for table {table_urn}")
            schema_metadata = None
            if columns:
                fields = []
                for field in columns:
                    nativeDataType = field.get("remoteType", "UNKNOWN")
                    TypeClass = FIELD_TYPE_MAPPING.get(nativeDataType, NullTypeClass)

                    schema_field = SchemaField(
                        fieldPath=field["name"],
                        type=SchemaFieldDataType(type=TypeClass()),
                        description="",
                        nativeDataType=nativeDataType,
                    )

                    fields.append(schema_field)

                schema_metadata = SchemaMetadata(
                    schemaName="test",
                    platform=f"urn:li:dataPlatform:{self.platform}",
                    version=0,
                    fields=fields,
                    hash="",
                    platformSchema=OtherSchema(rawSchema=""),
                )
            if schema_metadata is not None:
                dataset_snapshot.aspects.append(schema_metadata)

            yield self.get_metadata_change_event(dataset_snapshot)
Example #19
0
 def _get_fields_and_primary_keys(
     self, looker_view: LookerView
 ) -> Tuple[List[SchemaField], List[str]]:
     fields: List[SchemaField] = []
     primary_keys: List = []
     for field in looker_view.fields:
         schema_field = SchemaField(
             fieldPath=field.name,
             type=self._get_field_type(field.type),
             nativeDataType=field.type,
             description=f"{field.field_type.value}. {field.description}",
         )
         fields.append(schema_field)
         if field.is_primary_key:
             primary_keys.append(schema_field.fieldPath)
     return fields, primary_keys
Example #20
0
def _recordschema_to_mce_fields(
        schema: avro.schema.RecordSchema) -> List[SchemaField]:
    fields: List[SchemaField] = []

    for parsed_field in schema.fields:
        field = SchemaField(
            fieldPath=parsed_field.name,
            nativeDataType=str(parsed_field.type),
            type=_get_column_type(parsed_field.type),
            description=parsed_field.props.get("doc", None),
            recursive=False,
            nullable=_is_nullable(parsed_field.type),
        )

        fields.append(field)

    return fields
Example #21
0
def get_table_schema_fields(table: Table, max_rows: int) -> List[SchemaField]:
    table.infer(limit=max_rows)

    fields: List[SchemaField] = []

    for raw_field in table.schema.fields:
        mapped_type: Type = tableschema_type_map.get(raw_field.type,
                                                     NullTypeClass)

        field = SchemaField(
            fieldPath=raw_field.name,
            type=SchemaFieldDataType(mapped_type()),
            nativeDataType=str(raw_field.type),
            recursive=False,
        )
        fields.append(field)

    return fields
Example #22
0
def _genericschema_to_mce_fields(
        schema: avro.schema.Schema) -> List[SchemaField]:
    fields: List[SchemaField] = []

    # In the generic (non-RecordSchema) case, only a single SchemaField will be returned
    # and the fieldPath will be set to empty to signal that the type refers to the
    # the whole object.
    field = SchemaField(
        fieldPath="",
        nativeDataType=str(schema.type),
        type=_get_column_type(schema.type),
        description=schema.props.get("doc", None),
        recursive=False,
        nullable=_is_nullable(schema),
    )
    fields.append(field)

    return fields
Example #23
0
    def _get_schema_metadata_for_datasource(
        self, datasource_fields: List[dict]
    ) -> Optional[SchemaMetadata]:
        fields = []
        schema_metadata = None
        for field in datasource_fields:
            # check datasource - custom sql relations from a field being referenced
            self._track_custom_sql_ids(field)

            nativeDataType = field.get("dataType", "UNKNOWN")
            TypeClass = FIELD_TYPE_MAPPING.get(nativeDataType, NullTypeClass)

            schema_field = SchemaField(
                fieldPath=field["name"],
                type=SchemaFieldDataType(type=TypeClass()),
                description=make_description_from_params(
                    field.get("description", ""), field.get("formula")
                ),
                nativeDataType=nativeDataType,
                globalTags=get_tags_from_params(
                    [
                        field.get("role", ""),
                        field.get("__typename", ""),
                        field.get("aggregation", ""),
                    ]
                )
                if self.config.ingest_tags
                else None,
            )
            fields.append(schema_field)

        if fields:
            schema_metadata = SchemaMetadata(
                schemaName="test",
                platform=f"urn:li:dataPlatform:{self.platform}",
                version=0,
                fields=fields,
                hash="",
                platformSchema=OtherSchema(rawSchema=""),
            )

        return schema_metadata
Example #24
0
def _recordschema_to_mce_fields(
        schema: avro.schema.RecordSchema) -> List[SchemaField]:
    fields: List[SchemaField] = []

    for parsed_field in schema.fields:
        description: Optional[str] = parsed_field.doc
        if parsed_field.has_default:
            description = description if description else "No description available."
            description = f"{description}\nField default value: {parsed_field.default}"
        field = SchemaField(
            fieldPath=parsed_field.name,
            nativeDataType=str(parsed_field.type),
            type=_get_column_type(parsed_field.type),
            description=description,
            recursive=False,
            nullable=_is_nullable(parsed_field.type),
        )

        fields.append(field)

    return fields
Example #25
0
    def infer_schema(self, file: IO[bytes]) -> List[SchemaField]:
        # infer schema of a parquet file without reading the whole file

        # read the first line of the file
        schema = pyarrow.parquet.read_schema(file, memory_map=True)

        fields: List[SchemaField] = []

        for name, pyarrow_type in zip(schema.names, schema.types):
            mapped_type = map_pyarrow_type(pyarrow_type)

            field = SchemaField(
                fieldPath=name,
                type=SchemaFieldDataType(mapped_type()),
                nativeDataType=str(pyarrow_type),
                recursive=False,
            )

            fields.append(field)

        return fields
Example #26
0
    def emit_upstream_tables(self) -> Iterable[MetadataWorkUnit]:
        for (table_urn, (columns, path)) in self.upstream_tables.items():
            dataset_snapshot = DatasetSnapshot(
                urn=table_urn,
                aspects=[],
            )
            # Browse path
            browse_paths = BrowsePathsClass(
                paths=[f"/{self.config.env.lower()}/{self.platform}/{path}"])
            dataset_snapshot.aspects.append(browse_paths)

            fields = []
            for field in columns:
                nativeDataType = field.get("remoteType", "UNKNOWN")
                TypeClass = FIELD_TYPE_MAPPING.get(nativeDataType,
                                                   NullTypeClass)

                schema_field = SchemaField(
                    fieldPath=field["name"],
                    type=SchemaFieldDataType(type=TypeClass()),
                    description="",
                    nativeDataType=nativeDataType,
                )

                fields.append(schema_field)

            schema_metadata = SchemaMetadata(
                schemaName="test",
                platform=f"urn:li:dataPlatform:{self.platform}",
                version=0,
                fields=fields,
                hash="",
                platformSchema=OtherSchema(rawSchema=""),
            )
            if schema_metadata is not None:
                dataset_snapshot.aspects.append(schema_metadata)

            yield self.get_metadata_change_event(dataset_snapshot)
Example #27
0
def _schema_fields_from_dag(
    graph: nx.DiGraph, is_key_schema: bool
) -> List[SchemaField]:
    generations: List = list(nx.algorithms.dag.topological_generations(graph))
    fields: Dict = {}

    if generations and generations[0]:
        roots = generations[0]
        leafs: List = []
        for node in graph:
            if graph.out_degree(node) == 0:
                leafs.append(node)

        type_of_nodes: Dict = nx.get_node_attributes(graph, "node_type")

        for root in roots:
            root_type = type_of_nodes[root]
            for leaf in leafs:
                paths = list(nx.all_simple_edge_paths(graph, root, leaf))
                if paths:
                    for path in paths:
                        stack: List[str] = ["[version=2.0]"]
                        if is_key_schema:
                            stack.append("[key=True]")
                        stack.append(root_type)
                        if len(roots) > 1:
                            stack.append(re.sub(r"^.*\.", "", root))
                            root_path = ".".join(stack)
                            fields[root_path] = SchemaField(
                                fieldPath=root_path,
                                nativeDataType="message",
                                type=SchemaFieldDataType(type=RecordTypeClass()),
                            )
                        for field in _traverse_path(graph, path, stack):
                            fields[field.path] = field.field

    return sorted(fields.values(), key=lambda sf: sf.fieldPath)
Example #28
0
        def emit(self) -> Generator[SchemaField, None, None]:
            if (not isinstance(
                    self._actual_schema,
                (
                    avro.schema.ArraySchema,
                    avro.schema.Field,
                    avro.schema.MapSchema,
                    avro.schema.RecordSchema,
                ),
            ) and self._converter._fields_stack):
                # We are in the context of a non-nested(simple) field or the special-cased union.
                yield from self._converter._gen_from_last_field()
            else:
                # Just emit the SchemaField from schema provided in the Ctor.

                schema = self._schema
                actual_schema = self._actual_schema

                if isinstance(schema, avro.schema.Field):
                    # Field's schema is actually it's type.
                    schema = schema.type
                    actual_schema = (self._converter.
                                     _get_underlying_type_if_option_as_union(
                                         schema, schema))

                description = self._description
                if description is None:
                    description = schema.props.get("doc", None)

                native_data_type = self._converter._prefix_name_stack[-1]
                if isinstance(schema,
                              (avro.schema.Field, avro.schema.UnionSchema)):
                    native_data_type = self._converter._prefix_name_stack[-2]
                type_prefix = "[type="
                if native_data_type.startswith(type_prefix):
                    native_data_type = native_data_type[slice(
                        len(type_prefix),
                        len(native_data_type) - 1)]
                native_data_type = actual_schema.props.get(
                    "native_data_type", native_data_type)

                field_path = self._converter._get_cur_field_path()
                merged_props = {}
                merged_props.update(self._schema.other_props)
                merged_props.update(schema.other_props)

                tags = None
                if "deprecated" in merged_props:
                    description = (
                        f"<span style=\"color:red\">DEPRECATED: {merged_props['deprecated']}</span>\n"
                        + description)
                    tags = GlobalTagsClass(tags=[
                        TagAssociationClass(tag="urn:li:tag:Deprecated")
                    ])

                field = SchemaField(
                    fieldPath=field_path,
                    # Populate it with the simple native type for now.
                    nativeDataType=native_data_type,
                    type=self._converter._get_column_type(
                        actual_schema.type,
                        actual_schema.props.get("logicalType")),
                    description=description,
                    recursive=False,
                    nullable=self._converter._is_nullable(schema),
                    isPartOfKey=self._converter._is_key_schema,
                    globalTags=tags,
                    jsonProps=json.dumps(merged_props)
                    if merged_props else None,
                )
                yield field
Example #29
0
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        platform = "mongodb"

        database_names: List[str] = self.mongo_client.list_database_names()

        # traverse databases in sorted order so output is consistent
        for database_name in sorted(database_names):
            if database_name in DENY_DATABASE_LIST:
                continue
            if not self.config.database_pattern.allowed(database_name):
                self.report.report_dropped(database_name)
                continue

            database = self.mongo_client[database_name]
            collection_names: List[str] = database.list_collection_names()

            # traverse collections in sorted order so output is consistent
            for collection_name in sorted(collection_names):
                dataset_name = f"{database_name}.{collection_name}"

                if not self.config.collection_pattern.allowed(dataset_name):
                    self.report.report_dropped(dataset_name)
                    continue

                dataset_urn = f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.config.env})"

                dataset_snapshot = DatasetSnapshot(
                    urn=dataset_urn,
                    aspects=[],
                )

                dataset_properties = DatasetPropertiesClass(
                    tags=[],
                    customProperties={},
                )
                dataset_snapshot.aspects.append(dataset_properties)

                if self.config.enableSchemaInference:
                    assert self.config.maxDocumentSize is not None
                    collection_schema = construct_schema_pymongo(
                        database[collection_name],
                        delimiter=".",
                        use_random_sampling=self.config.useRandomSampling,
                        max_document_size=self.config.maxDocumentSize,
                        is_version_gte_4_4=self.is_server_version_gte_4_4(),
                        sample_size=self.config.schemaSamplingSize,
                    )

                    # initialize the schema for the collection
                    canonical_schema: List[SchemaField] = []
                    max_schema_size = self.config.maxSchemaSize
                    collection_schema_size = len(collection_schema.values())
                    collection_fields: Union[
                        List[SchemaDescription], ValuesView[
                            SchemaDescription]] = collection_schema.values()
                    assert max_schema_size is not None
                    if collection_schema_size > max_schema_size:
                        # downsample the schema, using frequency as the sort key
                        self.report.report_warning(
                            key=dataset_urn,
                            reason=
                            f"Downsampling the collection schema because it has {collection_schema_size} fields. Threshold is {max_schema_size}",
                        )
                        collection_fields = sorted(
                            collection_schema.values(),
                            key=lambda x: x["count"],
                            reverse=True,
                        )[0:max_schema_size]
                        # Add this information to the custom properties so user can know they are looking at downsampled schema
                        dataset_properties.customProperties[
                            "schema.downsampled"] = "True"
                        dataset_properties.customProperties[
                            "schema.totalFields"] = f"{collection_schema_size}"

                    logger.debug(
                        f"Size of collection fields = {len(collection_fields)}"
                    )
                    # append each schema field (sort so output is consistent)
                    for schema_field in sorted(
                            collection_fields,
                            key=lambda x: x["delimited_name"]):
                        field = SchemaField(
                            fieldPath=schema_field["delimited_name"],
                            nativeDataType=self.get_pymongo_type_string(
                                schema_field["type"], dataset_name),
                            type=self.get_field_type(schema_field["type"],
                                                     dataset_name),
                            description=None,
                            nullable=schema_field["nullable"],
                            recursive=False,
                        )
                        canonical_schema.append(field)

                    # create schema metadata object for collection
                    schema_metadata = SchemaMetadata(
                        schemaName=collection_name,
                        platform=f"urn:li:dataPlatform:{platform}",
                        version=0,
                        hash="",
                        platformSchema=SchemalessClass(),
                        fields=canonical_schema,
                    )

                    dataset_snapshot.aspects.append(schema_metadata)

                # TODO: use list_indexes() or index_information() to get index information
                # See https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.list_indexes.

                mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
                wu = MetadataWorkUnit(id=dataset_name, mce=mce)
                self.report.report_workunit(wu)
                yield wu
Example #30
0
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        platform = "mongodb"

        database_names: List[str] = self.mongo_client.list_database_names()

        # traverse databases in sorted order so output is consistent
        for database_name in sorted(database_names):
            if database_name in DENY_DATABASE_LIST:
                continue
            if not self.config.database_pattern.allowed(database_name):
                self.report.report_dropped(database_name)
                continue

            database = self.mongo_client[database_name]
            collection_names: List[str] = database.list_collection_names()

            # traverse collections in sorted order so output is consistent
            for collection_name in sorted(collection_names):
                dataset_name = f"{database_name}.{collection_name}"

                if not self.config.collection_pattern.allowed(dataset_name):
                    self.report.report_dropped(dataset_name)
                    continue

                dataset_snapshot = DatasetSnapshot(
                    urn=f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.config.env})",
                    aspects=[],
                )

                dataset_properties = DatasetPropertiesClass(
                    tags=[],
                    customProperties={},
                )
                dataset_snapshot.aspects.append(dataset_properties)

                if self.config.enableSchemaInference:

                    collection_schema = construct_schema_pymongo(
                        database[collection_name],
                        delimiter=".",
                        sample_size=self.config.schemaSamplingSize,
                    )

                    # initialize the schema for the collection
                    canonical_schema: List[SchemaField] = []

                    # append each schema field (sort so output is consistent)
                    for schema_field in sorted(
                        collection_schema.values(), key=lambda x: x["delimited_name"]
                    ):
                        field = SchemaField(
                            fieldPath=schema_field["delimited_name"],
                            nativeDataType=self.get_pymongo_type_string(
                                schema_field["type"], dataset_name
                            ),
                            type=self.get_field_type(
                                schema_field["type"], dataset_name
                            ),
                            description=None,
                            nullable=schema_field["nullable"],
                            recursive=False,
                        )
                        canonical_schema.append(field)

                    # create schema metadata object for collection
                    actor = "urn:li:corpuser:etl"
                    sys_time = int(time.time() * 1000)
                    schema_metadata = SchemaMetadata(
                        schemaName=collection_name,
                        platform=f"urn:li:dataPlatform:{platform}",
                        version=0,
                        hash="",
                        platformSchema=SchemalessClass(),
                        created=AuditStamp(time=sys_time, actor=actor),
                        lastModified=AuditStamp(time=sys_time, actor=actor),
                        fields=canonical_schema,
                    )

                    dataset_snapshot.aspects.append(schema_metadata)

                # TODO: use list_indexes() or index_information() to get index information
                # See https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.list_indexes.

                mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
                wu = MetadataWorkUnit(id=dataset_name, mce=mce)
                self.report.report_workunit(wu)
                yield wu