Ejemplo n.º 1
0
    def _coerce_decimal(cls, vector: pa.Array, field: pa.Field) -> pa.Array:

        # Loss of precision is allowed, but loss of data is not
        # Arrow will raise an error if cast() results in loss of data

        try:

            # For decimal values, arrow will raise an error on loss of precision
            # Round explicitly to the required scale so there is no loss of precision in cast()
            if pa.types.is_decimal(vector.type):
                rounded = pc.round(vector, ndigits=field.type.scale)  # noqa
                return pc.cast(rounded, field.type)

            # Floats and integers can always be coerced to decimal, so long as there is no data loss
            elif pa.types.is_floating(vector.type) or pa.types.is_integer(
                    vector.type):
                return pc.cast(vector, field.type)

            else:
                error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE,
                                                  vector, field)
                cls.__log.error(error_message)
                raise _ex.EDataConformance(error_message)

        except pa.ArrowInvalid as e:

            error_message = cls._format_error(cls.__E_DATA_LOSS_DID_OCCUR,
                                              vector, field, e)
            cls.__log.error(error_message)
            raise _ex.EDataConformance(error_message) from e
Ejemplo n.º 2
0
    def _coerce_timestamp(cls, vector: pa.Array, field: pa.Field) -> pa.Array:

        try:

            if pa.types.is_timestamp(vector.type):

                if not isinstance(field.type, pa.TimestampType):
                    raise _ex.EUnexpected()

                if vector.type.tz != field.type.tz:
                    error_message = cls._format_error(
                        cls.__E_TIMEZONE_DOES_NOT_MATCH, vector, field)
                    cls.__log.error(error_message)
                    raise _ex.EDataConformance(error_message)

                # The cast() function applied to timestamp on Windows does not correctly detect overflows / under-flows
                # To get consistent behavior, this custom implementation casts to int64, the underlying type
                # Then performs the required scaling on the int64 vector, which does throw for overflows
                # Bug exists in Arrow 7.0.0 as of May 2022

                # This also avoids the need for timezone lookup on Windows
                # Although zone conversion is not supported, a tz database is still required
                # When casting timestamps with source and target type in the same zone

                if platform.system().lower().startswith("win"):
                    return cls._coerce_timestamp_windows(vector, field)

                if field.type.unit == "s":
                    rounding_unit = "second"
                elif field.type.unit == "ms":
                    rounding_unit = "millisecond"
                elif field.type.unit == "us":
                    rounding_unit = "microsecond"
                elif field.type.unit == "ns":
                    rounding_unit = "nanosecond"
                else:
                    raise _ex.EUnexpected()

                # Loss of precision is allowed, loss of data is not
                # Rounding will prevent errors in cast() due to loss of precision
                # cast() will fail if the source value is outside the range of the target type

                rounded_vector = pc.round_temporal(vector,
                                                   unit=rounding_unit)  # noqa
                return pc.cast(rounded_vector, field.type)

            else:
                error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE,
                                                  vector, field)
                cls.__log.error(error_message)
                raise _ex.EDataConformance(error_message)

        except pa.ArrowInvalid as e:

            error_message = cls._format_error(cls.__E_DATA_LOSS_DID_OCCUR,
                                              vector, field, e)
            cls.__log.error(error_message)
            raise _ex.EDataConformance(error_message) from e
Ejemplo n.º 3
0
    def _coerce_float(cls, vector: pa.Array,
                      field: pa.Field) -> pa.FloatingPointArray:

        try:

            # Coercing between float types
            if pa.types.is_floating(vector.type):

                # Casting floats to a wider type is allowed
                # Casting to a less wide type does not raise exceptions when values do not fit
                # So we need an explict check on which casts are allowed

                source_bit_width = vector.type.bit_width
                target_bit_width = field.type.bit_width

                if source_bit_width == target_bit_width:
                    return vector  # noqa

                # cast() is available for float32 -> float64, but not for float16 -> float32/float64
                elif source_bit_width == 32 and target_bit_width == 64:
                    return pc.cast(vector, field.type)

                elif source_bit_width > target_bit_width:
                    error_message = cls._format_error(
                        cls.__E_DATA_LOSS_WILL_OCCUR, vector, field)
                    cls.__log.error(error_message)
                    raise _ex.EDataConformance(error_message)

            # All integer types can be coerced to float32 or float64
            if pa.types.is_integer(
                    vector.type) and not pa.types.is_float16(field.type):
                return pc.cast(vector, field.type)

            if pa.types.is_integer(
                    vector.type) and vector.type.bit_width <= 16:
                return pc.cast(vector, field.type)

            error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector,
                                              field)
            cls.__log.error(error_message)
            raise _ex.EDataConformance(error_message)

        except pa.ArrowInvalid as e:

            error_message = cls._format_error(cls.__E_DATA_LOSS_DID_OCCUR,
                                              vector, field, e)
            cls.__log.error(error_message)
            raise _ex.EDataConformance(error_message) from e
Ejemplo n.º 4
0
    def _check_duplicate_fields(cls, schema: pa.Schema, schema_or_table: bool):

        check = {}

        for field in schema.names:
            field_lower = field.lower()
            if field_lower not in check:
                check[field_lower] = []
            check[field_lower].append(field)

        duplicate_fields = dict(
            filter(lambda f_fs: len(f_fs[1]) > 1, check.items()))

        if any(duplicate_fields):

            duplicate_info = []

            for field_lower, fields in duplicate_fields.items():
                if all(map(lambda f: f == fields[0], fields)):
                    duplicate_info.append(f"[{fields[0]}]")
                else:
                    duplicate_info.append(
                        f"[{field_lower}] ({', '.join(fields)} differ only by case)"
                    )

            source = "Schema" if schema_or_table else "Data"

            error_message = f"{source} contains duplicate fields: " + ", ".join(
                duplicate_info)
            cls.__log.error(error_message)
            raise _ex.EDataConformance(error_message)
Ejemplo n.º 5
0
    def _coerce_boolean(cls, vector: pa.Array,
                        field: pa.Field) -> pa.BooleanArray:

        if pa.types.is_boolean(vector.type):
            return vector  # noqa

        error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector,
                                          field)
        cls.__log.error(error_message)
        raise _ex.EDataConformance(error_message)
Ejemplo n.º 6
0
    def _coerce_integer(cls, vector: pa.Array,
                        field: pa.Field) -> pa.IntegerArray:

        try:

            if pa.types.is_integer(vector.type):
                return pc.cast(vector, field.type)

            else:
                error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE,
                                                  vector, field)
                cls.__log.error(error_message)
                raise _ex.EDataConformance(error_message)

        except pa.ArrowInvalid as e:

            error_message = cls._format_error(cls.__E_DATA_LOSS_DID_OCCUR,
                                              vector, field, e)
            cls.__log.error(error_message)
            raise _ex.EDataConformance(error_message) from e
Ejemplo n.º 7
0
    def read_table(self, source: tp.BinaryIO,
                   schema: tp.Optional[pa.Schema]) -> pa.Table:

        # For CSV data, if there is no schema then type inference will do unpredictable things!

        if schema is None or len(schema.names) == 0 or len(schema.types) == 0:
            raise _ex.EDataConformance(
                "An explicit schema is required to load CSV data")

        if self._use_lenient_parser:
            return self._read_table_lenient(source, schema)
        else:
            return self._read_table_arrow(source, schema)
Ejemplo n.º 8
0
    def _coerce_vector(cls,
                       vector: pa.Array,
                       field: pa.Field,
                       pandas_type=None) -> pa.Array:

        if pa.types.is_null(vector.type):

            if field.nullable:
                return pa.array([], type=field.type, size=len(vector))
            else:
                raise _ex.EDataConformance(
                    f"All null values in non-null field [{field.name}]")

        if pa.types.is_boolean(field.type):
            return cls._coerce_boolean(vector, field)

        if pa.types.is_integer(field.type):
            return cls._coerce_integer(vector, field)

        if pa.types.is_floating(field.type):
            return cls._coerce_float(vector, field)

        if pa.types.is_decimal(field.type):
            return cls._coerce_decimal(vector, field)

        if pa.types.is_string(field.type) or pa.types.is_large_string(
                field.type):
            return cls._coerce_string(vector, field)

        if pa.types.is_date(field.type):
            return cls._coerce_date(vector, field, pandas_type)

        if pa.types.is_timestamp(field.type):
            return cls._coerce_timestamp(vector, field)

        error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector,
                                          field)
        cls.__log.error(error_message)
        raise _ex.EDataConformance(error_message)
Ejemplo n.º 9
0
    def _coerce_string(cls, vector: pa.Array, field: pa.Field) -> pa.Array:

        if pa.types.is_string(field.type):
            if pa.types.is_string(vector.type):
                return vector

        if pa.types.is_large_string(field.type):
            if pa.types.is_large_string(vector.type):
                return vector
            # Allow up-casting string -> large_string
            if pa.types.is_string(vector.type):
                return pc.cast(vector, field.type)

        error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector,
                                          field)
        cls.__log.error(error_message)

        raise _ex.EDataConformance(error_message)
Ejemplo n.º 10
0
    def _coerce_date(cls,
                     vector: pa.Array,
                     field: pa.Field,
                     pandas_type=None) -> pa.Array:

        # Allow casting date32 -> date64, both range and precision are greater so there is no data loss
        if pa.types.is_date(vector.type):
            if field.type.bit_width >= vector.type.bit_width:
                return pc.cast(vector, field.type)

        # Special handling for Pandas/NumPy date values
        # These are encoded as np.datetime64[ns] in Pandas -> pa.timestamp64[ns] in Arrow
        # Only allow this conversion if the vector is coming from Pandas with datetime type
        if pandas_type == DataMapping.pandas_datetime_type():
            if pa.types.is_timestamp(vector.type) and vector.type.unit == "ns":
                return pc.cast(vector, field.type)

        error_message = cls._format_error(cls.__E_WRONG_DATA_TYPE, vector,
                                          field)
        cls.__log.error(error_message)

        raise _ex.EDataConformance(error_message)
Ejemplo n.º 11
0
    def _convert_python_value(self, raw_value: tp.Any, python_type: type,
                              row: int, col: int) -> tp.Any:

        try:

            if raw_value is None:
                return None

            if isinstance(raw_value, python_type):
                return raw_value

            if python_type == bool:
                if isinstance(raw_value, str):
                    if raw_value.lower() in self.__TRUE_VALUES:
                        return True
                    if raw_value.lower() in self.__FALSE_VALUES:
                        return False
                if isinstance(raw_value, int) or isinstance(raw_value, float):
                    if raw_value == 1:
                        return True
                    if raw_value == 0:
                        return False

            if python_type == int:
                if isinstance(raw_value, float):
                    return int(raw_value)
                if isinstance(raw_value, str):
                    return int(raw_value)

            if python_type == float:
                if isinstance(raw_value, int):
                    return float(raw_value)
                if isinstance(raw_value, str):
                    return float(raw_value)

            if python_type == decimal.Decimal:
                if isinstance(raw_value, int):
                    return decimal.Decimal.from_float(float(raw_value))
                if isinstance(raw_value, float):
                    return decimal.Decimal.from_float(raw_value)
                if isinstance(raw_value, str):
                    return decimal.Decimal(raw_value)

            if python_type == str:
                return str(raw_value)

            if python_type == dt.date:
                if isinstance(raw_value, str):
                    return dt.date.fromisoformat(raw_value)

            if python_type == dt.datetime:
                if isinstance(raw_value, str):
                    return dt.datetime.fromisoformat(raw_value)

            msg = f"CSV data does not match the schema and cannot be converted" \
                + f" (row = {row}, col = {col}, expected type = {python_type}, value = [{str(raw_value)}])"

            self._log.error(msg)
            raise _ex.EDataConformance(msg)

        except Exception as e:

            msg = f"CSV data does not match the schema and cannot be converted" \
                + f" (row = {row}, col = {col}, expected type = {python_type}, value = [{str(raw_value)}])" \
                + f": {str(e)}"

            self._log.exception(msg)
            raise _ex.EDataConformance(msg) from e
Ejemplo n.º 12
0
    def _read_table_lenient(self, source: tp.BinaryIO,
                            schema: tp.Optional[pa.Schema]) -> pa.Table:

        try:

            stream_reader = codecs.getreader('utf-8')
            text_source = stream_reader(source)

            csv_params = {"skipinitialspace": True, "doublequote": True}

            csv_reader = csv.reader(text_source, **csv_params)
            header = next(csv_reader)

            header_lower = list(map(str.lower, header))
            missing_columns = list(
                filter(lambda col_: col_.lower() not in header_lower,
                       schema.names))

            if any(missing_columns):
                msg = f"CSV data is missing one or more columns: [{', '.join(missing_columns)}]"
                self._log.error(msg)
                raise _ex.EDataConformance(msg)

            schema_columns = {
                col.lower(): index
                for index, col in enumerate(schema.names)
            }
            col_mapping = [schema_columns.get(col) for col in header_lower]
            python_types = list(
                map(_data.DataMapping.arrow_to_python_type, schema.types))

            data = [[] for _ in range(len(schema.names))]
            csv_row = 1  # Allowing for header
            csv_col = 0

            for row in csv_reader:

                for raw_value in row:

                    output_col = col_mapping[csv_col]

                    if output_col is not None:
                        python_type = python_types[output_col]
                        python_value = self._convert_python_value(
                            raw_value, python_type, csv_row, csv_col)
                        data[output_col].append(python_value)

                    csv_col += 1

                csv_col = 0
                csv_row += 1

            data_dict = dict(zip(schema.names, data))
            table = pa.Table.from_pydict(data_dict, schema)  # noqa

            return table

        except UnicodeDecodeError as e:
            err = f"CSV decoding failed, content is garbled"
            self._log.exception(err)
            raise _ex.EDataCorruption(err) from e
Ejemplo n.º 13
0
    def conform_to_schema(
            cls, table: pa.Table, schema: pa.Schema,
            pandas_types=None, warn_extra_columns=True) \
            -> pa.Table:
        """
        Align an Arrow table to an Arrow schema.

        Columns will be matched using case-insensitive matching and columns not in the schema will be dropped.
        The resulting table will have the field order and case defined in the schema.

        Where column types do not match exactly, type coercion will be applied if possible.
        In some cases type coercion may result in overflows,
        for example casting int64 -> int32 will fail if any values are greater than the maximum int32 value.

        If the incoming data has been converted from Pandas, there are some conversions that can be applied
        if the original Pandas dtype is known. These dtypes can be supplied via the pandas_dtypes parameter
        and should line up with the data in the table (i.e. dtypes are for the source data, not the target schema).

        The method will return a dataset whose schema exactly matches the requested schema.
        If it is not possible to make the data conform to the schema for any reason, EDataConformance will be raised.

        :param table: The data to be conformed
        :param schema: The schema to conform to
        :param pandas_types: Pandas dtypes for the table, if the table has been converted from Pandas
        :param warn_extra_columns: Whether to log warnings it the table contains columns not in the schema
        :return: The conformed data, whose schema will exactly match the supplied schema parameter
        :raises: _ex.EDataConformance if conformance is not possible for any reason
        """

        # If Pandas types are supplied they must match the table, i.e. table has been converted from Pandas
        if pandas_types is not None and len(pandas_types) != len(
                table.schema.types):
            raise _ex.EUnexpected()

        cls._check_duplicate_fields(schema, True)
        cls._check_duplicate_fields(table.schema, False)

        table_indices = {
            f.lower(): i
            for (i, f) in enumerate(table.schema.names)
        }
        conformed_data = []
        conformance_errors = []

        # Coerce types to match expected schema where possible
        for schema_index in range(len(schema.names)):

            try:
                schema_field = schema.field(schema_index)
                table_index = table_indices.get(schema_field.name.lower())

                if table_index is None:
                    message = cls.__E_FIELD_MISSING.format(
                        field_name=schema_field.name)
                    cls.__log.error(message)
                    raise _ex.EDataConformance(message)

                table_column: pa.Array = table.column(table_index)

                pandas_type = pandas_types[table_index] \
                    if pandas_types is not None \
                    else None

                if table_column.type == schema_field.type:
                    conformed_column = table_column
                else:
                    conformed_column = cls._coerce_vector(
                        table_column, schema_field, pandas_type)

                if not schema_field.nullable and table_column.null_count > 0:
                    message = f"Null values present in non-null field [{schema_field.name}]"
                    cls.__log.error(message)
                    raise _ex.EDataConformance(message)

                conformed_data.append(conformed_column)

            except _ex.EDataConformance as e:
                conformance_errors.append(e)

        # Columns not defined in the schema will not be included in the conformed output
        if warn_extra_columns and table.num_columns > len(schema.types):

            schema_columns = set(map(str.lower, schema.names))
            extra_columns = [
                f"[{col}]" for col in table.schema.names
                if col.lower() not in schema_columns
            ]

            message = f"Columns not defined in the schema will be dropped: {', '.join(extra_columns)}"
            cls.__log.warning(message)

        if any(conformance_errors):
            if len(conformance_errors) == 1:
                raise conformance_errors[0]
            else:
                cls.__log.error("There were multiple data conformance errors")
                raise _ex.EDataConformance(
                    "There were multiple data conformance errors",
                    conformance_errors)

        return pa.Table.from_arrays(conformed_data, schema=schema)  # noqa