Ejemplo n.º 1
0
    def encode_value(cls, value: tp.Any) -> _meta.Value:

        if value is None:
            raise _ex.ETracInternal("Cannot encode a null value")

        if isinstance(value, bool):
            type_desc = _meta.TypeDescriptor(_meta.BasicType.BOOLEAN)
            return _meta.Value(type_desc, booleanValue=value)

        if isinstance(value, int):
            type_desc = _meta.TypeDescriptor(_meta.BasicType.INTEGER)
            return _meta.Value(type_desc, integerValue=value)

        if isinstance(value, float):
            type_desc = _meta.TypeDescriptor(_meta.BasicType.FLOAT)
            return _meta.Value(type_desc, floatValue=value)

        if isinstance(value, decimal.Decimal):
            type_desc = _meta.TypeDescriptor(_meta.BasicType.DECIMAL)
            return _meta.Value(type_desc, decimalValue=_meta.DecimalValue(str(value)))

        if isinstance(value, str):
            type_desc = _meta.TypeDescriptor(_meta.BasicType.STRING)
            return _meta.Value(type_desc, stringValue=value)

        # dt.datetime inherits dt.date, so check datetime first to avoid encoding datetime as a date
        if isinstance(value, dt.datetime):
            type_desc = _meta.TypeDescriptor(_meta.BasicType.DATETIME)
            return _meta.Value(type_desc, datetimeValue=_meta.DatetimeValue(value.isoformat()))

        if isinstance(value, dt.date):
            type_desc = _meta.TypeDescriptor(_meta.BasicType.DATE)
            return _meta.Value(type_desc, dateValue=_meta.DateValue(value.isoformat()))

        raise _ex.ETracInternal(f"Encoding value type [{type(value)}] is not supported yet")
Ejemplo n.º 2
0
    def view_to_pandas(cls, view: DataView, part: DataPartKey) -> pd.DataFrame:

        deltas = view.parts.get(part)

        # Sanity checks

        if not view.arrow_schema:
            raise _ex.ETracInternal(f"Data view schema not set")

        if not deltas:
            raise _ex.ETracInternal(
                f"Data view for part [{part.opaque_key}] does not contain any items"
            )

        if len(deltas) == 1:
            return cls.item_to_pandas(deltas[0])

        batches = {
            batch
            for delta in deltas for batch in (
                delta.batches if delta.batches else delta.table.to_batches())
        }

        table = pa.Table.from_batches(batches)  # noqa
        return table.to_pandas()
Ejemplo n.º 3
0
    def decode_value(value: _meta.Value) -> tp.Any:

        if value is None or not isinstance(value, _meta.Value):
            raise _ex.ETracInternal()

        if value.type is None or \
           value.type.basicType is None or \
           value.type.basicType == _meta.BasicType.BASIC_TYPE_NOT_SET:

            raise _ex.ETracInternal("Missing type information")

        basic_type = value.type.basicType

        if basic_type == _meta.BasicType.BOOLEAN:
            return value.booleanValue

        if basic_type == _meta.BasicType.INTEGER:
            return value.integerValue

        if basic_type == _meta.BasicType.FLOAT:
            return value.floatValue

        if basic_type == _meta.BasicType.DECIMAL:
            return decimal.Decimal(value.decimalValue.decimal)

        if basic_type == _meta.BasicType.STRING:
            return value.stringValue

        if basic_type == _meta.BasicType.DATE:
            return dt.date.fromisoformat(value.dateValue.isoDate)

        if basic_type == _meta.BasicType.DATETIME:
            return dt.datetime.fromisoformat(value.datetimeValue.isoDatetime)

        raise _ex.ETracInternal(f"Decoding value type [{basic_type}] is not supported yet")
Ejemplo n.º 4
0
    def lookup(self, node_id: NodeId[__T]) -> __T:

        engine_node = self.__nodes.get(node_id)

        # Use internal errors if any of the checks fail on a result lookup
        # The engine should guarantee that all these conditions are met before a node is executed

        if engine_node is None:
            raise _ex.ETracInternal(
                f"Node [{node_id.name}] does not exist in execution context [{node_id.namespace}]"
            )

        if not engine_node.complete:
            raise _ex.ETracInternal(
                f"Node [{node_id.name}] still pending in execution context [{node_id.namespace}]"
            )

        if engine_node.error:
            raise _ex.ETracInternal(
                f"Node [{node_id.name}] failed in execution context [{node_id.namespace}]"
            )

        if not NodeProcessor.result_matches_type(engine_node.result,
                                                 node_id.result_type):

            expected_type = node_id.result_type or type(None)
            result_type = type(engine_node.result)

            err = f"Wrong type for node [{node_id.name}] in execution context [{node_id.namespace}]" \
                + f" (expected [{expected_type}], got [{result_type}])"

            raise _ex.ETracInternal(err)

        return engine_node.result
Ejemplo n.º 5
0
    def arrow_to_python_type(arrow_type: pa.DataType) -> type:

        if pa.types.is_boolean(arrow_type):
            return bool

        if pa.types.is_integer(arrow_type):
            return int

        if pa.types.is_floating(arrow_type):
            return float

        if pa.types.is_decimal(arrow_type):
            return decimal.Decimal

        if pa.types.is_string(arrow_type):
            return str

        if pa.types.is_date(arrow_type):
            return dt.date

        if pa.types.is_timestamp(arrow_type):
            return dt.datetime

        raise _ex.ETracInternal(
            f"No Python type mapping available for Arrow type [{arrow_type}]")
Ejemplo n.º 6
0
    def _update_results(self, updates: tp.Dict[NodeId, _EngineNode]):

        nodes = {**self.graph.nodes, **updates}

        pending_nodes = cp.copy(self.graph.pending_nodes)
        active_nodes = cp.copy(self.graph.active_nodes)
        succeeded_nodes = cp.copy(self.graph.succeeded_nodes)
        failed_nodes = cp.copy(self.graph.failed_nodes)

        for node_id, node in updates.items():

            if node_id in active_nodes:
                active_nodes.remove(node_id)
            elif node_id in pending_nodes:
                pending_nodes.remove(
                    node_id
                )  # TODO: check pending node ID is part of main node id bundle
            else:
                raise _ex.ETracInternal()

            if node.error:
                failed_nodes.add(node_id)
            else:
                succeeded_nodes.add(node_id)

            if node_id in self.processors:
                node_ref = self.processors.pop(node_id)
                self.actors().stop(node_ref)

        graph = _EngineContext(nodes, pending_nodes, active_nodes,
                               succeeded_nodes, failed_nodes)

        self.graph = graph
        self.check_job_status()
Ejemplo n.º 7
0
    def python_to_arrow_type(cls, python_type: type) -> pa.DataType:

        if python_type == bool:
            return pa.bool_()

        if python_type == int:
            return pa.int64()

        if python_type == float:
            return pa.float64()

        if python_type == decimal.Decimal:
            return pa.decimal128(cls.__TRAC_DECIMAL_PRECISION,
                                 cls.__TRAC_DECIMAL_SCALE)

        if python_type == str:
            return pa.utf8()

        if python_type == dt.date:
            return pa.date32()

        if python_type == dt.datetime:
            return pa.timestamp(cls.__TRAC_TIMESTAMP_UNIT,
                                cls.__TRAC_TIMESTAMP_ZONE)

        raise _ex.ETracInternal(
            f"No Arrow type mapping available for Python type [{python_type}]")
Ejemplo n.º 8
0
    def trac_to_python_basic_type(cls, trac_basic_type: _meta.BasicType) -> type:

        python_type = cls.__TRAC_TO_PYTHON_BASIC_TYPE.get(trac_basic_type)

        if python_type is None:
            raise _ex.ETracInternal(f"No Python type mapping available for TRAC type [{trac_basic_type}]")

        return python_type
Ejemplo n.º 9
0
    def python_to_trac_basic_type(cls, python_type: type) -> _meta.BasicType:

        basic_type = cls.__PYTHON_TO_TRAC_BASIC_TYPE.get(python_type)

        if basic_type is None:
            raise _ex.ETracInternal(f"No TRAC type mapping available for Python type [{python_type}]")

        return basic_type
Ejemplo n.º 10
0
    def convert_boolean_value(raw_value: tp.Any) -> _meta.Value:

        type_desc = _meta.TypeDescriptor(_meta.BasicType.BOOLEAN)

        if isinstance(raw_value, bool):
            return _meta.Value(type_desc, booleanValue=raw_value)

        msg = f"Value of type [{type(raw_value)}] cannot be converted to {_meta.BasicType.BOOLEAN.name}"
        raise _ex.ETracInternal(msg)
Ejemplo n.º 11
0
    def _check_result_type(self, result):

        # Use an internal error if the result is the wrong type
        # Node functions should only ever return the expected type

        expected_type = self.node.node.id.result_type or self.__NONE_TYPE
        result_type = type(result)

        if not self.result_matches_type(result, expected_type):
            err = f"Node result is the wrong type, expected [{expected_type.__name__}], got [{result_type.__name__}]"
            raise _ex.ETracInternal(err)
Ejemplo n.º 12
0
    def trac_to_arrow_basic_type(
            cls, trac_basic_type: _meta.BasicType) -> pa.DataType:

        arrow_type = cls.__TRAC_TO_ARROW_BASIC_TYPE_MAPPING.get(
            trac_basic_type)

        if arrow_type is None:
            raise _ex.ETracInternal(
                f"No Arrow type mapping available for TRAC type [{trac_basic_type}]"
            )

        return arrow_type
Ejemplo n.º 13
0
    def convert_decimal_value(raw_value: tp.Any) -> _meta.Value:

        type_desc = _meta.TypeDescriptor(_meta.BasicType.DECIMAL)

        if isinstance(raw_value, decimal.Decimal):
            return _meta.Value(type_desc, decimalValue=_meta.DecimalValue(str(raw_value)))

        if isinstance(raw_value, int) or isinstance(raw_value, float):
            return _meta.Value(type_desc, decimalValue=_meta.DecimalValue(str(raw_value)))

        msg = f"Value of type [{type(raw_value)}] cannot be converted to {_meta.BasicType.DECIMAL.name}"
        raise _ex.ETracInternal(msg)
Ejemplo n.º 14
0
    def convert_float_value(raw_value: tp.Any) -> _meta.Value:

        type_desc = _meta.TypeDescriptor(_meta.BasicType.FLOAT)

        if isinstance(raw_value, float):
            return _meta.Value(type_desc, floatValue=raw_value)

        if isinstance(raw_value, int):
            return _meta.Value(type_desc, floatValue=float(raw_value))

        msg = f"Value of type [{type(raw_value)}] cannot be converted to {_meta.BasicType.FLOAT.name}"
        raise _ex.ETracInternal(msg)
Ejemplo n.º 15
0
    def convert_integer_value(raw_value: tp.Any) -> _meta.Value:

        type_desc = _meta.TypeDescriptor(_meta.BasicType.INTEGER)

        if isinstance(raw_value, int):
            return _meta.Value(type_desc, integerValue=raw_value)

        if isinstance(raw_value, float) and raw_value.is_integer():
            return _meta.Value(type_desc, integerValue=int(raw_value))

        msg = f"Value of type [{type(raw_value)}] cannot be converted to {_meta.BasicType.INTEGER.name}"
        raise _ex.ETracInternal(msg)
Ejemplo n.º 16
0
    def convert_datetime_value(raw_value: tp.Any) -> _meta.Value:

        type_desc = _meta.TypeDescriptor(_meta.BasicType.DATETIME)

        if isinstance(raw_value, dt.datetime):
            return _meta.Value(type_desc, datetimeValue=_meta.DatetimeValue(isoDatetime=raw_value.isoformat()))

        if isinstance(raw_value, str):
            datetime_value = dt.datetime.fromisoformat(raw_value)
            return _meta.Value(type_desc, datetimeValue=_meta.DatetimeValue(isoDatetime=datetime_value.isoformat()))

        msg = f"Value of type [{type(raw_value)}] cannot be converted to {_meta.BasicType.DATETIME.name}"
        raise _ex.ETracInternal(msg)
Ejemplo n.º 17
0
    def trac_to_arrow_schema(cls,
                             trac_schema: _meta.SchemaDefinition) -> pa.Schema:

        if trac_schema.schemaType != _meta.SchemaType.TABLE:
            raise _ex.ETracInternal(
                f"Schema type [{trac_schema.schemaType}] cannot be converted for Apache Arrow"
            )

        arrow_fields = [(f.fieldName,
                         cls.trac_to_arrow_basic_type(f.fieldType))
                        for f in trac_schema.table.fields]

        return pa.schema(arrow_fields, metadata={})
Ejemplo n.º 18
0
    def item_to_pandas(cls, item: DataItem) -> pd.DataFrame:

        if item.pandas is not None:
            return item.pandas.copy()

        if item.table is not None:
            return cls.arrow_to_pandas(item.table)

        if item.batches is not None:
            table = pa.Table.from_batches(item.batches, item.schema)  # noqa
            return cls.arrow_to_pandas(table)

        raise _ex.ETracInternal(f"Data item does not contain any usable data")
Ejemplo n.º 19
0
    def convert_string_value(raw_value: tp.Any) -> _meta.Value:

        type_desc = _meta.TypeDescriptor(_meta.BasicType.STRING)

        if isinstance(raw_value, str):
            return _meta.Value(type_desc, stringValue=raw_value)

        if isinstance(raw_value, bool) or \
           isinstance(raw_value, int) or \
           isinstance(raw_value, float) or \
           isinstance(raw_value, decimal.Decimal):

            return _meta.Value(type_desc, stringValue=str(raw_value))

        msg = f"Value of type [{type(raw_value)}] cannot be converted to {_meta.BasicType.STRING.name}"
        raise _ex.ETracInternal(msg)
Ejemplo n.º 20
0
    def read_table(
            self, storage_path: str, storage_format: str,
            schema: tp.Optional[pa.Schema],
            storage_options: tp.Dict[str, tp.Any] = None) \
            -> pa.Table:

        try:

            format_impl = FormatManager.get_data_format(
                storage_format, storage_options)

            stat = self.__file_storage.stat(storage_path)

            if stat.file_type == FileType.DIRECTORY:

                dir_content = self.__file_storage.ls(storage_path)

                if len(dir_content) == 1:
                    storage_path = storage_path.rstrip(
                        "/\\") + "/" + dir_content[0]
                else:
                    raise NotImplementedError(
                        "Directory storage format not available yet")

            with self.__file_storage.read_byte_stream(
                    storage_path) as byte_stream:
                table = format_impl.read_table(byte_stream, schema)

            if schema is not None:
                # Apply conformance, in case the format was not able to apply it fully on read
                # It is fine to silently ignore extra columns of an input
                return _data.DataConformance.conform_to_schema(
                    table, schema, warn_extra_columns=False)
            else:
                return table

        except (_ex.EStorage, _ex.EData) as e:
            err = f"Failed to read table [{storage_path}]: {str(e)}"
            self.__log.error(err)
            raise type(e)(err) from e

        except Exception as e:
            err = f"Failed to read table [{storage_path}]: An unexpected error occurred"
            self.__log.error(err)
            self.__log.exception(str(e))
            raise _ex.ETracInternal(err) from e
Ejemplo n.º 21
0
    def write_table(self,
                    storage_path: str,
                    storage_format: str,
                    table: pa.Table,
                    storage_options: tp.Dict[str, tp.Any] = None,
                    overwrite: bool = False):

        try:

            format_impl = FormatManager.get_data_format(
                storage_format, storage_options)
            format_extension = FormatManager.extension_for_format(
                storage_format)

            # TODO: Full handling of directory storage formats

            if not storage_path.endswith(format_extension):
                parent_dir_ = storage_path
                storage_path_ = storage_path.rstrip(
                    "/\\") + f"/chunk-0.{format_extension}"
                self.__file_storage.mkdir(parent_dir_,
                                          True,
                                          exists_ok=overwrite)
            else:
                parent_dir_ = str(pathlib.PurePath(storage_path).parent)
                storage_path_ = storage_path
                self.__file_storage.mkdir(parent_dir_, True, True)

            with self.__file_storage.write_byte_stream(
                    storage_path_, overwrite=overwrite) as byte_stream:
                format_impl.write_table(byte_stream, table)

        except (_ex.EStorage, _ex.EData) as e:
            err = f"Failed to write table [{storage_path}]: {str(e)}"
            self.__log.error(err)
            raise type(e)(err) from e

        except Exception as e:
            err = f"Failed to write table [{storage_path}]: An unexpected error occurred"
            self.__log.error(err)
            self.__log.exception(str(e))
            raise _ex.ETracInternal(err) from e
Ejemplo n.º 22
0
    def result_matches_type(cls, result, expected_type) -> bool:

        if expected_type is None or expected_type == cls.__NONE_TYPE:
            return result is None

        if expected_type == tp.Any:
            return True

        generic_type = _util.get_origin(expected_type)

        if generic_type is None:
            return isinstance(result, expected_type)

        if generic_type == list:

            list_type = _util.get_args(expected_type)[0]

            def list_type_check(item):
                return cls.result_matches_type(item, list_type)

            return isinstance(result, generic_type) and all(
                map(list_type_check, result))

        if generic_type == dict:

            dict_type_args = _util.get_args(expected_type)
            key_type = dict_type_args[0]
            value_type = dict_type_args[1]

            def dict_type_check(entry):
                key, value = entry
                return isinstance(key, key_type) and cls.result_matches_type(
                    value, value_type)

            return isinstance(result, generic_type) and all(
                map(dict_type_check, result.items()))

        raise _ex.ETracInternal(
            f"Cannot enforce type check for generic type [{str(generic_type)}]"
        )
Ejemplo n.º 23
0
    def convert_value(cls, raw_value: tp.Any, type_desc: _meta.TypeDescriptor):

        if type_desc.basicType == _meta.BasicType.BOOLEAN:
            return cls.convert_boolean_value(raw_value)

        if type_desc.basicType == _meta.BasicType.INTEGER:
            return cls.convert_integer_value(raw_value)

        if type_desc.basicType == _meta.BasicType.FLOAT:
            return cls.convert_float_value(raw_value)

        if type_desc.basicType == _meta.BasicType.DECIMAL:
            return cls.convert_decimal_value(raw_value)

        if type_desc.basicType == _meta.BasicType.STRING:
            return cls.convert_string_value(raw_value)

        if type_desc.basicType == _meta.BasicType.DATE:
            return cls.convert_date_value(raw_value)

        if type_desc.basicType == _meta.BasicType.DATETIME:
            return cls.convert_datetime_value(raw_value)

        raise _ex.ETracInternal(f"Conversion to value type [{type_desc.basicType.name}] is not supported yet")