Beispiel #1
0
class Hub(SatelliteOwner, register_as="hub"):
    star_prefix = "dim"

    _key_type: str = wysdom.UserProperty(str, name="key_type")

    static_columns: Dict[str, SatelliteColumn] = wysdom.UserProperty(
        wysdom.SchemaDict(SatelliteColumn), persist_defaults=True, default={})

    @property
    def key_type(self) -> ColumnType:
        return ColumnType(self._key_type)

    @property
    def key_length(self) -> int:
        return self.key_type.serialized_length

    @property
    def hubs(self) -> Dict[str, VaultObject]:
        return {self.name: self}

    @property
    def satellites_containing_keys(self) -> Dict[str, VaultObject]:
        return {
            key: sat
            for key, sat in self.project.satellites.items()
            if sat.parent.key == self.key
            or sat.parent.key in [link.key for link in self.links.values()]
            or self.name in sat.referenced_hubs.keys()
        }

    @property
    def links(self) -> Dict[str, VaultObject]:
        return {
            key: link
            for key, link in self.project.links.items()
            if self.name in link.unique_hubs.keys()
        }

    def hub_key_columns(self,
                        satellite: Satellite) -> Dict[str, List[HubKeyColumn]]:
        return {
            self.name:
            [HubKeyColumn(self.key_column_name, f'sat_{satellite.name}')]
        }

    def generate_key(self, from_table):
        return from_table.c[self.key_name]

    # TODO: Should this be in HubModel?
    def prepare_key_for_link(self, alias, from_table):
        key_column = from_table.c[self.alias_key_name(alias)]
        return self.key_type.serialize_column_expression(key_column)

    @property
    def link_key_columns(self):
        return []

    def validate(self) -> None:
        pass
Beispiel #2
0
class VaultObjectReference(wysdom.UserObject):

    type: str = wysdom.UserProperty(str)
    name: str = wysdom.UserProperty(str)

    @LazyProperty
    def key(self) -> VaultObjectKey:
        return VaultObjectKey(self.type, self.name)
Beispiel #3
0
class Column(wysdom.UserObject):

    _type: str = wysdom.UserProperty(str, name="type")

    @property
    def type(self) -> ColumnType:
        return ColumnType(self._type)
Beispiel #4
0
class SatelliteSQLPipeline(SatellitePipeline, register_as="sql"):

    type: str = wysdom.UserProperty(wysdom.SchemaConst('sql'))
    sql: str = wysdom.UserProperty(str, name="sql")
    load_dt: Optional[str] = wysdom.UserProperty(str, optional=True)
    deleted_ind: Optional[str] = wysdom.UserProperty(str, optional=True)
    dependencies: List[SatellitePipelineDependency] = wysdom.UserProperty(
        wysdom.SchemaArray(SatellitePipelineDependency), default=[])

    @property
    def key_columns(self) -> Dict[str, str]:
        if self._key_columns:
            return self._key_columns
        else:
            return {
                hub_alias: hub_alias
                for hub_alias in self.satellite.parent.hubs.keys()
            }
class SatellitePipelineDependency(wysdom.UserObject, Generic[DependencyType]):

    name: str = wysdom.UserProperty(str)
    type: str = wysdom.UserProperty(str)
    view: Optional[str] = wysdom.UserProperty(str, optional=True)

    @property
    def object_reference_key(self) -> VaultObjectKey:
        return VaultObjectKey(self.type, self.name)

    @property
    def project(self) -> Project:
        return wysdom.document(self).project

    @property
    def object_reference(self) -> DependencyType:
        return self.project[self.type, self.name]

    def validate(self) -> None:
        if self.object_reference_key not in self.project:
            raise KeyError(
                f"Cannot find {self.object_reference_key} in project.")
Beispiel #6
0
class SatelliteSourcePipeline(
    SatellitePipeline,
    register_as="source"
):

    type: str = wysdom.UserProperty(wysdom.SchemaConst('source'))
    _source: str = wysdom.UserProperty(str, name="source")

    @property
    def key_columns(self) -> Dict[str, str]:
        if self._key_columns:
            return self._key_columns
        else:
            return {
                key_column: source_column
                for key_column, source_column in zip(
                    self.satellite.parent.hubs.keys(),
                    self.source.columns.keys()
                )
            }

    @property
    def source(self) -> Source:
        # TODO: Refactor so this definitely returns Source, not VaultObject
        source_obj = self.project["source", self._source]
        assert isinstance(source_obj, Source)
        return source_obj

    @property
    def dependencies(self) -> List[SatellitePipelineDependency]:
        return [
            SatellitePipelineDependency(
                {'name': self._source, 'type': 'source'},
                json_dom_info=wysdom.dom.DOMInfo(
                    document=wysdom.document(self), parent=self)
            )
        ]
Beispiel #7
0
class Config(wysdom.UserObject, wysdom.ReadsJSON, wysdom.ReadsYAML):
    model_path: str = ConfigProperty(str,
                                     default_function=lambda self: os.getcwd())
    schema: str = ConfigProperty(str)
    skip_deploy: bool = ConfigProperty(bool, default=False)
    environment_type: str = ConfigProperty(str, default="local_spark")
    session: SessionConfig = ConfigProperty(SessionConfig,
                                            default={},
                                            persist_defaults=True)
    services: Dict[str, ServiceConfig] = ConfigProperty(
        wysdom.SchemaDict(ServiceConfig), default={}, persist_defaults=True)
    compute: str = ConfigProperty(str)
    registry: str = ConfigProperty(str)
    drop_schema_if_exists: bool = ConfigProperty(bool, default=False)

    @LazyProperty
    def secret_lookup(self) -> SecretLookup:
        return SecretLookup.registered_subclass_instance(
            self._secret_lookup_name)

    _secret_lookup_name: str = wysdom.UserProperty(str, name="secret_lookup")

    def reset_session(self):
        self.session.clear()

    def __str__(self):
        return yaml.dump(self.to_builtin(), default_flow_style=False)

    @classmethod
    def config_dir(cls):
        return os.path.join(str(Path.home()), '.jetavator')

    @classmethod
    def config_file(cls):
        return os.path.join(cls.config_dir(), 'config.yml')

    @classmethod
    def make_config_dir(cls):
        if not os.path.exists(cls.config_dir()):
            os.makedirs(cls.config_dir())

    def save(self):
        config_dict = self.to_builtin()
        # Don't save session specific config info
        if 'session' in config_dict:
            del config_dict['session']
        self.make_config_dir()
        with open(self.config_file(), 'w') as f:
            f.write(yaml.dump(config_dict, default_flow_style=False))
class SatellitePipeline(wysdom.UserObject, RegistersSubclasses, ABC):

    type: str = wysdom.UserProperty(str)
    performance_hints: PerformanceHints = wysdom.UserProperty(
        PerformanceHints, persist_defaults=True, default={})
    _key_columns: Dict[str, str] = wysdom.UserProperty(wysdom.SchemaDict(str),
                                                       name="key_columns",
                                                       persist_defaults=True,
                                                       default={})

    @property
    def satellite(self) -> SatelliteABC:
        # TODO: Improve the type checking here?
        parent = wysdom.parent(self)
        if isinstance(parent, SatelliteABC):
            return parent
        else:
            raise TypeError('Parent is not a subclass of SatelliteABC')

    @property
    def project(self) -> Project:
        return self.satellite.project

    @property
    @abstractmethod
    def dependencies(self) -> List[SatellitePipelineDependency]:
        raise NotImplementedError

    @property
    @abstractmethod
    def key_columns(self) -> Dict[str, str]:
        pass

    def validate(self) -> None:
        for dep in self.dependencies:
            dep.validate()
Beispiel #9
0
class VaultObject(wysdom.UserObject, wysdom.RegistersSubclasses, ABC):

    name: str = wysdom.UserProperty(str)
    type: str = wysdom.UserProperty(str)

    optional_yaml_properties = []

    def __init__(self, project: ProjectABC,
                 sqlalchemy_object: ObjectDefinition) -> None:
        self.project = project
        self._sqlalchemy_object = sqlalchemy_object
        super().__init__(self.definition)

    def __repr__(self) -> str:
        class_name = type(self).__name__
        return f'{class_name}({self.name})'

    @classmethod
    def subclass_instance(cls, project: ProjectABC,
                          definition: ObjectDefinition) -> VaultObject:
        return cls.registered_subclass_instance(definition.type, project,
                                                definition)

    @LazyProperty
    def key(self) -> VaultObjectKey:
        return VaultObjectKey(self.type, self.name)

    @property
    def definition(self) -> Dict[str, Any]:
        return self._sqlalchemy_object.definition

    def export_sqlalchemy_object(self) -> ObjectDefinition:
        if self._sqlalchemy_object.version != str(self.project.version):
            raise ValueError(
                "ObjectDefinition version must match project version "
                "and cannot be updated.")
        self._sqlalchemy_object.deploy_dt = str(datetime.now())
        return self._sqlalchemy_object

    @abstractmethod
    def validate(self) -> None:
        pass

    @property
    def compute_service(self) -> ComputeServiceABC:
        return self.project.compute_service

    @property
    def full_name(self) -> str:
        return f'{self.type}_{self.name}'

    @property
    def checksum(self) -> str:
        return str(self._sqlalchemy_object.checksum)

    @property
    def dependent_satellites(self) -> List[VaultObject]:
        return [
            satellite for satellite in self.project.satellites.values() if any(
                dependency.type == self.type and dependency.name == self.name
                for dependency in satellite.pipeline.dependencies)
        ]
Beispiel #10
0
class SatelliteOwner(VaultObject, ABC, register_as="satellite_owner"):

    key_length: int = None
    options: List[str] = wysdom.UserProperty(wysdom.SchemaArray(str),
                                             default=[])
    exclude_from_star_schema: bool = wysdom.UserProperty(bool, default=False)

    @property
    @abstractmethod
    def hubs(self) -> Dict[str, VaultObject]:
        pass

    @property
    def satellites(self) -> Dict[str, SatelliteABC]:
        return {
            satellite.name: satellite
            for satellite in self.project.satellites.values()
            if satellite.parent.key == self.key
        }

    @property
    def star_satellites(self) -> Dict[str, SatelliteABC]:
        return {
            satellite.name: satellite
            for satellite in self.satellites.values()
            if not satellite.exclude_from_star_schema
        }

    @property
    @abstractmethod
    def satellites_containing_keys(self) -> Dict[str, SatelliteABC]:
        pass

    @property
    def satellite_columns(self) -> Dict[str, SatelliteColumn]:
        return {
            column_name: column
            for satellite in self.star_satellites.values()
            for column_name, column in satellite.columns.items()
        }

    @property
    def key_column_name(self) -> str:
        return f"{self.type}_{self.name}_key"

    @property
    def hash_column_name(self) -> str:
        return f"{self.type}_{self.name}_hash"

    @property
    def hashed_columns(self) -> Dict[str, SatelliteColumn]:
        return self.satellite_columns

    def hub_key_columns(self, satellite) -> Dict[str, HubKeyColumn]:
        raise NotImplementedError

    def option(self, option_name: str) -> bool:
        return any(option == option_name for option in self.options)

    @abstractmethod
    def validate(self) -> None:
        pass

    def alias_key_name(self, alias):
        return f"{self.type}_{alias}_key"

    def alias_hash_key_name(self, alias):
        return f"{self.type}_{alias}_hash"

    @property
    def key_name(self):
        return self.alias_key_name(self.name)

    @property
    def hash_key_name(self):
        return self.alias_hash_key_name(self.name)

    def alias_primary_key_name(self, alias):
        if self.option("hash_key"):
            return self.alias_hash_key_name(alias)
        else:
            return self.alias_key_name(alias)

    @abstractmethod
    def generate_key(self, from_table):
        pass

    @property
    @abstractmethod
    def link_key_columns(self):
        pass

    @property
    @abstractmethod
    def key_type(self) -> ColumnType:
        pass

    # TODO: Move SQLAlchemy column generation to sql_model
    def alias_key_column(self, alias):
        return Column(self.alias_key_name(alias),
                      self.key_type.sqlalchemy_type,
                      nullable=False)

    def alias_hash_key_column(self, alias):
        return Column(self.alias_hash_key_name(alias),
                      CHAR(32),
                      nullable=False)

    def alias_key_columns(self, alias):
        if self.option("hash_key"):
            return [
                self.alias_hash_key_column(alias),
                self.alias_key_column(alias)
            ]
        else:
            return [self.alias_key_column(alias)]

    def alias_primary_key_column(self, alias):
        if self.option("hash_key"):
            return self.alias_hash_key_column(alias)
        else:
            return self.alias_key_column(alias)

    @property
    def table_name(self) -> str:
        return f"vault_{self.type}_{self.name}"

    @property
    def star_table_name(self) -> str:
        return f"star_{self.star_prefix}_{self.name}"

    @property
    @abstractmethod
    def star_prefix(self):
        pass
Beispiel #11
0
class SourceColumn(Column):

    nullable: bool = wysdom.UserProperty(bool)
    pk: Optional[bool] = wysdom.UserProperty(bool, default=False)
Beispiel #12
0
class Satellite(SatelliteABC, register_as="satellite"):

    _parent: VaultObjectReference = wysdom.UserProperty(VaultObjectReference,
                                                        name="parent")

    columns: Dict[str, SatelliteColumn] = wysdom.UserProperty(
        wysdom.SchemaDict(SatelliteColumn))
    pipeline: SatellitePipeline = wysdom.UserProperty(SatellitePipeline)
    exclude_from_star_schema: bool = wysdom.UserProperty(bool, default=False)

    @property
    def parent(self) -> SatelliteOwner:
        return self.project[self._parent.key]

    @property
    def hub_reference_columns(self) -> Dict[str, SatelliteColumn]:
        return {k: v for k, v in self.columns.items() if v.hub_reference}

    @property
    def referenced_hubs(self) -> Dict[str, SatelliteOwner]:
        return {
            hub_name: self.project["hub", hub_name]
            for hub_name in VaultObjectSet(
                x.hub_reference for x in self.hub_reference_columns.values())
        }

    @property
    def full_name(self) -> str:
        return f'sat_{self.name}'

    @property
    def hub_key_columns(self) -> Dict[str, List[HubKeyColumn]]:
        # check if this can be safely refactored to
        # a function hub_key_columns(self, hub_name)
        columns = self.parent.hub_key_columns(self)
        if (self.hub_reference_columns and
                not self.pipeline.performance_hints.no_update_referenced_hubs):
            for column_name, column in self.hub_reference_columns.items():
                columns.setdefault(column.hub_reference, []).append(
                    HubKeyColumn(column_name, f'hub_{column.hub_reference}'))
        return columns

    @LazyProperty
    def input_keys(self) -> VaultObjectSet[SatelliteOwner]:
        return VaultObjectSet(owner for dep in self.pipeline.dependencies
                              if isinstance(dep.object_reference, Satellite)
                              for owner in dep.object_reference.output_keys)

    @LazyProperty
    def produced_keys(self) -> VaultObjectSet[SatelliteOwner]:
        if self.pipeline.performance_hints.no_update_hubs:
            keys = VaultObjectSet()
        else:
            keys = VaultObjectSet(self.project.hubs[name]
                                  for name in self.hub_key_columns)
        if (self.parent.registered_name == 'link'
                and not self.pipeline.performance_hints.no_update_links):
            keys.add(self.parent)
        return keys

    @LazyProperty
    def output_keys(self) -> VaultObjectSet[SatelliteOwner]:
        return self.produced_keys | self.input_keys

    def dependent_satellites_by_owner(self,
                                      satellite_owner) -> List[Satellite]:
        return [
            dep.object_reference for dep in self.pipeline.dependencies
            if isinstance(dep.object_reference, Satellite)
            for output_key in dep.object_reference.output_keys
            if output_key is satellite_owner
        ]

    def validate(self) -> None:
        if self._parent.key not in self.project:
            raise KeyError(f"Could not find parent object {self._parent.key}")
        self.pipeline.validate()

    @property
    def satellite_columns(self):
        return [
            Column(column_name, column.type.sqlalchemy_type, nullable=True)
            for column_name, column in self.columns.items()
        ]

    @property
    def table_name(self):
        return f"vault_sat_{self.name}"
Beispiel #13
0
class PerformanceHints(wysdom.UserObject):

    no_update_hubs: str = wysdom.UserProperty(bool, default=False)
    no_update_links: str = wysdom.UserProperty(bool, default=False)
    no_update_referenced_hubs: str = wysdom.UserProperty(bool, default=False)
Beispiel #14
0
class SatelliteColumn(Column):

    nullable: bool = wysdom.UserProperty(bool, default=True)
    hub_reference: Optional[str] = wysdom.UserProperty(str, optional=True)
    index: bool = wysdom.UserProperty(bool, default=False)
class SparkDeltaStorageConfig(StorageServiceConfig):
    type: str = wysdom.UserProperty(wysdom.SchemaConst('spark_delta'))
Beispiel #16
0
class Link(SatelliteOwner, register_as="link"):

    star_prefix = "fact"

    # TODO: Rename link_hubs to hubs
    _link_hubs: Dict[str, str] = wysdom.UserProperty(
        wysdom.SchemaDict(str), name='link_hubs')

    @property
    def hubs(self) -> Dict[str, Hub]:
        return {
            k: self.project['hub', v]
            for k, v in self._link_hubs.items()
        }

    @property
    def satellites_containing_keys(self) -> Dict[str, Satellite]:
        return self.star_satellites

    @property
    def key_length(self) -> int:
        return sum([
            hub.key_length + 1
            for hub in self.hubs.values()
        ]) - 1

    @property
    def key_type(self) -> ColumnType:
        return ColumnType(f"CHAR({self.key_length})")

    @property
    def unique_hubs(self) -> Dict[str, Hub]:
        return {
            hub_name: self.project["hub", hub_name]
            for hub_name in set(x.name for x in self.hubs.values())
        }

    def hub_key_columns(self, satellite) -> Dict[str, HubKeyColumn]:
        columns = {}
        for alias, hub in self.hubs.items():
            columns.setdefault(hub.name, []).append(
                HubKeyColumn(f'hub_{alias}_key', f'hub_{hub.name}'))
        return columns

    def generate_key(self, from_table):
        key_components = iter([
            hub.prepare_key_for_link(hub_alias, from_table)
            for hub_alias, hub in self.hubs.items()
        ])
        composite_key = next(key_components)
        for column in key_components:
            composite_key = composite_key.concat(
                func.char(literal_column(str(SEPARATOR)))
            ).concat(column)
        return composite_key

    @property
    def link_key_columns(self):
        return [
            hub.alias_key_column(hub_alias)
            for hub_alias, hub in self.hubs.items()
        ]

    def validate(self) -> None:
        for k, v in self._link_hubs.items():
            if ('hub', v) not in self.project:
                raise KeyError(
                    f"Cannot find referenced hub {v} in object {self.key}"
                )
Beispiel #17
0
class Source(VaultObject, register_as="source"):

    DELETED_INDICATOR_SYSTEM_COLUMN = "jetavator_deleted_ind"
    LOAD_TIMESTAMP_SYSTEM_COLUMN = "jetavator_load_dt"

    columns: Dict[str, SourceColumn] = wysdom.UserProperty(
        wysdom.SchemaDict(SourceColumn))
    csv_files: List[str] = []
    deleted_indicator_column: Optional[str] = wysdom.UserProperty(
        str, optional=True)
    load_timestamp_column: Optional[str] = wysdom.UserProperty(str,
                                                               optional=True)
    date_format: Optional[str] = wysdom.UserProperty(str, optional=True)
    timestamp_format: Optional[str] = wysdom.UserProperty(str, optional=True)

    @property
    def primary_key_columns(self) -> Dict[str, SourceColumn]:
        return {k: v for k, v in self.columns.items() if v.pk}

    def validate(self) -> None:
        pass

    @LazyProperty
    def create_table_statement(self) -> CreateTable:
        return CreateTable(self.table)

    # TODO: Move to sql_model?
    @LazyProperty
    def table(self) -> Table:
        return Table(self.full_name, MetaData(), *self._table_columns())

    def load_csvs(
            self, csv_files: List[FilePath]
        # , assume_schema_integrity=False
    ) -> None:
        """Loads a list of CSV files into a single named Source

        :param csv_files:  List of paths on disk of the CSV files
        """
        self.csv_files = csv_files

    def _table_columns(self) -> List[Column]:
        # TODO: Spark/Hive does not allow PKs. Make this configurable per engine?
        use_primary_key = False
        return [
            *self._source_columns(use_primary_key),
            *self._date_columns(use_primary_key)
        ]

    def _source_columns(self, use_primary_key: bool = True) -> List[Column]:
        return [
            Column(column_name,
                   column.type.sqlalchemy_type,
                   nullable=True,
                   primary_key=(use_primary_key and column.pk))
            for column_name, column in self.columns.items()
        ]

    @staticmethod
    def _date_columns(use_primary_key: bool = True) -> List[Column]:
        return [
            Column("jetavator_load_dt",
                   DateTime(),
                   nullable=True,
                   primary_key=use_primary_key),
            Column(
                "jetavator_deleted_ind",
                # TODO: Loading as integer saves space in CSVs.
                #       Does this make sense for other file formats?
                #       Is there a more general solution?
                Integer(),
                nullable=True,
                default=0)
        ]