Esempio n. 1
0
class Hub(SatelliteOwner, register_as="hub"):
    star_prefix = "dim"

    _key_type: str = wysdom.UserProperty(str, name="key_type")

    static_columns: Dict[str, SatelliteColumn] = wysdom.UserProperty(
        wysdom.SchemaDict(SatelliteColumn), persist_defaults=True, default={})

    @property
    def key_type(self) -> ColumnType:
        return ColumnType(self._key_type)

    @property
    def key_length(self) -> int:
        return self.key_type.serialized_length

    @property
    def hubs(self) -> Dict[str, VaultObject]:
        return {self.name: self}

    @property
    def satellites_containing_keys(self) -> Dict[str, VaultObject]:
        return {
            key: sat
            for key, sat in self.project.satellites.items()
            if sat.parent.key == self.key
            or sat.parent.key in [link.key for link in self.links.values()]
            or self.name in sat.referenced_hubs.keys()
        }

    @property
    def links(self) -> Dict[str, VaultObject]:
        return {
            key: link
            for key, link in self.project.links.items()
            if self.name in link.unique_hubs.keys()
        }

    def hub_key_columns(self,
                        satellite: Satellite) -> Dict[str, List[HubKeyColumn]]:
        return {
            self.name:
            [HubKeyColumn(self.key_column_name, f'sat_{satellite.name}')]
        }

    def generate_key(self, from_table):
        return from_table.c[self.key_name]

    # TODO: Should this be in HubModel?
    def prepare_key_for_link(self, alias, from_table):
        key_column = from_table.c[self.alias_key_name(alias)]
        return self.key_type.serialize_column_expression(key_column)

    @property
    def link_key_columns(self):
        return []

    def validate(self) -> None:
        pass
Esempio n. 2
0
class ComputeServiceConfig(ServiceConfig):
    service_type: str = ConfigProperty(wysdom.SchemaConst('compute'))
    storage_services: Dict[str, StorageServiceConfig] = ConfigProperty(
        wysdom.SchemaDict(StorageServiceConfig),
        default={},
        persist_defaults=True)
    storage: StorageConfig = ConfigProperty(StorageConfig)

    schema: str = ConfigProperty(str, default_function=_get_default_schema)
    drop_schema_if_exists: bool = ConfigProperty(
        bool, default_function=_get_default_drop_schema_if_exists)
Esempio n. 3
0
class Config(wysdom.UserObject, wysdom.ReadsJSON, wysdom.ReadsYAML):
    model_path: str = ConfigProperty(str,
                                     default_function=lambda self: os.getcwd())
    schema: str = ConfigProperty(str)
    skip_deploy: bool = ConfigProperty(bool, default=False)
    environment_type: str = ConfigProperty(str, default="local_spark")
    session: SessionConfig = ConfigProperty(SessionConfig,
                                            default={},
                                            persist_defaults=True)
    services: Dict[str, ServiceConfig] = ConfigProperty(
        wysdom.SchemaDict(ServiceConfig), default={}, persist_defaults=True)
    compute: str = ConfigProperty(str)
    registry: str = ConfigProperty(str)
    drop_schema_if_exists: bool = ConfigProperty(bool, default=False)

    @LazyProperty
    def secret_lookup(self) -> SecretLookup:
        return SecretLookup.registered_subclass_instance(
            self._secret_lookup_name)

    _secret_lookup_name: str = wysdom.UserProperty(str, name="secret_lookup")

    def reset_session(self):
        self.session.clear()

    def __str__(self):
        return yaml.dump(self.to_builtin(), default_flow_style=False)

    @classmethod
    def config_dir(cls):
        return os.path.join(str(Path.home()), '.jetavator')

    @classmethod
    def config_file(cls):
        return os.path.join(cls.config_dir(), 'config.yml')

    @classmethod
    def make_config_dir(cls):
        if not os.path.exists(cls.config_dir()):
            os.makedirs(cls.config_dir())

    def save(self):
        config_dict = self.to_builtin()
        # Don't save session specific config info
        if 'session' in config_dict:
            del config_dict['session']
        self.make_config_dir()
        with open(self.config_file(), 'w') as f:
            f.write(yaml.dump(config_dict, default_flow_style=False))
Esempio n. 4
0
class SatellitePipeline(wysdom.UserObject, RegistersSubclasses, ABC):

    type: str = wysdom.UserProperty(str)
    performance_hints: PerformanceHints = wysdom.UserProperty(
        PerformanceHints, persist_defaults=True, default={})
    _key_columns: Dict[str, str] = wysdom.UserProperty(wysdom.SchemaDict(str),
                                                       name="key_columns",
                                                       persist_defaults=True,
                                                       default={})

    @property
    def satellite(self) -> SatelliteABC:
        # TODO: Improve the type checking here?
        parent = wysdom.parent(self)
        if isinstance(parent, SatelliteABC):
            return parent
        else:
            raise TypeError('Parent is not a subclass of SatelliteABC')

    @property
    def project(self) -> Project:
        return self.satellite.project

    @property
    @abstractmethod
    def dependencies(self) -> List[SatellitePipelineDependency]:
        raise NotImplementedError

    @property
    @abstractmethod
    def key_columns(self) -> Dict[str, str]:
        pass

    def validate(self) -> None:
        for dep in self.dependencies:
            dep.validate()
Esempio n. 5
0
class Link(SatelliteOwner, register_as="link"):

    star_prefix = "fact"

    # TODO: Rename link_hubs to hubs
    _link_hubs: Dict[str, str] = wysdom.UserProperty(
        wysdom.SchemaDict(str), name='link_hubs')

    @property
    def hubs(self) -> Dict[str, Hub]:
        return {
            k: self.project['hub', v]
            for k, v in self._link_hubs.items()
        }

    @property
    def satellites_containing_keys(self) -> Dict[str, Satellite]:
        return self.star_satellites

    @property
    def key_length(self) -> int:
        return sum([
            hub.key_length + 1
            for hub in self.hubs.values()
        ]) - 1

    @property
    def key_type(self) -> ColumnType:
        return ColumnType(f"CHAR({self.key_length})")

    @property
    def unique_hubs(self) -> Dict[str, Hub]:
        return {
            hub_name: self.project["hub", hub_name]
            for hub_name in set(x.name for x in self.hubs.values())
        }

    def hub_key_columns(self, satellite) -> Dict[str, HubKeyColumn]:
        columns = {}
        for alias, hub in self.hubs.items():
            columns.setdefault(hub.name, []).append(
                HubKeyColumn(f'hub_{alias}_key', f'hub_{hub.name}'))
        return columns

    def generate_key(self, from_table):
        key_components = iter([
            hub.prepare_key_for_link(hub_alias, from_table)
            for hub_alias, hub in self.hubs.items()
        ])
        composite_key = next(key_components)
        for column in key_components:
            composite_key = composite_key.concat(
                func.char(literal_column(str(SEPARATOR)))
            ).concat(column)
        return composite_key

    @property
    def link_key_columns(self):
        return [
            hub.alias_key_column(hub_alias)
            for hub_alias, hub in self.hubs.items()
        ]

    def validate(self) -> None:
        for k, v in self._link_hubs.items():
            if ('hub', v) not in self.project:
                raise KeyError(
                    f"Cannot find referenced hub {v} in object {self.key}"
                )
Esempio n. 6
0
class Satellite(SatelliteABC, register_as="satellite"):

    _parent: VaultObjectReference = wysdom.UserProperty(VaultObjectReference,
                                                        name="parent")

    columns: Dict[str, SatelliteColumn] = wysdom.UserProperty(
        wysdom.SchemaDict(SatelliteColumn))
    pipeline: SatellitePipeline = wysdom.UserProperty(SatellitePipeline)
    exclude_from_star_schema: bool = wysdom.UserProperty(bool, default=False)

    @property
    def parent(self) -> SatelliteOwner:
        return self.project[self._parent.key]

    @property
    def hub_reference_columns(self) -> Dict[str, SatelliteColumn]:
        return {k: v for k, v in self.columns.items() if v.hub_reference}

    @property
    def referenced_hubs(self) -> Dict[str, SatelliteOwner]:
        return {
            hub_name: self.project["hub", hub_name]
            for hub_name in VaultObjectSet(
                x.hub_reference for x in self.hub_reference_columns.values())
        }

    @property
    def full_name(self) -> str:
        return f'sat_{self.name}'

    @property
    def hub_key_columns(self) -> Dict[str, List[HubKeyColumn]]:
        # check if this can be safely refactored to
        # a function hub_key_columns(self, hub_name)
        columns = self.parent.hub_key_columns(self)
        if (self.hub_reference_columns and
                not self.pipeline.performance_hints.no_update_referenced_hubs):
            for column_name, column in self.hub_reference_columns.items():
                columns.setdefault(column.hub_reference, []).append(
                    HubKeyColumn(column_name, f'hub_{column.hub_reference}'))
        return columns

    @LazyProperty
    def input_keys(self) -> VaultObjectSet[SatelliteOwner]:
        return VaultObjectSet(owner for dep in self.pipeline.dependencies
                              if isinstance(dep.object_reference, Satellite)
                              for owner in dep.object_reference.output_keys)

    @LazyProperty
    def produced_keys(self) -> VaultObjectSet[SatelliteOwner]:
        if self.pipeline.performance_hints.no_update_hubs:
            keys = VaultObjectSet()
        else:
            keys = VaultObjectSet(self.project.hubs[name]
                                  for name in self.hub_key_columns)
        if (self.parent.registered_name == 'link'
                and not self.pipeline.performance_hints.no_update_links):
            keys.add(self.parent)
        return keys

    @LazyProperty
    def output_keys(self) -> VaultObjectSet[SatelliteOwner]:
        return self.produced_keys | self.input_keys

    def dependent_satellites_by_owner(self,
                                      satellite_owner) -> List[Satellite]:
        return [
            dep.object_reference for dep in self.pipeline.dependencies
            if isinstance(dep.object_reference, Satellite)
            for output_key in dep.object_reference.output_keys
            if output_key is satellite_owner
        ]

    def validate(self) -> None:
        if self._parent.key not in self.project:
            raise KeyError(f"Could not find parent object {self._parent.key}")
        self.pipeline.validate()

    @property
    def satellite_columns(self):
        return [
            Column(column_name, column.type.sqlalchemy_type, nullable=True)
            for column_name, column in self.columns.items()
        ]

    @property
    def table_name(self):
        return f"vault_sat_{self.name}"
Esempio n. 7
0
class Source(VaultObject, register_as="source"):

    DELETED_INDICATOR_SYSTEM_COLUMN = "jetavator_deleted_ind"
    LOAD_TIMESTAMP_SYSTEM_COLUMN = "jetavator_load_dt"

    columns: Dict[str, SourceColumn] = wysdom.UserProperty(
        wysdom.SchemaDict(SourceColumn))
    csv_files: List[str] = []
    deleted_indicator_column: Optional[str] = wysdom.UserProperty(
        str, optional=True)
    load_timestamp_column: Optional[str] = wysdom.UserProperty(str,
                                                               optional=True)
    date_format: Optional[str] = wysdom.UserProperty(str, optional=True)
    timestamp_format: Optional[str] = wysdom.UserProperty(str, optional=True)

    @property
    def primary_key_columns(self) -> Dict[str, SourceColumn]:
        return {k: v for k, v in self.columns.items() if v.pk}

    def validate(self) -> None:
        pass

    @LazyProperty
    def create_table_statement(self) -> CreateTable:
        return CreateTable(self.table)

    # TODO: Move to sql_model?
    @LazyProperty
    def table(self) -> Table:
        return Table(self.full_name, MetaData(), *self._table_columns())

    def load_csvs(
            self, csv_files: List[FilePath]
        # , assume_schema_integrity=False
    ) -> None:
        """Loads a list of CSV files into a single named Source

        :param csv_files:  List of paths on disk of the CSV files
        """
        self.csv_files = csv_files

    def _table_columns(self) -> List[Column]:
        # TODO: Spark/Hive does not allow PKs. Make this configurable per engine?
        use_primary_key = False
        return [
            *self._source_columns(use_primary_key),
            *self._date_columns(use_primary_key)
        ]

    def _source_columns(self, use_primary_key: bool = True) -> List[Column]:
        return [
            Column(column_name,
                   column.type.sqlalchemy_type,
                   nullable=True,
                   primary_key=(use_primary_key and column.pk))
            for column_name, column in self.columns.items()
        ]

    @staticmethod
    def _date_columns(use_primary_key: bool = True) -> List[Column]:
        return [
            Column("jetavator_load_dt",
                   DateTime(),
                   nullable=True,
                   primary_key=use_primary_key),
            Column(
                "jetavator_deleted_ind",
                # TODO: Loading as integer saves space in CSVs.
                #       Does this make sense for other file formats?
                #       Is there a more general solution?
                Integer(),
                nullable=True,
                default=0)
        ]