def __init__(self, **kwargs): self._logger = self._get_logger() self._format_date = kwargs.get('format_date') self._source: SourceSinkDescriptor = SourceSinkDescriptor() self._sink: SourceSinkDescriptor = SourceSinkDescriptor() self._job_id = None self._run_id = None self._manifest_name = None self._ddl_file = kwargs.get('ddl_file') self._env = kwargs.get('connection', 'state_manager') self._conn = pf.create(key=kwargs.get('dao'), configuration={ 'connection': kwargs.get('connection', 'state_manager') }) # TODO: Pass Table name as args metadata = schema.MetaData(bind=self._conn.engine) metadata.reflect() if ProjectConfig.state_manager_table_name( ) not in metadata.tables.keys(): with open(self._ddl_file, 'r') as stream: ddl = stream.read() with self._conn.connection as conn: conn.execute(ddl) metadata = schema.MetaData(bind=self._conn.engine) metadata.reflect() self._table: schema.Table = metadata.tables[ ProjectConfig.state_manager_table_name()]
def __init__(self, **kwargs): self._engine = None # Connection timeout management self._max_attempts = ProjectConfig.connection_max_attempts() self._timeout = ProjectConfig.connection_timeout() self._timeout_factor = self._timeout super().__init__(**kwargs)
def _create_engine(self) -> Any: with open(f"{ProjectConfig.hdm_home()}/{ProjectConfig.profile_path()}", 'r') as stream: conn_conf = yaml.safe_load(stream)[ProjectConfig.hdm_env()][ self._connection_name] return create_engine( f"mysql+mysqlconnector://{conn_conf['user']}:{conn_conf['password']}@{conn_conf['host']}:" f"{conn_conf['port']}/{conn_conf['database']}")
def _validate_configuration(self) -> bool: with open(f"{ProjectConfig.hdm_home()}/{ProjectConfig.profile_path()}", 'r') as stream: conn_conf = yaml.safe_load(stream)[ProjectConfig.hdm_env()][self._connection_name] required_keys = ['host', 'port', 'database', 'user', 'password', 'driver'] is_valid = all([key in conn_conf.keys() for key in required_keys]) if is_valid: required_keys = ['name', 'path'] return all([key in conn_conf['driver'].keys() for key in required_keys]) return is_valid
def _create_stage(self, stage_name, source_directory, connection): # TODO: Check with John. How to get azure connection info- hardcoded below? Manefist or profile yml with open(f"{ProjectConfig.hdm_home()}/{ProjectConfig.profile_path()}", 'r') as stream: conn_conf = yaml.safe_load(stream)[ProjectConfig.hdm_env( )]['azure'] # Fai: Pull from manifest "azure_env" the azure env name here instead of hardcode url = conn_conf['url'].replace("https", "azure") + self._stage_directory self.__external_stage_params = f"URL = '{url}' CREDENTIALS = ( AZURE_SAS_TOKEN = '{conn_conf['sas']}')" cursor = connection.cursor() # create stage self._logger.info("Creating stage: %s", self._stage_name) create_stage_sql = f"CREATE OR REPLACE STAGE {self._stage_name}" create_stage_sql += f" {self.__external_stage_params}" cursor.execute(create_stage_sql) cursor.close()
def _get_connection(self): """ Obtain a context managed azure connection Returns: azure connection Raises: ConnectionError: azure connection could not be established """ with open(f"{ProjectConfig.hdm_home()}/{ProjectConfig.profile_path()}", 'r') as stream: conn_conf = yaml.safe_load(stream)[ProjectConfig.hdm_env()][self._connection_name] self._container = conn_conf['container_name'] connection = BlobServiceClient(account_url=conn_conf['url'], credential=conn_conf['sas']) self._test_blob_container_existence(connection) return connection
def _create_engine(self) -> Any: with open(f"{ProjectConfig.hdm_home()}/{ProjectConfig.profile_path()}", 'r') as stream: conn_conf = yaml.safe_load(stream)[ProjectConfig.hdm_env()][ self._connection_name] if not conn_conf.get('dbpath', os.curdir): db_path = os.path.abspath(os.curdir) else: db_path = os.path.abspath(conn_conf.get('dbpath', os.curdir)) database = str(conn_conf['database']) if database.rfind(".db") == -1: db_path = os.path.join(db_path, f"{database}.db") return create_engine(f"sqlite:///{db_path}")
def _get_connection(self): """ Obtain a context managed netezza connection Returns: Netezza connection Raises: ConnectionError: Netezza connection could not be established """ connection = None with open(f"{ProjectConfig.hdm_home()}/{ProjectConfig.profile_path()}", 'r') as stream: conn_conf = yaml.safe_load(stream)[ProjectConfig.hdm_env()][self._connection_name] connection_config = self._get_connection_config(config=conn_conf) connection_invalid = True connection_attempt_count = 0 timeout = self._timeout while connection_attempt_count < self._max_attempts: connection = self._connect_by_connector(connection_config) # If your connection is valid, then set it so and break from while loop if self._test_connection(connection): connection_invalid = False break # Otherwise, you must put program to sleep, wait for next time to obtain connection and carry on. connection_attempt_count += 1 if connection_invalid < self._max_attempts: time.sleep(timeout) timeout *= self._timeout_factor if connection_invalid: raise ConnectionError('Unable to connection to Netezza. Please try again.') yield connection connection.close()
def __build_query(self) -> None: """ builds query. Default is no check_sum column checksum_methods: default method: generates random numbers as checksum value hash method: ** Important: Hash function needs IBM Netezza SQL Extensions toolkit installed checksum_method : hash hash_column is the column to be hashed hash_function supported are : hash4 (returns the 32 bit checksum hash of the input data.) hash8 (returns the 64 bit hash of the input data) hash (returns hashed input data) ** Important: hash() function is much slower to calculate than hash4() and hash8() Returns: none """ self.__query = f"INSERT INTO {self.__external_table_name} " \ f"SELECT * , CAST(random()* 100000 AS INT) as ck_sum FROM {self.__table}" # Checksum if self.__checksum and self.__checksum['function'] and self.__checksum['column']: self.__query = self._generate_checksum_select_query(checksum=self.__checksum, table_name=self.__table, external_table_name=self.__external_table_name) # Watermark if self.__watermark and self.__watermark['column'] and self.__watermark['offset']: where_clause = self._generate_watermarked_where_clause(watermark=self.__watermark, last_data_pulled=self._last_record_pulled) self.__query = " ".join( [ self.__query, where_clause ] ) if ProjectConfig.query_limit(): self.__query += f" LIMIT {ProjectConfig.query_limit()}"
def _create_engine(self) -> Any: """ get ODBC connection string from azure portal 'Driver={ODBC Driver 13 for SQL Server};Server=tcp:yourDBServerName.database.windows.net,1433; Database=dbname;Uid=username;Pwd=xxx;Encrypt=yes;TrustServerCertificate=no;Connection Timeout=30; and populate in values in hdm_profile.yml Important: {ODBC Driver 17 for SQL Server} works even though in azure portal connection string it say ODBC Driver 13 for SQL Server """ with open(f"{ProjectConfig.hdm_home()}/{ProjectConfig.profile_path()}", 'r') as stream: conn_conf = yaml.safe_load(stream)[ProjectConfig.hdm_env()][ self._connection_name] conn = f"Driver={{{conn_conf['driver']}}};Server=tcp:{conn_conf['host']}.database.windows.net,{conn_conf['port']};" \ f"Database={conn_conf['database']};Uid={conn_conf['user']};Pwd={conn_conf['password']};Encrypt=yes;" \ f"TrustServerCertificate=no;Connection Timeout=30;" params = urllib.parse.quote_plus(conn) conn_str = 'mssql+pyodbc:///?odbc_connect={}'.format(params) return create_engine(conn_str)
def _get_connection(self, **kwargs): """ Gets a S3 Session based on env variables. Connection is made using credentials specified in .aws/ or as per the profile values specified in: aws_access_key_id aws_secret_access_key region_name """ with open(f"{ProjectConfig.hdm_home()}/{ProjectConfig.profile_path()}", 'r') as stream: conn_conf = yaml.safe_load(stream)[ProjectConfig.hdm_env()][ kwargs.get('connection')] # TODO - Validation needed for all the correct keys present or not if 'profile' in conn_conf.keys(): connection = boto3.session.Session( profile_name=conn_conf.get('profile')) else: connection = boto3.session.Session( aws_access_key_id=conn_conf.get('aws_access_key_id'), aws_secret_access_key=conn_conf.get('aws_secret_access_key'), region_name=conn_conf.get('region_name')) return connection
def test_archive_folder(self): self.assertEqual('archive', ProjectConfig.archive_folder())
def test_hdm_home(self): os.environ['HDM_HOME'] = os.getcwd() self.assertEqual(os.getcwd(), ProjectConfig.hdm_home())
def test_connection_timeout(self): self.assertEqual(3, ProjectConfig.connection_timeout())
def test_connection_max_attempts(self): self.assertEqual(3, ProjectConfig.connection_max_attempts())
def test_state_manager_table_name(self): self.assertEqual('state_manager', ProjectConfig.state_manager_table_name())
def test_file_prefix(self): self.assertEqual('hdm', ProjectConfig.file_prefix())
def test_profile_path(self): self.assertEqual(ProjectConfig.profile_path(), ".hashmap_data_migrator/hdm_profiles.yml")
def test_hdm_env(self): self.assertEqual('unit-test', ProjectConfig.hdm_env()) os.environ['HDM_ENV'] = 'prod' self.assertEqual('prod', ProjectConfig.hdm_env())