Beispiel #1
0
    def __init__(self, **kwargs):

        self._logger = self._get_logger()
        self._format_date = kwargs.get('format_date')
        self._source: SourceSinkDescriptor = SourceSinkDescriptor()
        self._sink: SourceSinkDescriptor = SourceSinkDescriptor()
        self._job_id = None
        self._run_id = None
        self._manifest_name = None
        self._ddl_file = kwargs.get('ddl_file')

        self._env = kwargs.get('connection', 'state_manager')
        self._conn = pf.create(key=kwargs.get('dao'),
                               configuration={
                                   'connection':
                                   kwargs.get('connection', 'state_manager')
                               })
        # TODO: Pass Table name as args
        metadata = schema.MetaData(bind=self._conn.engine)
        metadata.reflect()
        if ProjectConfig.state_manager_table_name(
        ) not in metadata.tables.keys():
            with open(self._ddl_file, 'r') as stream:
                ddl = stream.read()

            with self._conn.connection as conn:
                conn.execute(ddl)

            metadata = schema.MetaData(bind=self._conn.engine)
            metadata.reflect()

        self._table: schema.Table = metadata.tables[
            ProjectConfig.state_manager_table_name()]
Beispiel #2
0
 def __init__(self, **kwargs):
     self._engine = None
     # Connection timeout management
     self._max_attempts = ProjectConfig.connection_max_attempts()
     self._timeout = ProjectConfig.connection_timeout()
     self._timeout_factor = self._timeout
     super().__init__(**kwargs)
Beispiel #3
0
    def _create_engine(self) -> Any:
        with open(f"{ProjectConfig.hdm_home()}/{ProjectConfig.profile_path()}",
                  'r') as stream:
            conn_conf = yaml.safe_load(stream)[ProjectConfig.hdm_env()][
                self._connection_name]

        return create_engine(
            f"mysql+mysqlconnector://{conn_conf['user']}:{conn_conf['password']}@{conn_conf['host']}:"
            f"{conn_conf['port']}/{conn_conf['database']}")
Beispiel #4
0
    def _validate_configuration(self) -> bool:
        with open(f"{ProjectConfig.hdm_home()}/{ProjectConfig.profile_path()}", 'r') as stream:
            conn_conf = yaml.safe_load(stream)[ProjectConfig.hdm_env()][self._connection_name]

        required_keys = ['host', 'port', 'database', 'user', 'password', 'driver']
        is_valid = all([key in conn_conf.keys() for key in required_keys])

        if is_valid:
            required_keys = ['name', 'path']
            return all([key in conn_conf['driver'].keys() for key in required_keys])

        return is_valid
Beispiel #5
0
    def _create_stage(self, stage_name, source_directory, connection):
        # TODO: Check with John. How to get azure connection info- hardcoded below? Manefist or profile yml
        with open(f"{ProjectConfig.hdm_home()}/{ProjectConfig.profile_path()}",
                  'r') as stream:
            conn_conf = yaml.safe_load(stream)[ProjectConfig.hdm_env(
            )]['azure']  # Fai: Pull from manifest "azure_env" the azure env name here instead of hardcode

        url = conn_conf['url'].replace("https",
                                       "azure") + self._stage_directory
        self.__external_stage_params = f"URL = '{url}' CREDENTIALS = ( AZURE_SAS_TOKEN = '{conn_conf['sas']}')"
        cursor = connection.cursor()
        # create stage
        self._logger.info("Creating stage: %s", self._stage_name)
        create_stage_sql = f"CREATE OR REPLACE STAGE {self._stage_name}"
        create_stage_sql += f" {self.__external_stage_params}"
        cursor.execute(create_stage_sql)
        cursor.close()
Beispiel #6
0
    def _get_connection(self):
        """
        Obtain a context managed azure connection

        Returns: azure connection

        Raises:
            ConnectionError: azure connection could not be established

        """
        with open(f"{ProjectConfig.hdm_home()}/{ProjectConfig.profile_path()}", 'r') as stream:
            conn_conf = yaml.safe_load(stream)[ProjectConfig.hdm_env()][self._connection_name]

        self._container = conn_conf['container_name']
        connection = BlobServiceClient(account_url=conn_conf['url'], credential=conn_conf['sas'])
        self._test_blob_container_existence(connection)
        return connection
Beispiel #7
0
    def _create_engine(self) -> Any:
        with open(f"{ProjectConfig.hdm_home()}/{ProjectConfig.profile_path()}",
                  'r') as stream:
            conn_conf = yaml.safe_load(stream)[ProjectConfig.hdm_env()][
                self._connection_name]

        if not conn_conf.get('dbpath', os.curdir):
            db_path = os.path.abspath(os.curdir)
        else:
            db_path = os.path.abspath(conn_conf.get('dbpath', os.curdir))

        database = str(conn_conf['database'])

        if database.rfind(".db") == -1:
            db_path = os.path.join(db_path, f"{database}.db")

        return create_engine(f"sqlite:///{db_path}")
Beispiel #8
0
    def _get_connection(self):
        """
        Obtain a context managed netezza connection

        Returns: Netezza connection

        Raises:
            ConnectionError: Netezza connection could not be established

        """
        connection = None

        with open(f"{ProjectConfig.hdm_home()}/{ProjectConfig.profile_path()}", 'r') as stream:
            conn_conf = yaml.safe_load(stream)[ProjectConfig.hdm_env()][self._connection_name]

        connection_config = self._get_connection_config(config=conn_conf)

        connection_invalid = True
        connection_attempt_count = 0

        timeout = self._timeout

        while connection_attempt_count < self._max_attempts:
            connection = self._connect_by_connector(connection_config)

            # If your connection is valid, then set it so and break from while loop
            if self._test_connection(connection):
                connection_invalid = False
                break
            # Otherwise, you must put program to sleep, wait for next time to obtain connection and carry on.

            connection_attempt_count += 1
            if connection_invalid < self._max_attempts:
                time.sleep(timeout)
                timeout *= self._timeout_factor

        if connection_invalid:
            raise ConnectionError('Unable to connection to Netezza. Please try again.')

        yield connection
        connection.close()
    def __build_query(self) -> None:
        """
        builds query.
        Default is no check_sum column
        checksum_methods:
        default method: generates random numbers as checksum value
        hash method:
            ** Important: Hash function needs IBM Netezza SQL Extensions toolkit installed
            checksum_method : hash
            hash_column  is the column to be hashed
            hash_function supported are :
                hash4 (returns the 32 bit checksum hash of the input data.)
                hash8 (returns the 64 bit hash of the input data)
                hash (returns hashed input data)
            ** Important: hash() function is much slower to calculate than hash4() and hash8()

        Returns: none

        """
        self.__query = f"INSERT INTO {self.__external_table_name} " \
                       f"SELECT * , CAST(random()* 100000 AS INT) as ck_sum FROM {self.__table}"

        # Checksum
        if self.__checksum and self.__checksum['function'] and self.__checksum['column']:
            self.__query = self._generate_checksum_select_query(checksum=self.__checksum,
                                                                table_name=self.__table,
                                                                external_table_name=self.__external_table_name)

        # Watermark
        if self.__watermark and self.__watermark['column'] and self.__watermark['offset']:
            where_clause = self._generate_watermarked_where_clause(watermark=self.__watermark,
                                                                   last_data_pulled=self._last_record_pulled)
            self.__query = " ".join(
                [
                    self.__query,
                    where_clause
                ]
            )
        if ProjectConfig.query_limit():
            self.__query += f" LIMIT {ProjectConfig.query_limit()}"
Beispiel #10
0
    def _create_engine(self) -> Any:
        """
        get ODBC connection string from azure portal
        'Driver={ODBC Driver 13 for SQL Server};Server=tcp:yourDBServerName.database.windows.net,1433;
        Database=dbname;Uid=username;Pwd=xxx;Encrypt=yes;TrustServerCertificate=no;Connection Timeout=30;

        and populate in values in hdm_profile.yml

        Important: {ODBC Driver 17 for SQL Server} works even though in azure portal connection string it say
        ODBC Driver 13 for SQL Server
        """
        with open(f"{ProjectConfig.hdm_home()}/{ProjectConfig.profile_path()}",
                  'r') as stream:
            conn_conf = yaml.safe_load(stream)[ProjectConfig.hdm_env()][
                self._connection_name]
        conn = f"Driver={{{conn_conf['driver']}}};Server=tcp:{conn_conf['host']}.database.windows.net,{conn_conf['port']};" \
               f"Database={conn_conf['database']};Uid={conn_conf['user']};Pwd={conn_conf['password']};Encrypt=yes;" \
               f"TrustServerCertificate=no;Connection Timeout=30;"

        params = urllib.parse.quote_plus(conn)
        conn_str = 'mssql+pyodbc:///?odbc_connect={}'.format(params)
        return create_engine(conn_str)
Beispiel #11
0
    def _get_connection(self, **kwargs):
        """
        Gets a S3 Session based on env variables.
        Connection is made using credentials specified in .aws/ or as per the profile values specified in:
            aws_access_key_id
            aws_secret_access_key
            region_name
        """
        with open(f"{ProjectConfig.hdm_home()}/{ProjectConfig.profile_path()}",
                  'r') as stream:
            conn_conf = yaml.safe_load(stream)[ProjectConfig.hdm_env()][
                kwargs.get('connection')]

        # TODO - Validation needed for all the correct keys present or not
        if 'profile' in conn_conf.keys():
            connection = boto3.session.Session(
                profile_name=conn_conf.get('profile'))
        else:
            connection = boto3.session.Session(
                aws_access_key_id=conn_conf.get('aws_access_key_id'),
                aws_secret_access_key=conn_conf.get('aws_secret_access_key'),
                region_name=conn_conf.get('region_name'))

        return connection
Beispiel #12
0
 def test_archive_folder(self):
     self.assertEqual('archive', ProjectConfig.archive_folder())
Beispiel #13
0
 def test_hdm_home(self):
     os.environ['HDM_HOME'] = os.getcwd()
     self.assertEqual(os.getcwd(), ProjectConfig.hdm_home())
Beispiel #14
0
 def test_connection_timeout(self):
     self.assertEqual(3, ProjectConfig.connection_timeout())
Beispiel #15
0
 def test_connection_max_attempts(self):
     self.assertEqual(3, ProjectConfig.connection_max_attempts())
Beispiel #16
0
 def test_state_manager_table_name(self):
     self.assertEqual('state_manager',
                      ProjectConfig.state_manager_table_name())
Beispiel #17
0
 def test_file_prefix(self):
     self.assertEqual('hdm', ProjectConfig.file_prefix())
Beispiel #18
0
 def test_profile_path(self):
     self.assertEqual(ProjectConfig.profile_path(),
                      ".hashmap_data_migrator/hdm_profiles.yml")
Beispiel #19
0
 def test_hdm_env(self):
     self.assertEqual('unit-test', ProjectConfig.hdm_env())
     os.environ['HDM_ENV'] = 'prod'
     self.assertEqual('prod', ProjectConfig.hdm_env())