Esempio n. 1
0
def main():
    for pipe in db.get_pipes('data'):
        metadata = yaml.load(pipe['comment'])
        if metadata and metadata.get('type') != 'Azure':
            log.info(f"{pipe['name']} is not an Azure pipe, and will be skipped.")
            continue

        blob_name = metadata['blob']
        account_name = metadata['account']
        pipe_name = pipe['name']
        table = metadata['target']

        sas_token_envar = 'AZURE_SAS_TOKEN_' + metadata.get('suffix', '')
        if sas_token_envar in environ:
            encrypted_sas_token = environ.get(sas_token_envar)
        elif 'encrypted_sas_token' in metadata:
            encrypted_sas_token = metadata['encrypted_sas_token']
        else:
            log.info(f"{pipe['name']} has no azure auth")
            continue

        sas_token = vault.decrypt_if_encrypted(encrypted_sas_token)

        log.info(f"Now working on pipe {pipe_name}")

        endpoint_suffix = metadata.get('endpoint_suffix', 'core.windows.net')

        block_blob_service = BlockBlobService(
            account_name=account_name,
            sas_token=sas_token,
            endpoint_suffix=endpoint_suffix
        )

        files = block_blob_service.list_blobs(blob_name)

        newest_time = get_timestamp(table)
        new_files = []
        if newest_time:
            for file in files:
                if file.properties.creation_time > newest_time:
                    new_files.append(StagedFile(file.name, None))
        else:
            for file in files:
                new_files.append(StagedFile(file.name, None))

        log.info(new_files)

        # Proxy object that abstracts the Snowpipe REST API
        ingest_manager = SimpleIngestManager(account=environ.get('SNOWFLAKE_ACCOUNT'),
                                             host=f'{environ.get("SNOWFLAKE_ACCOUNT")}.snowflakecomputing.com',
                                             user=environ.get('SA_USER'),
                                             pipe=f'SNOWALERT.DATA.{pipe_name}',
                                             private_key=load_pkb_rsa(PRIVATE_KEY, PRIVATE_KEY_PASSWORD))
        if len(new_files) > 0:
            try:
                response = ingest_manager.ingest_files(new_files)
                log.info(response)
            except Exception as e:
                log.error(e)
                return
Esempio n. 2
0
 def __init__(self, config: Config, pipe: str) -> None:
     self.logger = logging.getLogger(self.__class__.__name__)
     self.config = config
     self.ingest_manager = SimpleIngestManager(
         account=self.config.account,
         host=self.config.url,
         user=self.config.user,
         private_key=self.config.private_key_pem,
         pipe=pipe,
     )
Esempio n. 3
0
def test_simple_ingest(connection_ctx, test_util):
    param = connection_ctx['param']

    pipe_name = '{}.{}.TEST_SIMPLE_INGEST_PIPE'.format(param['database'],
                                                       param['schema'])

    private_key = test_util.read_private_key()

    print(private_key)

    cur = connection_ctx['cnx'].cursor()

    test_file = os.path.join(test_util.get_data_dir(), 'test_file.csv')
    cur.execute(
        'create or replace table TEST_SIMPLE_INGEST_TABLE(c1 number, c2 string)'
    )
    cur.execute('create or replace stage TEST_SIMPLE_INGEST_STAGE')
    cur.execute('put file://{} @TEST_SIMPLE_INGEST_STAGE'.format(test_file))
    cur.execute(
        'create or replace pipe {0} as copy into TEST_SIMPLE_INGEST_TABLE '
        'from @TEST_SIMPLE_INGEST_STAGE'.format(pipe_name))

    ingest_manager = SimpleIngestManager(account=param['account'],
                                         user=param['user'],
                                         private_key=private_key,
                                         pipe=pipe_name,
                                         scheme=param['protocol'],
                                         host=param['host'],
                                         port=param['port'])

    staged_files = [StagedFile('test_file.csv.gz', None)]

    resp = ingest_manager.ingest_files(staged_files)

    assert resp['responseCode'] == 'SUCCESS'

    start_polling_time = time.time()

    while time.time() - start_polling_time < 120:
        history_resp = ingest_manager.get_history()

        if len(history_resp['files']) == 1:
            assert history_resp['files'][0]['path'] == 'test_file.csv.gz'
            return
        else:
            # wait for 20 seconds
            time.sleep(20)

    assert False
Esempio n. 4
0
class SnowpipeApi:
    def __init__(self, config: Config, pipe: str) -> None:
        self.logger = logging.getLogger(self.__class__.__name__)
        self.config = config
        self.ingest_manager = SimpleIngestManager(
            account=self.config.account,
            host=self.config.url,
            user=self.config.user,
            private_key=self.config.private_key_pem,
            pipe=pipe,
        )

    def report(self, recent_seconds: Optional[int] = None) -> None:
        request_id = uuid.uuid4()
        self.logger.debug(f'request_id: {request_id}')
        body = self.ingest_manager.get_history(recent_seconds=recent_seconds,
                                               request_id=request_id)
        print(json.dumps(body, indent=4))

    def history(self,
                start_time: datetime,
                end_time: Optional[datetime] = None) -> None:
        if not start_time:
            raise ValueError('start_time must be defined')
        request_id = uuid.uuid4()
        self.logger.debug(f'request_id: {request_id}')
        body = self.ingest_manager.get_history_range(
            start_time_inclusive=start_time.isoformat(),
            end_time_exclusive=end_time.isoformat() if end_time else None,
            request_id=request_id)
        print(json.dumps(body, indent=4))

    def ingest(self, files: Iterable[str]) -> None:
        if not files:
            raise ValueError('files must be defined')
        request_id = uuid.uuid4()
        self.logger.debug(f'request_id: {request_id}')
        staged_files = [StagedFile(name, None) for name in files]
        body = self.ingest_manager.ingest_files(
            staged_files=staged_files,
            request_id=request_id,
        )
        print(json.dumps(body, indent=4))
Esempio n. 5
0
def ingest(table_name, options):
    base_name = re.sub(r'_CONNECTION$', '', table_name)
    storage_account = options['storage_account']
    sas_token = vault.decrypt_if_encrypted(options['sas_token'])
    suffix = options['suffix']
    container_name = options['container_name']
    snowflake_account = options['snowflake_account']
    sa_user = options['sa_user']
    database = options['database']

    block_blob_service = BlockBlobService(account_name=storage_account,
                                          sas_token=sas_token,
                                          endpoint_suffix=suffix)

    db.execute(f"select SYSTEM$PIPE_FORCE_RESUME('DATA.{base_name}_PIPE');")

    last_loaded = db.fetch_latest(f'data.{table_name}', 'loaded_on')

    log.info(f"Last loaded time is {last_loaded}")

    blobs = block_blob_service.list_blobs(container_name)
    new_files = [
        StagedFile(b.name, None) for b in blobs
        if (last_loaded is None or b.properties.creation_time > last_loaded)
    ]

    log.info(f"Found {len(new_files)} files to ingest")

    # Proxy object that abstracts the Snowpipe REST API
    ingest_manager = SimpleIngestManager(
        account=snowflake_account,
        host=f'{snowflake_account}.snowflakecomputing.com',
        user=sa_user,
        pipe=f'{database}.data.{base_name}_PIPE',
        private_key=load_pkb_rsa(PRIVATE_KEY, PRIVATE_KEY_PASSWORD))

    if len(new_files) > 0:
        for file_group in groups_of(4999, new_files):
            response = ingest_manager.ingest_files(file_group)
            log.info(response)
            yield len(file_group)
Esempio n. 6
0
    pemlines = pem_in.read()
    private_key_obj = load_pem_private_key(
        pemlines, os.environ['PRIVATE_KEY_PASSPHRASE'].encode(),
        default_backend())

private_key_text = private_key_obj.private_bytes(
    Encoding.PEM, PrivateFormat.PKCS8, NoEncryption()).decode('utf-8')
# Assume the public key has been registered in Snowflake:
# private key in PEM format

# List of files in the stage specified in the pipe definition
file_list = ['departments.csv']

ingest_manager = SimpleIngestManager(
    account='<>',
    host='<>.<>.snowflakecomputing.com',
    user='******',
    pipe='dataload_db.dataload.load_dept_pipe',
    private_key=private_key_text)

# List of files, but wrapped into a class
staged_file_list = []
for file_name in file_list:
    print(file_name)
    staged_file_list.append(StagedFile(file_name, None))

for staged_file in staged_file_list:
    print(staged_file)

try:
    resp = ingest_manager.ingest_files(staged_file_list)
    print(resp)
     ), 'rb') as pem_in:
    pemlines = pem_in.read()
    private_key_obj = load_pem_private_key(
        pemlines, os.environ['PRIVATE_KEY_PASSPHRASE'].encode(),
        default_backend())

private_key_text = private_key_obj.private_bytes(
    Encoding.PEM, PrivateFormat.PKCS8, NoEncryption()).decode('utf-8')
# Assume the public key has been registered in Snowflake:
# private key in PE format

# List of files in the stage specified in the pipe definition
file_list = ['/CSKT_TEXT_2020-01-15-10-17.parquet']
ingest_manager = SimpleIngestManager(
    account='storaenso',
    host='storaenso.west-europe.azure.snowflakecomputing.com',
    user='******',
    pipe='SOURCE_SYSTEM_TEST.SAPRAW.TEST_PIPE_1',
    private_key=private_key_text)
# List of files, but wrapped into a class
staged_file_list = []
for file_name in file_list:
    staged_file_list.append(StagedFile(file_name, None))

try:
    resp = ingest_manager.ingest_files(staged_file_list)
except HTTPError as e:
    # HTTP error, may need to retry
    logger.error(e)
    exit(1)

# This means Snowflake has received file and will start loading
Esempio n. 8
0
    def load_via_snowpipe(self, s3_key, stream):
        """ Performs data transfer from the stage to snowflake using snowpipe. """
        def _generate_pipe_name(dbname, schema_table_name):
            stripped_db_name = dbname.replace('"', '')
            stripped_table_name = schema_table_name.replace('"', '')
            return f"{stripped_db_name}.{stripped_table_name}_s3_pipe"

        def _generate_pipe_args(pipe_name, schema_table_name,
                                columns_with_trans):
            pipe_args = dict(
                pipe_name=pipe_name,
                db_name=self.connection_config['dbname'],
                obj_name=schema_table_name,
                stage=self.connection_config['stage'],
                file_format=self.connection_config['file_format'],
                cols=', '.join([c['name'] for c in columns_with_trans]),
            )
            return pipe_args

        def _load_private_key():
            key_path = getattr(self.connection_config, "private_key_path",
                               "/rsa_key.p8")
            password = getattr(self.connection_config, "private_key_password",
                               None)
            with open(key_path, 'rb') as pem_in:
                private_key_obj = load_pem_private_key(
                    pem_in.read(),
                    password=password,
                    backend=default_backend())

            private_key_text = private_key_obj.private_bytes(
                Encoding.PEM, PrivateFormat.PKCS8,
                NoEncryption()).decode('utf-8')
            return private_key_text

        def _increment_value(exponentially=False):
            previous = 0
            current = 1
            while True:
                yield 2**(current + previous) if exponentially  \
                    else current + previous + 30
                current = current + previous
                previous = current - previous

        self.logger.info("Loading data using Snowpipe.")
        # Get list if columns with types and transformation
        columns_with_trans = [{
            "name": safe_column_name(name),
            "trans": column_trans(schema)
        } for (name, schema) in self.flatten_schema.items()]
        schema_table_name = self.table_name(stream, False)
        db_name = self.connection_config['dbname']

        pipe_name = _generate_pipe_name(db_name, schema_table_name)
        pipe_args = _generate_pipe_args(pipe_name, schema_table_name,
                                        columns_with_trans)

        create_pipe_sql = """create pipe {pipe_name} as
                            copy into {db_name}.{obj_name} ({cols})
                            from @{db_name}.{stage}
                            file_format = (format_name = {db_name}.{file_format} );""".format(
            **pipe_args)
        drop_pipe_sql = f"drop pipe if exists {pipe_name};"

        # Create snowpipe
        try:
            self.logger.debug("Creating snowpipe - %s.", pipe_name)

            # primary key in records found, raise warning
            if len(self.stream_schema_message['key_properties']) > 0:
                self.logger.warning(
                    "Primary key %s found in the data stream. Snowpipe can not be used to "
                    "consolidate records based upon keys. It can just copy data. "
                    "Please refer the docs for further details",
                    self.stream_schema_message['key_properties'])

            # primary key not present in the records, perform copy
            self.query(create_pipe_sql)
        except ProgrammingError as error:
            self.logger.error(
                "An error was encountered while creating the snowpipe, %s",
                error)

        #  Private key encription required to perform snowpipe data transfer
        private_key_text = _load_private_key()

        ingest_manager = SimpleIngestManager(
            account=self.connection_config['account'].split('.')[0],
            host=self.connection_config['account'] + '.snowflakecomputing.com',
            user=self.connection_config['user'],
            pipe=pipe_name,
            scheme='https',
            port=443,
            private_key=private_key_text)

        # List of files, but wrapped into a class
        staged_file_list = [StagedFile(s3_key, None)]

        #ingest files using snowpipe
        retries = self.connection_config.get('max_retry', 5)
        wait_time = _increment_value(exponentially=True)
        while True:
            try:
                self.logger.debug(
                    "Starting to ingest file via snowpipe, retries left %s",
                    retries)
                resp = ingest_manager.ingest_files(staged_file_list)
                self.logger.info(
                    "Snowpipe has recived the files and will now start loading: %s",
                    resp['responseCode'])
                break
            except HTTPError as e:
                # HTTP error, wait and retry, exit if still fails
                self.logger.error(e)
                time.sleep(next(wait_time))
                retries -= 1
                if not retries:
                    self.logger.critcal(
                        "Max retry limit reached, Failed to load data using snowpipe"
                    )
                    sys.exit(1)

        # Needs to wait for a while to perform transfer, delete pipe after transfer
        wait_time = _increment_value()
        while True:
            history_resp = ingest_manager.get_history()

            if len(history_resp['files']) > 0:
                self.logger.info(
                    '''Ingest Report for snowpipe : %s
                                    STATUS: %s
                                    rowsInserted(rowsParsed): %s(%s)''',
                    history_resp['pipe'], history_resp['completeResult'],
                    history_resp['files'][0]['rowsInserted'],
                    history_resp['files'][0]['rowsParsed'])
                self.query(drop_pipe_sql)
                break
            else:
                self.logger.debug('waiting for snowpipe to transfer data...')
                time.sleep(next(wait_time))
Esempio n. 9
0
# load config
# contains user, keypassphrase, account, host and pipe attributes
parser = SafeConfigParser()
parser.read('config.ini')
pipe = dict(parser.items("keyrus_snowflake_pipe"))

# add private key to dictionary (and drop keypassphrase)
with open('/home/dom/.ssh/sys_mlb_snowpipe_key.p8', 'rb') as key:
    pipe['private_key'] = load_pem_private_key(
        key.read(),
        password=pipe.pop('keypassphrase').encode(),
        backend=default_backend())

# ingest manager
ingest_manager = SimpleIngestManager(**pipe)

# list of files to ingest
file_list = ['game_log_2017_04.csv']
staged_file_list = []
for file_name in file_list:
    staged_file_list.append(StagedFile(file_name, None))

# submit ingestion request
try:
    resp = ingest_manager.ingest_files(staged_file_list)
    log.info(f"Ingest Manager Response: {resp['responseCode']}")
except (IngestResponseError, HTTPError) as e:
    log.error(e)
    exit(1)
        backend=default_backend())

private_key_text = private_key_obj.private_bytes(
    Encoding.PEM, PrivateFormat.PKCS8, NoEncryption()).decode('utf-8')
# Assume the public key has been registered in Snowflake:
# private key in PEM format

# List of files in the stage specified in the pipe definition
file_list = [
    '@%diamonds/diamonds_snowpipe_output0.csv.gz',
    '@%diamonds/diamonds_snowpipe_output1.csv.gz',
    '@%diamonds/diamonds_snowpipe_output2.csv.gz'
]
ingest_manager = SimpleIngestManager(
    account='firn',
    host='firn.ap-southeast-2.snowflakecomputing.com',
    user='******',
    pipe='OLIVER_DB.PUBLIC.DIAMONDS_PIPE',
    private_key=private_key_text)
# List of files, but wrapped into a class
staged_file_list = []
for file_name in file_list:
    staged_file_list.append(StagedFile(file_name, None))

try:
    resp = ingest_manager.ingest_files(staged_file_list)
except HTTPError as e:
    # HTTP error, may need to retry
    logger.error(e)
    exit(1)

# This means Snowflake has received file and will start loading