def ingest(table_name, options): landing_table = f'data.{table_name}' service_user_creds = options['service_user_creds'] for subject in options.get('subjects_list') or ['']: for event in LOGIN_EVENTS: items = get_logs( service_user_creds, with_subject=subject, event_name=event, start_time=db.fetch_latest( landing_table, where=( f"delegating_subject='{subject}' AND " f"event_name='{event}'" ), ), ).get('items', []) db.insert( landing_table, values=[ ( item['id']['time'], item['etag'].strip('"'), subject, item.get('events', [{}])[0].get('name'), { p['name']: ( p.get('value') or p.get('boolValue') or p.get('multiValue') ) for p in item.get('events', [{}])[0].get('parameters', []) }, item['id']['customerId'], item['actor'].get('email'), item['actor'].get('profileId'), item.get('ipAddress'), item, ) for item in items ], select=( 'CURRENT_TIMESTAMP()', 'column1', 'column2', 'column3', 'column4', 'PARSE_JSON(column5)', 'column6', 'column7', 'column8', 'column9', 'PARSE_JSON(column10)', ), ) yield len(items)
def ingest(table_name, options): base_name = re.sub(r'_CONNECTION$', '', table_name) storage_account = options['storage_account'] sas_token = vault.decrypt_if_encrypted(options['sas_token']) suffix = options['suffix'] container_name = options['container_name'] snowflake_account = options['snowflake_account'] sa_user = options['sa_user'] database = options['database'] block_blob_service = BlockBlobService(account_name=storage_account, sas_token=sas_token, endpoint_suffix=suffix) db.execute(f"select SYSTEM$PIPE_FORCE_RESUME('DATA.{base_name}_PIPE');") last_loaded = db.fetch_latest(f'data.{table_name}', 'loaded_on') log.info(f"Last loaded time is {last_loaded}") blobs = block_blob_service.list_blobs(container_name) new_files = [ StagedFile(b.name, None) for b in blobs if (last_loaded is None or b.properties.creation_time > last_loaded) ] log.info(f"Found {len(new_files)} files to ingest") # Proxy object that abstracts the Snowpipe REST API ingest_manager = SimpleIngestManager( account=snowflake_account, host=f'{snowflake_account}.snowflakecomputing.com', user=sa_user, pipe=f'{database}.data.{base_name}_PIPE', private_key=load_pkb_rsa(PRIVATE_KEY, PRIVATE_KEY_PASSWORD)) if len(new_files) > 0: for file_group in groups_of(4999, new_files): response = ingest_manager.ingest_files(file_group) log.info(response) yield len(file_group)
def ingest(table_name, options): landing_table = f'data.{table_name}' api_key = options['api_key'] subdomain = options['subdomain'] url = f'https://{subdomain}.okta.com/api/v1/logs' headers = { 'Accept': 'application/json', 'Content-Type': 'application/json', 'Authorization': f'SSWS {api_key}' } ts = db.fetch_latest(landing_table, 'event_time') if ts is None: log.error("Unable to find a timestamp of most recent Okta log, " "defaulting to one hour ago") ts = datetime.datetime.now() - datetime.timedelta(hours=1) params = {'since': ts.strftime("%Y-%m-%dT%H:%M:%S.000Z")} while 1: response = requests.get(url=url, headers=headers, params=params) if response.status_code != 200: log.error('OKTA REQUEST FAILED: ', response.text) return result = response.json() if result == []: break db.insert(landing_table, values=[(row, row['published']) for row in result], select='PARSE_JSON(column1), column2') log.info(f'Inserted {len(result)} rows.') yield len(result) url = response.headers['Link'].split(', ')[1].split(';')[0][1:-1]
def ingest(table_name, options): ingest_type = ( 'users' if table_name.endswith('_USERS_CONNECTION') else 'groups' if table_name.endswith('_GROUPS_CONNECTION') else 'logs') landing_table = f'data.{table_name}' api_key = options['api_key'] subdomain = options['subdomain'] ingest_urls = { 'users': f'https://{subdomain}.okta.com/api/v1/users', 'deprovisioned_users': f'https://{subdomain}.okta.com/api/v1/users?filter=status+eq+\"DEPROVISIONED\"', 'groups': f'https://{subdomain}.okta.com/api/v1/groups', 'logs': f'https://{subdomain}.okta.com/api/v1/logs', } headers = { 'Accept': 'application/json', 'Content-Type': 'application/json', 'Authorization': f'SSWS {api_key}', } timestamp = datetime.datetime.utcnow() if ingest_type == 'groups': response = requests.get(url=ingest_urls[ingest_type], headers=headers) result = response.json() for row in result: try: row['users'] = requests.get(url=row['_links']['users']['href'], headers=headers).json() except TypeError: log.info(row) raise db.insert( landing_table, values=[(row, timestamp) for row in result], select='PARSE_JSON(column1), column2', ) log.info(f'Inserted {len(result)} rows.') yield len(result) elif ingest_type == 'users': yield from ingest_users(ingest_urls['users'], headers, landing_table, timestamp) yield from ingest_users(ingest_urls['deprovisioned_users'], headers, landing_table, timestamp) else: ts = db.fetch_latest(landing_table, 'event_time') if ts is None: log.error("Unable to find a timestamp of most recent Okta log, " "defaulting to one hour ago") ts = datetime.datetime.now() - datetime.timedelta(hours=1) params = {'since': ts.strftime("%Y-%m-%dT%H:%M:%S.000Z"), 'limit': 500} i = 0 print(params['since']) url = ingest_urls[ingest_type] while 1: response = requests.get(url=url, headers=headers, params=params) if response.status_code != 200: log.error('OKTA REQUEST FAILED: ', response.text) return result = response.json() if result == []: break db.insert( landing_table, values=[(row, row['published']) for row in result], select='PARSE_JSON(column1), column2', ) log.info(f'Inserted {len(result)} rows. {i}') i += 1 yield len(result) url = '' links = requests.utils.parse_header_links(response.headers['Link']) for link in links: if link['rel'] == 'next': url = link['url'] if len(url) == 0: break
def ingest(table_name, options): landing_table = f'data.{table_name}' username = options['username'] password = options['password'] security_token = options['security_token'] environment_raw = options['environment'] environment = 'test' if environment_raw == 'test' else None # We will fetch EventLogFiles where the LogDate is greater than the maximum # timestamp seen in all previous EventLogFiles start_time = db.fetch_latest(landing_table, col='raw:TIMESTAMP_DERIVED') if start_time is None: start_time = '1900-01-01T00:00:00.000Z' # TODO: Support more auth methods, including client certificates. sf = Salesforce( username=username, password=password, security_token=security_token, client_id='SnowAlert', domain=environment, ) event_log_soql_query = (f'SELECT id, eventtype, logdate ' f'FROM eventlogfile ' f'WHERE interval=\'Hourly\' ' f' AND logdate > {start_time}') log.info(f'Querying event logs: {event_log_soql_query}') log_files = sf.query_all(event_log_soql_query) # Create a temp directory only accessible by the current user, which we will delete after Snowflake upload temp_dir = tempfile.mkdtemp('_sfevents') # Salesforce will provide a bunch of files, an hourly extract of each of the different event types in CSV format # There are around 50 different event types and they all have different fields. Rather than a table per event type, # we'll convert them to JSON and do schema-on-read. # We'll load from the table stage which has the 'STRIP_OUTER_ARRAY' option, so there will be one row per event. total_files = log_files['totalSize'] log.info(f'Found {total_files} event files to load.') if total_files > 0: for record in log_files['records']: url = record['attributes']['url'] id = record['Id'] log.info(f'Downloading event log file {id} from {url}.') # The URL provided is relative, but includes part of the base URL which we have to trim out before combining # E.g. it could look like /services/data/v38.0/sobjects/EventLogFile/0AT0o00000NSIv5GAB # where the base URL will look like: https://ap8.salesforce.com/services/data/v38.0/ url_relative = 'sobjects/' + url.split('sobjects/')[1] + '/LogFile' result = sf._call_salesforce('GET', sf.base_url + url_relative, name=url_relative) # TODO: Investigate streaming the result and converting to JSON in chunks. # Current method has high memory requirements for large files, but unlikely to be # multi-GB hourly unless it's a really busy Salesforce org. reader = csv.DictReader(io.StringIO(result.text)) file_path = os.path.join(temp_dir, id + '.json') with open(file_path, 'w') as f: # This will create a single line JSON file containing an array of objects json.dump(list(reader), f) # Copy all the staged .json files into the landing table log.info(f'Uploading all files to Snowflake stage: {table_name}.') db.copy_file_to_table_stage(table_name, os.path.join(temp_dir, '*.json')) log.info(f'Upload successful, deleting all local files.') shutil.rmtree(temp_dir) # The table is configured to purge upon load from its stage, so we don't need to clean up log.info(f'Copying events into Snowflake table from staged files.') db.load_from_table_stage(table_name) log.info(f'Loaded {total_files} event files.') else: log.info(f'Skipping load as there are no new event files.') return total_files