Python fetch_latestの例、runners.helpers.db.fetch_latest Pythonの例

コード例 #1

0

ファイルを表示

ファイル: gsuite_logs.py プロジェクト: zoltan-fedor/SnowAlert

def ingest(table_name, options):
    landing_table = f'data.{table_name}'
    service_user_creds = options['service_user_creds']
    for subject in options.get('subjects_list') or ['']:
        for event in LOGIN_EVENTS:
            items = get_logs(
                service_user_creds,
                with_subject=subject,
                event_name=event,
                start_time=db.fetch_latest(
                    landing_table,
                    where=(
                        f"delegating_subject='{subject}' AND " f"event_name='{event}'"
                    ),
                ),
            ).get('items', [])

            db.insert(
                landing_table,
                values=[
                    (
                        item['id']['time'],
                        item['etag'].strip('"'),
                        subject,
                        item.get('events', [{}])[0].get('name'),
                        {
                            p['name']: (
                                p.get('value')
                                or p.get('boolValue')
                                or p.get('multiValue')
                            )
                            for p in item.get('events', [{}])[0].get('parameters', [])
                        },
                        item['id']['customerId'],
                        item['actor'].get('email'),
                        item['actor'].get('profileId'),
                        item.get('ipAddress'),
                        item,
                    )
                    for item in items
                ],
                select=(
                    'CURRENT_TIMESTAMP()',
                    'column1',
                    'column2',
                    'column3',
                    'column4',
                    'PARSE_JSON(column5)',
                    'column6',
                    'column7',
                    'column8',
                    'column9',
                    'PARSE_JSON(column10)',
                ),
            )
            yield len(items)

コード例 #2

0

ファイルを表示

def ingest(table_name, options):
    base_name = re.sub(r'_CONNECTION$', '', table_name)
    storage_account = options['storage_account']
    sas_token = vault.decrypt_if_encrypted(options['sas_token'])
    suffix = options['suffix']
    container_name = options['container_name']
    snowflake_account = options['snowflake_account']
    sa_user = options['sa_user']
    database = options['database']

    block_blob_service = BlockBlobService(account_name=storage_account,
                                          sas_token=sas_token,
                                          endpoint_suffix=suffix)

    db.execute(f"select SYSTEM$PIPE_FORCE_RESUME('DATA.{base_name}_PIPE');")

    last_loaded = db.fetch_latest(f'data.{table_name}', 'loaded_on')

    log.info(f"Last loaded time is {last_loaded}")

    blobs = block_blob_service.list_blobs(container_name)
    new_files = [
        StagedFile(b.name, None) for b in blobs
        if (last_loaded is None or b.properties.creation_time > last_loaded)
    ]

    log.info(f"Found {len(new_files)} files to ingest")

    # Proxy object that abstracts the Snowpipe REST API
    ingest_manager = SimpleIngestManager(
        account=snowflake_account,
        host=f'{snowflake_account}.snowflakecomputing.com',
        user=sa_user,
        pipe=f'{database}.data.{base_name}_PIPE',
        private_key=load_pkb_rsa(PRIVATE_KEY, PRIVATE_KEY_PASSWORD))

    if len(new_files) > 0:
        for file_group in groups_of(4999, new_files):
            response = ingest_manager.ingest_files(file_group)
            log.info(response)
            yield len(file_group)

コード例 #3

0

ファイルを表示

def ingest(table_name, options):
    landing_table = f'data.{table_name}'
    api_key = options['api_key']
    subdomain = options['subdomain']

    url = f'https://{subdomain}.okta.com/api/v1/logs'
    headers = {
        'Accept': 'application/json',
        'Content-Type': 'application/json',
        'Authorization': f'SSWS {api_key}'
    }

    ts = db.fetch_latest(landing_table, 'event_time')
    if ts is None:
        log.error("Unable to find a timestamp of most recent Okta log, "
                  "defaulting to one hour ago")
        ts = datetime.datetime.now() - datetime.timedelta(hours=1)

    params = {'since': ts.strftime("%Y-%m-%dT%H:%M:%S.000Z")}

    while 1:
        response = requests.get(url=url, headers=headers, params=params)
        if response.status_code != 200:
            log.error('OKTA REQUEST FAILED: ', response.text)
            return

        result = response.json()
        if result == []:
            break

        db.insert(landing_table,
                  values=[(row, row['published']) for row in result],
                  select='PARSE_JSON(column1), column2')

        log.info(f'Inserted {len(result)} rows.')
        yield len(result)

        url = response.headers['Link'].split(', ')[1].split(';')[0][1:-1]

コード例 #4

0

ファイルを表示

def ingest(table_name, options):
    ingest_type = (
        'users' if table_name.endswith('_USERS_CONNECTION') else
        'groups' if table_name.endswith('_GROUPS_CONNECTION') else 'logs')

    landing_table = f'data.{table_name}'
    api_key = options['api_key']
    subdomain = options['subdomain']

    ingest_urls = {
        'users': f'https://{subdomain}.okta.com/api/v1/users',
        'deprovisioned_users':
        f'https://{subdomain}.okta.com/api/v1/users?filter=status+eq+\"DEPROVISIONED\"',
        'groups': f'https://{subdomain}.okta.com/api/v1/groups',
        'logs': f'https://{subdomain}.okta.com/api/v1/logs',
    }

    headers = {
        'Accept': 'application/json',
        'Content-Type': 'application/json',
        'Authorization': f'SSWS {api_key}',
    }

    timestamp = datetime.datetime.utcnow()

    if ingest_type == 'groups':
        response = requests.get(url=ingest_urls[ingest_type], headers=headers)

        result = response.json()

        for row in result:
            try:
                row['users'] = requests.get(url=row['_links']['users']['href'],
                                            headers=headers).json()
            except TypeError:
                log.info(row)
                raise

        db.insert(
            landing_table,
            values=[(row, timestamp) for row in result],
            select='PARSE_JSON(column1), column2',
        )

        log.info(f'Inserted {len(result)} rows.')
        yield len(result)

    elif ingest_type == 'users':
        yield from ingest_users(ingest_urls['users'], headers, landing_table,
                                timestamp)
        yield from ingest_users(ingest_urls['deprovisioned_users'], headers,
                                landing_table, timestamp)

    else:
        ts = db.fetch_latest(landing_table, 'event_time')
        if ts is None:
            log.error("Unable to find a timestamp of most recent Okta log, "
                      "defaulting to one hour ago")
            ts = datetime.datetime.now() - datetime.timedelta(hours=1)

        params = {'since': ts.strftime("%Y-%m-%dT%H:%M:%S.000Z"), 'limit': 500}

        i = 0
        print(params['since'])
        url = ingest_urls[ingest_type]
        while 1:
            response = requests.get(url=url, headers=headers, params=params)
            if response.status_code != 200:
                log.error('OKTA REQUEST FAILED: ', response.text)
                return

            result = response.json()
            if result == []:
                break

            db.insert(
                landing_table,
                values=[(row, row['published']) for row in result],
                select='PARSE_JSON(column1), column2',
            )

            log.info(f'Inserted {len(result)} rows. {i}')
            i += 1
            yield len(result)

            url = ''
            links = requests.utils.parse_header_links(response.headers['Link'])
            for link in links:
                if link['rel'] == 'next':
                    url = link['url']

            if len(url) == 0:
                break

コード例 #5

0

ファイルを表示

def ingest(table_name, options):
    landing_table = f'data.{table_name}'
    username = options['username']
    password = options['password']
    security_token = options['security_token']
    environment_raw = options['environment']
    environment = 'test' if environment_raw == 'test' else None

    # We will fetch EventLogFiles where the LogDate is greater than the maximum
    # timestamp seen in all previous EventLogFiles
    start_time = db.fetch_latest(landing_table, col='raw:TIMESTAMP_DERIVED')
    if start_time is None:
        start_time = '1900-01-01T00:00:00.000Z'

    # TODO: Support more auth methods, including client certificates.
    sf = Salesforce(
        username=username,
        password=password,
        security_token=security_token,
        client_id='SnowAlert',
        domain=environment,
    )
    event_log_soql_query = (f'SELECT id, eventtype, logdate '
                            f'FROM eventlogfile '
                            f'WHERE interval=\'Hourly\' '
                            f'  AND logdate > {start_time}')
    log.info(f'Querying event logs: {event_log_soql_query}')
    log_files = sf.query_all(event_log_soql_query)

    # Create a temp directory only accessible by the current user, which we will delete after Snowflake upload
    temp_dir = tempfile.mkdtemp('_sfevents')

    # Salesforce will provide a bunch of files, an hourly extract of each of the different event types in CSV format
    # There are around 50 different event types and they all have different fields. Rather than a table per event type,
    # we'll convert them to JSON and do schema-on-read.
    # We'll load from the table stage which has the 'STRIP_OUTER_ARRAY' option, so there will be one row per event.
    total_files = log_files['totalSize']
    log.info(f'Found {total_files} event files to load.')
    if total_files > 0:
        for record in log_files['records']:
            url = record['attributes']['url']
            id = record['Id']
            log.info(f'Downloading event log file {id} from {url}.')

            # The URL provided is relative, but includes part of the base URL which we have to trim out before combining
            # E.g. it could look like /services/data/v38.0/sobjects/EventLogFile/0AT0o00000NSIv5GAB
            # where the base URL will look like: https://ap8.salesforce.com/services/data/v38.0/
            url_relative = 'sobjects/' + url.split('sobjects/')[1] + '/LogFile'
            result = sf._call_salesforce('GET',
                                         sf.base_url + url_relative,
                                         name=url_relative)

            # TODO: Investigate streaming the result and converting to JSON in chunks.

            # Current method has high memory requirements for large files, but unlikely to be
            # multi-GB hourly unless it's a really busy Salesforce org.
            reader = csv.DictReader(io.StringIO(result.text))
            file_path = os.path.join(temp_dir, id + '.json')
            with open(file_path, 'w') as f:
                # This will create a single line JSON file containing an array of objects
                json.dump(list(reader), f)

        # Copy all the staged .json files into the landing table
        log.info(f'Uploading all files to Snowflake stage: {table_name}.')
        db.copy_file_to_table_stage(table_name,
                                    os.path.join(temp_dir, '*.json'))
        log.info(f'Upload successful, deleting all local files.')
        shutil.rmtree(temp_dir)

        # The table is configured to purge upon load from its stage, so we don't need to clean up
        log.info(f'Copying events into Snowflake table from staged files.')
        db.load_from_table_stage(table_name)
        log.info(f'Loaded {total_files} event files.')
    else:
        log.info(f'Skipping load as there are no new event files.')

    return total_files