def test_skip_rows_env(self): """test whether or not index skips rows per SKIP_ROWS_EXTS=LIST""" # because of module caching we can't just patch the environment variable # since index.SKIP_ROWS_EXTS will never change after import with patch.dict(os.environ, {'SKIP_ROWS_EXTS': '.txt,.csv'}): exts = separated_env_to_iter('SKIP_ROWS_EXTS') with patch('index.SKIP_ROWS_EXTS', exts): assert '.parquet' not in exts assert '.csv' in exts assert '.txt' in exts with patch.dict(os.environ, {'SKIP_ROWS_EXTS': '.parquet,.tsvl'}): exts = separated_env_to_iter('SKIP_ROWS_EXTS') with patch('index.SKIP_ROWS_EXTS', exts): assert '.parquet' in exts assert '.csv' not in exts
def test_separated_env_to_iter(self): """ensure the function that infers overrides from the env works: always returns a valid set(), perhaps empty, lowercases extensions """ with patch.dict(os.environ, {'CONTENT_INDEX_EXTS': '.txt'}): assert separated_env_to_iter('CONTENT_INDEX_EXTS') == {'.txt'} with patch.dict(os.environ, {'CONTENT_INDEX_EXTS': ' .tXt '}): assert separated_env_to_iter('CONTENT_INDEX_EXTS') == {'.txt'} with patch.dict(os.environ, {'CONTENT_INDEX_EXTS': ' garbage gar.bage '}): assert separated_env_to_iter( 'CONTENT_INDEX_EXTS', predicate=lambda x: x.startswith('.') ) == set() with patch.dict(os.environ, {'CONTENT_INDEX_EXTS': ' .Parquet, .csv, .tsv'}): assert separated_env_to_iter('CONTENT_INDEX_EXTS') == {'.parquet', '.csv', '.tsv'} with patch.dict(os.environ, {'CONTENT_INDEX_EXTS': ''}): assert separated_env_to_iter('CONTENT_INDEX_EXTS') == set(), \ "Invalid sets should be empty and falsy"
from t4_lambda_shared.utils import ( get_available_memory, get_quilt_logger, MANIFEST_PREFIX_V1, POINTER_PREFIX_V1, query_manifest_content, separated_env_to_iter, ) from document_queue import (DocTypes, DocumentQueue, CONTENT_INDEX_EXTS, EVENT_PREFIX, MAX_RETRY) # 10 MB, see https://amzn.to/2xJpngN NB_VERSION = 4 # default notebook version for nbformat # currently only affects .parquet, TODO: extend to other extensions SKIP_ROWS_EXTS = separated_env_to_iter('SKIP_ROWS_EXTS') SELECT_PACKAGE_META = "SELECT * from S3Object o WHERE o.version IS NOT MISSING LIMIT 1" # No WHERE clause needed for aggregations since S3 Select skips missing fields for aggs SELECT_PACKAGE_STATS = "SELECT SUM(obj['size']) as total_bytes, COUNT(obj['size']) as total_files from S3Object obj" TEST_EVENT = "s3:TestEvent" # we need to filter out GetObject and HeadObject calls generated by the present # lambda in order to display accurate analytics in the Quilt catalog # a custom user agent enables said filtration USER_AGENT_EXTRA = " quilt3-lambdas-es-indexer" def now_like_boto3(): """ensure timezone UTC for consistency with boto3: Example of what boto3 returns on head_object: 'LastModified': datetime.datetime(2019, 11, 6, 3, 1, 16, tzinfo=tzutc()), """
sending to elastic search in memory-limited batches""" from datetime import datetime from enum import Enum from math import floor from typing import Dict, List import os from aws_requests_auth.aws_auth import AWSRequestsAuth import boto3 from elasticsearch import Elasticsearch, RequestsHttpConnection from elasticsearch.helpers import bulk from t4_lambda_shared.utils import separated_env_to_iter from t4_lambda_shared.preview import ELASTIC_LIMIT_BYTES CONTENT_INDEX_EXTS = separated_env_to_iter("CONTENT_INDEX_EXTS") or { ".csv", ".ipynb", ".json", ".md", ".parquet", ".rmd", ".tsv", ".txt" } EVENT_PREFIX = {"Created": "ObjectCreated:", "Removed": "ObjectRemoved:"} # See https://amzn.to/2xJpngN for chunk size as a function of container size CHUNK_LIMIT_BYTES = int(os.getenv('CHUNK_LIMIT_BYTES') or 9_500_000) ELASTIC_TIMEOUT = 30 MAX_BACKOFF = 360 # seconds MAX_RETRY = 4 # prevent long-running lambdas due to malformed calls # signifies that the object is truly deleted, not to be confused with # s3:ObjectRemoved:DeleteMarkerCreated, which we may see in versioned buckets # see https://docs.aws.amazon.com/AmazonS3/latest/dev/NotificationHowTo.html QUEUE_LIMIT_BYTES = 100_000_000 # 100MB RETRY_429 = 5