def test_collector(historical_role, buckets, mock_lambda_environment, swag_accounts, current_s3_table): from historical.s3.collector import handler now = datetime.utcnow().replace(tzinfo=None, microsecond=0) create_event = CloudwatchEventFactory( detail=DetailFactory(requestParameters={"bucketName": "testbucket1"}, source="aws.s3", eventName="CreateBucket", eventTime=now)) data = json.dumps(create_event, default=serialize) data = KinesisRecordsFactory( records=[KinesisRecordFactory(kinesis=KinesisDataFactory(data=data))]) data = json.dumps(data, default=serialize) data = json.loads(data) handler(data, None) result = list(CurrentS3Model.query("arn:aws:s3:::testbucket1")) assert len(result) == 1 # Verify that the tags are duplicated in the top level and configuration: assert len(result[0].Tags.attribute_values) == len( result[0].configuration.attribute_values["Tags"]) == 1 assert result[0].Tags.attribute_values["theBucketName"] == \ result[0].configuration.attribute_values["Tags"]["theBucketName"] == "testbucket1" # noqa # Polling (make sure the date is included): polling_event = CloudwatchEventFactory( detail=DetailFactory(requestParameters={ "bucketName": "testbucket1", "creationDate": now }, source="aws.s3", eventName="DescribeBucket", eventTime=now)) data = json.dumps(polling_event, default=serialize) data = KinesisRecordsFactory( records=[KinesisRecordFactory(kinesis=KinesisDataFactory(data=data))]) data = json.dumps(data, default=serialize) data = json.loads(data) handler(data, None) assert CurrentS3Model.count() == 1 # Load the config and verify the polling timestamp is in there: result = list(CurrentS3Model.query("arn:aws:s3:::testbucket1")) assert result[0].configuration["CreationDate"] == now.isoformat() + "Z" # And deletion: delete_event = CloudwatchEventFactory( detail=DetailFactory(requestParameters={"bucketName": "testbucket1"}, source="aws.s3", eventName="DeleteBucket", eventTime=now)) data = json.dumps(delete_event, default=serialize) data = KinesisRecordsFactory( records=[KinesisRecordFactory(kinesis=KinesisDataFactory(data=data))]) data = json.dumps(data, default=serialize) data = json.loads(data) handler(data, None) assert CurrentS3Model.count() == 0
def test_collector(historical_role, buckets, mock_lambda_environment, swag_accounts, current_s3_table): """Test the Collector.""" from historical.s3.models import CurrentS3Model from historical.s3.collector import handler now = datetime.utcnow().replace(tzinfo=None, microsecond=0) create_event = CloudwatchEventFactory( detail=DetailFactory(requestParameters={"bucketName": "testbucket1"}, eventSource="aws.s3", eventName="CreateBucket", eventTime=now)) data = json.dumps(create_event, default=serialize) data = RecordsFactory(records=[SQSDataFactory(body=data)]) data = json.dumps(data, default=serialize) data = json.loads(data) handler(data, mock_lambda_environment) result = list(CurrentS3Model.query("arn:aws:s3:::testbucket1")) assert len(result) == 1 assert result[0].Tags.attribute_values["theBucketName"] == "testbucket1" assert result[0].eventSource == "aws.s3" # Polling (make sure the date is included): polling_event = CloudwatchEventFactory( detail=DetailFactory(requestParameters={ "bucketName": "testbucket1", "creationDate": now }, eventSource="historical.s3.poller", eventName="PollS3", eventTime=now)) data = json.dumps(polling_event, default=serialize) data = RecordsFactory(records=[SQSDataFactory(body=data)]) data = json.dumps(data, default=serialize) data = json.loads(data) handler(data, mock_lambda_environment) assert CurrentS3Model.count() == 1 # Load the config and verify the polling timestamp is in there: result = list(CurrentS3Model.query("arn:aws:s3:::testbucket1")) assert result[0].configuration["CreationDate"] == now.isoformat() + "Z" assert result[0].eventSource == "historical.s3.poller" # And deletion: delete_event = CloudwatchEventFactory( detail=DetailFactory(requestParameters={"bucketName": "testbucket1"}, eventSource="aws.s3", eventName="DeleteBucket", eventTime=now)) data = json.dumps(delete_event, default=serialize) data = RecordsFactory(records=[SQSDataFactory(body=data)]) data = json.dumps(data, default=serialize) data = json.loads(data) handler(data, mock_lambda_environment) assert CurrentS3Model.count() == 0
def test_current_table(current_s3_table): from historical.s3.models import CurrentS3Model CurrentS3Model(**S3_BUCKET).save() items = list(CurrentS3Model.query('arn:aws:s3:::testbucket1')) assert len(items) == 1 assert isinstance(items[0].ttl, int) assert items[0].ttl > 0
def test_current_table(current_s3_table): # pylint: disable=W0613 """Tests for the Current PynamoDB model.""" from historical.s3.models import CurrentS3Model CurrentS3Model(**S3_BUCKET).save() items = list(CurrentS3Model.query('arn:aws:s3:::testbucket1')) assert len(items) == 1 assert isinstance(items[0].ttl, int) assert items[0].ttl > 0
def test_collector_on_deleted_bucket(historical_role, buckets, mock_lambda_environment, swag_accounts, current_s3_table): from historical.s3.collector import handler # If an event arrives on a bucket that is deleted, then it should skip # and wait until the Deletion event arrives. create_event = CloudwatchEventFactory( detail=DetailFactory( requestParameters={ "bucketName": "not-a-bucket" }, source="aws.s3", eventName="PutBucketPolicy", ) ) create_event_data = json.dumps(create_event, default=serialize) data = KinesisRecordsFactory( records=[ KinesisRecordFactory( kinesis=KinesisDataFactory(data=create_event_data)) ] ) data = json.dumps(data, default=serialize) data = json.loads(data) handler(data, None) assert CurrentS3Model.count() == 0
def test_lite_bucket_schema_for_events(historical_table, bucket_event): old_fields = CONFIG.exclude_fields CONFIG.exclude_fields = "Name,_version,Grants,LifecycleRules,Logging,Policy,Tags,Versioning,Website,Cors," \ "Notifications,Acceleration,Replication,CreationDate,AnalyticsConfigurations," \ "MetricsConfigurations,InventoryConfigurations".split(",") all_buckets = CurrentS3Model.scan() generated_report = S3ReportSchema(strict=True).dump({"all_buckets": all_buckets}).data generated_report["all_buckets"] = [] process_dynamodb_record(bucket_event["Records"][0], generated_report) lite_report = S3ReportSchema(strict=True).dump(generated_report).data assert lite_report["generated_date"] assert lite_report["s3_report_version"] == CONFIG.s3_reports_version assert not lite_report.get("all_buckets") assert lite_report["buckets"]["testbucketNEWBUCKET"] assert len(lite_report["buckets"]) == 11 for bucket in lite_report["buckets"].values(): keys = bucket.keys() for excluded in CONFIG.exclude_fields: assert excluded not in keys assert bucket["AccountId"] == "123456789012" assert bucket["Region"] == "us-east-1" # Clean-up: CONFIG.exclude_fields = old_fields
def test_light_bucket_schema(historical_table): old_fields = CONFIG.exclude_fields CONFIG.exclude_fields = "Name,_version,Grants,LifecycleRules,Logging,Policy,Tags,Versioning,Website,Cors," \ "Notifications,Acceleration,Replication,CreationDate,AnalyticsConfigurations," \ "MetricsConfigurations,InventoryConfigurations".split(",") all_buckets = CurrentS3Model.scan() generated_file = S3ReportSchema(strict=True).dump({ "all_buckets": all_buckets }).data assert generated_file["generated_date"] assert generated_file["s3_report_version"] == CONFIG.s3_reports_version assert len(generated_file["buckets"]) == 10 assert not generated_file.get("all_buckets") for bucket in generated_file["buckets"].values(): keys = bucket.keys() for excluded in CONFIG.exclude_fields: assert excluded not in keys assert bucket["AccountId"] == "123456789012" assert bucket["Region"] == "us-east-1" # Clean-up: CONFIG.exclude_fields = old_fields
def test_bucket_schema(historical_table): all_buckets = CurrentS3Model.scan() generated_file = S3ReportSchema(strict=True).dump({"all_buckets": all_buckets}).data assert generated_file["generated_date"] assert generated_file["s3_report_version"] == CONFIG.s3_reports_version assert not generated_file.get("all_buckets") for name, value in generated_file["buckets"].items(): assert value["AccountId"] == "123456789012" assert value["Region"] == "us-east-1" assert value["Tags"]["theBucketName"] == name assert not value.get("_version") assert not value.get("Name")
def test_serialization(): """Tests that the dictionary serialization for PynamoDB objects works properly.""" from historical.s3.models import CurrentS3Model bucket = S3_BUCKET.copy() bucket['eventTime'] = datetime( year=2017, month=5, day=12, hour=10, minute=30, second=0).isoformat() + 'Z' bucket = CurrentS3Model(**bucket) dictionary = dict(bucket) assert dictionary['version'] == VERSION assert dictionary['configuration']['LifecycleRules'][0]['Prefix'] is None
def process_durable_event(record, s3_report): """Processes a group of Historical Durable Table events.""" if record.get(EVENT_TOO_BIG_FLAG): result = list(CurrentS3Model.query(record['arn'])) # Is the record too big and also not found in the Current Table? Then delete it: if not result: record['item'] = {'configuration': {}, 'BucketName': record['arn'].split('arn:aws:s3:::')[1]} else: record['item'] = dict(result[0]) if not record['item']['configuration']: log.debug(f"[ ] Processing deletion for: {record['item']['BucketName']}") s3_report["buckets"].pop(record['item']['BucketName'], None) else: log.debug(f"[ ] Processing: {record['item']['BucketName']}") s3_report["all_buckets"].append(record['item'])
def create_delete_model(record): """Create an S3 model from a record.""" arn = "arn:aws:s3:::{}".format(cloudwatch.filter_request_parameters('bucketName', record)) log.debug('[-] Deleting Dynamodb Records. Hash Key: {arn}'.format(arn=arn)) data = { 'arn': arn, 'principalId': cloudwatch.get_principal(record), 'userIdentity': cloudwatch.get_user_identity(record), 'accountId': record['account'], 'eventTime': record['detail']['eventTime'], 'BucketName': cloudwatch.filter_request_parameters('bucketName', record), 'Region': cloudwatch.get_region(record), 'Tags': {}, 'configuration': {}, 'eventSource': record["detail"]["eventSource"] } return CurrentS3Model(**data)
def dump_report(commit=True): # Get all the data from DynamoDB: log.debug("Starting... Beginning scan.") all_buckets = CurrentS3Model.scan() generated_file = S3ReportSchema(strict=True).dump({ "all_buckets": all_buckets }).data # Dump to S3: if commit: log.debug("Saving to S3.") # Replace <empty> with "" <-- Due to Pynamo/Dynamo issues... dump_to_s3( json.dumps(generated_file, indent=4).replace("\"<empty>\"", "\"\"").encode("utf-8")) else: log.debug("Commit flag not set, not saving.") log.debug("Completed S3 report generation.")
def current_s3_table(dynamodb): yield CurrentS3Model.create_table(read_capacity_units=1, write_capacity_units=1, wait=True)
def generated_file(historical_table): all_buckets = CurrentS3Model.scan() return S3ReportSchema(strict=True).dumps({ "all_buckets": all_buckets }).data.encode("utf-8")
def historical_table(current_s3_table): for x in range(0, 10): bucket = json.loads(S3_BUCKET.replace("{number}", "{}".format(x))) CurrentS3Model(**bucket).save()
def process_update_records(update_records): """Process the requests for S3 bucket update requests""" events = sorted(update_records, key=lambda x: x['account']) # Group records by account for more efficient processing for account_id, events in groupby(events, lambda x: x['account']): events = list(events) # Grab the bucket names (de-dupe events): buckets = {} for event in events: # If the creation date is present, then use it: bucket_event = buckets.get( event['detail']['requestParameters']['bucketName'], { 'creationDate': event['detail']['requestParameters'].get('creationDate') }) bucket_event.update(event['detail']['requestParameters']) buckets[event['detail']['requestParameters'] ['bucketName']] = bucket_event buckets[event['detail']['requestParameters'] ['bucketName']]['eventDetails'] = event # Query AWS for current configuration for b_name, item in buckets.items(): LOG.debug(f'[~] Processing Create/Update for: {b_name}') # If the bucket does not exist, then simply drop the request -- # If this happens, there is likely a Delete event that has occurred and will be processed soon. try: bucket_details = get_bucket( b_name, account_number=account_id, include_created=(item.get('creationDate') is None), assume_role=HISTORICAL_ROLE, region=CURRENT_REGION) if bucket_details.get('Error'): LOG.error( f"[X] Unable to fetch details about bucket: {b_name}. " f"The error details are: {bucket_details['Error']}") continue except ClientError as cerr: if cerr.response['Error']['Code'] == 'NoSuchBucket': LOG.warning( f'[?] Received update request for bucket: {b_name} that does not ' 'currently exist. Skipping.') continue # Catch Access Denied exceptions as well: if cerr.response['Error']['Code'] == 'AccessDenied': LOG.error( f'[X] Unable to fetch details for S3 Bucket: {b_name} in {account_id}. Access is Denied. ' 'Skipping...') continue raise Exception(cerr) # Pull out the fields we want: data = { 'arn': f'arn:aws:s3:::{b_name}', 'principalId': cloudwatch.get_principal(item['eventDetails']), 'userIdentity': cloudwatch.get_user_identity(item['eventDetails']), 'userAgent': item['eventDetails']['detail'].get('userAgent'), 'sourceIpAddress': item['eventDetails']['detail'].get('sourceIPAddress'), 'requestParameters': item['eventDetails']['detail'].get('requestParameters'), 'accountId': account_id, 'eventTime': item['eventDetails']['detail']['eventTime'], 'BucketName': b_name, 'Region': bucket_details.pop('Region'), # Duplicated in top level and configuration for secondary index 'Tags': bucket_details.pop('Tags', {}) or {}, 'eventSource': item['eventDetails']['detail']['eventSource'], 'eventName': item['eventDetails']['detail']['eventName'], 'version': VERSION } # Remove the fields we don't care about: del bucket_details['Arn'] del bucket_details['GrantReferences'] del bucket_details['_version'] del bucket_details['Name'] if not bucket_details.get('CreationDate'): bucket_details['CreationDate'] = item['creationDate'] data['configuration'] = bucket_details current_revision = CurrentS3Model(**data) current_revision.save()
def test_historical_table_fixture(historical_table): assert CurrentS3Model.count() == 10
def test_snsproxy_dynamodb_differ(historical_role, current_s3_table, durable_s3_table, mock_lambda_environment, buckets): """ This mostly checks that the differ is able to properly load the reduced dataset from the SNSProxy. """ # Create the item in the current table: from historical.s3.collector import handler as current_handler from historical.s3.differ import handler as diff_handler from historical.s3.models import CurrentS3Model, DurableS3Model from historical.common.sns import shrink_sns_blob # Mock out the loggers: import historical.common.dynamodb old_logger = historical.common.dynamodb.log mocked_logger = MagicMock() historical.common.dynamodb.log = mocked_logger now = datetime.utcnow().replace(tzinfo=None, microsecond=0) create_event = CloudwatchEventFactory( detail=DetailFactory(requestParameters={"bucketName": "testbucket1"}, eventSource="aws.s3", eventName="CreateBucket", eventTime=now)) data = json.dumps(create_event, default=serialize) data = RecordsFactory(records=[SQSDataFactory(body=data)]) data = json.dumps(data, default=serialize) data = json.loads(data) current_handler(data, mock_lambda_environment) result = list(CurrentS3Model.query("arn:aws:s3:::testbucket1")) assert len(result) == 1 # Mock out the DDB Stream for this creation and for an item that is NOT in the current table:: ttl = int(time.time() + TTL_EXPIRY) new_bucket = S3_BUCKET.copy() new_bucket['eventTime'] = datetime( year=2017, month=5, day=12, hour=10, minute=30, second=0).isoformat() + 'Z' new_bucket['ttl'] = ttl ddb_existing_item = DynamoDBRecordFactory(dynamodb=DynamoDBDataFactory( NewImage=new_bucket, Keys={'arn': new_bucket['arn']}, OldImage=new_bucket), eventName='INSERT') missing_bucket = S3_BUCKET.copy() missing_bucket['eventTime'] = datetime( year=2017, month=5, day=12, hour=10, minute=30, second=0).isoformat() + 'Z' missing_bucket['ttl'] = ttl missing_bucket['BucketName'] = 'notinthecurrenttable' missing_bucket['arn'] = 'arn:aws:s3:::notinthecurrenttable' missing_bucket['configuration']['Name'] = 'notinthecurrenttable' ddb_missing_item = DynamoDBRecordFactory(dynamodb=DynamoDBDataFactory( NewImage=missing_bucket, Keys={'arn': 'arn:aws:s3:::notinthecurrenttable'}, OldImage=new_bucket), eventName='INSERT') # Get the shrunken blob: shrunken_existing = json.dumps( shrink_sns_blob( json.loads(json.dumps(ddb_existing_item, default=serialize)))) shrunken_missing = json.dumps( shrink_sns_blob( json.loads(json.dumps(ddb_missing_item, default=serialize)))) records = RecordsFactory(records=[ SQSDataFactory(body=json.dumps( SnsDataFactory(Message=shrunken_existing), default=serialize)), SQSDataFactory(body=json.dumps( SnsDataFactory(Message=shrunken_missing), default=serialize)) ]) records_event = json.loads(json.dumps(records, default=serialize)) # Run the differ: diff_handler(records_event, mock_lambda_environment) # Verify that the existing bucket in the Current table is in the Durable table with the correct configuration: result = list(DurableS3Model.query("arn:aws:s3:::testbucket1")) assert len(result) == 1 assert result[0].configuration.attribute_values['Name'] == 'testbucket1' # Verify that the missing bucket is ignored -- as it will be processed presumably later: result = list(DurableS3Model.query("arn:aws:s3:::notinthecurrenttable")) assert not result # Verify that the proper log statements were reached: assert mocked_logger.debug.called assert mocked_logger.error.called debug_calls = [ '[-->] Item with ARN: arn:aws:s3:::notinthecurrenttable was too big for SNS ' '-- fetching it from the Current table...', '[+] Saving new revision to durable table.', '[-->] Item with ARN: arn:aws:s3:::testbucket1 was too big for SNS -- fetching it from the Current table...' ] for dc in debug_calls: mocked_logger.debug.assert_any_call(dc) mocked_logger.error.assert_called_once_with( '[?] Received item too big for SNS, and was not able to ' 'find the original item with ARN: arn:aws:s3:::notinthecurrenttable') # Unmock the logger: historical.common.dynamodb.log = old_logger
def current_s3_table(dynamodb): from historical.s3.models import CurrentS3Model yield CurrentS3Model.create_table(read_capacity_units=1, write_capacity_units=1, wait=True)
def process_update_records(update_records): """Process the requests for S3 bucket update requests""" events = sorted(update_records, key=lambda x: x['account']) # Group records by account for more efficient processing for account_id, events in groupby(events, lambda x: x['account']): events = list(events) # Grab the bucket names (de-dupe events): buckets = {} for e in events: # If the creation date is present, then use it: bucket_event = buckets.get(e["detail"]["requestParameters"]["bucketName"], { "creationDate": e["detail"]["requestParameters"].get("creationDate") }) bucket_event.update(e["detail"]["requestParameters"]) buckets[e["detail"]["requestParameters"]["bucketName"]] = bucket_event buckets[e["detail"]["requestParameters"]["bucketName"]]["eventDetails"] = e # Query AWS for current configuration for b, item in buckets.items(): log.debug("[~] Processing Create/Update for: {}".format(b)) # If the bucket does not exist, then simply drop the request -- # If this happens, there is likely a Delete event that has occurred and will be processed soon. try: bucket_details = get_bucket(b, account_number=account_id, include_created=(item.get("creationDate") is None), assume_role=HISTORICAL_ROLE, region=CURRENT_REGION) if bucket_details.get("Error"): log.error("[X] Unable to fetch details about bucket: {}. " "The error details are: {}".format(b, bucket_details["Error"])) continue except ClientError as ce: if ce.response["Error"]["Code"] == "NoSuchBucket": log.warning("[?] Received update request for bucket: {} that does not " "currently exist. Skipping.".format(b)) continue # Catch Access Denied exceptions as well: if ce.response["Error"]["Code"] == "AccessDenied": log.error("[X] Unable to fetch details for S3 Bucket: {} in {}. Access is Denied. Skipping...".format( b, account_id )) continue raise Exception(ce) # Pull out the fields we want: data = { "arn": "arn:aws:s3:::{}".format(b), "principalId": cloudwatch.get_principal(item["eventDetails"]), "userIdentity": cloudwatch.get_user_identity(item["eventDetails"]), "accountId": account_id, "eventTime": item["eventDetails"]["detail"]["eventTime"], "BucketName": b, "Region": bucket_details["Region"], # Duplicated in top level and configuration for secondary index "Tags": bucket_details["Tags"] or {}, "eventSource": item["eventDetails"]["detail"]["eventSource"] } # Remove the fields we don't care about: del bucket_details["Arn"] del bucket_details["GrantReferences"] del bucket_details["Region"] if not bucket_details.get("CreationDate"): bucket_details["CreationDate"] = item["creationDate"] data["configuration"] = bucket_details current_revision = CurrentS3Model(**data) current_revision.save()