def handler(event, context): raw_records = event["Records"] logger.debug(raw_records) log_dict = dict() failed_dict = dict() xray_recorder.begin_subsegment("parse") for payload in kinesis.parse_records(raw_records): try: payload_parsed = json.loads(payload) except json.JSONDecodeError: logger.debug(f"Ignoring non-JSON data: {payload}") continue baikonur_logging.parse_payload_to_log_dict( payload_parsed, log_dict, failed_dict, LOG_TYPE_FIELD, LOG_TIMESTAMP_FIELD, LOG_ID_FIELD, LOG_TYPE_UNKNOWN_PREFIX, LOG_TYPE_FIELD_WHITELIST, timestamp_required=True, ) xray_recorder.end_subsegment() baikonur_logging.save_json_logs_to_s3(s3_client, failed_dict, "Valid log data") baikonur_logging.save_json_logs_to_s3( s3_client, failed_dict, "One or more necessary fields are unavailable")
def test_parse_records_json_root_non_object(self): # non-object types on root should be ignored data = ["true", "1", "null"] event = {"Records": generate_sample_kinesis_records(data)} records = [x for x in kinesis.parse_records(event["Records"])] self.assertEqual(len(records), 0)
def test_parse_records_json_empty(self): data = ["{}"] event = {"Records": generate_sample_kinesis_records(data)} records = [x for x in kinesis.parse_records(event["Records"])] self.assertEqual(len(records), len(data)) for i, r in enumerate(records): self.assertEqual(r, data[i])
def test_parse_records_plaintext_multiple(self): data = [f"test-data-{x}" for x in range(10)] event = {"Records": generate_sample_kinesis_records(data)} records = [x for x in kinesis.parse_records(event["Records"])] self.assertEqual(len(records), len(data)) for i, r in enumerate(records): self.assertEqual(r, data[i])
def test_parse_records_json_multiple(self): json_data = [{"a": 1}, {"b": 2}, {"c": 3}] data = [json.dumps(x) for x in json_data] event = {"Records": generate_sample_kinesis_records(data)} records = [x for x in kinesis.parse_records(event["Records"])] self.assertEqual(len(records), len(data)) for i, r in enumerate(records): self.assertEqual(r, data[i])
def test_parse_records_cwl_health_check(self): # raw sample data from CloudWatch Logs Subscription Filters # containing only a health check message: # "CWL CONTROL MESSAGE: Checking health of destination Kinesis stream." data = [ "H4sIAAAAAAAAADWOwQqCQBRFf2WYdURGFrkLsRZZQgYtQmLSlz7SGZk3JhH+e6PW8nAv954Pr4BI5HB+18A97kfH8ykKb4cgjje7gE+4ai" "XoPilVk7XCpEWocrJBqfKdVk1ts5Fio0FUI1Jzp1RjbVDJLZYGNHHvmgy94AXS9PjhmI11g1bDiMqOOe567i4XznK2ctzJX68XuITsp8d+" "eh7zC0ifKHNWgChNwdSDZXYJpeif2R4lEBKjQW3Ku6T7ApsNvwTyAAAA" ] event = {"Records": generate_sample_kinesis_records(data, encode=False)} records = [x for x in kinesis.parse_records(event["Records"])] # control messages should be ignored self.assertEqual(len(records), 0)
def handler(event, context): raw_records = event["Records"] logger.debug(raw_records) log_dict = dict() failed_dict = dict() xray_recorder.begin_subsegment("parse") for payload in kinesis.parse_records(raw_records): try: payload_parsed = json.loads(payload) except json.JSONDecodeError: logger.debug(f"Ignoring non-JSON data: {payload}") continue baikonur_logging.parse_payload_to_log_dict( payload_parsed, log_dict, failed_dict=failed_dict, log_id_key=LOG_ID_FIELD, log_timestamp_key=LOG_TIMESTAMP_FIELD, log_type_key=LOG_TYPE_FIELD, log_type_unknown_prefix=LOG_TYPE_UNKNOWN_PREFIX, log_type_whitelist=LOG_TYPE_FIELD_WHITELIST, timestamp_required=False, ) xray_recorder.end_subsegment() xray_recorder.begin_subsegment("kinesis PutRecords") for key in log_dict: logger.info( f"Processing log type {key}: {len(log_dict[key]['records'])} records" ) records_json = [json.dumps(x) for x in log_dict[key]["records"]] kinesis.put_records_batch(kinesis_client, TARGET_STREAM_NAME, records_json, KINESIS_MAX_RETRIES) xray_recorder.end_subsegment() xray_recorder.begin_subsegment("s3 upload") baikonur_logging.save_json_logs_to_s3(s3_client, failed_dict, reason="Failed logs") xray_recorder.end_subsegment() logger.info("Finished")
def test_parse_records_cwl_payload(self): # raw sample data from CloudWatch Logs Subscription Filters # containing a single DATA_MESSAGE with a plain text payload: # "hello" data = [ "H4sIANWN8F4C/02PzQrCMBCE3yVnD83m31vB6slTexORqosG2qYkUZHiu7taBPf47czszsR6TKm9YPMckS3ZqmzKw7aq63JTsQULjwEj4e" "JvCHfhsonhNtImY8ozqXPEtp/FRNLtmE7Rj9mHYe27jDGx5W7W77+G6o5D/sCJ+TP5hFJcCQsSLBgHvACtFRRSG2OhMMJyp6RyVmrNrQEt" "nDYg4HMse6qR254+4sqB0pIiJBeLXz2Kv2LXBfbav941C5T39AAAAA==" ] event = {"Records": generate_sample_kinesis_records(data, encode=False)} records = [x for x in kinesis.parse_records(event["Records"])] self.assertEqual(len(records), 1) self.assertEqual(records[0], "hello")
def test_parse_records_cwl_health_check_payload(self): # raw sample data from CloudWatch Logs Subscription Filters # containing a single health check and two DATA_MESSAGE payloads data = [ "H4sIAPSW8F4C/6WPzWrDQAyE30VnH7za/9wMdXLKybmVENxWpAu21+xuUkrwu1eJKRRKT9Xxk2ZGc4ORcu7PdPicCTbw1Bya077tumbXQg" "XxY6LEuP4xjId43qV4mXlTKJeVdCVRP67HTPLlJb+mMJcQp20YCqUMm+f1/vgQtFeayh3eILyxTmottHSo0KH1KGo0RmOtjLUOayud8Fpp" "75Qxwlk00huLEu9hJXCN0o/8kdAetVFsoYSsvuux/TsNQxSwVP+LE3/HqV9xCMtx+QJguUFbZAEAAA==", "H4sIAPCW8F4C/zWOwQqCQBRFf2WYdURGFrkLsRZZQgYtQmLSlz7SGZk3JhH+e6PW8nAv954Pr4BI5HB+18A97kfH8ykKb4cgjje7gE+4ai" "XoPilVk7XCpEWocrJBqfKdVk1ts5Fio0FUI1Jzp1RjbVDJLZYGNHHvmgy94AXS9PjhmI11g1bDiMqOOe567i4XznK2ctzJX68XuITsp8d+" "eh7zC0ifKHNWgChNwdSDZXYJpeif2R4lEBKjQW3Ku6T7ApsNvwTyAAAA", ] event = {"Records": generate_sample_kinesis_records(data, encode=False)} records = [x for x in kinesis.parse_records(event["Records"])] self.assertEqual(len(records), 2) self.assertEqual(records[0], "hello1") self.assertEqual(records[1], "hello2")
def handler(event, context): raw_records = event["Records"] logger.debug(raw_records) log_dict = dict() failed_dict = dict() actions = [] es = Elasticsearch( hosts=[{ "host": ELASTICSEARCH_HOST, "port": 443 }], http_auth=aws_auth, use_ssl=True, verify_certs=True, connection_class=RequestsHttpConnection, timeout=ES_TIMEOUT, ca_certs=certifi.where(), ) logger.info(f"Connected to Elasticsearch at https://{ELASTICSEARCH_HOST}") xray_recorder.begin_subsegment("parse") for payload in kinesis.parse_records(raw_records): try: payload_parsed = json.loads(payload) except json.JSONDecodeError: logger.debug(f"Ignoring non-JSON data: {payload}") continue baikonur_logging.parse_payload_to_log_dict( payload_parsed, log_dict, failed_dict, LOG_TYPE_FIELD, LOG_TIMESTAMP_FIELD, LOG_ID_FIELD, LOG_TYPE_UNKNOWN_PREFIX, LOG_TYPE_FIELD_WHITELIST, timestamp_required=True, ) xray_recorder.end_subsegment() for log_type, v in log_dict: records = v["records"] for record in records: timestamp = record[LOG_TIMESTAMP_FIELD] date = datetime.datetime.strftime(timestamp, "%Y%m%d") index = f"{INDEX_NAME_PREFIX}-{log_type}-{date}" actions.append({ "_index": index, "_type": "_doc", "_source": record }) baikonur_logging.save_json_logs_to_s3( s3_client, failed_dict, "failed validation: missing necessary fields, A") subsegment = xray_recorder.begin_subsegment("Elasticsearch push") subsegment.put_annotation("total_actions", len(actions)) # good logs save failed_data_es = [] if len(actions) > 0: logger.info( f"Pushing {len(actions)} actions generated from Kinesis records to Elasticsearch Bulk API" ) for i, chunk in enumerate(misc.split_list(actions, BULK_CHUNK_SIZE)): chunk_subsegment = xray_recorder.begin_subsegment( "Elasticsearch push chunk") chunk_subsegment.put_annotation("chunk_number", i) chunk_subsegment.put_annotation("chunk_size", len(chunk)) logger.info(f"Sending chunk no. {i} of {len(chunk)} actions") try: # make sure there will be only one internal chunk/batch helpers.bulk(es, chunk, chunk_size=len(chunk)) except BulkIndexError as e: logger.info( f"Got {len(e.errors)} failed actions from Elasticsearch Bulk API" ) failed_data_es += e.errors xray_recorder.end_subsegment() else: logger.info("Nothing to flush") xray_recorder.end_subsegment() baikonur_logging.save_json_logs_to_s3( s3_client, failed_dict, "One or more necessary fields are unavailable") timestamp = datetime.datetime.now() key = (FAILED_LOG_S3_PATH_PREFIX + "/" + timestamp.strftime("%Y-%m/%d/%Y-%m-%d-%H:%M:%S") + ".gz") data = "\n".join(to_str(f) for f in failed_data_es) logger.info( f"Saving records rejected by Elasticsearch to S3: s3://{FAILED_LOG_S3_BUCKET}/{key}" ) s3.put_str_data(s3_client, FAILED_LOG_S3_BUCKET, key, data, gzip_compress=True) logger.info(f"Finished")