Example #1
0
def handler(event, context):
    raw_records = event["Records"]
    logger.debug(raw_records)

    log_dict = dict()
    failed_dict = dict()

    xray_recorder.begin_subsegment("parse")
    for payload in kinesis.parse_records(raw_records):
        try:
            payload_parsed = json.loads(payload)
        except json.JSONDecodeError:
            logger.debug(f"Ignoring non-JSON data: {payload}")
            continue

        baikonur_logging.parse_payload_to_log_dict(
            payload_parsed,
            log_dict,
            failed_dict,
            LOG_TYPE_FIELD,
            LOG_TIMESTAMP_FIELD,
            LOG_ID_FIELD,
            LOG_TYPE_UNKNOWN_PREFIX,
            LOG_TYPE_FIELD_WHITELIST,
            timestamp_required=True,
        )
    xray_recorder.end_subsegment()

    baikonur_logging.save_json_logs_to_s3(s3_client, failed_dict,
                                          "Valid log data")

    baikonur_logging.save_json_logs_to_s3(
        s3_client, failed_dict, "One or more necessary fields are unavailable")
Example #2
0
    def test_parse_records_json_root_non_object(self):
        # non-object types on root should be ignored
        data = ["true", "1", "null"]

        event = {"Records": generate_sample_kinesis_records(data)}

        records = [x for x in kinesis.parse_records(event["Records"])]

        self.assertEqual(len(records), 0)
Example #3
0
    def test_parse_records_json_empty(self):
        data = ["{}"]

        event = {"Records": generate_sample_kinesis_records(data)}

        records = [x for x in kinesis.parse_records(event["Records"])]

        self.assertEqual(len(records), len(data))
        for i, r in enumerate(records):
            self.assertEqual(r, data[i])
Example #4
0
    def test_parse_records_plaintext_multiple(self):
        data = [f"test-data-{x}" for x in range(10)]

        event = {"Records": generate_sample_kinesis_records(data)}

        records = [x for x in kinesis.parse_records(event["Records"])]

        self.assertEqual(len(records), len(data))
        for i, r in enumerate(records):
            self.assertEqual(r, data[i])
Example #5
0
    def test_parse_records_json_multiple(self):
        json_data = [{"a": 1}, {"b": 2}, {"c": 3}]

        data = [json.dumps(x) for x in json_data]

        event = {"Records": generate_sample_kinesis_records(data)}

        records = [x for x in kinesis.parse_records(event["Records"])]

        self.assertEqual(len(records), len(data))
        for i, r in enumerate(records):
            self.assertEqual(r, data[i])
Example #6
0
    def test_parse_records_cwl_health_check(self):
        # raw sample data from CloudWatch Logs Subscription Filters
        # containing only a health check message:
        # "CWL CONTROL MESSAGE: Checking health of destination Kinesis stream."

        data = [
            "H4sIAAAAAAAAADWOwQqCQBRFf2WYdURGFrkLsRZZQgYtQmLSlz7SGZk3JhH+e6PW8nAv954Pr4BI5HB+18A97kfH8ykKb4cgjje7gE+4ai"
            "XoPilVk7XCpEWocrJBqfKdVk1ts5Fio0FUI1Jzp1RjbVDJLZYGNHHvmgy94AXS9PjhmI11g1bDiMqOOe567i4XznK2ctzJX68XuITsp8d+"
            "eh7zC0ifKHNWgChNwdSDZXYJpeif2R4lEBKjQW3Ku6T7ApsNvwTyAAAA"
        ]

        event = {"Records": generate_sample_kinesis_records(data, encode=False)}

        records = [x for x in kinesis.parse_records(event["Records"])]

        # control messages should be ignored
        self.assertEqual(len(records), 0)
Example #7
0
def handler(event, context):
    raw_records = event["Records"]
    logger.debug(raw_records)

    log_dict = dict()
    failed_dict = dict()

    xray_recorder.begin_subsegment("parse")
    for payload in kinesis.parse_records(raw_records):
        try:
            payload_parsed = json.loads(payload)
        except json.JSONDecodeError:
            logger.debug(f"Ignoring non-JSON data: {payload}")
            continue

        baikonur_logging.parse_payload_to_log_dict(
            payload_parsed,
            log_dict,
            failed_dict=failed_dict,
            log_id_key=LOG_ID_FIELD,
            log_timestamp_key=LOG_TIMESTAMP_FIELD,
            log_type_key=LOG_TYPE_FIELD,
            log_type_unknown_prefix=LOG_TYPE_UNKNOWN_PREFIX,
            log_type_whitelist=LOG_TYPE_FIELD_WHITELIST,
            timestamp_required=False,
        )
    xray_recorder.end_subsegment()

    xray_recorder.begin_subsegment("kinesis PutRecords")
    for key in log_dict:
        logger.info(
            f"Processing log type {key}: {len(log_dict[key]['records'])} records"
        )
        records_json = [json.dumps(x) for x in log_dict[key]["records"]]
        kinesis.put_records_batch(kinesis_client, TARGET_STREAM_NAME,
                                  records_json, KINESIS_MAX_RETRIES)
    xray_recorder.end_subsegment()

    xray_recorder.begin_subsegment("s3 upload")
    baikonur_logging.save_json_logs_to_s3(s3_client,
                                          failed_dict,
                                          reason="Failed logs")
    xray_recorder.end_subsegment()

    logger.info("Finished")
Example #8
0
    def test_parse_records_cwl_payload(self):
        # raw sample data from CloudWatch Logs Subscription Filters
        # containing a single DATA_MESSAGE with a plain text payload:
        # "hello"

        data = [
            "H4sIANWN8F4C/02PzQrCMBCE3yVnD83m31vB6slTexORqosG2qYkUZHiu7taBPf47czszsR6TKm9YPMckS3ZqmzKw7aq63JTsQULjwEj4e"
            "JvCHfhsonhNtImY8ozqXPEtp/FRNLtmE7Rj9mHYe27jDGx5W7W77+G6o5D/sCJ+TP5hFJcCQsSLBgHvACtFRRSG2OhMMJyp6RyVmrNrQEt"
            "nDYg4HMse6qR254+4sqB0pIiJBeLXz2Kv2LXBfbav941C5T39AAAAA=="
        ]

        event = {"Records": generate_sample_kinesis_records(data, encode=False)}

        records = [x for x in kinesis.parse_records(event["Records"])]

        self.assertEqual(len(records), 1)

        self.assertEqual(records[0], "hello")
Example #9
0
    def test_parse_records_cwl_health_check_payload(self):
        # raw sample data from CloudWatch Logs Subscription Filters
        # containing a single health check and two DATA_MESSAGE payloads
        data = [
            "H4sIAPSW8F4C/6WPzWrDQAyE30VnH7za/9wMdXLKybmVENxWpAu21+xuUkrwu1eJKRRKT9Xxk2ZGc4ORcu7PdPicCTbw1Bya077tumbXQg"
            "XxY6LEuP4xjId43qV4mXlTKJeVdCVRP67HTPLlJb+mMJcQp20YCqUMm+f1/vgQtFeayh3eILyxTmottHSo0KH1KGo0RmOtjLUOayud8Fpp"
            "75Qxwlk00huLEu9hJXCN0o/8kdAetVFsoYSsvuux/TsNQxSwVP+LE3/HqV9xCMtx+QJguUFbZAEAAA==",
            "H4sIAPCW8F4C/zWOwQqCQBRFf2WYdURGFrkLsRZZQgYtQmLSlz7SGZk3JhH+e6PW8nAv954Pr4BI5HB+18A97kfH8ykKb4cgjje7gE+4ai"
            "XoPilVk7XCpEWocrJBqfKdVk1ts5Fio0FUI1Jzp1RjbVDJLZYGNHHvmgy94AXS9PjhmI11g1bDiMqOOe567i4XznK2ctzJX68XuITsp8d+"
            "eh7zC0ifKHNWgChNwdSDZXYJpeif2R4lEBKjQW3Ku6T7ApsNvwTyAAAA",
        ]

        event = {"Records": generate_sample_kinesis_records(data, encode=False)}

        records = [x for x in kinesis.parse_records(event["Records"])]

        self.assertEqual(len(records), 2)

        self.assertEqual(records[0], "hello1")
        self.assertEqual(records[1], "hello2")
Example #10
0
def handler(event, context):
    raw_records = event["Records"]
    logger.debug(raw_records)

    log_dict = dict()
    failed_dict = dict()

    actions = []
    es = Elasticsearch(
        hosts=[{
            "host": ELASTICSEARCH_HOST,
            "port": 443
        }],
        http_auth=aws_auth,
        use_ssl=True,
        verify_certs=True,
        connection_class=RequestsHttpConnection,
        timeout=ES_TIMEOUT,
        ca_certs=certifi.where(),
    )

    logger.info(f"Connected to Elasticsearch at https://{ELASTICSEARCH_HOST}")

    xray_recorder.begin_subsegment("parse")
    for payload in kinesis.parse_records(raw_records):
        try:
            payload_parsed = json.loads(payload)
        except json.JSONDecodeError:
            logger.debug(f"Ignoring non-JSON data: {payload}")
            continue

        baikonur_logging.parse_payload_to_log_dict(
            payload_parsed,
            log_dict,
            failed_dict,
            LOG_TYPE_FIELD,
            LOG_TIMESTAMP_FIELD,
            LOG_ID_FIELD,
            LOG_TYPE_UNKNOWN_PREFIX,
            LOG_TYPE_FIELD_WHITELIST,
            timestamp_required=True,
        )
    xray_recorder.end_subsegment()

    for log_type, v in log_dict:
        records = v["records"]
        for record in records:
            timestamp = record[LOG_TIMESTAMP_FIELD]
            date = datetime.datetime.strftime(timestamp, "%Y%m%d")
            index = f"{INDEX_NAME_PREFIX}-{log_type}-{date}"

            actions.append({
                "_index": index,
                "_type": "_doc",
                "_source": record
            })

    baikonur_logging.save_json_logs_to_s3(
        s3_client, failed_dict,
        "failed validation: missing necessary fields, A")

    subsegment = xray_recorder.begin_subsegment("Elasticsearch push")
    subsegment.put_annotation("total_actions", len(actions))

    # good logs save
    failed_data_es = []
    if len(actions) > 0:
        logger.info(
            f"Pushing {len(actions)} actions generated from Kinesis records to Elasticsearch Bulk API"
        )

        for i, chunk in enumerate(misc.split_list(actions, BULK_CHUNK_SIZE)):

            chunk_subsegment = xray_recorder.begin_subsegment(
                "Elasticsearch push chunk")
            chunk_subsegment.put_annotation("chunk_number", i)
            chunk_subsegment.put_annotation("chunk_size", len(chunk))
            logger.info(f"Sending chunk no. {i} of {len(chunk)} actions")

            try:
                # make sure there will be only one internal chunk/batch
                helpers.bulk(es, chunk, chunk_size=len(chunk))

            except BulkIndexError as e:
                logger.info(
                    f"Got {len(e.errors)} failed actions from Elasticsearch Bulk API"
                )
                failed_data_es += e.errors

            xray_recorder.end_subsegment()

    else:
        logger.info("Nothing to flush")
    xray_recorder.end_subsegment()

    baikonur_logging.save_json_logs_to_s3(
        s3_client, failed_dict, "One or more necessary fields are unavailable")

    timestamp = datetime.datetime.now()
    key = (FAILED_LOG_S3_PATH_PREFIX + "/" +
           timestamp.strftime("%Y-%m/%d/%Y-%m-%d-%H:%M:%S") + ".gz")
    data = "\n".join(to_str(f) for f in failed_data_es)
    logger.info(
        f"Saving records rejected by Elasticsearch to S3: s3://{FAILED_LOG_S3_BUCKET}/{key}"
    )
    s3.put_str_data(s3_client,
                    FAILED_LOG_S3_BUCKET,
                    key,
                    data,
                    gzip_compress=True)

    logger.info(f"Finished")