Python AggregationsParser Examples, config_parsing.aggregations_parser.AggregationsParser Python Examples

Example #1

0

Show file

    def test__types_and_fields_validation_raise_already_aggregated_field_exception(
            self):
        test_input_rule = json.loads(
            """["key: src_ip","max(packet_size)","min(packet_size)"]""")
        test_input_operation = "reduceByKey"
        config = TestConfig({
            "processing": {
                "aggregations": {
                    "operation_type": test_input_operation,
                    "rule": test_input_rule
                }
            }
        })
        src_ip = StructField('src_ip', StringType())
        packet_size = StructField('packet_size', LongType())
        test_aggregation_config = AggregationsParser(
            config, StructType([src_ip, packet_size]))
        test_aggregation_config._expression = test_aggregation_config._parse_expression(
        )

        with self.assertRaisesRegex(
                NotValidAggregationExpression,
                "^Aggregate already aggregated field packet_size$"):
            test_validation = test_aggregation_config._types_and_field_names_validation(
            )

Example #2

0

Show file

    def test__types_and_fields_validation_raise_wrong_field_type_exception(
            self):
        # test wrong  type of field
        test_input_rule = json.loads(
            """["key: input_port","min(dst_mac)","sum(ip_size)"]""")
        test_input_operation = "reduceByKey"
        config = TestConfig({
            "processing": {
                "aggregations": {
                    "operation_type": test_input_operation,
                    "rule": test_input_rule
                }
            }
        })
        input_port = StructField('input_port', IntegerType())
        ip_size = StructField('ip_size', IntegerType())
        dst_mac = StructField('dst_mac', StringType())
        test_aggregation_config = AggregationsParser(
            config, StructType([input_port, dst_mac, ip_size]))
        test_aggregation_config._expression = test_aggregation_config._parse_expression(
        )

        with self.assertRaisesRegex(
                NotValidAggregationExpression,
                "^Incorrect type of field dst_mac for function min$"):
            _ = test_aggregation_config._types_and_field_names_validation()

Example #3

0

Show file

    def test__types_and_fields_validation_raise_wrong_function_exception(self):
        # test wrong  function name
        test_input_rule = json.loads(
            """["key: input_port","sin(in_vlan)","sum(ip_size)"]""")
        test_input_operation = "reduceByKey"
        config = TestConfig({
            "processing": {
                "aggregations": {
                    "operation_type": test_input_operation,
                    "rule": test_input_rule
                }
            }
        })
        input_port = StructField('input_port', IntegerType())
        in_vlan = StructField('in_vlan', IntegerType())
        ip_size = StructField('ip_size', IntegerType())
        test_aggregation_config = AggregationsParser(
            config, StructType([input_port, in_vlan, ip_size]))
        test_aggregation_config._expression = test_aggregation_config._parse_expression(
        )

        with self.assertRaisesRegex(NotValidAggregationExpression,
                                    "^Unsupported function\(s\): {'sin'}$"):
            test_validation = test_aggregation_config._types_and_field_names_validation(
            )

Example #4

0

Show file

 def test__check_uniq_key_field(self):
     test_input_rule = json.loads(
         """["min(field_name1)","count(field_name2)","sum(field_nameN)"]""")
     test_input_operation = "reduce"
     config = TestConfig({
         "processing": {
             "aggregations": {
                 "operation_type": test_input_operation,
                 "rule": test_input_rule
             }
         }
     })
     test_aggregation_config = AggregationsParser(
         config, self.data_structure_pyspark)
     self.assertTrue(
         test_aggregation_config._check_unique_key_field([{
             "input_field": "test1",
             "key": False
         }, {
             "input_field": "test2",
             "key": False
         }]),
         "Return value should be true if the input list don't contain key fields with true value"
     )
     self.assertTrue(
         not test_aggregation_config._check_unique_key_field(
             [{
                 "input_field": "test1",
                 "key": True
             }, {
                 "input_field": "test2",
                 "key": False
             }]),
         "Return value should be false if the input list contain key fields with true value"
     )

Example #5

0

Show file

    def test__parse_expression(self):
        test_input_rule = json.loads(
            """["key : field_name1","count(field_name2)","sum(field_nameN)"]"""
        )
        test_input_operation = "reduceByKey"
        config = TestConfig({
            "processing": {
                "aggregations": {
                    "operation_type": test_input_operation,
                    "rule": test_input_rule
                }
            }
        })
        test_aggregation_config = AggregationsParser(
            config, self.data_structure_pyspark)
        test_expression_token = test_aggregation_config._parse_expression()
        self.assertIsInstance(
            test_expression_token, list,
            "Return value of the _pars_expression method should be instance of list"
        )

        test_input_rule = json.loads(
            """["sum(field_name1)","count(field_name2)","sum(field_nameN)"]""")
        test_input_operation = "reduce"
        config = TestConfig({
            "processing": {
                "aggregations": {
                    "operation_type": test_input_operation,
                    "rule": test_input_rule
                }
            }
        })
        test_aggregation_config = AggregationsParser(
            config, self.data_structure_pyspark)
        test_expression_token = test_aggregation_config._parse_expression()
        self.assertIsInstance(
            test_expression_token, list,
            "Return value of the _pars_expression method should be instance of list"
        )

        test_input_rule = json.loads(
            """["key: field_name1","count(field_name2)","sum(field_nameN)"]""")
        test_input_operation = "groupBy"
        config = TestConfig({
            "processing": {
                "aggregations": {
                    "operation_type": test_input_operation,
                    "rule": test_input_rule
                }
            }
        })
        test_aggregation_config = AggregationsParser(
            config, self.data_structure_pyspark)

        with self.assertRaises(NotValidAggregationExpression) as context:
            test_expression_token = test_aggregation_config._parse_expression()
        self.assertTrue("The operation" in context.exception.args[0],
                        "Catch exception, but it differs from test exception")

Example #6

0

Show file

    def test__field_validation(self):
        config = TestConfig({
            "processing": {
                "aggregations": {
                    "operation_type": "",
                    "rule": ""
                }
            }
        })
        test_aggregation_config = AggregationsParser(
            config, self.data_structure_pyspark)

        test_parse = test_aggregation_config._field_validation(
            [('count', 'field_name2')], "count(field_name2):new_field_name2")
        self.assertIsInstance(
            test_parse, dict,
            "Return value of the _field_validation method should be instance of dict"
        )
        self.assertDictEqual(
            test_parse, {
                'func_name': 'count',
                'input_field': 'field_name2',
                'key': False
            },
            "Dictionary should contain next pair:func_name: value, input_field: "
            "value")

        # test exception when find 2 and more regexp in field
        with self.assertRaises(NotValidAggregationExpression) as context:
            test_parse = test_aggregation_config._field_validation(
                [('count', 'field_name2'),
                 ('sum', 'field_name3')], "count(field_name2):new_field_name2")
        self.assertTrue("Error in the rule" in context.exception.args[0],
                        "Catch exception, but it differs from test exception")

        # test exception when don't find regexp in field
        with self.assertRaises(NotValidAggregationExpression) as context:
            test_parse = test_aggregation_config._field_validation([], "")
        self.assertTrue("Error in the field" in context.exception.args[0],
                        "Catch exception, but it differs from test exception")

Example #7

0

Show file

File: test_ParsedAggregationConfig.py Project: MaslovaEV/kafka-spark-influx-csv-analysis

    def test__types_and_fields_validation_raise_wrong_field_name_exception(
            self):
        # test wrong field name
        test_input_rule = "key = input_port;Min(in_vlan_bad); Sum(ip_size)"
        test_input_operation = "reduceByKey"
        config = TestConfig({
            "processing": {
                "aggregations": {
                    "operation_type": test_input_operation,
                    "rule": test_input_rule
                }
            }
        })
        test_aggregation_config = AggregationsParser(
            config, self.data_structure_pyspark)
        test_aggregation_config._expression = test_aggregation_config._parse_expression(
        )

        with self.assertRaisesRegex(
                NotValidAggregationExpression,
                "^Unsupported or unused field\(s\): {'in_vlan_bad'}$"):
            test_validation = test_aggregation_config._types_and_field_names_validation(
            )

Example #8

0

Show file

    def test_get_parse_expression(self):
        test_input_rule = json.loads(
            """["key: input_port","sum(packet_size)"]""")
        test_input_operation = "reduceByKey"
        config = TestConfig({
            "processing": {
                "aggregations": {
                    "operation_type": test_input_operation,
                    "rule": test_input_rule
                }
            }
        })
        input_port = StructField('input_port', IntegerType())
        packet_size = StructField('packet_size', LongType())
        test_aggregation_config = AggregationsParser(
            config, StructType([
                input_port,
                packet_size,
            ]))
        test_expression_token = test_aggregation_config.get_parse_expression()
        self.assertIsInstance(
            test_expression_token, dict,
            "Return value of the get_parse_expression method should be instance of dict"
        )
        self.assertEqual(
            test_expression_token["operation_type"], "reduceByKey",
            "The dictionary should be contain pair 'operation_type':'reduce'")
        self.assertIsInstance(
            test_expression_token["rule"], list,
            "The dictionary should be contain not empty pair 'rule':list of token"
        )
        self.assertGreater(
            len(test_expression_token["rule"]), 0,
            "The dictionary should be contain not empty pair 'rule':list of token"
        )

        # test exception to incorrect type,function name or field name
        test_input_rule = json.loads(
            """["key : field_name1","count(field_name2)","sum(field_nameN)"]"""
        )
        test_input_operation = "reduceByKey"
        config = TestConfig({
            "processing": {
                "aggregations": {
                    "operation_type": test_input_operation,
                    "rule": test_input_rule
                }
            }
        })
        test_aggregation_config = AggregationsParser(
            config, self.data_structure_pyspark)

        with self.assertRaisesRegexp(NotValidAggregationExpression,
                                     "^Unsupported function\(s\): {'count'}$"):
            test_expression_token = test_aggregation_config.get_parse_expression(
            )

Example #9

0

Show file

File: aggregation_processor.py Project: MaslovaEV/kafka-spark-influx-csv-analysis

    def __init__(self, config_processor, input_data_structure):
        self.config_processor = config_processor
        self._input_data_structure = input_data_structure

        self._support_reduce_operations = SupportedReduceOperations().operation
        self.key_data = []

        aggregation_expression = AggregationsParser(config_processor,
                                                    self._input_data_structure)
        self._aggregation_expression = aggregation_expression.get_parse_expression(
        )

        self._input_field_name = [
            struct_field.name for struct_field in self._input_data_structure
        ]

        aggregation_data = copy.deepcopy(self._aggregation_expression)
        if self._aggregation_expression["operation_type"] == "reduceByKey":
            key_struct_list = [
                token for token in self._aggregation_expression["rule"]
                if token["key"]
            ]
            # (key_index,key_struct_field)
            for key_struct in key_struct_list:
                self.key_data.append(
                    (self._input_field_name.index(key_struct["input_field"]),
                     key_struct))
            for key_struct in key_struct_list:
                aggregation_data["rule"].remove(key_struct)
                self._input_field_name.remove(key_struct["input_field"])

        self._field_to_func_name = {(field["input_field"]):
                                    (field["func_name"])
                                    for field in aggregation_data["rule"]}
        self._enumerate_output_field = dict(
            map(lambda x: (x[1], x[0]), enumerate(self._input_field_name)))

Example #10

0

Show file

    def test__parse_reduce_by_key(self):
        test_input_rule = json.loads(
            """["key : field_name1","count(field_name2)","sum(field_nameN)"]"""
        )
        test_input_operation = "reduceByKey"
        config = TestConfig({
            "processing": {
                "aggregations": {
                    "operation_type": test_input_operation,
                    "rule": test_input_rule
                }
            }
        })
        test_aggregation_config = AggregationsParser(
            config, self.data_structure_pyspark)
        test_expression_token = test_aggregation_config._parse_reduce_by_key()
        self.assertIsInstance(
            test_expression_token, list,
            "Return value of the _pars_reduce_by_key method should be instance of list"
        )
        self.assertGreater(
            len(test_expression_token), 0,
            "Return value of the _pars_reduce method should not be empty")

        #
        # Testing complex key
        #

        test_input_rule = json.loads(
            """["key : (field_name1,field_name2)","count(field_name3)","sum(field_nameN)"]"""
        )
        test_input_operation = "reduceByKey"
        config = TestConfig({
            "processing": {
                "aggregations": {
                    "operation_type": test_input_operation,
                    "rule": test_input_rule
                }
            }
        })
        test_aggregation_config = AggregationsParser(
            config, self.data_structure_pyspark)
        test_expression_token = test_aggregation_config._parse_reduce_by_key()
        self.assertIsInstance(
            test_expression_token, list,
            "Return value of the _pars_reduce_by_key method should be instance of list"
        )
        self.assertGreater(
            len(test_expression_token), 0,
            "Return value of the _pars_reduce method should not be empty")

        #
        # Testing an exception for two or more key field
        #
        test_input_rule = json.loads(
            """["key : field_name1","key : field_name2","sum(field_nameN)"]""")
        test_input_operation = "reduceByKey"
        config = TestConfig({
            "processing": {
                "aggregations": {
                    "operation_type": test_input_operation,
                    "rule": test_input_rule
                }
            }
        })
        test_aggregation_config = AggregationsParser(
            config, self.data_structure_pyspark)
        with self.assertRaises(NotValidAggregationExpression) as context:
            test_expression_token = test_aggregation_config._parse_reduce_by_key(
            )
        self.assertTrue(
            "Key field is not unique in rule" in context.exception.args[0],
            "Catch exception, but it differs from test exception")

        #
        # Testing an exception for missing the key field
        #
        test_input_rule = json.loads(
            """["sum(field_name1)","min(key)","sum(field_nameN)"]""")
        test_input_operation = "reduceByKey"
        config = TestConfig({
            "processing": {
                "aggregations": {
                    "operation_type": test_input_operation,
                    "rule": test_input_rule
                }
            }
        })
        test_aggregation_config = AggregationsParser(
            config, self.data_structure_pyspark)
        with self.assertRaises(NotValidAggregationExpression) as context:
            test_expression_token = test_aggregation_config._parse_reduce_by_key(
            )
        self.assertTrue("don't contain key field" in context.exception.args[0],
                        "Catch exception, but it differs from test exception")

        #
        # Testing an exception for missing parenthesis
        #
        test_input_rule = json.loads(
            """["key: (key_field1, key_field2","sum(field_name1)","sum(field_nameN)"]"""
        )
        test_input_operation = "reduceByKey"
        config = TestConfig({
            "processing": {
                "aggregations": {
                    "operation_type": test_input_operation,
                    "rule": test_input_rule
                }
            }
        })
        test_aggregation_config = AggregationsParser(
            config, self.data_structure_pyspark)
        with self.assertRaises(NotValidAggregationExpression) as context:
            test_expression_token = test_aggregation_config._parse_reduce_by_key(
            )
        self.assertTrue(
            "The number of opening and closing parentheses"
            in context.exception.args[0],
            "Catch exception, but it differs from test exception {}".format(
                context.exception.args[0]))

        #
        # Testing an exception for special characters
        #
        test_input_rule = json.loads(
            """["sum(field_name1)#","min(key)","sum(field_nameN)"]""")
        test_input_operation = "reduceByKey"
        config = TestConfig({
            "processing": {
                "aggregations": {
                    "operation_type": test_input_operation,
                    "rule": test_input_rule
                }
            }
        })
        test_aggregation_config = AggregationsParser(
            config, self.data_structure_pyspark)
        with self.assertRaises(NotValidAggregationExpression) as context:
            test_expression_token = test_aggregation_config._parse_reduce_by_key(
            )
        self.assertTrue(
            "Invalid characters detected" in context.exception.args[0],
            "Catch exception, but it differs from test exception")

        #
        # Testing an exception for other symbols
        #
        test_input_rule = json.loads(
            """["sum(field_name1) sdfsdf","min(key)","sum(field_nameN)"]""")
        test_input_operation = "reduceByKey"
        config = TestConfig({
            "processing": {
                "aggregations": {
                    "operation_type": test_input_operation,
                    "rule": test_input_rule
                }
            }
        })
        test_aggregation_config = AggregationsParser(
            config, self.data_structure_pyspark)
        with self.assertRaises(NotValidAggregationExpression) as context:
            test_expression_token = test_aggregation_config._parse_reduce_by_key(
            )
        self.assertTrue("Error in the rule" in context.exception.args[0],
                        "Catch exception, but it differs from test exception")

Example #11

0

Show file

    def test__pars_reduce(self):
        test_input_rule = json.loads(
            """["Min(field_name1)","count(field_name2)","sum(field_nameN)"]""")
        test_input_operation = "reduce"
        config = TestConfig({
            "processing": {
                "aggregations": {
                    "operation_type": test_input_operation,
                    "rule": test_input_rule
                }
            }
        })
        test_aggregation_config = AggregationsParser(
            config, self.data_structure_pyspark)
        test_expression_token = test_aggregation_config._parse_reduce()
        self.assertIsInstance(
            test_expression_token, list,
            "Return value of the _pars_reduce method should be instance of list"
        )
        self.assertGreater(
            len(test_expression_token), 0,
            "Return value of the _pars_reduce method should not be empty")

        #
        # Testing an exception for special characters
        #
        test_input_rule = json.loads(
            """["sum(field_name1)#","min(key)","sum(field_nameN)"]""")
        test_input_operation = "reduceByKey"
        config = TestConfig({
            "processing": {
                "aggregations": {
                    "operation_type": test_input_operation,
                    "rule": test_input_rule
                }
            }
        })
        test_aggregation_config = AggregationsParser(
            config, self.data_structure_pyspark)
        with self.assertRaises(NotValidAggregationExpression) as context:
            test_expression_token = test_aggregation_config._parse_reduce()
        self.assertTrue(
            "Invalid characters detected" in context.exception.args[0],
            "Catch exception, but it differs from test exception")

        #
        # Testing an exception for other symbols
        #
        test_input_rule = json.loads(
            """["sum(field_name1) sdfsdf","min(key)","sum(field_nameN)"]""")
        test_input_operation = "reduce"
        config = TestConfig({
            "processing": {
                "aggregations": {
                    "operation_type": test_input_operation,
                    "rule": test_input_rule
                }
            }
        })
        test_aggregation_config = AggregationsParser(
            config, self.data_structure_pyspark)
        with self.assertRaises(NotValidAggregationExpression) as context:
            test_expression_token = test_aggregation_config._parse_reduce()
        self.assertTrue("Error in the rule" in context.exception.args[0],
                        "Catch exception, but it differs from test exception")