def test_build_lambda_processor_config(self): parser = TransformationsParser(["a: config('input.options.port')"]) parser.run() operations = TransformationOperations(self.config) transformations_validator = TransformationsValidator( operations, self.data_structure) _ = transformations_validator.validate(parser.expanded_transformation) creator = TransformationCreator(self.data_structure, parser.expanded_transformation, TransformationOperations(self.config)) transformation = creator.build_lambda() self.assertIsInstance(transformation, types.LambdaType, "Transformation type should be lambda") spark = SparkSession.builder.getOrCreate() file = spark.read.csv(DATA_PATH, self.data_structure_pyspark) result = file.rdd.map(transformation) result = result.collect() self.assertListEqual(result, [(29092, ), (29092, ), (29092, ), (29092, ), (29092, )], "List of tuples should be equal") spark.stop()
def test_build_lambda_numbers(self): st = SyntaxTree() st.operation = "_" st.children = [13] # as if it parsed parsed_transformations = [FieldTransformation("a", st)] operations = TransformationOperations(self.config) transformations_validator = TransformationsValidator( operations, self.data_structure) _ = transformations_validator.validate(parsed_transformations) creator = TransformationCreator(self.data_structure, parsed_transformations, TransformationOperations(self.config)) transformation = creator.build_lambda() self.assertIsInstance(transformation, types.LambdaType, "Transformation type should be lambda") spark = SparkSession.builder.getOrCreate() file = spark.read.csv(DATA_PATH, self.data_structure_pyspark) result = file.rdd.map(transformation) result = result.collect() self.assertListEqual(result, [(13, ), (13, ), (13, ), (13, ), (13, )], "List of tuples should be equal") spark.stop()
def test_build_lambda_add_scientific(self): st = SyntaxTree() st.operation = "add" st.children = [1.2E+5, 1.0] parsed_transformations = [FieldTransformation("sum", st)] creator = TransformationCreator(self.data_structure, parsed_transformations, TransformationOperations(self.config)) transformation = creator.build_lambda() self.assertIsInstance(transformation, types.LambdaType, "Transformation type should be lambda") spark = SparkSession.builder.getOrCreate() file = spark.read.csv(DATA_PATH, self.data_structure_pyspark) result = file.rdd.map(transformation) result = result.collect() self.assertListEqual(result, [(120001.0, ), (120001.0, ), (120001.0, ), (120001.0, ), (120001.0, )], "List of tuples should be equal") spark.stop()
def test_build_lambda_truncate(self): st = SyntaxTree() st.operation = "truncate" st.children = ["'test'", 2] parsed_transformations = [ FieldTransformation("cut_upto_2_symbols", st) ] creator = TransformationCreator(self.data_structure, parsed_transformations, TransformationOperations(self.config)) transformation = creator.build_lambda() self.assertIsInstance(transformation, types.LambdaType, "Transformation type should be lambda") spark = SparkSession.builder.getOrCreate() file = spark.read.csv(DATA_PATH, self.data_structure_pyspark) result = file.rdd.map(transformation) result = result.collect() self.assertListEqual(result, [('te', ), ('te', ), ('te', ), ('te', ), ('te', )], "List of tuples should be equal") spark.stop()
def test_build_lambda_with_nested_literals(self): st = SyntaxTree() st.operation = "concat" # should cast int to str and concat st.children = ["'6'", "packet_size"] # packet_size [74, 68] st2 = SyntaxTree() st2.operation = "concat" st2.children = [2E+2, st] parsed_transformations = [FieldTransformation("nested", st2)] creator = TransformationCreator(self.data_structure, parsed_transformations, TransformationOperations(self.config)) transformation = creator.build_lambda() self.assertIsInstance(transformation, types.LambdaType, "Transformation type should be lambda") spark = SparkSession.builder.getOrCreate() file = spark.read.csv(DATA_PATH, self.data_structure_pyspark) result = file.rdd.map(transformation) result = result.collect() self.assertListEqual(result, [('200.0674', ), ('200.0668', ), ('200.061510', ), ('200.06185', ), ('200.06185', )], "List of tuples should be equal") spark.stop()
def test_build_lambda_with_literals(self): st = SyntaxTree() st.operation = "concat" st.children = ["'6 - '", "packet_size"] # packet_size [74, 68] parsed_transformations = [FieldTransformation("ephemer", st)] creator = TransformationCreator(self.data_structure, parsed_transformations, TransformationOperations(self.config)) transformation = creator.build_lambda() self.assertIsInstance(transformation, types.LambdaType, "Transformation type should be lambda") spark = SparkSession.builder.getOrCreate() file = spark.read.csv(DATA_PATH, self.data_structure_pyspark) result = file.rdd.map(transformation) result = result.collect() self.assertListEqual(result, [("6 - 74", ), ("6 - 68", ), ("6 - 1510", ), ("6 - 185", ), ("6 - 185", )], "List of tuples should be equal")
def test_build_lambda_with_nested_operations(self): mult_syntax_tree = SyntaxTree() mult_syntax_tree.operation = "mult" mult_syntax_tree.children = ["packet_size", "sampling_rate"] root_mult_st = SyntaxTree() root_mult_st.operation = "mult" root_mult_st.children = [mult_syntax_tree, "10"] parsed_transformations = [ "src_ip", FieldTransformation("destination_ip", "dst_ip"), FieldTransformation("traffic", root_mult_st) ] creator = TransformationCreator( self.data_structure, parsed_transformations, TransformationOperations({ "country": "./GeoLite2-Country.mmdb", "city": "./GeoLite2-City.mmdb", "asn": "./GeoLite2-ASN.mmdb" })) transformation = creator.build_lambda() self.assertIsInstance(transformation, types.LambdaType, "Transformation type should be lambda") spark = SparkSession.builder.getOrCreate() file = spark.read.csv(DATA_PATH, self.data_structure_pyspark) result = file.rdd.map(transformation) result = result.collect() self.assertListEqual(result, [("217.69.143.60", "91.221.61.183", 378880), ("91.221.61.168", "90.188.114.141", 348160), ("91.226.13.80", "5.136.78.36", 7731200), ("192.168.30.2", "192.168.30.1", 947200), ("192.168.30.2", "192.168.30.1", 947200)], "List of tuples should be equal") spark.stop()
def test_build_lambda_processor_add(self): self.maxDiff = None parser = TransformationsParser([ "dst_ip: add(-13.5, 2)", "src_ip:add(-13.5,2)", "foobar: 'add(-13.5,2)'", "foobar2: 'add\\'(-13.5,2)'" ]) parser.run() operations = TransformationOperations(self.config) transformations_validator = TransformationsValidator( operations, self.data_structure) _ = transformations_validator.validate(parser.expanded_transformation) creator = TransformationCreator(self.data_structure, parser.expanded_transformation, TransformationOperations(self.config)) transformation = creator.build_lambda() self.assertIsInstance(transformation, types.LambdaType, "Transformation type should be lambda") spark = SparkSession.builder.getOrCreate() file = spark.read.csv(DATA_PATH, self.data_structure_pyspark) result = file.rdd.map(transformation) result = result.collect() self.assertListEqual(result, [(-11.5, -11.5, 'add(-13.5,2)', "add'(-13.5,2)"), (-11.5, -11.5, 'add(-13.5,2)', "add'(-13.5,2)"), (-11.5, -11.5, 'add(-13.5,2)', "add'(-13.5,2)"), (-11.5, -11.5, 'add(-13.5,2)', "add'(-13.5,2)"), (-11.5, -11.5, 'add(-13.5,2)', "add'(-13.5,2)")], "List of tuples should be equal") spark.stop()
def test_build_lambda_concat_with_nested_mul(self): mult_syntax_tree = SyntaxTree() mult_syntax_tree.operation = "mul" mult_syntax_tree.children = [6, "packet_size"] mult_syntax_tree_root = SyntaxTree() mult_syntax_tree_root.operation = "concat" mult_syntax_tree_root.children = [ mult_syntax_tree, "' -- xe \' 2/3 mul(3,3) FooBar'" ] parsed_transformations = [ FieldTransformation("traffic", mult_syntax_tree_root) ] creator = TransformationCreator(self.data_structure, parsed_transformations, TransformationOperations(self.config)) transformation = creator.build_lambda() self.assertIsInstance(transformation, types.LambdaType, "Transformation type should be lambda") spark = SparkSession.builder.getOrCreate() file = spark.read.csv(DATA_PATH, self.data_structure_pyspark) result = file.rdd.map(transformation) result = result.collect() self.assertListEqual(result, [('444 -- xe \' 2/3 mul(3,3) FooBar', ), ('408 -- xe \' 2/3 mul(3,3) FooBar', ), ('9060 -- xe \' 2/3 mul(3,3) FooBar', ), ('1110 -- xe \' 2/3 mul(3,3) FooBar', ), ('1110 -- xe \' 2/3 mul(3,3) FooBar', )], "List of tuples should be equal") spark.stop()