Beispiel #1
0
    def __init__(self, metrics_params=[]):
        """
        Init the class, adding metrics_params to the
        stuff to compute if a list of metrics parameters (dictionaries) is passed.
        Metrics parameters are checked for correctness, an assertion error will interrupt
        the program if any incorrectness is met.

        :param metrics_params: List of metrics, each metric is a dictionary mapping params of
            the metric to a value, as in the json config file.
        :type metrics_params: list
        """
        Config._process_metrics(metrics_params)
        self._metrics = metrics_params
Beispiel #2
0
    def test_mutual_info_check(self):
        j17 = {
            "table": "tablePath",
            "inferSchema": True,
            "delimiter": "#",
            "header": False,
            "output": home + "/output.json",
            "verbose": False,
            "metrics": [
                {
                    "metric": "mutual",
                    "when": "c1",
                    "then": 2
                },
            ]
        }

        with self.assertRaises(AssertionError) as cm:
            Config(j17)

        j17["metrics"][0]["metric"] = "mutual_info"
        j17["metrics"][0]["when"] = ["c1"]
        with self.assertRaises(AssertionError) as cm:
            Config(j17)

        j17["metrics"][0]["when"] = 1
        j17["metrics"][0]["then"] = [0]
        with self.assertRaises(AssertionError) as cm:
            Config(j17)

        j17["metrics"][0]["then"] = 0
        j17["metrics"][0]["useless param"] = 1010
        with self.assertRaises(AssertionError) as cm:
            Config(j17)

        j17["metrics"][0]["then"] = 1
        del j17["metrics"][0]["useless param"]
        with self.assertRaises(AssertionError) as cm:
            Config(j17)

        del j17["metrics"][0]["then"]
        with self.assertRaises(AssertionError) as cm:
            Config(j17)

        del j17["metrics"][0]["when"]
        j17["metrics"][0]["then"] = 1
        with self.assertRaises(AssertionError) as cm:
            Config(j17)

        j17["metrics"][0]["then"] = 1
        with self.assertRaises(AssertionError) as cm:
            Config(j17)

        j17["metrics"][0]["when"] = 2
        Config(j17)
Beispiel #3
0
 def add(self, input):
     """
     Adds more metrics to the task, the input can be of:
     - a _Task class or any class extending it
     - a dict containing metric params, or a list of those
     :param input: a _Task class or a class inheriting from it, a list of metrics params or a single metric param
     :type input dict/list/Task
     :return: Returns a reference to this instance of _Task, for chaining.
     """
     if type(input) is list:
         Config._process_metrics(input)
         self._metrics.extend(input)
     elif type(input) is dict:
         Config._process_metrics([input])
         self._metrics.append(input)
     elif isinstance(input, _Task):
         # metrics should already have been processed, no need to check here
         self._metrics.extend(input._metrics)
     else:
         "Input must be either a list of dicts, a dict or another Task instance"
         exit()
     return self
Beispiel #4
0
    def test_optional_arguments2(self):
        # set optional arguments
        j6 = {
            "table": "tablePath",
            "inferSchema": True,
            "delimiter": "#",
            "header": False,
            "output": home + "/output.json",
            "verbose": True,
            "metrics": [
                {
                    "metric": "completeness"
                },
            ]
        }
        # test default args are set as we wanted
        c = Config(j6)
        self.assertEqual(c["delimiter"], "#")
        self.assertEqual(c["header"], False)
        self.assertEqual(c["verbose"], True)

        with self.assertRaises(AssertionError) as cm:
            j6["delimiter"] = True
            Config(j6)

        with self.assertRaises(AssertionError) as cm:
            j6["delimiter"] = "#"
            j6["header"] = "header"
            Config(j6)

        with self.assertRaises(AssertionError) as cm:
            j6["header"] = False
            j6["threads"] = "shouldnt be here"
            Config(j6)

        with self.assertRaises(AssertionError) as cm:
            j6["verbose"] = 1
            Config(j6)
Beispiel #5
0
    def test_deduplication_aproximated_check(self):
        j15 = {
            "table": "tablePath",
            "inferSchema": True,
            "delimiter": "#",
            "header": False,
            "output": home + "/output.json",
            "verbose": False,
            "metrics": [
                {
                    "metric": "deduplication_approximated",
                    "columns": []
                },
            ]
        }

        with self.assertRaises(AssertionError) as cm:
            Config(j15)

        j15["metrics"][0]["metric"] = 10
        del j15["metrics"][0]["columns"]
        with self.assertRaises(AssertionError) as cm:
            Config(j15)

        j15["metrics"][0]["metric"] = "deduplication_approximated"
        j15["metrics"][0]["useless param"] = 1010
        with self.assertRaises(AssertionError) as cm:
            Config(j15)

        j15["metrics"][0]["columns"] = ["c0"]
        # note that at this point "useless param" is still in there
        with self.assertRaises(AssertionError) as cm:
            Config(j15)

        del j15["metrics"][0]["useless param"]
        # should run
        Config(j15)
Beispiel #6
0
    def test_completeness_check(self):
        j8 = {
            "table": "tablePath",
            "inferSchema": True,
            "delimiter": "#",
            "header": False,
            "output": home + "/output.json",
            "verbose": False,
            "metrics": [
                {
                    "metric": "completeness",
                    "columns": []
                },
            ]
        }

        with self.assertRaises(AssertionError) as cm:
            Config(j8)

        j8["metrics"][0]["metric"] = 10
        del j8["metrics"][0]["columns"]
        with self.assertRaises(AssertionError) as cm:
            Config(j8)

        j8["metrics"][0]["metric"] = "completeness"
        j8["metrics"][0]["useless param"] = 1010
        with self.assertRaises(AssertionError) as cm:
            Config(j8)

        j8["metrics"][0]["columns"] = ["c0"]
        # note that at this point "useless param" is still in there
        with self.assertRaises(AssertionError) as cm:
            Config(j8)

        del j8["metrics"][0]["useless param"]
        # should run
        Config(j8)
Beispiel #7
0
 def test_optional_arguments1(self):
     # no optional arguments
     j5 = {
         "table": "tablePath",
         "inferSchema": True,
         "output": home + "/output.json",
         "metrics": [
             {
                 "metric": "completeness"
             },
         ]
     }
     # test default args are set as defaults
     c = Config(j5)
     self.assertEqual(c["delimiter"], ",")
     self.assertEqual(c["header"], True)
     self.assertEqual(c["verbose"], False)
Beispiel #8
0
    def test_getter(self):
        j7 = {
            "table": "tablePath",
            "inferSchema": True,
            "delimiter": "#",
            "header": False,
            "output": home + "/output.json",
            "verbose": True,
            "metrics": [
                {
                    "metric": "completeness"
                },
            ]
        }
        c = Config(j7)

        # test no assignment
        with self.assertRaises(TypeError) as cm:
            c["table"] = "new"

        # test is copy
        metrics = c["metrics"]
        metrics[0]["metric"] = "ayy"
        self.assertEqual(c["metrics"][0]["metric"], "completeness")
Beispiel #9
0
    def test_entropy_check(self):
        j16 = {
            "table": "tablePath",
            "inferSchema": True,
            "delimiter": "#",
            "header": False,
            "output": home + "/output.json",
            "verbose": False,
            "metrics": [
                {
                    "metric": "entropy",
                    "column": []
                },
            ]
        }

        with self.assertRaises(AssertionError) as cm:
            Config(j16)

        j16["metrics"][0]["metric"] = 10
        j16["metrics"][0]["column"] = "c1"
        with self.assertRaises(AssertionError) as cm:
            Config(j16)

        j16["metrics"][0]["metric"] = "entropy"
        j16["metrics"][0]["useless param"] = 1010
        with self.assertRaises(AssertionError) as cm:
            Config(j16)

        del j16["metrics"][0]["column"]
        with self.assertRaises(AssertionError) as cm:
            Config(j16)

        del j16["metrics"][0]["useless param"]
        with self.assertRaises(AssertionError) as cm:
            Config(j16)

        j16["metrics"][0]["column"] = "c1"
        # should run
        Config(j16)
Beispiel #10
0
    def test_constraint_check(self):
        j14 = {
            "table": "tablePath",
            "inferSchema": True,
            "delimiter": "#",
            "header": False,
            "output": home + "/output.json",
            "verbose": False,
            "metrics": [
                {
                    "metric": "constraint",
                },
            ]
        }

        with self.assertRaises(AssertionError) as cm:
            Config(j14)

        j14["metrics"][0]["when"] = ["c2"]
        with self.assertRaises(AssertionError) as cm:
            Config(j14)

        del j14["metrics"][0]["when"]
        j14["metrics"][0]["then"] = ["c2"]
        with self.assertRaises(AssertionError) as cm:
            Config(j14)

        j14["metrics"][0]["when"] = ["c2"]
        j14["metrics"][0]["then"] = ["c2"]
        j14["metrics"][0]["ssss"] = ["c2"]
        with self.assertRaises(AssertionError) as cm:
            Config(j14)

        j14["metrics"][0]["when"] = "s2"
        j14["metrics"][0]["then"] = ["c2"]
        j14["metrics"][0]["ssss"] = ["c2"]
        with self.assertRaises(AssertionError) as cm:
            Config(j14)

        del j14["metrics"][0]["ssss"]
        j14["metrics"][0]["when"] = []
        with self.assertRaises(AssertionError) as cm:
            Config(j14)

        j14["metrics"][0]["when"] = ["c1"]
        j14["metrics"][0]["then"] = True
        with self.assertRaises(AssertionError) as cm:
            Config(j14)

        j14["metrics"][0]["when"] = ["c1"]
        j14["metrics"][0]["then"] = []
        with self.assertRaises(AssertionError) as cm:
            Config(j14)

        j14["metrics"][0]["when"] = ["c1", None]
        j14["metrics"][0]["then"] = []
        with self.assertRaises(AssertionError) as cm:
            Config(j14)

        j14["metrics"][0]["when"] = ["c1", None]
        j14["metrics"][0]["then"] = []
        with self.assertRaises(AssertionError) as cm:
            Config(j14)

        j14["metrics"][0]["when"] = ["c1"]
        j14["metrics"][0]["then"] = ["c2", "c1"]
        with self.assertRaises(AssertionError) as cm:
            Config(j14)

        j14["metrics"][0]["when"] = ["c1"]
        j14["metrics"][0]["then"] = ["c2", "c4"]
        cond1 = {"s": 1, "z": 2, "x": 4}
        cond2 = {"s": 1, "z": 2, "x": 4}
        j14["metrics"][0]["conditions"] = [cond1, cond2]
        with self.assertRaises(AssertionError) as cm:
            Config(j14)

        cond1 = {"column": None, "operator": "gt", "value": 4}
        cond2 = {"column": 1, "operator": "gt", "value": 4}
        j14["metrics"][0]["conditions"] = [cond1, cond2]
        with self.assertRaises(AssertionError) as cm:
            Config(j14)

        cond1 = {"column": "we", "operator": "gt", "value": 4}
        cond2 = {"column": "c1", "operator": "vgt", "value": 4}
        j14["metrics"][0]["conditions"] = [cond1, cond2]
        with self.assertRaises(AssertionError) as cm:
            Config(j14)

        cond1 = {"column": "we", "operator": "gt", "value": "s"}
        cond2 = {"column": "c1", "operator": "gt", "value": 4}
        j14["metrics"][0]["conditions"] = [cond1, cond2]
        with self.assertRaises(AssertionError) as cm:
            Config(j14)

        print("###########")
        cond1 = {"column": "we", "operator": "gt", "value": 4}
        cond2 = {"column": "c1", "operator": "eq", "value": True}
        j14["metrics"][0]["conditions"] = [cond1, cond2]
        with self.assertRaises(AssertionError) as cm:
            Config(j14)

        cond1 = {"column": "we", "operator": "gt", "value": 4}
        cond2 = {"column": "c1", "operator": "eq", "value": "jhon"}
        j14["metrics"][0]["conditions"] = [cond1, cond2]
        # should run
        Config(j14)
Beispiel #11
0
    def test_group_rule_check(self):
        j13 = {
            "table": "tablePath",
            "inferSchema": True,
            "delimiter": "#",
            "header": False,
            "output": home + "/output.json",
            "verbose": False,
            "metrics": [
                {
                    "metric": "groupRule",
                },
            ]
        }
        with self.assertRaises(AssertionError) as cm:
            Config(j13)

        j13["metrics"][0]["metric"] = "groupRule"
        j13["metrics"][0]["having"] = dict()
        with self.assertRaises(AssertionError) as cm:
            Config(j13)

        del j13["metrics"][0]["having"]
        j13["metrics"][0]["columns"] = ["s"]
        with self.assertRaises(AssertionError) as cm:
            Config(j13)

        j13["metrics"][0]["metric"] = "groupRule"
        j13["metrics"][0]["columns"] = ["s"]
        j13["metrics"][0]["having"] = ["s"]
        j13["metrics"][0]["z"] = dict()
        with self.assertRaises(AssertionError) as cm:
            Config(j13)

        del j13["metrics"][0]["z"]
        j13["metrics"][0]["conditions"] = dict()
        with self.assertRaises(AssertionError) as cm:
            Config(j13)

        cond1 = {"column": "we", "operator": "gt", "value": 4}
        cond2 = {"column": "c1", "operator": "eq", "value": "jhon"}

        having = {
            "operator": "zz",
            "value": 5,
            "aggregator": "min",
            "column": "c2"
        }
        j13["metrics"][0]["having"] = [having]
        j13["metrics"][0]["conditions"] = [cond1, cond2]
        with self.assertRaises(AssertionError) as cm:
            Config(j13)

        having = {
            "operator": "gt",
            "value": 2,
            "aggregator": "dmin",
            "column": "c2"
        }
        j13["metrics"][0]["having"] = [having]
        with self.assertRaises(AssertionError) as cm:
            Config(j13)

        having = {
            "operator": "gt",
            "value": 2,
            "aggregator": "dmin",
            "column": "c2"
        }
        j13["metrics"][0]["having"] = [having]
        with self.assertRaises(AssertionError) as cm:
            Config(j13)

        having = {
            "operator": "gt",
            "value": 2,
            "aggregator": "max",
            "column": "c2"
        }
        j13["metrics"][0]["having"] = [having]
        cond2 = {"column": ["c11"], "operator": "eq", "value": "jhon"}
        j13["metrics"][0]["conditions"] = [cond1, cond2]
        with self.assertRaises(AssertionError) as cm:
            Config(j13)

        cond2 = {"column": "c11", "operator": "==", "value": "jhon"}
        j13["metrics"][0]["conditions"] = [cond1, cond2]
        with self.assertRaises(AssertionError) as cm:
            Config(j13)

        cond2 = {"column": "c11", "operator": "lt", "value": "0.01"}
        j13["metrics"][0]["conditions"] = [cond1, cond2]
        with self.assertRaises(AssertionError) as cm:
            Config(j13)

        cond2 = {"column": "c11", "operator": "eq", "value": 10}
        j13["metrics"][0]["conditions"] = [cond1, cond2]
        # should run
        Config(j13)
Beispiel #12
0
    def test_timeliness_check(self):
        j11 = {
            "table": "tablePath",
            "inferSchema": True,
            "delimiter": "#",
            "header": False,
            "output": home + "/output.json",
            "verbose": False,
            "metrics": [
                {
                    "metric": "timeliness",
                },
            ]
        }

        with self.assertRaises(AssertionError) as cm:
            Config(j11)

        j11["metrics"][0]["metric"] = 10
        j11["metrics"][0]["columns"] = ["1"]
        j11["metrics"][0]["timeFormat"] = "ss"
        j11["metrics"][0]["value"] = "44"
        with self.assertRaises(AssertionError) as cm:
            Config(j11)

        j11["metrics"][0]["metric"] = "timeliness"
        j11["metrics"][0]["columns"] = []
        with self.assertRaises(AssertionError) as cm:
            Config(j11)

        j11["metrics"][0]["columns"] = ["c4", "c2"]
        j11["metrics"][0]["timeFormat"] = list()
        with self.assertRaises(AssertionError) as cm:
            Config(j11)

        del j11["metrics"][0]["timeFormat"]
        j11["metrics"][0]["dateFormat"] = list()
        with self.assertRaises(AssertionError) as cm:
            Config(j11)

        j11["metrics"][0]["z<<"] = ["ss"]
        del j11["metrics"][0]["dateFormat"]
        with self.assertRaises(AssertionError) as cm:
            Config(j11)

        del j11["metrics"][0]["z<<"]
        j11["metrics"][0]["value"] = 44
        with self.assertRaises(AssertionError) as cm:
            Config(j11)

        j11["metrics"][0]["timeFormat"] = "ss:hh:mm"
        with self.assertRaises(AssertionError) as cm:
            Config(j11)

        j11["metrics"][0]["timeFormat"] = ""
        j11["metrics"][0]["value"] = ""
        with self.assertRaises(AssertionError) as cm:
            Config(j11)

        j11["metrics"][0]["timeFormat"] = ":::"
        j11["metrics"][0]["value"] = ":::"
        with self.assertRaises(AssertionError) as cm:
            Config(j11)

        # should run
        j11["metrics"][0]["timeFormat"] = "hhh"
        j11["metrics"][0]["value"] = "555"
        Config(j11)
Beispiel #13
0
    def test_freshness_check(self):
        j10 = {
            "table": "tablePath",
            "inferSchema": True,
            "delimiter": "#",
            "header": False,
            "output": home + "/output.json",
            "verbose": False,
            "metrics": [
                {
                    "metric": "freshness",
                },
            ]
        }

        with self.assertRaises(AssertionError) as cm:
            Config(j10)

        j10["metrics"][0]["metric"] = 10
        j10["metrics"][0]["columns"] = ["1"]
        j10["metrics"][0]["timeFormat"] = "ss"
        with self.assertRaises(AssertionError) as cm:
            Config(j10)

        j10["metrics"][0]["metric"] = "freshness"
        j10["metrics"][0]["columns"] = []
        with self.assertRaises(AssertionError) as cm:
            Config(j10)

        j10["metrics"][0]["metric"] = "freshness"
        j10["metrics"][0]["columns"] = [list("true")]
        j10["metrics"][0]["timeFormat"] = "ss"
        with self.assertRaises(AssertionError) as cm:
            Config(j10)

        del j10["metrics"][0]["timeFormat"]
        j10["metrics"][0]["metric"] = "freshness"
        j10["metrics"][0]["columns"] = ["c2"]
        j10["metrics"][0]["we"] = "ss"
        with self.assertRaises(AssertionError) as cm:
            Config(j10)

        del j10["metrics"][0]["we"]
        j10["metrics"][0]["metric"] = "freshness"
        j10["metrics"][0]["columns"] = ["c2"]
        j10["metrics"][0]["timeFormat"] = True
        with self.assertRaises(AssertionError) as cm:
            Config(j10)

        del j10["metrics"][0]["timeFormat"]
        j10["metrics"][0]["metric"] = "freshness"
        j10["metrics"][0]["columns"] = ["c2"]
        j10["metrics"][0]["dateFormat"] = list()
        with self.assertRaises(AssertionError) as cm:
            Config(j10)

        j10["metrics"][0]["metric"] = "freshness"
        j10["metrics"][0]["columns"] = [4.2]
        j10["metrics"][0]["dateFormat"] = "ss:hh:mm"
        with self.assertRaises(AssertionError) as cm:
            Config(j10)

        # should run
        j10["metrics"][0]["columns"] = ["s"]
        Config(j10)
Beispiel #14
0
    def test_required_arguments(self):
        # missing table
        j1 = {
            "inferSchema": True,
            "delimiter": "|",
            "header": True,
            "output": home + "/output.json",
            "verbose": True,
            "metrics": [
                {
                    "metric": "completeness"
                },
            ]
        }

        # missing inferschema
        j2 = {
            "table": "tablePath",
            "delimiter": "|",
            "header": True,
            "output": home + "/output.json",
            "verbose": True,
            "metrics": [
                {
                    "metric": "completeness"
                },
            ]
        }

        # missing output
        j3 = {
            "table": "tablePath",
            "inferSchema": True,
            "delimiter": "|",
            "header": True,
            "verbose": True,
            "metrics": [
                {
                    "metric": "completeness"
                },
            ]
        }

        # missing metrics
        j4 = {
            "table": "tablePath",
            "inferSchema": True,
            "delimiter": "|",
            "header": True,
            "output": home + "/output.json",
            "verbose": True,
        }

        with self.assertRaises(AssertionError) as cm:
            Config(j1)

        with self.assertRaises(AssertionError) as cm:
            j1["table"] = 10
            Config(j1)

        with self.assertRaises(AssertionError) as cm:
            Config(j2)

        with self.assertRaises(AssertionError) as cm:
            j2["inferSchema"] = "yes"
            Config(j2)

        with self.assertRaises(AssertionError) as cm:
            Config(j3)

        with self.assertRaises(AssertionError) as cm:
            j3["output"] = True
            Config(j3)

        with self.assertRaises(AssertionError) as cm:
            Config(j4)

        with self.assertRaises(AssertionError) as cm:
            j4["metrics"] = "ss"
            Config(j4)

        with self.assertRaises(AssertionError) as cm:
            j4["metrics"] = []
            Config(j4)
Beispiel #15
0
 def test_input_type(self):
     with self.assertRaises(SystemExit) as cm:
         Config(4)
Beispiel #16
0
if __name__ == '__main__':
    print("____HAY_CHECKER____")

    # PARSING CONFIG
    args = sys.argv[1:]
    print("Parsing configuration file")
    if len(args) != 2 or args[0] != "--config":
        print("Usage:")
        print(
            "spark-submit your-spark-parameters hay_checker.py --config <path to config.json>"
        )
        print("or")
        print("python3 hay_checker.py --config <path to config.json")
        exit()
    config = Config(args[1])

    # LAUNCH SPARK, detect if running in distributed or centralized version
    print("Launching spark")
    spark = SparkSession.builder
    spark = spark.getOrCreate()

    # IMPORT DATA
    print("Getting data")
    df = import_data(spark, config["table"], config["header"],
                     config["delimiter"], config["inferSchema"])

    # EITHER GO WITH SPARK OR PANDAS
    name = spark.conf.get("spark.app.name")
    filename = __main__.__file__.split("/")[-1]
    if name == filename:
Beispiel #17
0
    def test_rule_check(self):
        j12 = {
            "table": "tablePath",
            "inferSchema": True,
            "delimiter": "#",
            "header": False,
            "output": home + "/output.json",
            "verbose": False,
            "metrics": [
                {
                    "metric": "rule",
                },
            ]
        }
        with self.assertRaises(AssertionError) as cm:
            Config(j12)

        j12["metrics"][0]["metric"] = 10
        j12["metrics"][0]["conditions"] = dict()
        with self.assertRaises(AssertionError) as cm:
            Config(j12)

        j12["metrics"][0]["metric"] = "rule"
        j12["metrics"][0]["conditions"] = "aaa"
        with self.assertRaises(AssertionError) as cm:
            Config(j12)

        j12["metrics"][0]["conditions"] = list()
        with self.assertRaises(AssertionError) as cm:
            Config(j12)

        cond1 = dict()
        cond2 = dict()
        j12["metrics"][0]["conditions"] = [cond1, cond2]
        with self.assertRaises(AssertionError) as cm:
            Config(j12)

        cond1 = {"s": 1, "z": 2, "x": 4}
        cond2 = {"s": 1, "z": 2, "x": 4}
        j12["metrics"][0]["conditions"] = [cond1, cond2]
        with self.assertRaises(AssertionError) as cm:
            Config(j12)

        cond1 = {"column": None, "operator": "gt", "value": 4}
        cond2 = {"column": 1, "operator": "gt", "value": 4}
        j12["metrics"][0]["conditions"] = [cond1, cond2]
        with self.assertRaises(AssertionError) as cm:
            Config(j12)

        cond1 = {"column": "we", "operator": "gt", "value": 4}
        cond2 = {"column": "c1", "operator": "vgt", "value": 4}
        j12["metrics"][0]["conditions"] = [cond1, cond2]
        with self.assertRaises(AssertionError) as cm:
            Config(j12)

        cond1 = {"column": "we", "operator": "gt", "value": "s"}
        cond2 = {"column": "c1", "operator": "gt", "value": 4}
        j12["metrics"][0]["conditions"] = [cond1, cond2]
        with self.assertRaises(AssertionError) as cm:
            Config(j12)

        cond1 = {"column": "we", "operator": "gt", "value": 4}
        cond2 = {"column": "c1", "operator": "eq", "value": True}
        j12["metrics"][0]["conditions"] = [cond1, cond2]
        with self.assertRaises(AssertionError) as cm:
            Config(j12)

        cond1 = {"column": "we", "operator": "gt", "value": 4}
        cond2 = {"column": "c1", "operator": "eq", "value": "jhon"}
        j12["metrics"][0]["conditions"] = [cond1, cond2]
        # should run
        Config(j12)