Example #1
0
    def test_histogram_others(self, spark_session, meta_conf):
        strings = []
        for i in range(1, 101):
            str = "str-{}".format(i)
            new_strings = [str] * i
            strings.extend(new_strings)

        strings = [(i, ) for i in strings]
        df = spark_session.createDataFrame(strings, ["string_column"])

        value_meta = SparkDataFrameValueType().get_value_meta(df, meta_conf)

        histogram = value_meta.histograms["string_column"]
        assert len(histogram[0]) == 50 and len(histogram[1]) == 50
        assert histogram[0][0] == 100 and histogram[1][0] == "str-100"
        assert histogram[0][10] == 90 and histogram[1][10] == "str-90"
        assert histogram[0][-2] == 52 and histogram[1][-2] == "str-52"
        assert histogram[0][-1] == sum(range(
            1, 52)) and histogram[1][-1] == "_others"

        stats = value_meta.descriptive_stats["string_column"]
        assert stats["count"] == 5050 == sum(histogram[0])
        assert stats["non-null"] == 5050
        assert stats["null-count"] == 0
        assert stats["distinct"] == 100
        assert stats["type"] == "string"
Example #2
0
 def test_float_column(self, spark_session, meta_conf, numbers):
     numbers = [(float(i), ) if i else (None, ) for i in numbers]
     df = spark_session.createDataFrame(numbers, ["numerical_column"])
     value_meta = SparkDataFrameValueType().get_value_meta(df, meta_conf)
     # pandas_df = df.toPandas()
     # pandas_stats, pandas_histograms = DataFrameValueType().get_histograms(pandas_df, meta_conf)
     self.validate_numerical_histogram_and_stats(value_meta,
                                                 "numerical_column")
     stats = value_meta.descriptive_stats["numerical_column"]
     assert stats["type"] == "double"
Example #3
0
 def test_null_str_column(self, spark_session, meta_conf):
     column_name = "null_column"
     nulls = [(None, ) for _ in range(20)]
     schema = StructType([StructField(column_name, StringType(), True)])
     null_df = spark_session.createDataFrame(nulls, schema=schema)
     value_meta = SparkDataFrameValueType().get_value_meta(
         null_df, meta_conf)
     assert value_meta.histograms[column_name] == ([20], [None])
     stats = value_meta.descriptive_stats[column_name]
     assert stats["type"] == "string"
Example #4
0
    def test_complex_column(self, spark_session, meta_conf, numbers):
        complex = [(i, [str(i), str(i + 1)]) if i else [None] * 2
                   for i in numbers]
        df = spark_session.createDataFrame(
            complex, ["numerical_column", "complex_column"])
        value_meta = SparkDataFrameValueType().get_value_meta(df, meta_conf)

        assert list(value_meta.histograms.keys()) == ["numerical_column"]
        assert list(
            value_meta.descriptive_stats.keys()) == ["numerical_column"]
        self.validate_numerical_histogram_and_stats(value_meta,
                                                    "numerical_column")
Example #5
0
    def test_multiple_columns(self, spark_session, meta_conf, numbers):
        values = [(i, float(i), str(i), str(i)) if i else [None] * 4
                  for i in numbers]
        df = spark_session.createDataFrame(values,
                                           ["ints", "floats", "str1", "str2"])
        value_meta = SparkDataFrameValueType().get_value_meta(df, meta_conf)

        self.validate_numerical_histogram_and_stats(value_meta, "ints")
        self.validate_numerical_histogram_and_stats(value_meta, "floats")
        str_histogram_1 = value_meta.histograms["str1"]
        str_histogram_2 = value_meta.histograms["str2"]
        assert str_histogram_1[0] == [4, 3, 2, 1]
        assert str_histogram_1[1] == ["1", "5", None, "3"]
        assert str_histogram_1 == str_histogram_2
Example #6
0
    def test_boolean_histogram(self, spark_session, meta_conf):
        booleans = [True] * 10 + [None] * 10 + [False] * 20 + [True] * 20
        booleans = [(i, ) for i in booleans]
        boolean_df = spark_session.createDataFrame(booleans,
                                                   ["boolean_column"])

        value_meta = SparkDataFrameValueType().get_value_meta(
            boolean_df, meta_conf)

        histogram = value_meta.histograms["boolean_column"]
        assert histogram[0] == [30, 20, 10]
        assert histogram[1] == [True, False, None]

        stats = value_meta.descriptive_stats["boolean_column"]
        assert stats["count"] == 60
        assert stats["type"] == "boolean"