Beispiel #1
0
    def test_scala_function1(self):
        greaterThan10 = ScalaFunction1(self.sc._gateway, lambda x: x > 10)
        self.assertFalse(greaterThan10.apply(9))
        self.assertTrue(greaterThan10.apply(11))

        notNoneTest = ScalaFunction1(self.sc._gateway, lambda x: x is not None)
        self.assertFalse(notNoneTest.apply(None))
        self.assertTrue(notNoneTest.apply("foo"))

        appendTest = ScalaFunction1(self.sc._gateway, "{}test".format)
        self.assertEqual("xtest", appendTest.apply("x"))
Beispiel #2
0
    def hasSum(self, column, assertion, hint=None):
        """
        Creates a constraint that asserts on the sum of the column

        :param str column: Column in Data Frame to run the assertion on.
        :param lambda assertion: A function with an int or float parameter. The parameter is the sum.
        :param str hint: A hint that states why a constraint could have failed.
        :return: hasMean self: A Check object that asserts the sum of the column.
        """
        assertion_func = ScalaFunction1(
            self._spark_session.sparkContext._gateway, assertion)
        hint = self._jvm.scala.Option.apply(hint)
        self._Check = self._Check.hasSum(column, assertion_func, hint)
        return self
Beispiel #3
0
    def haveAnyCompleteness(self, columns, assertion, hint=None):
        """Creates a constraint that asserts on any completion in the combined set of columns.

        :param list[str] columns: Columns in Data Frame to run the assertion on.
        :param lambda assertion: A function that accepts an int or float parameter.
        :param str hint: A hint that states why a constraint could have failed.
        :return: haveAnyCompleteness self: A Check.scala object that asserts completion in the columns.
        """
        columns_seq = to_scala_seq(self._jvm, columns)
        assertion_func = ScalaFunction1(
            self._spark_session.sparkContext._gateway, assertion)
        hint = self._jvm.scala.Option.apply(hint)
        self._Check = self._Check.haveAnyCompleteness(columns_seq,
                                                      assertion_func, hint)
        return self
Beispiel #4
0
    def hasMaxLength(self, column, assertion, hint=None):
        """
        Creates a constraint that asserts on the maximum length of a string datatype column

        :param str column: Column in Data Frame to run the assertion on. The column is expected to be a string type.
        :param lanmbda assertion: A function which accepts an int or float parameter
                that discerns the maximum length of the string.
        :param str hint: A hint that states why a constraint could have failed.
        :return: hasMaxLength self : A Check object that asserts maxLength of the column.
        """
        assertion_func = ScalaFunction1(
            self._spark_session.sparkContext._gateway, assertion)
        hint = self._jvm.scala.Option.apply(hint)
        self._Check = self._Check.hasMaxLength(column, assertion_func, hint)
        return self
Beispiel #5
0
    def isPositive(self, column, assertion=None, hint=None):
        """
        Creates a constraint which asserts that a column contains no negative values and is greater than 0.

        :param str column: The Column in DataFrame to run the assertion on.
        :param lambda assertion: A function with an int or float parameter.
        :param str hint: A hint that states why a constraint could have failed.
        :return: isNonNegative self: A Check object that runs the assertion on the column.
        """
        assertion_func = (ScalaFunction1(
            self._spark_session.sparkContext._gateway, assertion) if assertion
                          else getattr(self._Check, "isPositive$default$2")())
        hint = self._jvm.scala.Option.apply(hint)
        self._Check = self._Check.isPositive(column, assertion_func, hint)
        return self
Beispiel #6
0
    def containsURL(self, column, assertion=None, hint=None):
        """
        Check to run against the compliance of a column against an e-mail pattern.

        :param str column: The Column in DataFrame to be checked. The column is expected to be a string datatype.
        :param lambda assertion: A function with an int or float parameter.
        :param str hint: A hint that states why a constraint could have failed.
        :return: containsURL self: A Check object that runs the compliance on the column.
        """
        assertion = (ScalaFunction1(self._spark_session.sparkContext._gateway,
                                    assertion) if assertion else
                     getattr(self._Check, "containsURL$default$2")())
        hint = self._jvm.scala.Option.apply(hint)
        self._Check = self._Check.containsURL(column, assertion, hint)
        return self
Beispiel #7
0
    def hasEntropy(self, column, assertion, hint=None):
        """
        Creates a constraint that asserts on a column entropy. Entropy is a measure of the level of information
        contained in a message.

        :param str column: Column in Data Frame to run the assertion on.
        :param lambda assertion: A function that accepts an int or float parameter.
        :param str hint: A hint that states why a constraint could have failed.
        :return: hasEntropy self: A Check object that asserts the entropy in the column.
        """
        assertion_func = ScalaFunction1(
            self._spark_session.sparkContext._gateway, assertion)
        hint = self._jvm.scala.Option.apply(hint)
        self._Check = self._Check.hasEntropy(column, assertion_func, hint)
        return self
Beispiel #8
0
    def hasSize(self, assertion, hint=None):
        """
        Creates a constraint that calculates the data frame size and runs the assertion on it.

        :param lambda assertion: Refers to a data frame size. The given function can include comparisons
                and conjunction or disjunction statements.
        :param str hint: A hint that states why a constraint could have failed.
        :return: hasSize self: A Check.scala object that implements the assertion on the column.

        """
        assertion_func = ScalaFunction1(
            self._spark_session.sparkContext._gateway, assertion)
        hint = self._jvm.scala.Option.apply(hint)
        self._Check = self._Check.hasSize(
            assertion_func, hint)  # getattr(self._Check, "hasSize$default$2"))
        return self
Beispiel #9
0
    def hasCorrelation(self, columnA, columnB, assertion, hint=None):
        """
        Creates a constraint that asserts on the pearson correlation between two columns.

        :param str columnA: First column in Data Frame which calculates the correlation.
        :param str columnB: Second column in Data Frame which calculates the correlation.
        :param lambda assertion: A function that accepts an int or float parameter.
        :param str hint: A hint that states why a constraint could have failed.
        :return: hasCorrelation self: A Check object that asserts the correlation calculation in the columns.
        """
        assertion_func = ScalaFunction1(
            self._spark_session.sparkContext._gateway, assertion)
        hint = self._jvm.scala.Option.apply(hint)
        self._Check = self._Check.hasCorrelation(columnA, columnB,
                                                 assertion_func, hint)
        return self
Beispiel #10
0
    def hasUniqueValueRatio(self, columns, assertion, hint=None):
        """
        Creates a constraint on the unique value ratio in a single or combined set of key columns.

        :param list[str] columns: Column(s) in Data Frame to run the assertion on.
        :param lambda assertion: A function that accepts an int or float parameter.
        :param str hint: A hint that states why a constraint could have failed.
        :return: hasUniqueValueRatio self: A Check object that asserts the unique value ratio in the columns.
        """
        assertion_func = ScalaFunction1(
            self._spark_session.sparkContext._gateway, assertion)
        hint = self._jvm.scala.Option.apply(hint)
        columns_seq = to_scala_seq(self._jvm, columns)
        self._Check = self._Check.hasUniqueValueRatio(columns_seq,
                                                      assertion_func, hint)
        return self
Beispiel #11
0
    def hasApproxQuantile(self, column, quantile, assertion, hint=None):
        """
        Creates a constraint that asserts on an approximated quantile

        :param str column: Column in Data Frame to run the assertion on
        :param float quantile: Quantile to run the assertion on.
        :param lambda assertion: A function that accepts the computed quantile as an int or float parameter.
        :param str hint: A hint that states why a constraint could have failed.
        :return: hasApproxQuantile self: A Check object that asserts the approximated quantile in the column.
        """
        assertion_func = ScalaFunction1(
            self._spark_session.sparkContext._gateway, assertion)
        hint = self._jvm.scala.Option.apply(hint)
        self._Check = self._Check.hasApproxQuantile(column, float(quantile),
                                                    assertion_func, hint)
        return self
Beispiel #12
0
    def isLessThan(self, columnA, columnB, assertion=None, hint=None):
        """
        Asserts that, in each row, the value of columnA is less than the value of columnB

        :param str columnA: Column in DataFrame to run the assertion on.
        :param str columnB: Column in DataFrame to run the assertion on.
        :param lambda assertion: A function that accepts an int or float parameter.
        :param str hint: A hint that states why a constraint could have failed.
        :return: isLessThan self : A Check object that checks the assertion on the columns.
        """
        assertion_func = (ScalaFunction1(
            self._spark_session.sparkContext._gateway, assertion) if assertion
                          else getattr(self._Check, "isLessThan$default$3")())
        hint = self._jvm.scala.Option.apply(hint)
        self._Check = self._Check.isLessThan(columnA, columnB, assertion_func,
                                             hint)
        return self
Beispiel #13
0
    def hasCompleteness(self, column, assertion, hint=None):
        """
        Creates a constraint that asserts column completion.
        Uses the given history selection strategy to retrieve historical completeness values on this
        column from the history provider.

        :param str column: Column in Data Frame to run the assertion on.
        :param lambda assertion: A function that accepts an int or float parameter.
        :param str hint: A hint that states why a constraint could have failed.
        :return: hasCompleteness self: A Check object that implements the assertion.
        """
        assertion_func = ScalaFunction1(
            self._spark_session.sparkContext._gateway, assertion)
        hint = self._jvm.scala.Option.apply(hint)
        self._Check = self._Check.hasCompleteness(column, assertion_func, hint)
        print(self)
        return self
Beispiel #14
0
    def hasMutualInformation(self, columnA, columnB, assertion, hint=None):
        """
        Creates a constraint that asserts on a mutual information between two columns. Mutual Information describes how
        much information about one column can be inferred from another.

        :param str columnA: First column in Data Frame which calculates the mutual information.
        :param str columnB: Second column in Data Frame which calculates the mutual information.
        :param lambda assertion: A function that accepts an int or float parameter.
        :param str hint: A hint that states why a constraint could have failed.
        :return: hasMutualInformation self: A Check object that asserts the mutual information in the columns.
        """
        assertion_func = ScalaFunction1(
            self._spark_session.sparkContext._gateway, assertion)
        hint = self._jvm.scala.Option.apply(hint)
        self._Check = self._Check.hasMutualInformation(columnA, columnB,
                                                       assertion_func, hint)
        return self
Beispiel #15
0
    def hasHistogramValues(self,
                           column,
                           assertion,
                           binningUdf,
                           maxBins,
                           hint=None):
        """Creates a constraint that asserts on column's value distribution.

        :param str column: Column in Data Frame to run the assertion on.
        :param lambda assertion: A function that accepts an int or float parameter as a distribution input parameter.
        :param str binningUDF: An optional binning function.
        :param str maxBins: Histogram details is only provided for N column values with top counts. MaxBins sets the N.
        :param str hint: A hint that states why a constraint could have failed.
        :return: hasHistogramValues self: A Check object that asserts the column's value distribution in the column.
        """
        assertion_func = ScalaFunction1(
            self._spark_session.sparkContext._gateway, assertion)
        hint = self._jvm.scala.Option.apply(hint)
        self._Check = self._Check.hasHistogramValues(column, assertion_func,
                                                     binningUdf, maxBins, hint)
        return self
Beispiel #16
0
    def kllSketchSatisfies(self,
                           column,
                           assertion,
                           kllParameters=None,
                           hint=None):
        """Creates a constraint that asserts on column's sketch size.

        :param str column: Column in Data Frame to run the assertion on.
        :param Lambda(BucketDistribution) assertion: A function that accepts an int or float
                parameter as a distribution input parameter.
        :param KLLParameters kllParameters: Parameters of KLL sketch
        :param str hint: A hint that states why a constraint could have failed.
        :return: kllSketchSatisfies self: A Check object that asserts the column's sketch size in the column.
        """
        assertion_func = ScalaFunction1(
            self._spark_session.sparkContext._gateway, assertion)
        hint = self._jvm.scala.Option.apply(hint)
        params = self._jvm.scala.Option.apply(
            kllParameters._param if kllParameters else None)
        self._Check = self._Check.kllSketchSatisfies(column, assertion_func,
                                                     params, hint)
        return self
Beispiel #17
0
    def hasDataType(self,
                    column,
                    datatype: ConstrainableDataTypes,
                    assertion=None,
                    hint=None):
        """
        Check to run against the fraction of rows that conform to the given data type.

        :param str column: The Column in DataFrame to be checked.
        :param ConstrainableDataTypes datatype: Data type that the columns should be compared against
        :param lambda assertion: A function with an int or float parameter.
        :param str hint: A hint that states why a constraint could have failed.

        :return: hasDataType self: A Check object that runs the compliance on the column.
        """
        datatype_jvm = datatype._get_java_object(self._jvm)
        assertion = (ScalaFunction1(self._spark_session.sparkContext._gateway,
                                    assertion) if assertion else
                     getattr(self._Check, "hasDataType$default$3")())
        hint = self._jvm.scala.Option.apply(hint)
        self._Check = self._Check.hasDataType(column, datatype_jvm, assertion,
                                              hint)
        return self
Beispiel #18
0
    def satisfies(self,
                  columnCondition,
                  constraintName,
                  assertion=None,
                  hint=None):
        """
        Creates a constraint that runs the given condition on the data frame.

        :param str columnCondition: Data frame column which is a combination of expression and the column name.
                It has to comply with Spark SQL syntax. Can be written in an exact same way with conditions inside the
                `WHERE` clause.
        :param str constraintName: A name that summarizes the check being made. This name is being used to name the
                metrics for the analysis being done.
        :param lambda assertion: A function with an int or float parameter.
        :param str hint: A hint that states why a constraint could have failed.
        :return: satisfies self: A Check object that runs the condition on the data frame.
        """
        assertion_func = (ScalaFunction1(
            self._spark_session.sparkContext._gateway, assertion) if assertion
                          else getattr(self._Check, "satisfies$default$2")())
        hint = self._jvm.scala.Option.apply(hint)
        self._Check = self._Check.satisfies(columnCondition, constraintName,
                                            assertion_func, hint)
        return self