コード例 #1
0
    def test_scala_function1(self):
        greaterThan10 = ScalaFunction1(self.sc._gateway, lambda x: x > 10)
        self.assertFalse(greaterThan10.apply(9))
        self.assertTrue(greaterThan10.apply(11))

        notNoneTest = ScalaFunction1(self.sc._gateway, lambda x: x is not None)
        self.assertFalse(notNoneTest.apply(None))
        self.assertTrue(notNoneTest.apply('foo'))

        appendTest = ScalaFunction1(self.sc._gateway, lambda x: "{}test".format(x))
        self.assertEqual("xtest", appendTest.apply('x'))
コード例 #2
0
    def hasStandardDeviation(self, column, assertion, hint=None):
        """
        Creates a constraint that asserts on the standard deviation of the column

        :param str column: Column in Data Frame to run the assertion on.
        :param lambda assertion: A function with an int or float parameter. The parameter is the standard deviation.
        :param str hint: A hint that states why a constraint could have failed.
        :return: hasMean self: A Check object that asserts the std deviation of the column.
        """
        assertion_func = ScalaFunction1(self._spark_session.sparkContext._gateway, assertion)
        hint = self._jvm.scala.Option.apply(hint)
        self._Check = self._Check.hasStandardDeviation(column, assertion_func, hint)
        return self
コード例 #3
0
    def haveAnyCompleteness(self, columns, assertion, hint=None):
        """Creates a constraint that asserts on any completion in the combined set of columns.

        :param list[str] columns: Columns in Data Frame to run the assertion on.
        :param lambda assertion: A function that accepts an int or float parameter.
        :param str hint: A hint that states why a constraint could have failed.
        :return: haveAnyCompleteness self: A Check.scala object that asserts completion in the columns.
        """
        columns_seq = to_scala_seq(self._jvm, columns)
        assertion_func = ScalaFunction1(self._spark_session.sparkContext._gateway, assertion)
        hint = self._jvm.scala.Option.apply(hint)
        self._Check = self._Check.haveAnyCompleteness(columns_seq, assertion_func, hint)
        return self
コード例 #4
0
    def hasApproxCountDistinct(self, column, assertion, hint=None):
        """
        Creates a constraint that asserts on the approximate count distinct of the given column

        :param str column: Column in DataFrame to run the assertion on.
        :param lambda assertion: A function with an int or float parameter.
        :param str hint: A hint that states why a constraint could have failed.
        :return: hasApproxCountDistinct self: A Check object that asserts the  count distinct of the column.
        """
        assertion_func = ScalaFunction1(self._spark_session.sparkContext._gateway, assertion)
        hint = self._jvm.scala.Option.apply(hint)
        self._Check = self._Check.hasApproxCountDistinct(column, assertion_func, hint)
        return self
コード例 #5
0
    def hasHistogramValues(self, column, assertion, binningUdf, maxBins, hint=None):
        """Creates a constraint that asserts on column's value distribution.

        :param str column: Column in Data Frame to run the assertion on.
        :param lambda assertion: A function that accepts an int or float parameter as a distribution input parameter.
        :param str binningUDF: An optional binning function.
        :param str maxBins: Histogram details is only provided for N column values with top counts. MaxBins sets the N.
        :param str hint: A hint that states why a constraint could have failed.
        :return: hasHistogramValues self: A Check object that asserts the column's value distribution in the column.
        """
        assertion_func = ScalaFunction1(self._spark_session.sparkContext._gateway, assertion)
        hint = self._jvm.scala.Option.apply(hint)
        self._Check = self._Check.hasHistogramValues(column, assertion_func, binningUdf, maxBins, hint)
        return self
コード例 #6
0
    def hasUniqueValueRatio(self, columns, assertion, hint=None):
        """
        Creates a constraint on the unique value ratio in a single or combined set of key columns.

        :param list[str] columns: Column(s) in Data Frame to run the assertion on.
        :param lambda assertion: A function that accepts an int or float parameter.
        :param str hint: A hint that states why a constraint could have failed.
        :return: hasUniqueValueRatio self: A Check object that asserts the unique value ratio in the columns.
        """
        assertion_func = ScalaFunction1(self._spark_session.sparkContext._gateway, assertion)
        hint = self._jvm.scala.Option.apply(hint)
        columns_seq = to_scala_seq(self._jvm, columns)
        self._Check = self._Check.hasUniqueValueRatio(columns_seq, assertion_func, hint)
        return self
コード例 #7
0
    def hasEntropy(self, column, assertion, hint=None):
        """
        Creates a constraint that asserts on a column entropy. Entropy is a measure of the level of information
        contained in a message.

        :param str column: Column in Data Frame to run the assertion on.
        :param lambda assertion: A function that accepts an int or float parameter.
        :param str hint: A hint that states why a constraint could have failed.
        :return: hasEntropy self: A Check object that asserts the entropy in the column.
        """
        assertion_func = ScalaFunction1(self._spark_session.sparkContext._gateway, assertion)
        hint = self._jvm.scala.Option.apply(hint)
        self._Check = self._Check.hasEntropy(column, assertion_func, hint)
        return self
コード例 #8
0
    def hasMaxLength(self, column, assertion, hint=None):
        """
        Creates a constraint that asserts on the maximum length of a string datatype column

        :param str column: Column in Data Frame to run the assertion on. The column is expected to be a string type.
        :param lanmbda assertion: A function which accepts an int or float parameter
                that discerns the maximum length of the string.
        :param str hint: A hint that states why a constraint could have failed.
        :return: hasMaxLength self : A Check object that asserts maxLength of the column.
        """
        assertion_func = ScalaFunction1(self._spark_session.sparkContext._gateway, assertion)
        hint = self._jvm.scala.Option.apply(hint)
        self._Check = self._Check.hasMaxLength(column, assertion_func, hint)
        return self
コード例 #9
0
    def hasCorrelation(self, columnA, columnB, assertion, hint=None):
        """
        Creates a constraint that asserts on the pearson correlation between two columns.

        :param str columnA: First column in Data Frame which calculates the correlation.
        :param str columnB: Second column in Data Frame which calculates the correlation.
        :param lambda assertion: A function that accepts an int or float parameter.
        :param str hint: A hint that states why a constraint could have failed.
        :return: hasCorrelation self: A Check object that asserts the correlation calculation in the columns.
        """
        assertion_func = ScalaFunction1(self._spark_session.sparkContext._gateway, assertion)
        hint = self._jvm.scala.Option.apply(hint)
        self._Check = self._Check.hasCorrelation(columnA, columnB, assertion_func, hint)
        return self
コード例 #10
0
    def isPositive(self, column, assertion=None, hint=None):
        """
        Creates a constraint which asserts that a column contains no negative values and is greater than 0.

        :param str column: The Column in DataFrame to run the assertion on.
        :param lambda assertion: A function with an int or float parameter.
        :param str hint: A hint that states why a constraint could have failed.
        :return: isNonNegative self: A Check object that runs the assertion on the column.
        """
        assertion_func = ScalaFunction1(self._spark_session.sparkContext._gateway, assertion) if assertion \
            else getattr(self._Check, "isPositive$default$2")()
        hint = self._jvm.scala.Option.apply(hint)
        self._Check = self._Check.isPositive(column, assertion_func, hint)
        return self
コード例 #11
0
    def containsURL(self, column, assertion=None, hint=None):
        """
        Check to run against the compliance of a column against an e-mail pattern.

        :param str column: The Column in DataFrame to be checked. The column is expected to be a string datatype.
        :param lambda assertion: A function with an int or float parameter.
        :param str hint: A hint that states why a constraint could have failed.
        :return: containsURL self: A Check object that runs the compliance on the column.
        """
        assertion = ScalaFunction1(self._spark_session.sparkContext._gateway, assertion) if assertion \
            else getattr(self._Check, "containsURL$default$2")()
        hint = self._jvm.scala.Option.apply(hint)
        self._Check = self._Check.containsURL(column, assertion, hint)
        return self
コード例 #12
0
    def hasSize(self, assertion, hint=None):
        """
        Creates a constraint that calculates the data frame size and runs the assertion on it.

        :param lambda assertion: Refers to a data frame size. The given function can include comparisons
                and conjunction or disjunction statements.
        :param str hint: A hint that states why a constraint could have failed.
        :return: hasSize self: A Check.scala object that implements the assertion on the column.

        """
        assertion_func = ScalaFunction1(self._spark_session.sparkContext._gateway, assertion)
        hint = self._jvm.scala.Option.apply(hint)
        self._Check = self._Check.hasSize(assertion_func, hint) # getattr(self._Check, "hasSize$default$2"))
        return self
コード例 #13
0
    def hasApproxQuantile(self, column, quantile, assertion, hint=None):
        """
        Creates a constraint that asserts on an approximated quantile

        :param str column: Column in Data Frame to run the assertion on
        :param float quantile: Quantile to run the assertion on.
        :param lambda assertion: A function that accepts the computed quantile as an int or float parameter.
        :param str hint: A hint that states why a constraint could have failed.
        :return: hasApproxQuantile self: A Check object that asserts the approximated quantile in the column.
        """
        assertion_func = ScalaFunction1(self._spark_session.sparkContext._gateway, assertion)
        hint = self._jvm.scala.Option.apply(hint)
        self._Check = self._Check.hasApproxQuantile(column, float(quantile), assertion_func, hint)
        return self
コード例 #14
0
    def isGreaterThanOrEqualTo(self, columnA, columnB, assertion=None, hint=None):
        """
        Asserts that, in each row, the value of columnA is greather than or equal to the value of columnB

        :param str columnA: Column in DataFrame to run the assertion on.
        :param str columnB: Column in DataFrame to run the assertion on.
        :param lambda assertion: A function that accepts an int or float parameter.
        :param str hint: A hint that states why a constraint could have failed.
        :return: isGreaterThanOrEqualTo self: A Check object that runs the assertion on the columns.
        """
        assertion_func = ScalaFunction1(self._spark_session.sparkContext._gateway, assertion) if assertion \
            else getattr(self._Check, "isGreaterThanOrEqualTo$default$3")()
        hint = self._jvm.scala.Option.apply(hint)
        self._Check = self._Check.isGreaterThanOrEqualTo(columnA, columnB, assertion_func, hint)
        return self
コード例 #15
0
    def kllSketchSatisfies(self, column, assertion, kllParameters=None, hint=None):
        """Creates a constraint that asserts on column's sketch size.

        :param str column: Column in Data Frame to run the assertion on.
        :param Lambda(BucketDistribution) assertion: A function that accepts an int or float
                parameter as a distribution input parameter.
        :param KLLParameters kllParameters: Parameters of KLL sketch
        :param str hint: A hint that states why a constraint could have failed.
        :return: kllSketchSatisfies self: A Check object that asserts the column's sketch size in the column.
        """
        assertion_func = ScalaFunction1(self._spark_session.sparkContext._gateway, assertion)
        hint = self._jvm.scala.Option.apply(hint)
        params = self._jvm.scala.Option.apply(kllParameters._param if kllParameters else None)
        self._Check = self._Check.kllSketchSatisfies(column, assertion_func, params, hint)
        return self
コード例 #16
0
    def hasMutualInformation(self, columnA, columnB, assertion, hint=None):
        """
        Creates a constraint that asserts on a mutual information between two columns. Mutual Information describes how
        much information about one column can be inferred from another.

        :param str columnA: First column in Data Frame which calculates the mutual information.
        :param str columnB: Second column in Data Frame which calculates the mutual information.
        :param lambda assertion: A function that accepts an int or float parameter.
        :param str hint: A hint that states why a constraint could have failed.
        :return: hasMutualInformation self: A Check object that asserts the mutual information in the columns.
        """
        assertion_func = ScalaFunction1(self._spark_session.sparkContext._gateway, assertion)
        hint = self._jvm.scala.Option.apply(hint)
        self._Check = self._Check.hasMutualInformation(columnA, columnB, assertion_func, hint)
        return self
コード例 #17
0
    def hasCompleteness(self, column, assertion, hint=None):
        """
        Creates a constraint that asserts column completion.
        Uses the given history selection strategy to retrieve historical completeness values on this
        column from the history provider.

        :param str column: Column in Data Frame to run the assertion on.
        :param lambda assertion: A function that accepts an int or float parameter.
        :param str hint: A hint that states why a constraint could have failed.
        :return: hasCompleteness self: A Check object that implements the assertion.
        """
        assertion_func = ScalaFunction1(self._spark_session.sparkContext._gateway, assertion)
        hint = self._jvm.scala.Option.apply(hint)
        self._Check = self._Check.hasCompleteness(column, assertion_func, hint)
        print(self)
        return self
コード例 #18
0
 def isContainedIn(self, column, allowed_values, assertion=None, hint=None):
     """
     Asserts that every non-null value in a column is contained in a set of predefined values
     :param str column: Column in DataFrame to run the assertion on.
     :param list[str] allowed_values: A function that accepts allowed values for the column.
     :param lambda assertion: A function that accepts an int or float parameter.
     :param str hint: A hint that states why a constraint could have failed.
     :return: isContainedIn self: A Check object that runs the assertion on the columns.
     """
     arr = self._spark_session.sparkContext._gateway.new_array(self._jvm.java.lang.String, len(allowed_values))
     for i in range(0, len(allowed_values)):
         arr[i] = allowed_values[i]
     assertion_func = ScalaFunction1(self._spark_session.sparkContext._gateway, assertion) if assertion \
         else getattr(self._Check, "IsOne")()
     hint = self._jvm.scala.Option.apply(hint)
     self._Check = self._Check.isContainedIn(column, arr, assertion_func, hint)
     return self
コード例 #19
0
    def hasDataType(self, column, datatype: ConstrainableDataTypes, assertion=None, hint=None):
        """
        Check to run against the fraction of rows that conform to the given data type.

        :param str column: The Column in DataFrame to be checked.
        :param ConstrainableDataTypes datatype: Data type that the columns should be compared against
        :param lambda assertion: A function with an int or float parameter.
        :param str hint: A hint that states why a constraint could have failed.

        :return: hasDataType self: A Check object that runs the compliance on the column.
        """
        datatype_jvm = datatype._get_java_object(self._jvm)
        assertion = ScalaFunction1(self._spark_session.sparkContext._gateway, assertion) if assertion \
            else getattr(self._Check, "hasDataType$default$3")()
        hint = self._jvm.scala.Option.apply(hint)
        self._Check = self._Check.hasDataType(column, datatype_jvm, assertion, hint)
        return self
コード例 #20
0
    def satisfies(self, columnCondition, constraintName, assertion=None, hint=None):
        """
        Creates a constraint that runs the given condition on the data frame.

        :param str columnCondition: Data frame column which is a combination of expression and the column name.
                It has to comply with Spark SQL syntax. Can be written in an exact same way with conditions inside the
                `WHERE` clause.
        :param str constraintName: A name that summarizes the check being made. This name is being used to name the
                metrics for the analysis being done.
        :param lambda assertion: A function with an int or float parameter.
        :param str hint: A hint that states why a constraint could have failed.
        :return: satisfies self: A Check object that runs the condition on the data frame.
        """
        assertion_func = ScalaFunction1(self._spark_session.sparkContext._gateway, assertion) if assertion \
            else getattr(self._Check, "satisfies$default$2")()
        hint = self._jvm.scala.Option.apply(hint)
        self._Check = self._Check.satisfies(columnCondition, constraintName, assertion_func, hint)
        return self
コード例 #21
0
    def hasPattern(self, column, pattern, assertion=None, name=None, hint=None):
        """
        Checks for pattern compliance. Given a column name and a regular expression, defines a
        Check on the average compliance of the column's values to the regular expression.

        :param str column: Column in DataFrame to be checked
        :param Regex pattern: A name that summarizes the current check and the
                metrics for the analysis being done.
        :param lambda assertion: A function with an int or float parameter.
        :param str name: A name for the pattern constraint.
        :param str hint: A hint that states why a constraint could have failed.
        :return: hasPattern self: A Check object that runs the condition on the column.
        """
        assertion_func = ScalaFunction1(self._spark_session.sparkContext._gateway, assertion) if assertion \
            else getattr(self._Check, "hasPattern$default$2")()
        name = self._jvm.scala.Option.apply(name)
        hint = self._jvm.scala.Option.apply(hint)
        pattern_regex = self._jvm.scala.util.matching.Regex(pattern, None)
        self._Check = self._Check.hasPattern(column, pattern_regex, assertion_func, name, hint)
        return self