def test_scala_function1(self): greaterThan10 = ScalaFunction1(self.sc._gateway, lambda x: x > 10) self.assertFalse(greaterThan10.apply(9)) self.assertTrue(greaterThan10.apply(11)) notNoneTest = ScalaFunction1(self.sc._gateway, lambda x: x is not None) self.assertFalse(notNoneTest.apply(None)) self.assertTrue(notNoneTest.apply("foo")) appendTest = ScalaFunction1(self.sc._gateway, "{}test".format) self.assertEqual("xtest", appendTest.apply("x"))
def hasSum(self, column, assertion, hint=None): """ Creates a constraint that asserts on the sum of the column :param str column: Column in Data Frame to run the assertion on. :param lambda assertion: A function with an int or float parameter. The parameter is the sum. :param str hint: A hint that states why a constraint could have failed. :return: hasMean self: A Check object that asserts the sum of the column. """ assertion_func = ScalaFunction1( self._spark_session.sparkContext._gateway, assertion) hint = self._jvm.scala.Option.apply(hint) self._Check = self._Check.hasSum(column, assertion_func, hint) return self
def haveAnyCompleteness(self, columns, assertion, hint=None): """Creates a constraint that asserts on any completion in the combined set of columns. :param list[str] columns: Columns in Data Frame to run the assertion on. :param lambda assertion: A function that accepts an int or float parameter. :param str hint: A hint that states why a constraint could have failed. :return: haveAnyCompleteness self: A Check.scala object that asserts completion in the columns. """ columns_seq = to_scala_seq(self._jvm, columns) assertion_func = ScalaFunction1( self._spark_session.sparkContext._gateway, assertion) hint = self._jvm.scala.Option.apply(hint) self._Check = self._Check.haveAnyCompleteness(columns_seq, assertion_func, hint) return self
def hasMaxLength(self, column, assertion, hint=None): """ Creates a constraint that asserts on the maximum length of a string datatype column :param str column: Column in Data Frame to run the assertion on. The column is expected to be a string type. :param lanmbda assertion: A function which accepts an int or float parameter that discerns the maximum length of the string. :param str hint: A hint that states why a constraint could have failed. :return: hasMaxLength self : A Check object that asserts maxLength of the column. """ assertion_func = ScalaFunction1( self._spark_session.sparkContext._gateway, assertion) hint = self._jvm.scala.Option.apply(hint) self._Check = self._Check.hasMaxLength(column, assertion_func, hint) return self
def isPositive(self, column, assertion=None, hint=None): """ Creates a constraint which asserts that a column contains no negative values and is greater than 0. :param str column: The Column in DataFrame to run the assertion on. :param lambda assertion: A function with an int or float parameter. :param str hint: A hint that states why a constraint could have failed. :return: isNonNegative self: A Check object that runs the assertion on the column. """ assertion_func = (ScalaFunction1( self._spark_session.sparkContext._gateway, assertion) if assertion else getattr(self._Check, "isPositive$default$2")()) hint = self._jvm.scala.Option.apply(hint) self._Check = self._Check.isPositive(column, assertion_func, hint) return self
def containsURL(self, column, assertion=None, hint=None): """ Check to run against the compliance of a column against an e-mail pattern. :param str column: The Column in DataFrame to be checked. The column is expected to be a string datatype. :param lambda assertion: A function with an int or float parameter. :param str hint: A hint that states why a constraint could have failed. :return: containsURL self: A Check object that runs the compliance on the column. """ assertion = (ScalaFunction1(self._spark_session.sparkContext._gateway, assertion) if assertion else getattr(self._Check, "containsURL$default$2")()) hint = self._jvm.scala.Option.apply(hint) self._Check = self._Check.containsURL(column, assertion, hint) return self
def hasEntropy(self, column, assertion, hint=None): """ Creates a constraint that asserts on a column entropy. Entropy is a measure of the level of information contained in a message. :param str column: Column in Data Frame to run the assertion on. :param lambda assertion: A function that accepts an int or float parameter. :param str hint: A hint that states why a constraint could have failed. :return: hasEntropy self: A Check object that asserts the entropy in the column. """ assertion_func = ScalaFunction1( self._spark_session.sparkContext._gateway, assertion) hint = self._jvm.scala.Option.apply(hint) self._Check = self._Check.hasEntropy(column, assertion_func, hint) return self
def hasSize(self, assertion, hint=None): """ Creates a constraint that calculates the data frame size and runs the assertion on it. :param lambda assertion: Refers to a data frame size. The given function can include comparisons and conjunction or disjunction statements. :param str hint: A hint that states why a constraint could have failed. :return: hasSize self: A Check.scala object that implements the assertion on the column. """ assertion_func = ScalaFunction1( self._spark_session.sparkContext._gateway, assertion) hint = self._jvm.scala.Option.apply(hint) self._Check = self._Check.hasSize( assertion_func, hint) # getattr(self._Check, "hasSize$default$2")) return self
def hasCorrelation(self, columnA, columnB, assertion, hint=None): """ Creates a constraint that asserts on the pearson correlation between two columns. :param str columnA: First column in Data Frame which calculates the correlation. :param str columnB: Second column in Data Frame which calculates the correlation. :param lambda assertion: A function that accepts an int or float parameter. :param str hint: A hint that states why a constraint could have failed. :return: hasCorrelation self: A Check object that asserts the correlation calculation in the columns. """ assertion_func = ScalaFunction1( self._spark_session.sparkContext._gateway, assertion) hint = self._jvm.scala.Option.apply(hint) self._Check = self._Check.hasCorrelation(columnA, columnB, assertion_func, hint) return self
def hasUniqueValueRatio(self, columns, assertion, hint=None): """ Creates a constraint on the unique value ratio in a single or combined set of key columns. :param list[str] columns: Column(s) in Data Frame to run the assertion on. :param lambda assertion: A function that accepts an int or float parameter. :param str hint: A hint that states why a constraint could have failed. :return: hasUniqueValueRatio self: A Check object that asserts the unique value ratio in the columns. """ assertion_func = ScalaFunction1( self._spark_session.sparkContext._gateway, assertion) hint = self._jvm.scala.Option.apply(hint) columns_seq = to_scala_seq(self._jvm, columns) self._Check = self._Check.hasUniqueValueRatio(columns_seq, assertion_func, hint) return self
def hasApproxQuantile(self, column, quantile, assertion, hint=None): """ Creates a constraint that asserts on an approximated quantile :param str column: Column in Data Frame to run the assertion on :param float quantile: Quantile to run the assertion on. :param lambda assertion: A function that accepts the computed quantile as an int or float parameter. :param str hint: A hint that states why a constraint could have failed. :return: hasApproxQuantile self: A Check object that asserts the approximated quantile in the column. """ assertion_func = ScalaFunction1( self._spark_session.sparkContext._gateway, assertion) hint = self._jvm.scala.Option.apply(hint) self._Check = self._Check.hasApproxQuantile(column, float(quantile), assertion_func, hint) return self
def isLessThan(self, columnA, columnB, assertion=None, hint=None): """ Asserts that, in each row, the value of columnA is less than the value of columnB :param str columnA: Column in DataFrame to run the assertion on. :param str columnB: Column in DataFrame to run the assertion on. :param lambda assertion: A function that accepts an int or float parameter. :param str hint: A hint that states why a constraint could have failed. :return: isLessThan self : A Check object that checks the assertion on the columns. """ assertion_func = (ScalaFunction1( self._spark_session.sparkContext._gateway, assertion) if assertion else getattr(self._Check, "isLessThan$default$3")()) hint = self._jvm.scala.Option.apply(hint) self._Check = self._Check.isLessThan(columnA, columnB, assertion_func, hint) return self
def hasCompleteness(self, column, assertion, hint=None): """ Creates a constraint that asserts column completion. Uses the given history selection strategy to retrieve historical completeness values on this column from the history provider. :param str column: Column in Data Frame to run the assertion on. :param lambda assertion: A function that accepts an int or float parameter. :param str hint: A hint that states why a constraint could have failed. :return: hasCompleteness self: A Check object that implements the assertion. """ assertion_func = ScalaFunction1( self._spark_session.sparkContext._gateway, assertion) hint = self._jvm.scala.Option.apply(hint) self._Check = self._Check.hasCompleteness(column, assertion_func, hint) print(self) return self
def hasMutualInformation(self, columnA, columnB, assertion, hint=None): """ Creates a constraint that asserts on a mutual information between two columns. Mutual Information describes how much information about one column can be inferred from another. :param str columnA: First column in Data Frame which calculates the mutual information. :param str columnB: Second column in Data Frame which calculates the mutual information. :param lambda assertion: A function that accepts an int or float parameter. :param str hint: A hint that states why a constraint could have failed. :return: hasMutualInformation self: A Check object that asserts the mutual information in the columns. """ assertion_func = ScalaFunction1( self._spark_session.sparkContext._gateway, assertion) hint = self._jvm.scala.Option.apply(hint) self._Check = self._Check.hasMutualInformation(columnA, columnB, assertion_func, hint) return self
def hasHistogramValues(self, column, assertion, binningUdf, maxBins, hint=None): """Creates a constraint that asserts on column's value distribution. :param str column: Column in Data Frame to run the assertion on. :param lambda assertion: A function that accepts an int or float parameter as a distribution input parameter. :param str binningUDF: An optional binning function. :param str maxBins: Histogram details is only provided for N column values with top counts. MaxBins sets the N. :param str hint: A hint that states why a constraint could have failed. :return: hasHistogramValues self: A Check object that asserts the column's value distribution in the column. """ assertion_func = ScalaFunction1( self._spark_session.sparkContext._gateway, assertion) hint = self._jvm.scala.Option.apply(hint) self._Check = self._Check.hasHistogramValues(column, assertion_func, binningUdf, maxBins, hint) return self
def kllSketchSatisfies(self, column, assertion, kllParameters=None, hint=None): """Creates a constraint that asserts on column's sketch size. :param str column: Column in Data Frame to run the assertion on. :param Lambda(BucketDistribution) assertion: A function that accepts an int or float parameter as a distribution input parameter. :param KLLParameters kllParameters: Parameters of KLL sketch :param str hint: A hint that states why a constraint could have failed. :return: kllSketchSatisfies self: A Check object that asserts the column's sketch size in the column. """ assertion_func = ScalaFunction1( self._spark_session.sparkContext._gateway, assertion) hint = self._jvm.scala.Option.apply(hint) params = self._jvm.scala.Option.apply( kllParameters._param if kllParameters else None) self._Check = self._Check.kllSketchSatisfies(column, assertion_func, params, hint) return self
def hasDataType(self, column, datatype: ConstrainableDataTypes, assertion=None, hint=None): """ Check to run against the fraction of rows that conform to the given data type. :param str column: The Column in DataFrame to be checked. :param ConstrainableDataTypes datatype: Data type that the columns should be compared against :param lambda assertion: A function with an int or float parameter. :param str hint: A hint that states why a constraint could have failed. :return: hasDataType self: A Check object that runs the compliance on the column. """ datatype_jvm = datatype._get_java_object(self._jvm) assertion = (ScalaFunction1(self._spark_session.sparkContext._gateway, assertion) if assertion else getattr(self._Check, "hasDataType$default$3")()) hint = self._jvm.scala.Option.apply(hint) self._Check = self._Check.hasDataType(column, datatype_jvm, assertion, hint) return self
def satisfies(self, columnCondition, constraintName, assertion=None, hint=None): """ Creates a constraint that runs the given condition on the data frame. :param str columnCondition: Data frame column which is a combination of expression and the column name. It has to comply with Spark SQL syntax. Can be written in an exact same way with conditions inside the `WHERE` clause. :param str constraintName: A name that summarizes the check being made. This name is being used to name the metrics for the analysis being done. :param lambda assertion: A function with an int or float parameter. :param str hint: A hint that states why a constraint could have failed. :return: satisfies self: A Check object that runs the condition on the data frame. """ assertion_func = (ScalaFunction1( self._spark_session.sparkContext._gateway, assertion) if assertion else getattr(self._Check, "satisfies$default$2")()) hint = self._jvm.scala.Option.apply(hint) self._Check = self._Check.satisfies(columnCondition, constraintName, assertion_func, hint) return self