Example #1
0
 def hasHistogramValues(self,
                        column,
                        assertion,
                        binningUdf=None,
                        maxBins=None):
     """
     Creates a constraint that asserts on column's value distribution.
     
     @param column     Column to run the assertion on
     @param assertion  Function that receives a Distribution input parameter and returns a boolean.
                       E.g
                       .hasHistogramValues("att2", _.absolutes("f") == 3)
                       .hasHistogramValues("att2",
                       _.ratios(Histogram.NullFieldReplacement) == 2/6.0)
     @param binningUdf An optional binning function
     @param maxBins    Histogram details is only provided for N column values with top counts.
                       maxBins sets the N
     @param hint A hint to provide additional context why a constraint could have failed
     """
     function = jc.scala_function1(self.spark.sparkContext._gateway,
                                   assertion)
     jvmConstraint = self.jvmCheck.hasHistogramValues(
         column, function,
         getattr(self.jvmCheck, "hasHistogramValues$default$3")(),
         getattr(self.jvmCheck, "hasHistogramValues$default$4")(),
         getattr(self.jvmCheck, "hasHistogramValues$default$5")())
     return Check(self.spark, self.level, self.description, jvmConstraint)
Example #2
0
 def isNonNegative(self, column, assertion=is_one):
     """
     Creates a constraint that asserts that a column contains no negative values
     
     @param column Column to run the assertion on
     @param assertion Function that receives a double input parameter and returns a boolean
     @param hint A hint to provide additional context why a constraint could have failed
     """
     function = jc.scala_function1(self.spark.sparkContext._gateway,
                                   assertion)
     jvmConstraint = self.jvmCheck.isNonNegative(
         column, function,
         getattr(self.jvmCheck, "isNonNegative$default$3")())
     return Check(self.spark, self.level, self.description, jvmConstraint)
Example #3
0
 def hasApproxCountDistinct(self, column, assertion):
     """
     Creates a constraint that asserts on the approximate count distinct of the given column
     
     @param column Column to run the assertion on
     @param assertion Function that receives a double input parameter and returns a boolean
     @param hint A hint to provide additional context why a constraint could have failed
     """
     function = jc.scala_function1(self.spark.sparkContext._gateway,
                                   assertion)
     jvmConstraint = self.jvmCheck.hasApproxCountDistinct(
         column, function,
         getattr(self.jvmCheck, "hasApproxCountDistinct$default$3")())
     return Check(self.spark, self.level, self.description, jvmConstraint)
Example #4
0
 def hasSize(self, assertion):
     """
     Creates a constraint that calculates the data frame size and runs the
     assertion on it.
     Args:
         assertion (function):
     Returns:
         checks.Check object including this constraint
     """
     function = jc.scala_function1(self.spark.sparkContext._gateway,
                                   assertion)
     jvmConstraint = self.jvmCheck.hasSize(
         function,
         getattr(self.jvmCheck, "hasSize$default$2")())
     return Check(self.spark, self.level, self.description, jvmConstraint)
Example #5
0
 def isGreaterThan(self, columnA, columnB, assertion=is_one):
     """
     Asserts that, in each row, the value of columnA is greater than the value of columnB
     
     @param columnA Column to run the assertion on
     @param columnB Column to run the assertion on
     @param assertion Function that receives a double input parameter and returns a boolean
     @param hint A hint to provide additional context why a constraint could have failed
     """
     function = jc.scala_function1(self.spark.sparkContext._gateway,
                                   assertion)
     jvmConstraint = self.jvmCheck.isGreaterThan(
         columnA, columnB, function,
         getattr(self.jvmCheck, "isGreaterThan$default$4")())
     return Check(self.spark, self.level, self.description, jvmConstraint)
Example #6
0
 def hasCorrelation(self, columnA, columnB, assertion):
     """
      Creates a constraint that asserts on the pearson correlation between two columns.
      
      @param columnA   First column for correlation calculation
      @param columnB   Second column for correlation calculation
      @param assertion Function that receives a double input parameter and returns a boolean
      @param hint A hint to provide additional context why a constraint could have failed
      """
     function = jc.scala_function1(self.spark.sparkContext._gateway,
                                   assertion)
     jvmConstraint = self.jvmCheck.hasCorrelation(
         columnA, columnB, function,
         getattr(self.jvmCheck, "hasCorrelation$default$4")())
     return Check(self.spark, self.level, self.description, jvmConstraint)
Example #7
0
 def hasCompleteness(self, column, assertion):
     """
     Creates a constraint that asserts on a column completion.
     Uses the given history selection strategy to retrieve historical completeness values on this
     column from the history provider.
     
     @param column    Column to run the assertion on
     @param assertion Function that receives a double input parameter and returns a boolean
     @param hint A hint to provide additional context why a constraint could have failed
     """
     function = jc.scala_function1(self.spark.sparkContext._gateway,
                                   assertion)
     jvmConstraint = self.jvmCheck.hasCompleteness(
         column, function,
         getattr(self.jvmCheck, "hasCompleteness$default$3")())
     return Check(self.spark, self.level, self.description, jvmConstraint)
Example #8
0
 def hasApproxQuantile(self, column, quantile, assertion):
     """
     Creates a constraint that asserts on an approximated quantile
     
     @param column Column to run the assertion on
     @param quantile Which quantile to assert on
     @param assertion Function that receives a double input parameter (the computed quantile)
                      and returns a boolean
     @param hint A hint to provide additional context why a constraint could have failed
     """
     function = jc.scala_function1(self.spark.sparkContext._gateway,
                                   assertion)
     jvmConstraint = self.jvmCheck.hasApproxQuantile(
         column, quantile, function,
         getattr(self.jvmCheck, "hasApproxQuantile$default$4")())
     return Check(self.spark, self.level, self.description, jvmConstraint)
Example #9
0
 def hasUniqueness(self, columns, assertion):
     """
      Creates a constraint that asserts on uniqueness in a single or combined set of key columns.
     
      @param columns Key columns
      @param assertion Function that receives a double input parameter and returns a boolean.
                       Refers to the fraction of unique values
      @param hint A hint to provide additional context why a constraint could have failed
      """
     if (not isinstance(columns, list)):
         # Single column is provided
         columns = [columns]
     function = jc.scala_function1(self.spark.sparkContext._gateway,
                                   assertion)
     jvmConstraint = self.jvmCheck.hasUniqueness(
         jc.iterable_to_scala_seq(self._jvm, columns), function)
     return Check(self.spark, self.level, self.description, jvmConstraint)
Example #10
0
 def isContainedIn(self, column, allowedValues, assertion=is_one):
     """
     Asserts that every non-null value in a column is contained in a set of predefined values
     
     @param column Column to run the assertion on
     @param allowedValues Allowed values for the column
     @param assertion Function that receives a double input parameter and returns a boolean
     @param hint A hint to provide additional context why a constraint could have failed
     """
     if (isinstance(allowedValues, list) == False):
         raise ValueError("'allowedValues' must be a list of strings.")
     function = jc.scala_function1(self.spark.sparkContext._gateway,
                                   assertion)
     scalaArray = jc.iterable_to_scala_array(self._jvm, allowedValues)
     jvmConstraint = self.jvmCheck.isContainedIn(
         column, scalaArray, function,
         getattr(self.jvmCheck, "isContainedIn$default$6")())
     return Check(self.spark, self.level, self.description, jvmConstraint)
Example #11
0
 def satisfies(self, columnCondition, constraintName, assertion):
     """
     Creates a constraint that runs the given condition on the data frame.
     
     @param columnCondition Data frame column which is a combination of expression and the column
                            name. It has to comply with Spark SQL syntax.
                            Can be written in an exact same way with conditions inside the
                            `WHERE` clause.
     @param constraintName  A name that summarizes the check being made. This name is being used to
                            name the metrics for the analysis being done.
     @param assertion       Function that receives a double input parameter and returns a boolean
     @param hint A hint to provide additional context why a constraint could have failed
     """
     function = jc.scala_function1(self.spark.sparkContext._gateway,
                                   assertion)
     jvmConstraint = self.jvmCheck.satisfies(
         columnCondition, constraintName, function,
         getattr(self.jvmCheck, "satisfies$default$4")())
     return Check(self.spark, self.level, self.description, jvmConstraint)
Example #12
0
 def hasNumberOfDistinctValues(self,
                               column,
                               assertion,
                               binningUdf=None,
                               maxBins=None):
     """
     Creates a constraint that asserts on the number of distinct values a column has.
     
     @param column     Column to run the assertion on
     @param assertion  Function that receives a long input parameter and returns a boolean
     @param binningUdf An optional binning function
     @param maxBins    Histogram details is only provided for N column values with top counts.
                       maxBins sets the N
     @param hint A hint to provide additional context why a constraint could have failed
     """
     function = jc.scala_function1(self.spark.sparkContext._gateway,
                                   assertion)
     jvmConstraint = self.jvmCheck.hasNumberOfDistinctValues(
         column, function,
         getattr(self.jvmCheck, "hasNumberOfDistinctValues$default$3")(),
         getattr(self.jvmCheck, "hasNumberOfDistinctValues$default$4")(),
         getattr(self.jvmCheck, "hasNumberOfDistinctValues$default$5")())
     return Check(self.spark, self.level, self.description, jvmConstraint)
Example #13
0
 def hasDataType(self, column, dataType, assertion):
     """
     Check to run against the fraction of rows that conform to the given data type.
     
     @param column Name of the columns that should be checked.
     @param dataType Data type that the columns should be compared against.
     @param assertion Function that receives a double input parameter and returns a boolean
     @param hint A hint to provide additional context why a constraint could have failed
     """
     _jconstDataTypes = self._jvm.com.amazon.deequ.constraints.ConstrainableDataTypes
     dataTypes = {
         'null': _jconstDataTypes.Null(),
         'boolean': _jconstDataTypes.Boolean(),
         'string': _jconstDataTypes.String(),
         'numeric': _jconstDataTypes.Numeric(),
         'fractional': _jconstDataTypes.Fractional(),
         'integer': _jconstDataTypes.Integral()
     }
     function = jc.scala_function1(self.spark.sparkContext._gateway,
                                   assertion)
     jvmConstraint = self.jvmCheck.hasDataType(
         column, dataTypes[dataType], function,
         getattr(self.jvmCheck, "hasDataType$default$4")())
     return Check(self.spark, self.level, self.description, jvmConstraint)