def main(): # SparkSession startup spark = (SparkSession .builder .master('local[*]') .config('spark.jars.packages', 'com.amazon.deequ:deequ:1.0.5') .appName('constrain-example') .getOrCreate()) df = spark.createDataFrame(test_data) # Constrain verification r = (VerificationSuite(spark) .onData(df) .addCheck(Check(spark, 'error', 'examples') .hasSize(lambda x: x == 8) .isUnique('_2') .hasCompleteness('_2', lambda x: x >= 0.75) .hasUniqueness('_1', lambda x: x == 3/8) .hasDistinctness('_1', lambda x: x == 5/8) .hasUniqueValueRatio('_2', lambda x: x == 0.8) .hasNumberOfDistinctValues('_2', lambda x: x == 6) #.hasHistogram .hasEntropy('_3', lambda x: x > 1) #.hasMutualInformation('_2', '_3', lambda x: x > 0.5) .hasApproxQuantile('_2', 0.5, lambda x: x == 7) .hasMinLength('_1', lambda x: x == 6) .hasMaxLength('_3', lambda x: x == 10) .hasMin('_2', lambda x: x == 1) .hasMax('_2', lambda x: x == 20) .hasMean('_2', lambda x: x > 10) .hasSum('_2', lambda x: x > 50) .hasStandardDeviation('_2', lambda x: x > 5) .hasApproxCountDistinct('_2', lambda x: x == 5) .hasCorrelation('_2', '_5', lambda x: x == 1) .satisfies("_2 > 15", "MyCondition", lambda x: x == 0.25) #.hasPattern("_1", "thing([A-Z])", lambda x: x == 1) #.hasDataType("_1", "string", lambda x: x == 1) .isPositive('_2') .isNonNegative('_2') .isLessThan('_5', '_2', lambda x: x == 0.375) .isLessThanOrEqualTo('_5', '_2', lambda x: x == 0.375) .isGreaterThan('_5', '_2', lambda x: x == 0.125) .isGreaterThanOrEqualTo('_5', '_2', lambda x: x == 0.125) #.isContainedIn('_3', ['DELAYED', 'INTRANSIT']) .isInInterval('_5', 1.0, 50.0) ) .run() ) df = DataFrame(r, spark) df.show(df.count(), False) # SparkSession and Java Gateway teardown spark.sparkContext._gateway.close() spark.stop()
def main(): # SparkSession startup spark = (SparkSession .builder .master('local[*]') .config('spark.jars.packages', 'com.amazon.deequ:deequ:1.0.5') .appName('suggestions-example') .getOrCreate()) df = spark.createDataFrame(test_data) # Analysis run a = (AnalysisRunner(spark) .onData(df) .addAnalyzer(analyzers.Size())) \ .run() key = ResultKey(spark, 100000, {'key1': 'value1'}) myrepo = FileSystemMetricsRepository(spark, '../test.json') myrepo.save(key, a) # Verification run key2 = repo.ResultKey(spark, 100000, {'key1': 'value2', 'key2':'value3'}) v = (base.VerificationSuite(spark) .onData(df) .addCheck(Check(spark, 'error', 'examples') .hasSize(lambda x: x == 8) .isUnique('_2')) .useRepository(myrepo) .saveOrAppendResult(key2) .run() ) myrepo.load().withTagValues({'key1': 'value1'}).after(99000) \ .getMetricsAsDF().show() # SparkSession and Java Gateway teardown spark.sparkContext._gateway.close() spark.stop()
def test_hasMaxLength(self): chk = Check(self.spark) \ .hasMaxLength('_3', lambda x: x == 10) out = self.suite.onData(self.df).addCheck(chk).run() out = DataFrame(out, self.spark).select('constraint_status').collect() self.assertEqual(out, [self.success])
def test_hasApproxQuantile(self): chk = Check(self.spark) \ .hasApproxQuantile('_2', 0.5, lambda x: x == 7) out = self.suite.onData(self.df).addCheck(chk).run() out = DataFrame(out, self.spark).select('constraint_status').collect() self.assertEqual(out, [self.success])
def test_hasNumberOfDistinctValues(self): chk = Check(self.spark) \ .hasNumberOfDistinctValues('_2', lambda x: x == 6) out = self.suite.onData(self.df).addCheck(chk).run() out = DataFrame(out, self.spark).select('constraint_status').collect() self.assertEqual(out, [self.success])
def test_hasUniqueValueRatio(self): chk = Check(self.spark) \ .hasUniqueValueRatio('_2', lambda x: x == 0.8) out = self.suite.onData(self.df).addCheck(chk).run() out = DataFrame(out, self.spark).select('constraint_status').collect() self.assertEqual(out, [self.success])
def test_hasCompleteness(self): chk = Check(self.spark) \ .hasCompleteness('_2', lambda x: x >= 0.75) out = self.suite.onData(self.df).addCheck(chk).run() out = DataFrame(out, self.spark).select('constraint_status').collect() self.assertEqual(out, [self.success])
def test_isUnique(self): chk = Check(self.spark) \ .isUnique('_1') out = self.suite.onData(self.df).addCheck(chk).run() out = DataFrame(out, self.spark).select('constraint_status').collect() self.assertEqual(out, [self.failure])
def test_isInInterval(self): chk = Check(self.spark) \ .isInInterval('_5', 1.0, 50.0) out = self.suite.onData(self.df).addCheck(chk).run() out = DataFrame(out, self.spark).select('constraint_status').collect() self.assertEqual(out, [self.success])
def test_isGreaterThanOrEqualTo(self): chk = Check(self.spark) \ .isGreaterThanOrEqualTo('_5', '_2', lambda x: x == 0.125) out = self.suite.onData(self.df).addCheck(chk).run() out = DataFrame(out, self.spark).select('constraint_status').collect() self.assertEqual(out, [self.success])
def test_isNonNegative(self): chk = Check(self.spark) \ .isNonNegative('_2') out = self.suite.onData(self.df).addCheck(chk).run() out = DataFrame(out, self.spark).select('constraint_status').collect() self.assertEqual(out, [self.success])
def test_satisfies(self): chk = Check(self.spark) \ .satisfies("_2 > 15", "MyCondition", lambda x: x == 0.25) out = self.suite.onData(self.df).addCheck(chk).run() out = DataFrame(out, self.spark).select('constraint_status').collect() self.assertEqual(out, [self.success])
def test_hasStandardDeviation(self): chk = Check(self.spark) \ .hasStandardDeviation('_2', lambda x: x > 5) out = self.suite.onData(self.df).addCheck(chk).run() out = DataFrame(out, self.spark).select('constraint_status').collect() self.assertEqual(out, [self.success])
def createCheck(self, level: CheckLevel, description: str, constraints=None): return Check(self._spark_session, level, description, constraints)