def test_add_outliers(self): computed_dataframe = ShowResults._add_distances(self.dataframe, point_col='point_col') computed_pdf = ShowResults._add_outliers(computed_dataframe).toPandas() # Boundary pre calculated mean for prediction 0: mean+2*stddev actual_values = [False]*5+[True]+4*[False] self.assertListEqual(list(computed_pdf['is_outlier']), actual_values) print('add_outliers \n', computed_pdf)
def test_add_distances(self): from math import sqrt computed_dataframe = ShowResults._add_distances(self.dataframe, point_col='point_col') self.assertIn(('distance', 'double'), computed_dataframe.dtypes) p_computed_dataframe = computed_dataframe.toPandas() actual_distances = [sqrt(1.0), sqrt(1.0), sqrt(1.0), sqrt(1.0), sqrt(4.0), sqrt(9.0+16.0), sqrt(1.0), sqrt(100.0), sqrt(4.0), sqrt(25.0)] for idx, val in enumerate(actual_distances): self.assertEqual(val, p_computed_dataframe['distance'][idx]) print('add_distance \n', p_computed_dataframe)
def test_compute_summary(self): computed_dataframe = ShowResults._add_distances(self.dataframe, point_col='point_col') computed_df = ShowResults._add_outliers(computed_dataframe) summary_pdf = ShowResults.compute_summary(computed_df).toPandas() # counts from predictionCol actual_count_prediction = [6, 3, 1] # counts from outliers in distance actual_count_outliers = [1, 0, 0] # percentage from actual_count_outliers / actual_count_prediction actual_count_percentage = list(map(float, ['%.f' % elem for elem in [out/pre*100 for out, pre in zip(actual_count_outliers, actual_count_prediction)]])) self.assertEqual(list(summary_pdf['count']), actual_count_prediction) self.assertEqual(list(summary_pdf['outlier_count']), actual_count_outliers) self.assertEqual(list(summary_pdf['outlier percentage']), actual_count_percentage) print('compute_summary \n', summary_pdf)