def test_add_row_index(self): computed_dataframe = ShowResults._add_row_index(self.dataframe) self.assertIn(('rowId', 'bigint'), computed_dataframe.dtypes) computed_dataframe = ShowResults._add_row_index(self.dataframe, rowId='roow') self.assertIn(('roow', 'bigint'), computed_dataframe.dtypes)
def test_add_outliers(self): computed_dataframe = ShowResults._add_distances(self.dataframe, point_col='point_col') computed_pdf = ShowResults._add_outliers(computed_dataframe).toPandas() # Boundary pre calculated mean for prediction 0: mean+2*stddev actual_values = [False]*5+[True]+4*[False] self.assertListEqual(list(computed_pdf['is_outlier']), actual_values) print('add_outliers \n', computed_pdf)
def test_cluster_graph(self): # not tested through table_df = ShowResults.prepare_table_data(self.dataframe, point_col='point_col').toPandas() grouped = table_df.groupby('prediction') for i in range(1, len(table_df.prediction.unique())+1): group_i = grouped.get_group(i) table_json = ShowResults.cluster_graph(group_i) print('cluster_graph \n', table_json)
def run(sc: pyspark.SparkContext, **kwargs): from shared.parse_algorithm_variables import parse_algorithm_variables from cleaning.ExecuteCleaningWorkflow import ExecuteWorkflow from cleaning.ShowCleaning import ShowResults # Initialization phase v.1.0 import_path = kwargs.get('input_data', None) feature_columns = kwargs.get('features', None) label_columns = kwargs.get('labels', None) id_column = kwargs.get('id', 'id') # header_columns = kwargs.get('headers', None) algorithm_params = parse_algorithm_variables( vars=kwargs.get('algo_params', None)) standardizer = algorithm_params.get('standardizer', True) spark_session = pyspark.sql.SparkSession(sc) # label_schema = create_sub_schema(label_columns, type='label') # id_schema = create_sub_schema(id_column, type='id') # feature_schema = create_sub_schema(feature_columns, type='feature') # all_structs = list(filter(lambda x: x != None, id_schema+label_schema+feature_schema)) # training_data_schema = T.StructType(all_structs) training_data_frame = spark_session.read.load(path=import_path, format='csv', inferSchema=True, header=True).persist() header_columns = training_data_frame.columns # training_data_frame.show() cleaning_workflow = ExecuteWorkflow(dict_params=algorithm_params, cols_features=feature_columns, cols_labels=label_columns, standardize=standardizer) training_model = cleaning_workflow.execute_pipeline( data_frame=training_data_frame) clustered_data_frame = cleaning_workflow.apply_model( sc=sc, model=training_model, data_frame=training_data_frame) # clustered_data_frame.show() show_result = ShowResults(id=id_column[0], list_features=feature_columns, list_labels=label_columns, list_headers=header_columns, **algorithm_params) all_info_df = show_result.prepare_table_data( dataframe=clustered_data_frame, **algorithm_params) # all_info_df.show() d_point = 'data_points' output_df = show_result.arrange_output(sc=sc, dataframe=all_info_df, data_point_name=d_point, **algorithm_params) training_data_frame.unpersist() return output_df.sort('prediction')
def test_arrange_output(self): # Preamble: setup data features = ['a', 'b'] id = 'id' prediction = 'Prediction' # Test 1: Are all columns there? shows = ShowResults(id=id, list_features=features, list_labels=['k'], predictionCol=prediction, k=10) d_point = 'dp' df = self._generate_data() arranged_df = shows.arrange_output(self.sc, df, data_point_name=d_point) expected_cols = [prediction, d_point, 'amount', 'percentage_outlier', 'amount_outlier', 'buckets'] self.assertListEqual(sorted(expected_cols), sorted(arranged_df.columns))
def test_compute_summary(self): computed_dataframe = ShowResults._add_distances(self.dataframe, point_col='point_col') computed_df = ShowResults._add_outliers(computed_dataframe) summary_pdf = ShowResults.compute_summary(computed_df).toPandas() # counts from predictionCol actual_count_prediction = [6, 3, 1] # counts from outliers in distance actual_count_outliers = [1, 0, 0] # percentage from actual_count_outliers / actual_count_prediction actual_count_percentage = list(map(float, ['%.f' % elem for elem in [out/pre*100 for out, pre in zip(actual_count_outliers, actual_count_prediction)]])) self.assertEqual(list(summary_pdf['count']), actual_count_prediction) self.assertEqual(list(summary_pdf['outlier_count']), actual_count_outliers) self.assertEqual(list(summary_pdf['outlier percentage']), actual_count_percentage) print('compute_summary \n', summary_pdf)
def test_add_distances(self): from math import sqrt computed_dataframe = ShowResults._add_distances(self.dataframe, point_col='point_col') self.assertIn(('distance', 'double'), computed_dataframe.dtypes) p_computed_dataframe = computed_dataframe.toPandas() actual_distances = [sqrt(1.0), sqrt(1.0), sqrt(1.0), sqrt(1.0), sqrt(4.0), sqrt(9.0+16.0), sqrt(1.0), sqrt(100.0), sqrt(4.0), sqrt(25.0)] for idx, val in enumerate(actual_distances): self.assertEqual(val, p_computed_dataframe['distance'][idx]) print('add_distance \n', p_computed_dataframe)
def test_create_buckets(self): # Preamble: setup data df = self._generate_data() features = ['a', 'b'] id = 'id' prediction = 'Prediction' shows = ShowResults.prepare_table_data(self.dataframe, point_col='point_col') # ShowResults(id=id, list_features=features, list_labels=['k'], predictionCol=prediction, k=10) buckets = shows.create_buckets(sc=self.sc, dataframe=df, buckets=20, prediction_col=prediction) print(buckets.rdd.take(1)[0]['buckets']) self.fail()
def test_prepare_table_data(self): # not tested through table_df = ShowResults.prepare_table_data(self.dataframe, point_col='point_col').toPandas() print('prepare_table_data \n', table_df)
def test_create_linspace(self): # data, min, max, buckets, boundary): data = ShowResults.prepare_table_data(self.dataframe, point_col='point_col') data.show()
def test_json_histogram(self): # not tested through table_df = ShowResults.prepare_table_data(self.dataframe, point_col='point_col').toPandas() hist_json = ShowResults.json_histogram(table_df) print('json_hist \n', hist_json)