def run(sc: pyspark.SparkContext, **kwargs): from shared.parse_algorithm_variables import parse_algorithm_variables from cleaning.ExecuteCleaningWorkflow import ExecuteWorkflow from cleaning.ShowCleaning import ShowResults # Initialization phase v.1.0 import_path = kwargs.get('input_data', None) feature_columns = kwargs.get('features', None) label_columns = kwargs.get('labels', None) id_column = kwargs.get('id', 'id') # header_columns = kwargs.get('headers', None) algorithm_params = parse_algorithm_variables( vars=kwargs.get('algo_params', None)) standardizer = algorithm_params.get('standardizer', True) spark_session = pyspark.sql.SparkSession(sc) # label_schema = create_sub_schema(label_columns, type='label') # id_schema = create_sub_schema(id_column, type='id') # feature_schema = create_sub_schema(feature_columns, type='feature') # all_structs = list(filter(lambda x: x != None, id_schema+label_schema+feature_schema)) # training_data_schema = T.StructType(all_structs) training_data_frame = spark_session.read.load(path=import_path, format='csv', inferSchema=True, header=True).persist() header_columns = training_data_frame.columns # training_data_frame.show() cleaning_workflow = ExecuteWorkflow(dict_params=algorithm_params, cols_features=feature_columns, cols_labels=label_columns, standardize=standardizer) training_model = cleaning_workflow.execute_pipeline( data_frame=training_data_frame) clustered_data_frame = cleaning_workflow.apply_model( sc=sc, model=training_model, data_frame=training_data_frame) # clustered_data_frame.show() show_result = ShowResults(id=id_column[0], list_features=feature_columns, list_labels=label_columns, list_headers=header_columns, **algorithm_params) all_info_df = show_result.prepare_table_data( dataframe=clustered_data_frame, **algorithm_params) # all_info_df.show() d_point = 'data_points' output_df = show_result.arrange_output(sc=sc, dataframe=all_info_df, data_point_name=d_point, **algorithm_params) training_data_frame.unpersist() return output_df.sort('prediction')
def test_arrange_output(self): # Preamble: setup data features = ['a', 'b'] id = 'id' prediction = 'Prediction' # Test 1: Are all columns there? shows = ShowResults(id=id, list_features=features, list_labels=['k'], predictionCol=prediction, k=10) d_point = 'dp' df = self._generate_data() arranged_df = shows.arrange_output(self.sc, df, data_point_name=d_point) expected_cols = [prediction, d_point, 'amount', 'percentage_outlier', 'amount_outlier', 'buckets'] self.assertListEqual(sorted(expected_cols), sorted(arranged_df.columns))