コード例 #1
0
    def test_add_row_index(self):

        computed_dataframe = ShowResults._add_row_index(self.dataframe)
        self.assertIn(('rowId', 'bigint'), computed_dataframe.dtypes)

        computed_dataframe = ShowResults._add_row_index(self.dataframe, rowId='roow')
        self.assertIn(('roow', 'bigint'), computed_dataframe.dtypes)
コード例 #2
0
    def test_add_outliers(self):
        computed_dataframe = ShowResults._add_distances(self.dataframe, point_col='point_col')
        computed_pdf = ShowResults._add_outliers(computed_dataframe).toPandas()

        # Boundary pre calculated mean for prediction 0: mean+2*stddev
        actual_values = [False]*5+[True]+4*[False]
        self.assertListEqual(list(computed_pdf['is_outlier']), actual_values)
        print('add_outliers \n', computed_pdf)
コード例 #3
0
    def test_cluster_graph(self):
        # not tested through

        table_df = ShowResults.prepare_table_data(self.dataframe, point_col='point_col').toPandas()
        grouped = table_df.groupby('prediction')
        for i in range(1, len(table_df.prediction.unique())+1):
            group_i = grouped.get_group(i)
            table_json = ShowResults.cluster_graph(group_i)
            print('cluster_graph \n', table_json)
コード例 #4
0
ファイル: run.py プロジェクト: Visma-MLaaS/WorkflowCleaning
def run(sc: pyspark.SparkContext, **kwargs):

    from shared.parse_algorithm_variables import parse_algorithm_variables
    from cleaning.ExecuteCleaningWorkflow import ExecuteWorkflow
    from cleaning.ShowCleaning import ShowResults

    # Initialization phase v.1.0
    import_path = kwargs.get('input_data', None)
    feature_columns = kwargs.get('features', None)
    label_columns = kwargs.get('labels', None)
    id_column = kwargs.get('id', 'id')
    # header_columns = kwargs.get('headers', None)
    algorithm_params = parse_algorithm_variables(
        vars=kwargs.get('algo_params', None))
    standardizer = algorithm_params.get('standardizer', True)
    spark_session = pyspark.sql.SparkSession(sc)

    # label_schema = create_sub_schema(label_columns, type='label')
    # id_schema = create_sub_schema(id_column, type='id')
    # feature_schema = create_sub_schema(feature_columns, type='feature')
    # all_structs = list(filter(lambda x: x != None, id_schema+label_schema+feature_schema))
    # training_data_schema = T.StructType(all_structs)

    training_data_frame = spark_session.read.load(path=import_path,
                                                  format='csv',
                                                  inferSchema=True,
                                                  header=True).persist()
    header_columns = training_data_frame.columns
    # training_data_frame.show()
    cleaning_workflow = ExecuteWorkflow(dict_params=algorithm_params,
                                        cols_features=feature_columns,
                                        cols_labels=label_columns,
                                        standardize=standardizer)
    training_model = cleaning_workflow.execute_pipeline(
        data_frame=training_data_frame)
    clustered_data_frame = cleaning_workflow.apply_model(
        sc=sc, model=training_model, data_frame=training_data_frame)
    # clustered_data_frame.show()
    show_result = ShowResults(id=id_column[0],
                              list_features=feature_columns,
                              list_labels=label_columns,
                              list_headers=header_columns,
                              **algorithm_params)
    all_info_df = show_result.prepare_table_data(
        dataframe=clustered_data_frame, **algorithm_params)
    # all_info_df.show()
    d_point = 'data_points'

    output_df = show_result.arrange_output(sc=sc,
                                           dataframe=all_info_df,
                                           data_point_name=d_point,
                                           **algorithm_params)
    training_data_frame.unpersist()
    return output_df.sort('prediction')
コード例 #5
0
    def test_arrange_output(self):
        # Preamble: setup data
        features = ['a', 'b']
        id = 'id'
        prediction = 'Prediction'
        # Test 1: Are all columns there?

        shows = ShowResults(id=id, list_features=features, list_labels=['k'], predictionCol=prediction, k=10)
        d_point = 'dp'
        df = self._generate_data()
        arranged_df = shows.arrange_output(self.sc, df, data_point_name=d_point)
        expected_cols = [prediction, d_point, 'amount', 'percentage_outlier', 'amount_outlier', 'buckets']
        self.assertListEqual(sorted(expected_cols), sorted(arranged_df.columns))
コード例 #6
0
    def test_compute_summary(self):
        computed_dataframe = ShowResults._add_distances(self.dataframe, point_col='point_col')
        computed_df = ShowResults._add_outliers(computed_dataframe)
        summary_pdf = ShowResults.compute_summary(computed_df).toPandas()

        # counts from predictionCol
        actual_count_prediction = [6, 3, 1]
        # counts from outliers in distance
        actual_count_outliers = [1, 0, 0]
        # percentage from actual_count_outliers / actual_count_prediction
        actual_count_percentage = list(map(float, ['%.f' % elem for elem in
                                                   [out/pre*100 for out, pre in
                                                    zip(actual_count_outliers, actual_count_prediction)]]))

        self.assertEqual(list(summary_pdf['count']), actual_count_prediction)
        self.assertEqual(list(summary_pdf['outlier_count']), actual_count_outliers)
        self.assertEqual(list(summary_pdf['outlier percentage']), actual_count_percentage)
        print('compute_summary \n', summary_pdf)
コード例 #7
0
    def test_add_distances(self):
        from math import sqrt
        computed_dataframe = ShowResults._add_distances(self.dataframe, point_col='point_col')
        self.assertIn(('distance', 'double'), computed_dataframe.dtypes)

        p_computed_dataframe = computed_dataframe.toPandas()
        actual_distances = [sqrt(1.0), sqrt(1.0), sqrt(1.0), sqrt(1.0), sqrt(4.0),
                            sqrt(9.0+16.0), sqrt(1.0), sqrt(100.0), sqrt(4.0), sqrt(25.0)]
        for idx, val in enumerate(actual_distances):
            self.assertEqual(val, p_computed_dataframe['distance'][idx])
        print('add_distance \n', p_computed_dataframe)
コード例 #8
0
    def test_create_buckets(self):
        # Preamble: setup data
        df = self._generate_data()
        features = ['a', 'b']
        id = 'id'
        prediction = 'Prediction'

        shows = ShowResults.prepare_table_data(self.dataframe, point_col='point_col')
        # ShowResults(id=id, list_features=features, list_labels=['k'], predictionCol=prediction, k=10)
        buckets = shows.create_buckets(sc=self.sc, dataframe=df, buckets=20, prediction_col=prediction)

        print(buckets.rdd.take(1)[0]['buckets'])
        self.fail()
コード例 #9
0
 def test_prepare_table_data(self):
     # not tested through
     table_df = ShowResults.prepare_table_data(self.dataframe, point_col='point_col').toPandas()
     print('prepare_table_data \n', table_df)
コード例 #10
0
 def test_create_linspace(self):  # data, min, max, buckets, boundary):
     data = ShowResults.prepare_table_data(self.dataframe, point_col='point_col')
     data.show()
コード例 #11
0
    def test_json_histogram(self):
        # not tested through

        table_df = ShowResults.prepare_table_data(self.dataframe, point_col='point_col').toPandas()
        hist_json = ShowResults.json_histogram(table_df)
        print('json_hist \n', hist_json)