コード例 #1
0
ファイル: run.py プロジェクト: Visma-MLaaS/WorkflowCleaning
def run(sc: pyspark.SparkContext, **kwargs):

    from shared.parse_algorithm_variables import parse_algorithm_variables
    from cleaning.ExecuteCleaningWorkflow import ExecuteWorkflow
    from cleaning.ShowCleaning import ShowResults

    # Initialization phase v.1.0
    import_path = kwargs.get('input_data', None)
    feature_columns = kwargs.get('features', None)
    label_columns = kwargs.get('labels', None)
    id_column = kwargs.get('id', 'id')
    # header_columns = kwargs.get('headers', None)
    algorithm_params = parse_algorithm_variables(
        vars=kwargs.get('algo_params', None))
    standardizer = algorithm_params.get('standardizer', True)
    spark_session = pyspark.sql.SparkSession(sc)

    # label_schema = create_sub_schema(label_columns, type='label')
    # id_schema = create_sub_schema(id_column, type='id')
    # feature_schema = create_sub_schema(feature_columns, type='feature')
    # all_structs = list(filter(lambda x: x != None, id_schema+label_schema+feature_schema))
    # training_data_schema = T.StructType(all_structs)

    training_data_frame = spark_session.read.load(path=import_path,
                                                  format='csv',
                                                  inferSchema=True,
                                                  header=True).persist()
    header_columns = training_data_frame.columns
    # training_data_frame.show()
    cleaning_workflow = ExecuteWorkflow(dict_params=algorithm_params,
                                        cols_features=feature_columns,
                                        cols_labels=label_columns,
                                        standardize=standardizer)
    training_model = cleaning_workflow.execute_pipeline(
        data_frame=training_data_frame)
    clustered_data_frame = cleaning_workflow.apply_model(
        sc=sc, model=training_model, data_frame=training_data_frame)
    # clustered_data_frame.show()
    show_result = ShowResults(id=id_column[0],
                              list_features=feature_columns,
                              list_labels=label_columns,
                              list_headers=header_columns,
                              **algorithm_params)
    all_info_df = show_result.prepare_table_data(
        dataframe=clustered_data_frame, **algorithm_params)
    # all_info_df.show()
    d_point = 'data_points'

    output_df = show_result.arrange_output(sc=sc,
                                           dataframe=all_info_df,
                                           data_point_name=d_point,
                                           **algorithm_params)
    training_data_frame.unpersist()
    return output_df.sort('prediction')
コード例 #2
0
    def test_arrange_output(self):
        # Preamble: setup data
        features = ['a', 'b']
        id = 'id'
        prediction = 'Prediction'
        # Test 1: Are all columns there?

        shows = ShowResults(id=id, list_features=features, list_labels=['k'], predictionCol=prediction, k=10)
        d_point = 'dp'
        df = self._generate_data()
        arranged_df = shows.arrange_output(self.sc, df, data_point_name=d_point)
        expected_cols = [prediction, d_point, 'amount', 'percentage_outlier', 'amount_outlier', 'buckets']
        self.assertListEqual(sorted(expected_cols), sorted(arranged_df.columns))