def test_cluster_graph(self):
        # not tested through

        table_df = ShowResults.prepare_table_data(self.dataframe, point_col='point_col').toPandas()
        grouped = table_df.groupby('prediction')
        for i in range(1, len(table_df.prediction.unique())+1):
            group_i = grouped.get_group(i)
            table_json = ShowResults.cluster_graph(group_i)
            print('cluster_graph \n', table_json)
Ejemplo n.º 2
0
def run(sc: pyspark.SparkContext, **kwargs):

    from shared.parse_algorithm_variables import parse_algorithm_variables
    from cleaning.ExecuteCleaningWorkflow import ExecuteWorkflow
    from cleaning.ShowCleaning import ShowResults

    # Initialization phase v.1.0
    import_path = kwargs.get('input_data', None)
    feature_columns = kwargs.get('features', None)
    label_columns = kwargs.get('labels', None)
    id_column = kwargs.get('id', 'id')
    # header_columns = kwargs.get('headers', None)
    algorithm_params = parse_algorithm_variables(
        vars=kwargs.get('algo_params', None))
    standardizer = algorithm_params.get('standardizer', True)
    spark_session = pyspark.sql.SparkSession(sc)

    # label_schema = create_sub_schema(label_columns, type='label')
    # id_schema = create_sub_schema(id_column, type='id')
    # feature_schema = create_sub_schema(feature_columns, type='feature')
    # all_structs = list(filter(lambda x: x != None, id_schema+label_schema+feature_schema))
    # training_data_schema = T.StructType(all_structs)

    training_data_frame = spark_session.read.load(path=import_path,
                                                  format='csv',
                                                  inferSchema=True,
                                                  header=True).persist()
    header_columns = training_data_frame.columns
    # training_data_frame.show()
    cleaning_workflow = ExecuteWorkflow(dict_params=algorithm_params,
                                        cols_features=feature_columns,
                                        cols_labels=label_columns,
                                        standardize=standardizer)
    training_model = cleaning_workflow.execute_pipeline(
        data_frame=training_data_frame)
    clustered_data_frame = cleaning_workflow.apply_model(
        sc=sc, model=training_model, data_frame=training_data_frame)
    # clustered_data_frame.show()
    show_result = ShowResults(id=id_column[0],
                              list_features=feature_columns,
                              list_labels=label_columns,
                              list_headers=header_columns,
                              **algorithm_params)
    all_info_df = show_result.prepare_table_data(
        dataframe=clustered_data_frame, **algorithm_params)
    # all_info_df.show()
    d_point = 'data_points'

    output_df = show_result.arrange_output(sc=sc,
                                           dataframe=all_info_df,
                                           data_point_name=d_point,
                                           **algorithm_params)
    training_data_frame.unpersist()
    return output_df.sort('prediction')
    def test_create_buckets(self):
        # Preamble: setup data
        df = self._generate_data()
        features = ['a', 'b']
        id = 'id'
        prediction = 'Prediction'

        shows = ShowResults.prepare_table_data(self.dataframe, point_col='point_col')
        # ShowResults(id=id, list_features=features, list_labels=['k'], predictionCol=prediction, k=10)
        buckets = shows.create_buckets(sc=self.sc, dataframe=df, buckets=20, prediction_col=prediction)

        print(buckets.rdd.take(1)[0]['buckets'])
        self.fail()
 def test_prepare_table_data(self):
     # not tested through
     table_df = ShowResults.prepare_table_data(self.dataframe, point_col='point_col').toPandas()
     print('prepare_table_data \n', table_df)
 def test_create_linspace(self):  # data, min, max, buckets, boundary):
     data = ShowResults.prepare_table_data(self.dataframe, point_col='point_col')
     data.show()
    def test_json_histogram(self):
        # not tested through

        table_df = ShowResults.prepare_table_data(self.dataframe, point_col='point_col').toPandas()
        hist_json = ShowResults.json_histogram(table_df)
        print('json_hist \n', hist_json)