def setUp(self): if 'spark' not in locals(): print('Environment: Databricks-Connect') spark, sqlContext, setting = spark_init() sc = spark.sparkContext self.start_date = '20190710' self.period = -1 self.env = 'TST' self.df = sc.pickleFile('post_processing_test_data_df.pkl').toDF() self.sales = sc.pickleFile( 'post_processing_test_data_sales.pkl').toDF() self.total_basket_count, self.coupon_views, self.matrix = market_basket_sql( self.df) self.cvm_post = CVMPostProcessing(sales=self.sales, matrix=self.matrix, data=self.df, coupon_views=self.coupon_views, division='LSG')
import unittest from unittest import TestCase from utility_functions.stats_functions import permute_columns from utility_functions.databricks_uf import has_column from connect2Databricks.spark_init import spark_init if 'spark' not in locals(): spark, sqlContext, setting = spark_init() sc = spark.sparkContext class TestPermuteColumns(TestCase): def test_permute_columns(self): data = spark.createDataFrame([(1, 'a', 'a'), (2, 'b', 'b'), (3, 'c', 'c'), (4, 'd', 'd'), (5, 'e', 'e')], ['id', 'col1', 'col2']) permuted_data = permute_columns(data, columns_to_permute=['col1', 'col2'], column_to_order='id', ind_permute=False) permuted_data.show() self.assertTrue(has_column(permuted_data, 'rand_id')) self.assertTrue(has_column(permuted_data, 'rand_col1')) self.assertTrue(has_column(permuted_data, 'rand_col2')) self.assertEqual( permuted_data.select('rand_col1').collect(), permuted_data.select('rand_col2').collect()) self.assertNotEqual( permuted_data.select('col1').collect(),
from pyspark.sql.functions import rand from pyspark.sql.functions import split, explode, col, ltrim, rtrim, coalesce, countDistinct, broadcast from pyspark.sql.window import Window import pyspark.sql.functions as func from utility_functions.databricks_uf import timer from connect2Databricks.spark_init import spark_init if 'spark' not in locals(): print('Environment: Databricks-Connect') spark, sqlContext, _ = spark_init() sc = spark.sparkContext @timer def permute_columns(df, column_to_order: str, ind_permute: bool = False, columns_to_permute: list = []): """ Author: Allison Wu Description: This function permutes the columns specified in columns_to_permute :param df: :param column_to_order: :param ind_permute: :param columns_to_permute: :return: permuted_df """ window = Window.partitionBy().orderBy(col(column_to_order)) window_rand = Window.partitionBy().orderBy(rand()) df = df. \