Example #1
0
    def setUp(self):
        if 'spark' not in locals():
            print('Environment: Databricks-Connect')
            spark, sqlContext, setting = spark_init()

        sc = spark.sparkContext

        self.start_date = '20190710'
        self.period = -1
        self.env = 'TST'
        self.df = sc.pickleFile('post_processing_test_data_df.pkl').toDF()
        self.sales = sc.pickleFile(
            'post_processing_test_data_sales.pkl').toDF()
        self.total_basket_count, self.coupon_views, self.matrix = market_basket_sql(
            self.df)

        self.cvm_post = CVMPostProcessing(sales=self.sales,
                                          matrix=self.matrix,
                                          data=self.df,
                                          coupon_views=self.coupon_views,
                                          division='LSG')
import unittest
from unittest import TestCase
from utility_functions.stats_functions import permute_columns
from utility_functions.databricks_uf import has_column
from connect2Databricks.spark_init import spark_init
if 'spark' not in locals():
    spark, sqlContext, setting = spark_init()

sc = spark.sparkContext


class TestPermuteColumns(TestCase):
    def test_permute_columns(self):
        data = spark.createDataFrame([(1, 'a', 'a'), (2, 'b', 'b'),
                                      (3, 'c', 'c'), (4, 'd', 'd'),
                                      (5, 'e', 'e')], ['id', 'col1', 'col2'])
        permuted_data = permute_columns(data,
                                        columns_to_permute=['col1', 'col2'],
                                        column_to_order='id',
                                        ind_permute=False)

        permuted_data.show()

        self.assertTrue(has_column(permuted_data, 'rand_id'))
        self.assertTrue(has_column(permuted_data, 'rand_col1'))
        self.assertTrue(has_column(permuted_data, 'rand_col2'))
        self.assertEqual(
            permuted_data.select('rand_col1').collect(),
            permuted_data.select('rand_col2').collect())
        self.assertNotEqual(
            permuted_data.select('col1').collect(),
from pyspark.sql.functions import rand
from pyspark.sql.functions import split, explode, col, ltrim, rtrim, coalesce, countDistinct, broadcast
from pyspark.sql.window import Window
import pyspark.sql.functions as func
from utility_functions.databricks_uf import timer
from connect2Databricks.spark_init import spark_init
if 'spark' not in locals():
    print('Environment: Databricks-Connect')
    spark, sqlContext, _ = spark_init()

sc = spark.sparkContext


@timer
def permute_columns(df,
                    column_to_order: str,
                    ind_permute: bool = False,
                    columns_to_permute: list = []):
    """
    Author: Allison Wu
    Description: This function permutes the columns specified in columns_to_permute
    :param df:
    :param column_to_order:
    :param ind_permute:
    :param columns_to_permute:
    :return: permuted_df
    """
    window = Window.partitionBy().orderBy(col(column_to_order))
    window_rand = Window.partitionBy().orderBy(rand())

    df = df. \