def Run(self):
     df_chisquare_obj = ChiSquare(
         self._data_frame, self._dataframe_helper, self._dataframe_context,
         self._metaParser).test_all(dimension_columns=(
             self._dataframe_context.get_result_column(), ))
     # df_chisquare_result = CommonUtils.as_dict(df_chisquare_obj)
     # print 'RESULT: %s' % (json.dumps(df_chisquare_result, indent=2))
     # DataWriter.write_dict_as_json(self._spark, df_chisquare_result, self._dataframe_context.get_result_file()+'ChiSquare/')
     # Narratives
     # print self._data_frame.select('Sales').show()
     if df_chisquare_obj.get_result() != {}:
         chisquare_narratives = CommonUtils.as_dict(
             ChiSquareNarratives(self._dataframe_helper, df_chisquare_obj,
                                 self._spark, self._dataframe_context,
                                 self._data_frame, self._story_narrative,
                                 self._result_setter))
Example #2
0
class TestChiSquare(unittest.TestCase):

    # def __init__(self):
    # 	pass

    def setUp(self):
        APP_NAME = "test"
        spark = CommonUtils.get_spark_session(app_name=APP_NAME,
                                              hive_environment=False)
        spark.sparkContext.setLogLevel("ERROR")
        # spark.conf.set("spark.sql.execution.arrow.enabled", "true")

        configJson = get_test_configs("testCase", testFor="chisquare")

        config = configJson["config"]
        jobConfig = configJson["job_config"]
        jobType = jobConfig["job_type"]
        jobName = jobConfig["job_name"]
        jobURL = jobConfig["job_url"]
        messageURL = jobConfig["message_url"]
        try:
            errorURL = jobConfig["error_reporting_url"]
        except:
            errorURL = None
        if "app_id" in jobConfig:
            appid = jobConfig["app_id"]
        else:
            appid = None
        debugMode = True
        LOGGER = {}

        configJsonObj = configparser.ParserConfig(config)
        configJsonObj.set_json_params()
        configJsonObj = configparser.ParserConfig(config)
        configJsonObj.set_json_params()

        dataframe_context = ContextSetter(configJsonObj)
        dataframe_context.set_job_type(
            jobType
        )  #jobType should be set before set_params call of dataframe_context
        dataframe_context.set_params()
        dataframe_context.set_message_url(messageURL)
        dataframe_context.set_app_id(appid)
        dataframe_context.set_debug_mode(debugMode)
        dataframe_context.set_job_url(jobURL)
        dataframe_context.set_app_name(APP_NAME)
        dataframe_context.set_error_url(errorURL)
        dataframe_context.set_logger(LOGGER)
        dataframe_context.set_xml_url(jobConfig["xml_url"])
        dataframe_context.set_job_name(jobName)
        dataframe_context.set_environment("debugMode")
        dataframe_context.set_message_ignore(True)
        dataframe_context.set_analysis_name("Descriptive analysis")

        df = MasterHelper.load_dataset(spark, dataframe_context)
        metaParserInstance = MasterHelper.get_metadata(df, spark,
                                                       dataframe_context, None)
        df, df_helper = MasterHelper.set_dataframe_helper(
            df, dataframe_context, metaParserInstance)
        targetVal = dataframe_context.get_result_column()

        self.result_setter = ResultSetter(dataframe_context)
        self.story_narrative = NarrativesTree()
        self.story_narrative.set_name(
            "{} Performance Report".format(targetVal))
        self.data_frame = df
        self.df_helper = df_helper
        self.df_context = dataframe_context
        self.meta_parser = metaParserInstance
        self.measure_columns = df_helper.get_numeric_columns()
        self.base_dir = "/chisquare/"
        self.significant_variables = [
            'Buyer_Gender', 'Sales', 'Discount_Range', 'Shipping_Cost',
            'Last_Transaction', 'Marketing_Cost'
        ]
        self.measure_columns = [
            'Tenure_in_Days', 'Sales', 'Marketing_Cost', 'Shipping_Cost',
            'Last_Transaction'
        ]
        self.df_chisquare_obj = ChiSquare(
            self.data_frame, self.df_helper, self.df_context,
            self.meta_parser).test_all(
                dimension_columns=(self.df_context.get_result_column(), ))
        self.df_chisquare_result = self.df_chisquare_obj.get_result()
        self.num_analysed_variables = 11

    def test_chisquare_dimension(self):
        test_dimension = ChiSquare(self.data_frame, self.df_helper,
                                   self.df_context,
                                   self.meta_parser).test_dimension(
                                       'Price_Range', 'Source')
        self.assertAlmostEqual(test_dimension.get_pvalue(),
                               exp_values['pval']['Price_Range-Source'],
                               places=5)
        self.assertAlmostEqual(test_dimension.get_effect_size(),
                               exp_values['effect_size']['Price_Range-Source'],
                               places=5)
        self.assertAlmostEqual(test_dimension.get_stat(),
                               exp_values['stats']['Price_Range-Source'],
                               places=5)
        self.assertAlmostEqual(test_dimension.get_v_value(),
                               exp_values['v_value']['Price_Range-Source'],
                               places=5)

    def test_chisquare_measure(self):
        test_measures = ChiSquare(self.data_frame, self.df_helper,
                                  self.df_context,
                                  self.meta_parser).test_measures(
                                      'Price_Range', 'Marketing_Cost')
        self.assertAlmostEqual(
            test_measures.get_pvalue(),
            exp_values['pval']['Price_Range-Marketing_Cost'],
            places=5)
        self.assertAlmostEqual(
            test_measures.get_effect_size(),
            exp_values['effect_size']['Price_Range-Marketing_Cost'],
            places=5)
        self.assertAlmostEqual(
            test_measures.get_stat(),
            exp_values['stats']['Price_Range-Marketing_Cost'],
            places=5)
        self.assertAlmostEqual(
            test_measures.get_v_value(),
            exp_values['v_value']['Price_Range-Marketing_Cost'],
            places=5)

    def test_chisquare_all(self):

        #PVal-Test
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Deal_Type').get_pvalue(),
            exp_values['pval']['Price_Range-Deal_Type'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Discount_Range').get_pvalue(),
            exp_values['pval']['Price_Range-Discount_Range'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result('Price_Range',
                                                       'Source').get_pvalue(),
            exp_values['pval']['Price_Range-Source'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Platform').get_pvalue(),
            exp_values['pval']['Price_Range-Platform'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Buyer_Age').get_pvalue(),
            exp_values['pval']['Price_Range-Buyer_Age'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Buyer_Gender').get_pvalue(),
            exp_values['pval']['Price_Range-Buyer-Gender'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Tenure_in_Days').get_pvalue(),
            exp_values['pval']['Price_Range-Tenure_in_Days'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result('Price_Range',
                                                       'Sales').get_pvalue(),
            exp_values['pval']['Price_Range-Sales'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Marketing_Cost').get_pvalue(),
            exp_values['pval']['Price_Range-Marketing_Cost'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Shipping_Cost').get_pvalue(),
            exp_values['pval']['Price_Range-Shipping_Cost'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Last_Transaction').get_pvalue(),
            exp_values['pval']['Price_Range-Last_Transaction'])

        #EffectSize_Test
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Deal_Type').get_effect_size(),
            exp_values['effect_size']['Price_Range-Deal_Type'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Discount_Range').get_effect_size(),
            exp_values['effect_size']['Price_Range-Discount_Range'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Source').get_effect_size(),
            exp_values['effect_size']['Price_Range-Source'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Platform').get_effect_size(),
            exp_values['effect_size']['Price_Range-Platform'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Buyer_Age').get_effect_size(),
            exp_values['effect_size']['Price_Range-Buyer_Age'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Buyer_Gender').get_effect_size(),
            exp_values['effect_size']['Price_Range-Buyer-Gender'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Tenure_in_Days').get_effect_size(),
            exp_values['effect_size']['Price_Range-Tenure_in_Days'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Sales').get_effect_size(),
            exp_values['effect_size']['Price_Range-Sales'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Marketing_Cost').get_effect_size(),
            exp_values['effect_size']['Price_Range-Marketing_Cost'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Shipping_Cost').get_effect_size(),
            exp_values['effect_size']['Price_Range-Shipping_Cost'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Last_Transaction').get_effect_size(),
            exp_values['effect_size']['Price_Range-Last_Transaction'])

        #Stats_Test
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result('Price_Range',
                                                       'Deal_Type').get_stat(),
            exp_values['stats']['Price_Range-Deal_Type'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Discount_Range').get_stat(),
            exp_values['stats']['Price_Range-Discount_Range'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result('Price_Range',
                                                       'Source').get_stat(),
            exp_values['stats']['Price_Range-Source'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result('Price_Range',
                                                       'Platform').get_stat(),
            exp_values['stats']['Price_Range-Platform'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result('Price_Range',
                                                       'Buyer_Age').get_stat(),
            exp_values['stats']['Price_Range-Buyer_Age'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Buyer_Gender').get_stat(),
            exp_values['stats']['Price_Range-Buyer-Gender'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Tenure_in_Days').get_stat(),
            exp_values['stats']['Price_Range-Tenure_in_Days'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result('Price_Range',
                                                       'Sales').get_stat(),
            exp_values['stats']['Price_Range-Sales'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Marketing_Cost').get_stat(),
            exp_values['stats']['Price_Range-Marketing_Cost'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Shipping_Cost').get_stat(),
            exp_values['stats']['Price_Range-Shipping_Cost'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Last_Transaction').get_stat(),
            exp_values['stats']['Price_Range-Last_Transaction'])

        # #VVal-Test
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Deal_Type').get_v_value(),
            exp_values['v_value']['Price_Range-Deal_Type'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Discount_Range').get_v_value(),
            exp_values['v_value']['Price_Range-Discount_Range'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result('Price_Range',
                                                       'Source').get_v_value(),
            exp_values['v_value']['Price_Range-Source'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Platform').get_v_value(),
            exp_values['v_value']['Price_Range-Platform'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Buyer_Age').get_v_value(),
            exp_values['v_value']['Price_Range-Buyer_Age'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Buyer_Gender').get_v_value(),
            exp_values['v_value']['Price_Range-Buyer-Gender'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Tenure_in_Days').get_v_value(),
            exp_values['v_value']['Price_Range-Tenure_in_Days'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result('Price_Range',
                                                       'Sales').get_v_value(),
            exp_values['v_value']['Price_Range-Sales'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Marketing_Cost').get_v_value(),
            exp_values['v_value']['Price_Range-Marketing_Cost'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Shipping_Cost').get_v_value(),
            exp_values['v_value']['Price_Range-Shipping_Cost'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Last_Transaction').get_v_value(),
            exp_values['v_value']['Price_Range-Last_Transaction'])

    def test_chisquare_analysis(self):
        target_chisquare_result = self.df_chisquare_result['Price_Range']
        chisquare_result = self.df_chisquare_obj.get_chisquare_result(
            'Price_Range', 'Buyer_Gender')
        out = ChiSquareAnalysis(
            self.df_context, self.df_helper, chisquare_result, 'Price_Range',
            'Buyer_Gender', self.significant_variables,
            self.num_analysed_variables, self.data_frame, self.measure_columns,
            self.base_dir, None,
            target_chisquare_result)._generate_narratives()

        self.assertEqual(out['data_dict'], exp_data_dict)
        self.assertEqual(out['target_dict']['11 to 50'],
                         out['target_dict']['11 to 50'])
        self.assertEqual(out['target_dict']['101 to 500'],
                         out['target_dict']['101 to 500'])
        self.assertEqual(out['target_dict']['0 to 10'],
                         out['target_dict']['0 to 10'])