def Run(self): df_chisquare_obj = ChiSquare( self._data_frame, self._dataframe_helper, self._dataframe_context, self._metaParser).test_all(dimension_columns=( self._dataframe_context.get_result_column(), )) # df_chisquare_result = CommonUtils.as_dict(df_chisquare_obj) # print 'RESULT: %s' % (json.dumps(df_chisquare_result, indent=2)) # DataWriter.write_dict_as_json(self._spark, df_chisquare_result, self._dataframe_context.get_result_file()+'ChiSquare/') # Narratives # print self._data_frame.select('Sales').show() if df_chisquare_obj.get_result() != {}: chisquare_narratives = CommonUtils.as_dict( ChiSquareNarratives(self._dataframe_helper, df_chisquare_obj, self._spark, self._dataframe_context, self._data_frame, self._story_narrative, self._result_setter))
class TestChiSquare(unittest.TestCase): # def __init__(self): # pass def setUp(self): APP_NAME = "test" spark = CommonUtils.get_spark_session(app_name=APP_NAME, hive_environment=False) spark.sparkContext.setLogLevel("ERROR") # spark.conf.set("spark.sql.execution.arrow.enabled", "true") configJson = get_test_configs("testCase", testFor="chisquare") config = configJson["config"] jobConfig = configJson["job_config"] jobType = jobConfig["job_type"] jobName = jobConfig["job_name"] jobURL = jobConfig["job_url"] messageURL = jobConfig["message_url"] try: errorURL = jobConfig["error_reporting_url"] except: errorURL = None if "app_id" in jobConfig: appid = jobConfig["app_id"] else: appid = None debugMode = True LOGGER = {} configJsonObj = configparser.ParserConfig(config) configJsonObj.set_json_params() configJsonObj = configparser.ParserConfig(config) configJsonObj.set_json_params() dataframe_context = ContextSetter(configJsonObj) dataframe_context.set_job_type( jobType ) #jobType should be set before set_params call of dataframe_context dataframe_context.set_params() dataframe_context.set_message_url(messageURL) dataframe_context.set_app_id(appid) dataframe_context.set_debug_mode(debugMode) dataframe_context.set_job_url(jobURL) dataframe_context.set_app_name(APP_NAME) dataframe_context.set_error_url(errorURL) dataframe_context.set_logger(LOGGER) dataframe_context.set_xml_url(jobConfig["xml_url"]) dataframe_context.set_job_name(jobName) dataframe_context.set_environment("debugMode") dataframe_context.set_message_ignore(True) dataframe_context.set_analysis_name("Descriptive analysis") df = MasterHelper.load_dataset(spark, dataframe_context) metaParserInstance = MasterHelper.get_metadata(df, spark, dataframe_context, None) df, df_helper = MasterHelper.set_dataframe_helper( df, dataframe_context, metaParserInstance) targetVal = dataframe_context.get_result_column() self.result_setter = ResultSetter(dataframe_context) self.story_narrative = NarrativesTree() self.story_narrative.set_name( "{} Performance Report".format(targetVal)) self.data_frame = df self.df_helper = df_helper self.df_context = dataframe_context self.meta_parser = metaParserInstance self.measure_columns = df_helper.get_numeric_columns() self.base_dir = "/chisquare/" self.significant_variables = [ 'Buyer_Gender', 'Sales', 'Discount_Range', 'Shipping_Cost', 'Last_Transaction', 'Marketing_Cost' ] self.measure_columns = [ 'Tenure_in_Days', 'Sales', 'Marketing_Cost', 'Shipping_Cost', 'Last_Transaction' ] self.df_chisquare_obj = ChiSquare( self.data_frame, self.df_helper, self.df_context, self.meta_parser).test_all( dimension_columns=(self.df_context.get_result_column(), )) self.df_chisquare_result = self.df_chisquare_obj.get_result() self.num_analysed_variables = 11 def test_chisquare_dimension(self): test_dimension = ChiSquare(self.data_frame, self.df_helper, self.df_context, self.meta_parser).test_dimension( 'Price_Range', 'Source') self.assertAlmostEqual(test_dimension.get_pvalue(), exp_values['pval']['Price_Range-Source'], places=5) self.assertAlmostEqual(test_dimension.get_effect_size(), exp_values['effect_size']['Price_Range-Source'], places=5) self.assertAlmostEqual(test_dimension.get_stat(), exp_values['stats']['Price_Range-Source'], places=5) self.assertAlmostEqual(test_dimension.get_v_value(), exp_values['v_value']['Price_Range-Source'], places=5) def test_chisquare_measure(self): test_measures = ChiSquare(self.data_frame, self.df_helper, self.df_context, self.meta_parser).test_measures( 'Price_Range', 'Marketing_Cost') self.assertAlmostEqual( test_measures.get_pvalue(), exp_values['pval']['Price_Range-Marketing_Cost'], places=5) self.assertAlmostEqual( test_measures.get_effect_size(), exp_values['effect_size']['Price_Range-Marketing_Cost'], places=5) self.assertAlmostEqual( test_measures.get_stat(), exp_values['stats']['Price_Range-Marketing_Cost'], places=5) self.assertAlmostEqual( test_measures.get_v_value(), exp_values['v_value']['Price_Range-Marketing_Cost'], places=5) def test_chisquare_all(self): #PVal-Test self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Deal_Type').get_pvalue(), exp_values['pval']['Price_Range-Deal_Type']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Discount_Range').get_pvalue(), exp_values['pval']['Price_Range-Discount_Range']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result('Price_Range', 'Source').get_pvalue(), exp_values['pval']['Price_Range-Source']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Platform').get_pvalue(), exp_values['pval']['Price_Range-Platform']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Buyer_Age').get_pvalue(), exp_values['pval']['Price_Range-Buyer_Age']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Buyer_Gender').get_pvalue(), exp_values['pval']['Price_Range-Buyer-Gender']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Tenure_in_Days').get_pvalue(), exp_values['pval']['Price_Range-Tenure_in_Days']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result('Price_Range', 'Sales').get_pvalue(), exp_values['pval']['Price_Range-Sales']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Marketing_Cost').get_pvalue(), exp_values['pval']['Price_Range-Marketing_Cost']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Shipping_Cost').get_pvalue(), exp_values['pval']['Price_Range-Shipping_Cost']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Last_Transaction').get_pvalue(), exp_values['pval']['Price_Range-Last_Transaction']) #EffectSize_Test self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Deal_Type').get_effect_size(), exp_values['effect_size']['Price_Range-Deal_Type']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Discount_Range').get_effect_size(), exp_values['effect_size']['Price_Range-Discount_Range']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Source').get_effect_size(), exp_values['effect_size']['Price_Range-Source']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Platform').get_effect_size(), exp_values['effect_size']['Price_Range-Platform']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Buyer_Age').get_effect_size(), exp_values['effect_size']['Price_Range-Buyer_Age']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Buyer_Gender').get_effect_size(), exp_values['effect_size']['Price_Range-Buyer-Gender']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Tenure_in_Days').get_effect_size(), exp_values['effect_size']['Price_Range-Tenure_in_Days']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Sales').get_effect_size(), exp_values['effect_size']['Price_Range-Sales']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Marketing_Cost').get_effect_size(), exp_values['effect_size']['Price_Range-Marketing_Cost']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Shipping_Cost').get_effect_size(), exp_values['effect_size']['Price_Range-Shipping_Cost']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Last_Transaction').get_effect_size(), exp_values['effect_size']['Price_Range-Last_Transaction']) #Stats_Test self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result('Price_Range', 'Deal_Type').get_stat(), exp_values['stats']['Price_Range-Deal_Type']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Discount_Range').get_stat(), exp_values['stats']['Price_Range-Discount_Range']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result('Price_Range', 'Source').get_stat(), exp_values['stats']['Price_Range-Source']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result('Price_Range', 'Platform').get_stat(), exp_values['stats']['Price_Range-Platform']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result('Price_Range', 'Buyer_Age').get_stat(), exp_values['stats']['Price_Range-Buyer_Age']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Buyer_Gender').get_stat(), exp_values['stats']['Price_Range-Buyer-Gender']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Tenure_in_Days').get_stat(), exp_values['stats']['Price_Range-Tenure_in_Days']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result('Price_Range', 'Sales').get_stat(), exp_values['stats']['Price_Range-Sales']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Marketing_Cost').get_stat(), exp_values['stats']['Price_Range-Marketing_Cost']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Shipping_Cost').get_stat(), exp_values['stats']['Price_Range-Shipping_Cost']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Last_Transaction').get_stat(), exp_values['stats']['Price_Range-Last_Transaction']) # #VVal-Test self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Deal_Type').get_v_value(), exp_values['v_value']['Price_Range-Deal_Type']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Discount_Range').get_v_value(), exp_values['v_value']['Price_Range-Discount_Range']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result('Price_Range', 'Source').get_v_value(), exp_values['v_value']['Price_Range-Source']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Platform').get_v_value(), exp_values['v_value']['Price_Range-Platform']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Buyer_Age').get_v_value(), exp_values['v_value']['Price_Range-Buyer_Age']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Buyer_Gender').get_v_value(), exp_values['v_value']['Price_Range-Buyer-Gender']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Tenure_in_Days').get_v_value(), exp_values['v_value']['Price_Range-Tenure_in_Days']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result('Price_Range', 'Sales').get_v_value(), exp_values['v_value']['Price_Range-Sales']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Marketing_Cost').get_v_value(), exp_values['v_value']['Price_Range-Marketing_Cost']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Shipping_Cost').get_v_value(), exp_values['v_value']['Price_Range-Shipping_Cost']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Last_Transaction').get_v_value(), exp_values['v_value']['Price_Range-Last_Transaction']) def test_chisquare_analysis(self): target_chisquare_result = self.df_chisquare_result['Price_Range'] chisquare_result = self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Buyer_Gender') out = ChiSquareAnalysis( self.df_context, self.df_helper, chisquare_result, 'Price_Range', 'Buyer_Gender', self.significant_variables, self.num_analysed_variables, self.data_frame, self.measure_columns, self.base_dir, None, target_chisquare_result)._generate_narratives() self.assertEqual(out['data_dict'], exp_data_dict) self.assertEqual(out['target_dict']['11 to 50'], out['target_dict']['11 to 50']) self.assertEqual(out['target_dict']['101 to 500'], out['target_dict']['101 to 500']) self.assertEqual(out['target_dict']['0 to 10'], out['target_dict']['0 to 10'])