class RegressionNarrative(object): def __init__(self, df_helper, df_context, result_setter, spark, df_regression_result, correlations,story_narrative,meta_parser): self._metaParser = meta_parser self._result_setter = result_setter self._story_narrative = story_narrative self._df_regression_result = df_regression_result self._correlations = correlations self._dataframe_helper = df_helper self._dataframe_context = df_context self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER # self._result_setter.set_trend_section_name("regression") self._measure_columns = self._dataframe_helper.get_numeric_columns() self._dimension_columns = self._dataframe_helper.get_string_columns() self._date_columns = self._dataframe_context.get_date_columns() self._uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(self._uid_col): self._dimension_columns = list(set(self._dimension_columns) - {self._uid_col}) if len(self._date_columns) >0 : self._dimension_columns = list(set(self._dimension_columns)-set(self._date_columns)) self._spark = spark self.measures = [] self.result_column = self._dataframe_helper.resultcolumn self.all_coefficients = self._df_regression_result.get_all_coeff() all_coeff = [(x,self.all_coefficients[x]) for x in list(self.all_coefficients.keys())] all_coeff = sorted(all_coeff,key = lambda x:abs(x[1]["coefficient"]),reverse = True) self._all_coeffs = all_coeff self.significant_measures = [x[0] for x in all_coeff if x[1]['p_value']<=0.05] print(self.significant_measures) print("regression narratives started") self.narratives = {"heading": self.result_column + "Performance Report", "main_card":{}, "cards":[] } self._base_dir = "/regression/" self._run_dimension_level_regression = False # self._dim_regression = self.run_regression_for_dimension_levels() self._regressionNode = NarrativesTree() self._completionStatus = self._dataframe_context.get_completion_status() self._analysisName = self._dataframe_context.get_analysis_name() self._messageURL = self._dataframe_context.get_message_url() self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight() self._scriptStages = { "regressionNarrativeStart":{ "summary":"Started The Regression Narratives", "weight":1 }, "regressionNarrativeEnd":{ "summary":"Narratives For Regression Finished", "weight":0 }, } self._completionStatus += old_div(self._scriptWeightDict[self._analysisName]["narratives"]*self._scriptStages["regressionNarrativeStart"]["weight"],10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "regressionNarrativeStart",\ "info",\ self._scriptStages["regressionNarrativeStart"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL,progressMessage) self._dataframe_context.update_completion_status(self._completionStatus) self.generate_narratives() self._regressionNode.set_name("Influencers") self._result_setter.set_regression_node(self._regressionNode) self._completionStatus += old_div(self._scriptWeightDict[self._analysisName]["narratives"]*self._scriptStages["regressionNarrativeEnd"]["weight"],10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "regressionNarrativeEnd",\ "info",\ self._scriptStages["regressionNarrativeEnd"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL,progressMessage) self._dataframe_context.update_completion_status(self._completionStatus) def generate_narratives(self): regression_narrative_obj = LinearRegressionNarrative( self._df_regression_result, self._correlations, self._dataframe_helper, self._dataframe_context, self._metaParser, self._spark ) main_card_data = regression_narrative_obj.generate_main_card_data() main_card_narrative = NarrativesUtils.get_template_output(self._base_dir,\ 'regression_main_card.html',main_card_data) self.narratives['main_card'] = {} self.narratives["main_card"]['paragraphs'] = NarrativesUtils.paragraph_splitter(main_card_narrative) self.narratives["main_card"]['header'] = 'Key Measures that affect ' + self.result_column self.narratives["main_card"]['chart'] = {} self.narratives["main_card"]['chart']['heading'] = '' self.narratives["main_card"]['chart']['data'] = [[i for i,j in self._all_coeffs], [j['coefficient'] for i,j in self._all_coeffs]] self.narratives["main_card"]['chart']['label'] = {'x':'Measure Name', 'y': 'Change in ' + self.result_column + ' per unit increase'} main_card = NormalCard() main_card_header = HtmlData(data = '<h3>Key Measures that affect ' + self.result_column+"</h3>") main_card_paragraphs = NarrativesUtils.block_splitter(main_card_narrative,self._blockSplitter) main_card_chart_data = [{"key":val[0],"value":val[1]} for val in zip([i for i,j in self._all_coeffs],[j['coefficient'] for i,j in self._all_coeffs])] main_card_chart = NormalChartData(data=main_card_chart_data) mainCardChartJson = ChartJson() mainCardChartJson.set_data(main_card_chart.get_data()) mainCardChartJson.set_label_text({'x':'Influencing Factors','y': 'Change in ' + self.result_column + ' per unit increase'}) mainCardChartJson.set_chart_type("bar") mainCardChartJson.set_axes({"x":"key","y":"value"}) mainCardChartJson.set_yaxis_number_format(".2f") # st_info = ["Test : Regression","Threshold for p-value: 0.05", "Effect Size: Regression Coefficient"] chart_data = sorted(main_card_chart_data,key=lambda x:x["value"],reverse=True) statistical_info_array=[ ("Test Type","Regression"), ("Effect Size","Coefficients"), ("Max Effect Size",chart_data[0]["key"]), ("Min Effect Size",chart_data[-1]["key"]), ] statistical_inferenc = "" if len(chart_data) == 1: statistical_inference = "{} is the only variable that have significant influence over {} (Target) having an \ Effect size of {}".format(chart_data[0]["key"],self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4)) elif len(chart_data) == 2: statistical_inference = "There are two variables ({} and {}) that have significant influence over {} (Target) and the \ Effect size ranges are {} and {} respectively".format(chart_data[0]["key"],chart_data[1]["key"],self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4),round(chart_data[1]["value"],4)) else: statistical_inference = "There are {} variables that have significant influence over {} (Target) and the \ Effect size ranges from {} to {}".format(len(chart_data),self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4),round(chart_data[-1]["value"],4)) if statistical_inference != "": statistical_info_array.append(("Inference",statistical_inference)) statistical_info_array = NarrativesUtils.statistical_info_array_formatter(statistical_info_array) main_card.set_card_data(data = [main_card_header]+main_card_paragraphs+[C3ChartData(data=mainCardChartJson,info=statistical_info_array)]) main_card.set_card_name("Key Influencers") self._regressionNode.add_a_card(main_card) count = 0 for measure_column in self.significant_measures: sigMeasureNode = NarrativesTree() sigMeasureNode.set_name(measure_column) measureCard1 = NormalCard() measureCard1.set_card_name("{}: Impact on {}".format(measure_column,self.result_column)) measureCard1Data = [] if self._run_dimension_level_regression: measureCard2 = NormalCard() measureCard2.set_card_name("Key Areas where it Matters") measureCard2Data = [] measure_column_cards = {} card0 = {} card1data = regression_narrative_obj.generate_card1_data(measure_column) card1heading = "<h3>Impact of "+measure_column+" on "+self.result_column+"</h3>" measureCard1Header = HtmlData(data=card1heading) card1data.update({"blockSplitter":self._blockSplitter}) card1narrative = NarrativesUtils.get_template_output(self._base_dir,\ 'regression_card1.html',card1data) card1paragraphs = NarrativesUtils.block_splitter(card1narrative,self._blockSplitter) card0 = {"paragraphs":card1paragraphs} card0["charts"] = {} card0['charts']['chart2']={} # card0['charts']['chart2']['data']=card1data["chart_data"] # card0['charts']['chart2']['heading'] = '' # card0['charts']['chart2']['labels'] = {} card0['charts']['chart1']={} card0["heading"] = card1heading measure_column_cards['card0'] = card0 measureCard1Header = HtmlData(data=card1heading) measureCard1Data += [measureCard1Header] measureCard1para = card1paragraphs measureCard1Data += measureCard1para if self._run_dimension_level_regression: print("running narratives for key area dict") self._dim_regression = self.run_regression_for_dimension_levels() card2table, card2data=regression_narrative_obj.generate_card2_data(measure_column,self._dim_regression) card2data.update({"blockSplitter":self._blockSplitter}) card2narrative = NarrativesUtils.get_template_output(self._base_dir,\ 'regression_card2.html',card2data) card2paragraphs = NarrativesUtils.block_splitter(card2narrative,self._blockSplitter) card1 = {'tables': card2table, 'paragraphs' : card2paragraphs, 'heading' : 'Key Areas where ' + measure_column + ' matters'} measure_column_cards['card1'] = card1 measureCard2Data += card2paragraphs if "table1" in card2table: table1data = regression_narrative_obj.convert_table_data(card2table["table1"]) card2Table1 = TableData() card2Table1.set_table_data(table1data) card2Table1.set_table_type("heatMap") card2Table1.set_table_top_header(card2table["table1"]["heading"]) card2Table1Json = json.loads(CommonUtils.convert_python_object_to_json(card2Table1)) # measureCard2Data.insert(3,card2Table1) measureCard2Data.insert(3,card2Table1Json) if "table2" in card2table: table2data = regression_narrative_obj.convert_table_data(card2table["table2"]) card2Table2 = TableData() card2Table2.set_table_data(table2data) card2Table2.set_table_type("heatMap") card2Table2.set_table_top_header(card2table["table2"]["heading"]) # measureCard2Data.insert(5,card2Table2) card2Table2Json = json.loads(CommonUtils.convert_python_object_to_json(card2Table2)) # measureCard2Data.append(card2Table2) measureCard2Data.append(card2Table2Json) # self._result_setter.set_trend_section_data({"result_column":self.result_column, # "measure_column":measure_column, # "base_dir":self._base_dir # }) # trend_narratives_obj = TimeSeriesNarrative(self._dataframe_helper, self._dataframe_context, self._result_setter, self._spark, self._story_narrative) # card2 = trend_narratives_obj.get_regression_trend_card_data() # if card2: # measure_column_cards['card2'] = card2 # # # card3 = {} progressMessage = CommonUtils.create_progress_message_object(self._analysisName,"custom","info","Analyzing Key Influencers",self._completionStatus,self._completionStatus,display=True) CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=False) card4data = regression_narrative_obj.generate_card4_data(self.result_column,measure_column) card4data.update({"blockSplitter":self._blockSplitter}) # card4heading = "Sensitivity Analysis: Effect of "+self.result_column+" on Segments of "+measure_column card4narrative = NarrativesUtils.get_template_output(self._base_dir,\ 'regression_card4.html',card4data) card4paragraphs = NarrativesUtils.block_splitter(card4narrative,self._blockSplitter) # card3 = {"paragraphs":card4paragraphs} card0['paragraphs'] = card1paragraphs+card4paragraphs card4Chart = card4data["charts"] # st_info = ["Test : Regression", "Variables : "+ self.result_column +", "+measure_column,"Intercept : "+str(round(self._df_regression_result.get_intercept(),2)), "Regression Coefficient : "+ str(round(self._df_regression_result.get_coeff(measure_column),2))] statistical_info_array=[ ("Test Type","Regression"), ("Coefficient",str(round(self._df_regression_result.get_coeff(measure_column),2))), ("P-Value","<= 0.05"), ("Intercept",str(round(self._df_regression_result.get_intercept(),2))), ("R Square ",str(round(self._df_regression_result.get_rsquare(),2))), ] inferenceTuple = () coeff = self._df_regression_result.get_coeff(measure_column) if coeff > 0: inferenceTuple = ("Inference","For every additional unit of increase in {} there will be an increase of {} units in {} (target).".format(measure_column,str(round(coeff,2)),self._dataframe_context.get_result_column())) else: inferenceTuple = ("Inference","For every additional unit of decrease in {} there will be an decrease of {} units in {} (target).".format(measure_column,str(round(coeff,2)),self._dataframe_context.get_result_column())) if len(inferenceTuple) > 0: statistical_info_array.append(inferenceTuple) statistical_info_array = NarrativesUtils.statistical_info_array_formatter(statistical_info_array) card4paragraphs.insert(2,C3ChartData(data=card4Chart,info=statistical_info_array)) measureCard1Data += card4paragraphs self.narratives['cards'].append(measure_column_cards) if count == 0: card4data.pop("charts") self._result_setter.update_executive_summary_data(card4data) count += 1 measureCard1.set_card_data(measureCard1Data) if self._run_dimension_level_regression: measureCard2.set_card_data(measureCard2Data) sigMeasureNode.add_cards([measureCard1,measureCard2]) sigMeasureNode.add_cards([measureCard1]) self._regressionNode.add_a_node(sigMeasureNode) # self._result_setter.set_trend_section_completion_status(True) self._story_narrative.add_a_node(self._regressionNode) def run_regression_for_dimension_levels(self): print("Running regression for Dimension Levels") significant_dimensions = self._dataframe_helper.get_significant_dimension() print("significant_dimensions:",significant_dimensions) if significant_dimensions != {}: sig_dims = [(x,significant_dimensions[x]) for x in list(significant_dimensions.keys())] sig_dims = sorted(sig_dims,key=lambda x:x[1],reverse=True) cat_columns = [x[0] for x in sig_dims[:5]] else: cat_columns = self._dimension_columns[:5] cat_columns= [x for x in cat_columns if x != "Agent Name"] print("Running regression for below 5 dimensions") print(cat_columns) regression_result_dimension_cols = dict(list(zip(cat_columns,[{}]*len(cat_columns)))) for col in cat_columns: print("For Column:",col) # column_levels = self._dataframe_helper.get_all_levels(col) column_levels = list(self._metaParser.get_unique_level_dict(col).keys()) level_regression_result = dict(list(zip(column_levels,[{}]*len(column_levels)))) print("No of levels in this column",len(column_levels)) for level in column_levels: print("Filtering data for level:",level) filtered_df = self._dataframe_helper.filter_dataframe(col,level) result = LinearRegression(filtered_df, self._dataframe_helper, self._dataframe_context,self._metaParser,self._spark).fit(self._dataframe_context.get_result_column()) if result == None: result = {"intercept" : 0.0, "rmse" : 0.0, "rsquare" : 0.0, "coeff" : 0.0 } else: result = {"intercept" : result.get_intercept(), "rmse" : result.get_root_mean_square_error(), "rsquare" : result.get_rsquare(), "coeff" : result.get_all_coeff() } level_regression_result[level] = result regression_result_dimension_cols[col] = level_regression_result # print json.dumps(regression_result_dimension_cols,indent=2) return regression_result_dimension_cols
class ChiSquareNarratives: #@accepts(object, int, DFChiSquareResult ,ContextSetter) def __init__(self, df_helper, df_chisquare_result, spark, df_context, data_frame, story_narrative, result_setter, scriptWeight=None, analysisName=None): self._story_narrative = story_narrative self._result_setter = result_setter self._data_frame = data_frame self._dataframe_context = df_context self._dataframe_helper = df_helper self._storyOnScoredData = self._dataframe_context.get_story_on_scored_data( ) self._measure_columns = df_helper.get_numeric_columns() self._df_chisquare = df_chisquare_result self._df_chisquare_result = df_chisquare_result.get_result() self.narratives = {} self._appid = df_context.get_app_id() self._chiSquareNode = NarrativesTree() self._chiSquareNode.set_name("Association") self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER self._noOfSigDimsToShow = GLOBALSETTINGS.CHISQUARESIGNIFICANTDIMENSIONTOSHOW self._base_dir = "/chisquare/" self._spark = spark ############################DataFrame Measure to Dimesion Column##################### pandas_df = self._data_frame.toPandas() target_dimension = self._df_chisquare_result.keys() bin_data = {} for col in self._measure_columns: chisquare_result = self._df_chisquare.get_chisquare_result( target_dimension[0], col) bin_data[col] = chisquare_result.get_contingency_table( ).get_column_two_levels() for bin_col in bin_data.keys(): for split in bin_data[bin_col]: val = split.split('to') pandas_df[bin_col][ (pandas_df[bin_col] >= float(val[0].replace(',', ''))) & (pandas_df[bin_col] < float(val[1].replace(',', '')) )] = split fields = [ StructField(field_name, StringType(), True) for field_name in pandas_df.columns ] schema = StructType(fields) SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) self._data_frame = SQLctx.createDataFrame(pandas_df, schema) # print self._data_frame ############################DataFrame Measure to Dimesion Column##################### if self._appid != None: if self._appid == "1": self._base_dir += "appid1/" elif self._appid == "2": self._base_dir += "appid2/" self._completionStatus = self._dataframe_context.get_completion_status( ) if analysisName == None: self._analysisName = self._dataframe_context.get_analysis_name() else: self._analysisName = analysisName self._messageURL = self._dataframe_context.get_message_url() if scriptWeight == None: self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight( ) else: self._scriptWeightDict = scriptWeight self._analysisDict = self._dataframe_context.get_analysis_dict() if self._analysisDict != {}: self._nColsToUse = self._analysisDict[ self._analysisName]["noOfColumnsToUse"] else: self._nColsToUse = None self._scriptStages = { "initialization": { "summary": "Initialized the Frequency Narratives", "weight": 0 }, "summarygeneration": { "summary": "summary generation finished", "weight": 10 }, "completion": { "summary": "Frequency Stats Narratives done", "weight": 0 }, } CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "initialization", "info", display=False, weightKey="narratives") self._generate_narratives() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "summarygeneration", "info", display=False, weightKey="narratives") CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "completion", "info", display=False, weightKey="narratives") def _generate_narratives(self): """ generate main card narrative and remaining cards are generated by calling ChiSquareAnalysis class for each of analyzed dimensions """ for target_dimension in self._df_chisquare_result.keys(): target_chisquare_result = self._df_chisquare_result[ target_dimension] analysed_variables = target_chisquare_result.keys( ) ## List of all analyzed var. # List of significant var out of analyzed var. significant_variables = [ dim for dim in target_chisquare_result.keys() if target_chisquare_result[dim].get_pvalue() <= 0.05 ] effect_sizes = [ target_chisquare_result[dim].get_effect_size() for dim in significant_variables ] effect_size_dict = dict(zip(significant_variables, effect_sizes)) significant_variables = [ y for (x, y) in sorted(zip(effect_sizes, significant_variables), reverse=True) ] #insignificant_variables = [i for i in self._df_chisquare_result[target_dimension] if i['pv']>0.05] num_analysed_variables = len(analysed_variables) num_significant_variables = len(significant_variables) self.narratives['main_card'] = {} self.narratives['main_card'][ 'heading'] = 'Relationship between ' + target_dimension + ' and other factors' self.narratives['main_card']['paragraphs'] = {} data_dict = { 'num_variables': num_analysed_variables, 'num_significant_variables': num_significant_variables, 'significant_variables': significant_variables, 'target': target_dimension, 'analysed_dimensions': analysed_variables, 'blockSplitter': self._blockSplitter } # for both para 1 and para 2 paragraph = {} paragraph['header'] = '' paragraph['content'] = NarrativesUtils.get_template_output( self._base_dir, 'main_card.html', data_dict) self.narratives['main_card']['paragraphs'] = [paragraph] self.narratives['cards'] = [] chart = { 'header': 'Strength of association between ' + target_dimension + ' and other dimensions' } chart['data'] = effect_size_dict chart['label_text'] = { 'x': 'Dimensions', 'y': 'Effect Size (Cramers-V)' } chart_data = [] chartDataValues = [] for k, v in effect_size_dict.items(): chart_data.append({"key": k, "value": float(v)}) chartDataValues.append(float(v)) chart_data = sorted(chart_data, key=lambda x: x["value"], reverse=True) chart_json = ChartJson() chart_json.set_data(chart_data) chart_json.set_chart_type("bar") # chart_json.set_label_text({'x':'Dimensions','y':'Effect Size (Cramers-V)'}) chart_json.set_label_text({ 'x': ' ', 'y': 'Effect Size (Cramers-V)' }) chart_json.set_axis_rotation(True) chart_json.set_axes({"x": "key", "y": "value"}) # chart_json.set_yaxis_number_format(".4f") chart_json.set_yaxis_number_format( NarrativesUtils.select_y_axis_format(chartDataValues)) self.narratives['main_card']['chart'] = chart main_card = NormalCard() header = "<h3>Strength of association between " + target_dimension + " and other dimensions</h3>" main_card_data = [HtmlData(data=header)] main_card_narrative = NarrativesUtils.get_template_output( self._base_dir, 'main_card.html', data_dict) main_card_narrative = NarrativesUtils.block_splitter( main_card_narrative, self._blockSplitter) main_card_data += main_card_narrative # st_info = ["Test : Chi Square", "Threshold for p-value : 0.05", "Effect Size : Cramer's V"] # print "chartdata",chart_data if len(chart_data) > 0: statistical_info_array = [ ("Test Type", "Chi-Square"), ("Effect Size", "Cramer's V"), ("Max Effect Size", chart_data[0]["key"]), ("Min Effect Size", chart_data[-1]["key"]), ] statistical_inferenc = "" if len(chart_data) == 1: statistical_inference = "{} is the only variable that have significant association with the {} (Target) having an \ Effect size of {}".format( chart_data[0]["key"], self._dataframe_context.get_result_column(), round(chart_data[0]["value"], 4)) elif len(chart_data) == 2: statistical_inference = "There are two variables ({} and {}) that have significant association with the {} (Target) and the \ Effect size ranges are {} and {} respectively".format( chart_data[0]["key"], chart_data[1]["key"], self._dataframe_context.get_result_column(), round(chart_data[0]["value"], 4), round(chart_data[1]["value"], 4)) else: statistical_inference = "There are {} variables that have significant association with the {} (Target) and the \ Effect size ranges from {} to {}".format( len(chart_data), self._dataframe_context.get_result_column(), round(chart_data[0]["value"], 4), round(chart_data[-1]["value"], 4)) if statistical_inference != "": statistical_info_array.append( ("Inference", statistical_inference)) statistical_info_array = NarrativesUtils.statistical_info_array_formatter( statistical_info_array) else: statistical_info_array = [] main_card_data.append( C3ChartData(data=chart_json, info=statistical_info_array)) main_card.set_card_data(main_card_data) main_card.set_card_name("Key Influencers") if self._storyOnScoredData != True: self._chiSquareNode.add_a_card(main_card) self._result_setter.add_a_score_chi_card(main_card) print "target_dimension", target_dimension if self._appid == '2' and num_significant_variables > 5: significant_variables = significant_variables[:5] else: if self._nColsToUse != None: significant_variables = significant_variables[:self. _nColsToUse] CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "custom", "info", display=True, customMsg="Analyzing key drivers", weightKey="narratives") for analysed_dimension in significant_variables[:self. _noOfSigDimsToShow]: chisquare_result = self._df_chisquare.get_chisquare_result( target_dimension, analysed_dimension) if self._appid == '2': print "APPID 2 is used" card = ChiSquareAnalysis( self._dataframe_context, self._dataframe_helper, chisquare_result, target_dimension, analysed_dimension, significant_variables, num_analysed_variables, self._data_frame, self._measure_columns, self._base_dir, None, target_chisquare_result) # self.narratives['cards'].append(card) self._result_setter.add_a_score_chi_card( json.loads( CommonUtils.convert_python_object_to_json( card.get_dimension_card1()))) elif self._appid == '1': print "APPID 1 is used" card = ChiSquareAnalysis( self._dataframe_context, self._dataframe_helper, chisquare_result, target_dimension, analysed_dimension, significant_variables, num_analysed_variables, self._data_frame, self._measure_columns, self._base_dir, None, target_chisquare_result) # self.narratives['cards'].append(card) self._result_setter.add_a_score_chi_card( json.loads( CommonUtils.convert_python_object_to_json( card.get_dimension_card1()))) else: target_dimension_card = ChiSquareAnalysis( self._dataframe_context, self._dataframe_helper, chisquare_result, target_dimension, analysed_dimension, significant_variables, num_analysed_variables, self._data_frame, self._measure_columns, self._base_dir, None, target_chisquare_result) self.narratives['cards'].append(target_dimension_card) self._chiSquareNode.add_a_node( target_dimension_card.get_dimension_node()) self._story_narrative.add_a_node(self._chiSquareNode) self._result_setter.set_chisquare_node(self._chiSquareNode)
class ChiSquareNarratives(object): #@accepts(object, int, DFChiSquareResult ,ContextSetter) def __init__(self, df_helper, df_chisquare_result, spark, df_context, data_frame, story_narrative, result_setter, scriptWeight=None, analysisName=None): self._story_narrative = story_narrative self._result_setter = result_setter self._data_frame = data_frame self._dataframe_context = df_context self._pandas_flag = df_context._pandas_flag self._dataframe_helper = df_helper self._storyOnScoredData = self._dataframe_context.get_story_on_scored_data( ) self._measure_columns = df_helper.get_numeric_columns() self._df_chisquare = df_chisquare_result self._df_chisquare_result = df_chisquare_result.get_result() self.narratives = {} self._appid = df_context.get_app_id() self._chiSquareNode = NarrativesTree() self._chiSquareNode.set_name("Key Drivers") self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER self._noOfSigDimsToShow = GLOBALSETTINGS.CHISQUARESIGNIFICANTDIMENSIONTOSHOW self._base_dir = "/chisquare/" self._spark = spark ############################DataFrame Measure to Dimesion Column##################### if self._pandas_flag: pandas_df = self._data_frame.copy(deep=True) else: pandas_df = self._data_frame.toPandas() target_dimension = list(self._df_chisquare_result.keys()) bin_data = {} for col in self._measure_columns: if self._df_chisquare.get_chisquare_result(target_dimension[0], col): chisquare_result = self._df_chisquare.get_chisquare_result( target_dimension[0], col) bin_data[col] = chisquare_result.get_contingency_table( ).get_column_two_levels() for bin_col in list(bin_data.keys()): for split in bin_data[bin_col]: val = split.split('to') # pandas_df[bin_col][(float(pandas_df[bin_col])>=float(val[0].replace(',',''))) & (float(pandas_df[bin_col])<float(val[1].replace(',','')))] = split row_value = list(pandas_df[bin_col]) temp = [] for row_value_ in row_value: if not isinstance(row_value_, str) and \ (float(row_value_) >= float(val[0].replace(',',''))) and \ (float(row_value_) < float(val[1].replace(',',''))): temp.append(split) else: temp.append(row_value_) pandas_df[bin_col] = temp if self._pandas_flag: pass # self._data_frame = pandas_df else: fields = [ StructField(field_name, StringType(), True) for field_name in pandas_df.columns ] schema = StructType(fields) SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) self._data_frame = SQLctx.createDataFrame(pandas_df, schema) # print self._data_frame ############################DataFrame Measure to Dimesion Column##################### if self._appid != None: if self._appid == "1": self._base_dir += "appid1/" elif self._appid == "2": self._base_dir += "appid2/" self._completionStatus = self._dataframe_context.get_completion_status( ) if analysisName == None: self._analysisName = self._dataframe_context.get_analysis_name() else: self._analysisName = analysisName self._messageURL = self._dataframe_context.get_message_url() if scriptWeight == None: self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight( ) else: self._scriptWeightDict = scriptWeight self._analysisDict = self._dataframe_context.get_analysis_dict() if self._analysisDict != {}: self._nColsToUse = self._analysisDict[ self._analysisName]["noOfColumnsToUse"] else: self._nColsToUse = None self._scriptStages = { "initialization": { "summary": "Initialized the Frequency Narratives", "weight": 0 }, "summarygeneration": { "summary": "Summary Generation Finished", "weight": 4 }, "completion": { "summary": "Frequency Stats Narratives Done", "weight": 0 }, } CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "initialization", "info", display=False, weightKey="narratives") self.new_effect_size, self.signi_dict = self.feat_imp_threshold( target_dimension) self._generate_narratives() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "summarygeneration", "info", display=False, weightKey="narratives") CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "completion", "info", display=False, weightKey="narratives") def feat_imp_threshold(self, target_dimension, dummy_Cols=True, label_encoding=False): if self._pandas_flag: if is_numeric_dtype(self._data_frame[target_dimension[0]]): self.app_type = 'regression' elif is_string_dtype(self._data_frame[target_dimension[0]]): self.app_type = 'classification' else: if self._data_frame.select( target_dimension[0]).dtypes[0][1] == 'string': self.app_type = 'classification' elif self._data_frame.select( target_dimension[0]).dtypes[0][1] in ['int', 'double']: self.app_type = 'regression' try: DataValidation_obj = DataValidation(self._data_frame, target_dimension[0], self.app_type, self._pandas_flag) DataValidation_obj.data_validation_run() except Exception as e: CommonUtils.print_errors_and_store_traceback( self.LOGGER, "datavalidation", e) CommonUtils.save_error_messages(self.errorURL, self.app_type, e, ignore=self.ignoreMsg) try: DataPreprocessingAutoML_obj = DataPreprocessingAutoML( DataValidation_obj.data_frame, DataValidation_obj.target, DataValidation_obj.data_change_dict, DataValidation_obj.numeric_cols, DataValidation_obj.dimension_cols, DataValidation_obj.datetime_cols, DataValidation_obj.problem_type, self._pandas_flag) DataPreprocessingAutoML_obj.data_preprocessing_run() except Exception as e: CommonUtils.print_errors_and_store_traceback( self.LOGGER, "dataPreprocessing", e) CommonUtils.save_error_messages(self.errorURL, self.app_type, e, ignore=self.ignoreMsg) preprocess_df = DataPreprocessingAutoML_obj.data_frame FeatureEngineeringAutoML_obj = FeatureEngineeringAutoML( DataPreprocessingAutoML_obj.data_frame, DataPreprocessingAutoML_obj.target, DataPreprocessingAutoML_obj.data_change_dict, DataPreprocessingAutoML_obj.numeric_cols, DataPreprocessingAutoML_obj.dimension_cols, DataPreprocessingAutoML_obj.datetime_cols, DataPreprocessingAutoML_obj.problem_type, self._pandas_flag) if FeatureEngineeringAutoML_obj.datetime_cols != 0: FeatureEngineeringAutoML_obj.date_column_split( FeatureEngineeringAutoML_obj.datetime_cols) if dummy_Cols: if self._pandas_flag: FeatureEngineeringAutoML_obj.sk_one_hot_encoding( FeatureEngineeringAutoML_obj.dimension_cols) clean_df = FeatureEngineeringAutoML_obj.data_frame else: FeatureEngineeringAutoML_obj.pyspark_one_hot_encoding( FeatureEngineeringAutoML_obj.dimension_cols) clean_df = FeatureEngineeringAutoML_obj.data_frame if label_encoding: if self._pandas_flag: for column_name in FeatureEngineeringAutoML_obj.dimension_cols: preprocess_df[ column_name + '_label_encoded'] = LabelEncoder().fit_transform( preprocess_df[column_name]) preprocess_df = preprocess_df.drop(column_name, 1) clean_df = preprocess_df else: FeatureEngineeringAutoML_obj.pyspark_label_encoding( FeatureEngineeringAutoML_obj.dimension_cols) clean_df = FeatureEngineeringAutoML_obj.data_frame if self._pandas_flag: ind_var = clean_df.drop(target_dimension[0], 1) ind_var = ind_var[ind_var._get_numeric_data().columns] target = clean_df[target_dimension[0]] dtree = DecisionTreeClassifier(criterion='gini', max_depth=5, random_state=42) dtree.fit(ind_var, target) feat_imp_dict = {} for feature, importance in zip(list(ind_var.columns), dtree.feature_importances_): feat_imp_dict[feature] = round(importance, 2) else: num_var = [ col[0] for col in clean_df.dtypes if ((col[1] == 'int') | (col[1] == 'double')) & (col[0] != target_dimension[0]) ] num_var = [col for col in num_var if not col.endswith('indexed')] labels_count = [ len(clean_df.select(col).distinct().collect()) for col in num_var ] # labels_count = [len(clean_df.agg((F.collect_set(col).alias(col))).first().asDict()[col]) for col in num_var] labels_count.sort() max_count = labels_count[-1] label_indexes = StringIndexer(inputCol=target_dimension[0], outputCol='label', handleInvalid='keep') assembler = VectorAssembler(inputCols=num_var, outputCol="features") model = pysparkDecisionTreeClassifier(labelCol="label", featuresCol="features", seed=8464, impurity='gini', maxDepth=5, maxBins=max_count + 2) pipe = Pipeline(stages=[assembler, label_indexes, model]) mod_fit = pipe.fit(clean_df) df2 = mod_fit.transform(clean_df) list_extract = [] for i in df2.schema["features"].metadata["ml_attr"]["attrs"]: list_extract = list_extract + df2.schema["features"].metadata[ "ml_attr"]["attrs"][i] varlist = pd.DataFrame(list_extract) varlist['score'] = varlist['idx'].apply( lambda x: mod_fit.stages[-1].featureImportances[x]) feat_imp_dict = pd.Series(varlist.score.values, index=varlist.name).to_dict() feat_imp_ori_dict = {} actual_cols = list(self._data_frame.columns) actual_cols.remove(target_dimension[0]) for col in actual_cols: fea_imp_ori_list = [] for col_imp in feat_imp_dict: temp = col_imp.split(col, -1) if len(temp) == 2: fea_imp_ori_list.append(feat_imp_dict[col_imp]) feat_imp_ori_dict.update({col: sum(fea_imp_ori_list)}) sort_dict = dict( sorted(feat_imp_ori_dict.items(), key=lambda x: x[1], reverse=True)) if self._pandas_flag: self._data_frame = self._data_frame.apply( lambda col: pd.to_datetime(col, errors='ignore') if col.dtypes == object else col, axis=0) cat_var = [ key for key in dict(self._data_frame.dtypes) if dict(self._data_frame.dtypes)[key] in ['object'] ] else: cat_var = [ col[0] for col in self._data_frame.dtypes if col[1] == 'string' ] cat_var.remove(target_dimension[0]) si_var_dict = { key: value for key, value in sort_dict.items() if key in cat_var } threshold = 0 si_var_thresh = {} for key, value in si_var_dict.items(): threshold = threshold + value if threshold < 0.8: si_var_thresh[key] = value return feat_imp_dict, si_var_thresh def _generate_narratives(self): """ generate main card narrative and remaining cards are generated by calling ChiSquareAnalysis class for each of analyzed dimensions """ for target_dimension in list(self._df_chisquare_result.keys()): target_chisquare_result = self._df_chisquare_result[ target_dimension] analysed_variables = list( target_chisquare_result.keys()) ## List of all analyzed var. # List of significant var out of analyzed var. # significant_variables = [dim for dim in list(target_chisquare_result.keys()) if target_chisquare_result[dim].get_pvalue()<=0.05] effect_size_dict = self.new_effect_size significant_variables = list(self.signi_dict.keys()) effect_sizes = list(self.signi_dict.values()) significant_variables = [ y for (x, y) in sorted(zip(effect_sizes, significant_variables), reverse=True) if round(float(x), 2) > 0 ] #insignificant_variables = [i for i in self._df_chisquare_result[target_dimension] if i['pv']>0.05] num_analysed_variables = len(analysed_variables) num_significant_variables = len(significant_variables) self.narratives['main_card'] = {} self.narratives['main_card'][ 'heading'] = 'Relationship between ' + target_dimension + ' and other factors' self.narratives['main_card']['paragraphs'] = {} data_dict = { 'num_variables': num_analysed_variables, 'num_significant_variables': num_significant_variables, 'significant_variables': significant_variables, 'target': target_dimension, 'analysed_dimensions': analysed_variables, 'blockSplitter': self._blockSplitter } # for both para 1 and para 2 paragraph = {} paragraph['header'] = '' paragraph['content'] = NarrativesUtils.get_template_output( self._base_dir, 'main_card.html', data_dict) self.narratives['main_card']['paragraphs'] = [paragraph] self.narratives['cards'] = [] chart = { 'header': 'Strength of association between ' + target_dimension + ' and other dimensions' } chart['data'] = effect_size_dict chart['label_text'] = { 'x': 'Dimensions', 'y': 'Feature Importance' } chart_data = [] chartDataValues = [] for k, v in list(effect_size_dict.items()): "rounding the chart data for keydrivers tab" if round(float(v), 2) > 0: chart_data.append({ "Attribute": k, "Effect_Size": round(float(v), 2) }) chartDataValues.append(round(float(v), 2)) chart_data = sorted(chart_data, key=lambda x: x["Effect_Size"], reverse=True) chart_json = ChartJson() chart_json.set_data(chart_data) chart_json.set_chart_type("bar") # chart_json.set_label_text({'x':'Dimensions','y':'Effect Size (Cramers-V)'}) chart_json.set_label_text({'x': ' ', 'y': 'Feature Importance'}) chart_json.set_axis_rotation(True) chart_json.set_axes({"x": "Attribute", "y": "Feature Importance"}) chart_json.set_yaxis_number_format(".2f") # chart_json.set_yaxis_number_format(NarrativesUtils.select_y_axis_format(chartDataValues)) self.narratives['main_card']['chart'] = chart main_card = NormalCard() header = "<h3>Key Factors that drive " + target_dimension + "</h3>" main_card_data = [HtmlData(data=header)] main_card_narrative = NarrativesUtils.get_template_output( self._base_dir, 'main_card.html', data_dict) main_card_narrative = NarrativesUtils.block_splitter( main_card_narrative, self._blockSplitter) main_card_data += main_card_narrative # st_info = ["Test : Chi Square", "Threshold for p-value : 0.05", "Effect Size : Cramer's V"] # print "chartdata",chart_data if len(chart_data) > 0: statistical_info_array = [ ("Test Type", "Chi-Square"), ("Effect Size", "Cramer's V"), ("Max Effect Size", chart_data[0]["Attribute"]), ("Min Effect Size", chart_data[-1]["Attribute"]), ] statistical_inferenc = "" if len(chart_data) == 1: statistical_inference = "{} is the only variable that have significant association with the {} (Target) having an \ Effect size of {}".format( chart_data[0]["Attribute"], self._dataframe_context.get_result_column(), round(chart_data[0]["Effect_Size"], 4)) elif len(chart_data) == 2: statistical_inference = "There are two variables ({} and {}) that have significant association with the {} (Target) and the \ Effect size ranges are {} and {} respectively".format( chart_data[0]["Attribute"], chart_data[1]["Attribute"], self._dataframe_context.get_result_column(), round(chart_data[0]["Effect_Size"], 4), round(chart_data[1]["Effect_Size"], 4)) else: statistical_inference = "There are {} variables that have significant association with the {} (Target) and the \ Effect size ranges from {} to {}".format( len(chart_data), self._dataframe_context.get_result_column(), round(chart_data[0]["Effect_Size"], 4), round(chart_data[-1]["Effect_Size"], 4)) if statistical_inference != "": statistical_info_array.append( ("Inference", statistical_inference)) statistical_info_array = NarrativesUtils.statistical_info_array_formatter( statistical_info_array) else: statistical_info_array = [] main_card_data.append( C3ChartData(data=chart_json, info=statistical_info_array)) main_card.set_card_data(main_card_data) main_card.set_card_name("Key Influencers") if self._storyOnScoredData != True: self._chiSquareNode.add_a_card(main_card) self._result_setter.add_a_score_chi_card(main_card) print("target_dimension", target_dimension) if self._appid == '2' and num_significant_variables > 5: significant_variables = significant_variables[:5] else: if self._nColsToUse != None: significant_variables = significant_variables[:self. _nColsToUse] nColsToUse_temp = self._nColsToUse else: nColsToUse_temp = self._noOfSigDimsToShow CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "custom", "info", display=True, customMsg="Analyzing key drivers", weightKey="narratives") for analysed_dimension in significant_variables[:nColsToUse_temp]: chisquare_result = self._df_chisquare.get_chisquare_result( target_dimension, analysed_dimension) if self._appid == '2': print("APPID 2 is used") card = ChiSquareAnalysis( self._dataframe_context, self._dataframe_helper, chisquare_result, target_dimension, analysed_dimension, significant_variables, num_analysed_variables, self._data_frame, self._measure_columns, self._base_dir, None, target_chisquare_result) # self.narratives['cards'].append(card) self._result_setter.add_a_score_chi_card( json.loads( CommonUtils.convert_python_object_to_json( card.get_dimension_card1()))) elif self._appid == '1': print("APPID 1 is used") card = ChiSquareAnalysis( self._dataframe_context, self._dataframe_helper, chisquare_result, target_dimension, analysed_dimension, significant_variables, num_analysed_variables, self._data_frame, self._measure_columns, self._base_dir, None, target_chisquare_result) # self.narratives['cards'].append(card) self._result_setter.add_a_score_chi_card( json.loads( CommonUtils.convert_python_object_to_json( card.get_dimension_card1()))) else: target_dimension_card = ChiSquareAnalysis( self._dataframe_context, self._dataframe_helper, chisquare_result, target_dimension, analysed_dimension, significant_variables, num_analysed_variables, self._data_frame, self._measure_columns, self._base_dir, None, target_chisquare_result) self.narratives['cards'].append(target_dimension_card) self._chiSquareNode.add_a_node( target_dimension_card.get_dimension_node()) self._story_narrative.add_a_node(self._chiSquareNode) self._result_setter.set_chisquare_node(self._chiSquareNode)
data_dict_overall["price_trend"] = stockPriceTrendArrayFormatted data_dict_overall["avg_sentiment_score"] = data_dict_overall["avg_sentiment_score"]/number_stocks data_dict_overall["stock_value_change"] = data_dict_overall["stock_value_change"]/number_stocks data_dict_overall["stock_percent_change"] = data_dict_overall["stock_percent_change"]/number_stocks data_dict_overall["number_articles_by_concept"] = self.get_number_articles_per_concept(data_dict_overall["nArticlesAndSentimentsPerConcept"]) key, value = max(data_dict_overall["max_value_change"].iteritems(), key = lambda p: p[1]) data_dict_overall["max_value_change_overall"] = (self.get_capitalized_name(key),value) key, value = min(data_dict_overall["max_value_change"].iteritems(), key = lambda p: p[1]) data_dict_overall["min_value_change_overall"] = (self.get_capitalized_name(key),value) key,value = max(data_dict_overall["max_sentiment_change"].iteritems(), key = lambda p: p[1]) data_dict_overall["max_sentiment_change_overall"] = (self.get_capitalized_name(key),value) # print data_dict_overall finalResult = NarrativesTree() overviewNode = NarrativesTree() stockNode = NarrativesTree() overviewNode.set_name("Overview") stockNode.set_name("Single Stock Analysis") overviewCard = MLUtils.stock_sense_overview_card(data_dict_overall) overviewNode.add_a_card(overviewCard) finalResult.add_a_node(overviewNode) individualStockNodes = MLUtils.stock_sense_individual_stock_cards(stockDict) stockNode.add_nodes(individualStockNodes) finalResult.add_a_node(stockNode) return finalResult
class AnovaNarratives(object): ALPHA = 0.05 KEY_SUMMARY = 'summary' KEY_NARRATIVES = 'narratives' KEY_TAKEAWAY = 'key_takeaway' DRILL_DOWN = 'drill_down_narrative' KEY_CARD = 'card' KEY_HEADING = 'heading' KEY_SUBHEADING = 'header' KEY_CHART = 'charts' KEY_PARAGRAPH = 'paragraphs' KEY_PARA_HEADER = 'header' KEY_PARA_CONTENT = 'content' KEY_BUBBLE = 'bubble_data' # @accepts(object, DFAnovaResult, DataFrameHelper) def __init__(self, df_anova_result, df_helper, df_context, result_setter, story_narrative, scriptWeight=None, analysisName=None): self._story_narrative = story_narrative self._result_setter = result_setter self._dataframe_context = df_context self._df_anova_result = df_anova_result self._df_helper = df_helper self.narratives = {} self.narratives['variables'] = '' self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER self._base_dir = "/anova/" self._analysisName = self._dataframe_context.get_analysis_name() self._analysisDict = self._dataframe_context.get_analysis_dict() self._completionStatus = self._dataframe_context.get_completion_status( ) self._messageURL = self._dataframe_context.get_message_url() if analysisName == None: self._analysisName = self._dataframe_context.get_analysis_name() else: self._analysisName = analysisName if scriptWeight == None: self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight( ) else: self._scriptWeightDict = scriptWeight self._scriptStages = { "anovaNarrativeStart": { "summary": "Started The Anova Narratives", "weight": 0 }, "anovaNarrativeEnd": { "summary": "Narratives For Anova Finished", "weight": 10 }, } # self._completionStatus += self._scriptWeightDict[self._analysisName]["narratives"]*self._scriptStages["anovaNarrativeStart"]["weight"]/10 # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ # "anovaNarrativeStart",\ # "info",\ # self._scriptStages["anovaNarrativeStart"]["summary"],\ # self._completionStatus,\ # self._completionStatus) # CommonUtils.save_progress_message(self._messageURL,progressMessage) # self._dataframe_context.update_completion_status(self._completionStatus) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "anovaNarrativeStart", "info", display=False, emptyBin=False, customMsg=None, weightKey="narratives") self._generate_narratives() # self._completionStatus += self._scriptWeightDict[self._analysisName]["narratives"]*self._scriptStages["anovaNarrativeEnd"]["weight"]/10 # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ # "anovaNarrativeEnd",\ # "info",\ # self._scriptStages["anovaNarrativeEnd"]["summary"],\ # self._completionStatus,\ # self._completionStatus) # CommonUtils.save_progress_message(self._messageURL,progressMessage) # self._dataframe_context.update_completion_status(self._completionStatus) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "anovaNarrativeEnd", "info", display=False, emptyBin=False, customMsg=None, weightKey="narratives") if self._anovaNodes.get_card_count() > 0: self._story_narrative.add_a_node(self._anovaNodes) #self._generate_take_away() self._result_setter.set_anova_node(self._anovaNodes) def _generate_narratives(self): try: nColsToUse = self._analysisDict[ self._analysisName]["noOfColumnsToUse"] except: nColsToUse = None self._anovaNodes = NarrativesTree() self._anovaNodes.set_name("Performance") for measure_column in self._df_anova_result.get_measure_columns(): measure_anova_result = self._df_anova_result.get_measure_result( measure_column) significant_dimensions_dict, insignificant_dimensions = measure_anova_result.get_OneWayAnovaSignificantDimensions( ) num_dimensions = len(list(significant_dimensions_dict.items()) ) + len(insignificant_dimensions) significant_dimensions = [ k for k, v in sorted(list(significant_dimensions_dict.items()), key=lambda x: -x[1]) ] if nColsToUse != None: significant_dimensions = significant_dimensions[:nColsToUse] num_significant_dimensions = len(significant_dimensions) num_insignificant_dimensions = len(insignificant_dimensions) print("num_significant_dimensions", num_significant_dimensions) if num_significant_dimensions > 0: mainCard = NormalCard(name="Overview of Key Factors") data_c3 = [] for sig_dim in significant_dimensions: data_c3.append({ 'dimension': sig_dim, 'effect_size': float(significant_dimensions_dict[sig_dim]) }) self.narratives = {} self.narratives[AnovaNarratives. KEY_HEADING] = "%s Performance Analysis" % ( measure_column, ) self.narratives['main_card'] = {} self.narratives['cards'] = [] self.narratives['main_card'][ AnovaNarratives. KEY_SUBHEADING] = "Relationship between %s and other Dimensions" % ( measure_column) self.narratives['main_card'][ AnovaNarratives.KEY_PARAGRAPH] = [] data_dict = { \ 'significant_dimensions' : significant_dimensions, 'insignificant_dimensions' : insignificant_dimensions, 'num_significant_dimensions' : num_significant_dimensions, 'num_insignificant_dimensions' : num_insignificant_dimensions, 'num_dimensions' : num_significant_dimensions+num_insignificant_dimensions, 'target' : measure_column \ } output = {'header': ''} output['content'] = NarrativesUtils.get_template_output( self._base_dir, 'anova_template_1.html', data_dict) self.narratives['main_card'][ AnovaNarratives.KEY_PARAGRAPH].append(output) output1 = {'header': ''} output1['content'] = NarrativesUtils.get_template_output( self._base_dir, 'anova_template_2.html', data_dict) lines = [] lines += NarrativesUtils.block_splitter( output['content'], self._blockSplitter) data_c3 = NormalChartData(data_c3) chart_data = data_c3.get_data() chartDataValues = [] effect_size_values = [] for obj in chart_data: effect_size_values.append(obj["effect_size"]) chart_data_min = min(effect_size_values) if chart_data_min < 0.00001: for obj in chart_data: chartDataValues.append(str(obj["effect_size"])) else: for obj in chart_data: chartDataValues.append(obj["effect_size"]) chart_json = ChartJson(data=chart_data, axes={ 'x': 'dimension', 'y': 'effect_size' }, label_text={ 'x': '', 'y': 'Effect Size (scaled exp values)' }, chart_type='bar') chart_json.set_axis_rotation(True) # chart_json.set_yaxis_number_format(".4f") chart_json.set_yaxis_number_format( NarrativesUtils.select_y_axis_format(chartDataValues)) # st_info = ["Test : ANOVA", "Threshold for p-value : 0.05", "Effect Size : Tukey's HSD"] statistical_info_array = [ ("Test Type", "ANOVA"), ("Effect Size", "ETA squared"), ("Max Effect Size", chart_data[0]["dimension"]), ("Min Effect Size", chart_data[-1]["dimension"]), ] statistical_inferenc = "" if len(chart_data) == 1: statistical_inference = "{} is the only variable that have significant association with the {} (Target) having an \ Effect size of {}".format( chart_data[0]["dimension"], self._dataframe_context.get_result_column(), round(chart_data[0]["effect_size"], 4)) elif len(chart_data) == 2: statistical_inference = "There are two variables ({} and {}) that have significant association with the {} (Target) and the \ Effect size ranges are {} and {} respectively".format( chart_data[0]["dimension"], chart_data[1]["dimension"], self._dataframe_context.get_result_column(), round(chart_data[0]["effect_size"], 4), round(chart_data[1]["effect_size"], 4)) else: statistical_inference = "There are {} variables that have significant association with the {} (Target) and the \ Effect size ranges from {} to {}".format( len(chart_data), self._dataframe_context.get_result_column(), round(chart_data[0]["effect_size"], 4), round(chart_data[-1]["effect_size"], 4)) if statistical_inference != "": statistical_info_array.append( ("Inference", statistical_inference)) statistical_info_array = NarrativesUtils.statistical_info_array_formatter( statistical_info_array) lines += [ C3ChartData(data=chart_json, info=statistical_info_array) ] lines += NarrativesUtils.block_splitter( output1['content'], self._blockSplitter) mainCard.set_card_data(lines) self._anovaNodes.add_a_card(mainCard) self.narratives['main_card'][ AnovaNarratives.KEY_PARAGRAPH].append(output1) self.narratives['main_card'][AnovaNarratives.KEY_CHART] = {} effect_size_chart = { 'heading': '', 'labels': { 'Dimension': 'Effect Size' }, 'data': significant_dimensions_dict } print(significant_dimensions_dict) self.narratives['main_card'][AnovaNarratives.KEY_CHART][ 'effect_size'] = effect_size_chart progressMessage = CommonUtils.create_progress_message_object( self._analysisName, "custom", "info", "Analyzing Key Drivers", self._completionStatus, self._completionStatus, display=True) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=False) self._generate_dimension_narratives(significant_dimensions, measure_anova_result, measure_column) else: mainCard = NormalCard(name="Overview of Key Factors") cardText = HtmlData( "There are no dimensions in the dataset that have significant influence on {}" .format(measure_column)) mainCard.set_card_data([cardText]) self._anovaNodes.add_a_card(mainCard) def _generate_dimension_narratives(self, significant_dimensions, measure_anova_result, measure): self.narratives['cards'] = [] anova_trend_result = measure_anova_result.get_trend_data() if len(significant_dimensions) == 0: self.narratives['cards'].append({ 'card1': '', 'card2': '', 'card3': '' }) self.narratives['variables'] = significant_dimensions for dimension in significant_dimensions: dimensionNode = NarrativesTree(name=dimension) narratives = OneWayAnovaNarratives(self._dataframe_context, measure, dimension, measure_anova_result, anova_trend_result, self._result_setter, dimensionNode, self._base_dir) self._anovaNodes.add_a_node(dimensionNode) self.narratives['cards'].append(narratives)