class DimensionColumnNarrative: MAX_FRACTION_DIGITS = 2 def __init__(self, column_name, df_helper, df_context, freq_dimension_stats, result_setter, story_narrative, scriptWeight=None, analysisName=None): self._story_narrative = story_narrative self._result_setter = result_setter self._column_name = column_name.lower() self._colname = column_name self._capitalized_column_name = "%s%s" % (column_name[0].upper(), column_name[1:]) self._dimension_col_freq_dict = freq_dimension_stats.get_frequency_dict( ) self.header = None self.subheader = None self.count = {} self.summary = [] self.analysis = [] self.frequency_dict = json.loads(self._dimension_col_freq_dict) self.appid = df_context.get_app_id() self._base_dir = "/dimensions/" if self.appid != None: if self.appid == "1": self._base_dir += "appid1/" elif self.appid == "2": self._base_dir += "appid2/" self._dataframe_context = df_context self._dataframe_helper = df_helper self._storyOnScoredData = self._dataframe_context.get_story_on_scored_data( ) self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER self._dimensionSummaryNode = NarrativesTree() self._dimensionSummaryNode.set_name("Overview") self._headNode = NarrativesTree() self._headNode.set_name("Overview") self._completionStatus = self._dataframe_context.get_completion_status( ) if analysisName == None: self._analysisName = self._dataframe_context.get_analysis_name() else: self._analysisName = analysisName self._messageURL = self._dataframe_context.get_message_url() if scriptWeight == None: self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight( ) else: self._scriptWeightDict = scriptWeight self._scriptStages = { "initialization": { "summary": "Initialized the Frequency Narratives", "weight": 2 }, "summarygeneration": { "summary": "summary generation finished", "weight": 8 }, "completion": { "summary": "Frequency Stats Narratives done", "weight": 0 }, } CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "initialization", "info", weightKey="narratives") self._generate_narratives() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "summarygeneration", "info", weightKey="narratives") self._story_narrative.add_a_node(self._dimensionSummaryNode) self._result_setter.set_head_node(self._headNode) self._result_setter.set_distribution_node(self._dimensionSummaryNode) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "summarygeneration", "info", weightKey="narratives") def _generate_narratives(self): if self.appid != None: if self.appid == "1": self._generate_title() self._generate_summary() self._generate_analysis() elif self.appid == "2": self._generate_title() self._generate_summary() self._generate_analysis() else: self._generate_title() if self._storyOnScoredData != True: self._generate_summary() self._generate_analysis() else: self._generate_title() if self._storyOnScoredData != True: self._generate_summary() self._generate_analysis() def _generate_title(self): self.header = '%s Performance Report' % ( self._capitalized_column_name, ) # self._dimensionSummaryNode.set_name(self.header) def _generate_summary(self): ignored_columns = self._dataframe_context.get_ignore_column_suggestions( ) if ignored_columns == None: ignored_columns = [] data_dict = { "n_c": len(self._dataframe_helper.get_columns()), "n_m": len(self._dataframe_helper.get_numeric_columns()), "n_d": len(self._dataframe_helper.get_string_columns()), "n_td": len(self._dataframe_helper.get_timestamp_columns()), "c": self._column_name, "d": self._dataframe_helper.get_string_columns(), "m": self._dataframe_helper.get_numeric_columns(), "td": self._dataframe_helper.get_timestamp_columns(), "observations": self._dataframe_helper.get_num_rows(), "ignorecolumns": ignored_columns, "n_t": len(self._dataframe_helper.get_string_columns()) + len(self._dataframe_helper.get_numeric_columns()) + len(self._dataframe_helper.get_timestamp_columns()), # "n_t" : self._dataframe_helper.get_num_columns()+len(ignored_columns), "blockSplitter": self._blockSplitter } output = NarrativesUtils.get_template_output(self._base_dir,\ 'dimension_report_summary.html',data_dict) summary = NarrativesUtils.block_splitter(output, self._blockSplitter) dimensionSummaryCard = SummaryCard(name=self.header, slug=None, cardData=None) dimensionSummaryCard.set_no_of_measures(data_dict["n_m"]) dimensionSummaryCard.set_no_of_dimensions(data_dict["n_d"]) dimensionSummaryCard.set_no_of_time_dimensions(data_dict["n_td"]) dimensionSummaryCard.set_summary_html(summary) dimensionSummaryCard.set_card_name("overall summary card") # dimensionSummaryCard.set_quote_html self._story_narrative.add_a_card(dimensionSummaryCard) self._headNode.add_a_card(dimensionSummaryCard) def _generate_analysis(self): lines = [] freq_dict = self._dimension_col_freq_dict # print "freq_dict",freq_dict json_freq_dict = json.dumps(freq_dict) freq_dict = json.loads(freq_dict) colname = self._colname freq_data = [] print "self._dataframe_helper.get_cols_to_bin()", self._dataframe_helper.get_cols_to_bin( ) if colname in self._dataframe_helper.get_cols_to_bin(): keys_to_sort = freq_dict[colname][colname].values() convert = lambda text: int(text) if text.isdigit() else text alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] keys_to_sort.sort(key=alphanum_key) temp_dict = {} for k, v in freq_dict[colname][colname].items(): temp_dict[v] = freq_dict[colname]["count"][k] for each in keys_to_sort: freq_data.append({"key": each, "Count": temp_dict[each]}) else: for k, v in freq_dict[colname][colname].items(): freq_data.append({ "key": v, "Count": freq_dict[colname]["count"][k] }) freq_data = sorted(freq_data, key=lambda x: x["Count"], reverse=True) data_dict = {"colname": self._colname} data_dict["plural_colname"] = pattern.en.pluralize( data_dict["colname"]) count = freq_dict[colname]['count'] max_key = max(count, key=count.get) min_key = min(count, key=count.get) data_dict["blockSplitter"] = self._blockSplitter data_dict["max"] = { "key": freq_dict[colname][colname][max_key], "val": count[max_key] } data_dict["min"] = { "key": freq_dict[colname][colname][min_key], "val": count[min_key] } data_dict["keys"] = freq_dict[colname][colname].values() data_dict["avg"] = round( sum(count.values()) / float(len(count.values())), 2) data_dict["above_avg"] = [ freq_dict[colname][colname][key] for key in count.keys() if count[key] > data_dict["avg"] ] data_dict["per_bigger_avg"] = round( data_dict["max"]["val"] / float(data_dict["avg"]), 4) data_dict["per_bigger_low"] = round( data_dict["max"]["val"] / float(data_dict["min"]["val"]), 4) uniq_val = list(set(count.values())) data_dict["n_uniq"] = len(uniq_val) if len(uniq_val) == 1: data_dict["count"] = uniq_val[0] if len(data_dict["keys"]) >= 3: #percent_75 = np.percentile(count.values(),75) #kv=[(freq_dict[colname][colname][key],count[key]) for key in count.keys()] percent_75 = sum(count.values()) * 0.75 kv = sorted(count.items(), key=operator.itemgetter(1), reverse=True) kv_75 = [(k, v) for k, v in kv if v <= percent_75] kv_75 = [] temp_sum = 0 for k, v in kv: temp_sum = temp_sum + v kv_75.append((freq_dict[colname][colname][k], v)) if temp_sum >= percent_75: break data_dict["percent_contr"] = round( temp_sum * 100.0 / float(sum(count.values())), 2) data_dict["kv_75"] = len(kv_75) data_dict["kv_75_cat"] = [k for k, v in kv_75] largest_text = " %s is the largest with %s observations" % ( data_dict["max"]["key"], NarrativesUtils.round_number(data_dict["max"]["val"])) smallest_text = " %s is the smallest with %s observations" % ( data_dict["min"]["key"], NarrativesUtils.round_number(data_dict["min"]["val"])) largest_per = round( data_dict["max"]["val"] * 100.0 / float(sum(count.values())), 2) data_dict['largest_per'] = largest_per smallest_per = round( data_dict["min"]["val"] * 100.0 / float(sum(count.values())), 2) self.count = { "largest": [largest_text, str(round(largest_per, 1)) + '%'], "smallest": [smallest_text, str(round(smallest_per, 1)) + '%'] } if len(data_dict["keys"]) >= 3: # self.subheader = "Top %d %s account for more than three quarters (%d percent) of observations." % (data_dict["kv_75"],data_dict["plural_colname"],data_dict["percent_contr"]) self.subheader = 'Distribution of ' + self._capitalized_column_name else: self.subheader = 'Distribution of ' + self._capitalized_column_name output1 = NarrativesUtils.get_template_output(self._base_dir,\ 'dimension_distribution1.html',data_dict) output1 = NarrativesUtils.block_splitter(output1, self._blockSplitter) output2 = NarrativesUtils.get_template_output(self._base_dir,\ 'dimension_distribution2.html',data_dict) output2 = NarrativesUtils.block_splitter(output2, self._blockSplitter) chart_data = NormalChartData(data=freq_data) chart_json = ChartJson() chart_json.set_data(chart_data.get_data()) chart_json.set_chart_type("bar") chart_json.set_axes({"x": "key", "y": "Count"}) chart_json.set_label_text({'x': ' ', 'y': 'No. of Observations'}) chart_json.set_yaxis_number_format(".2s") lines += output1 lines += [C3ChartData(data=chart_json)] lines += output2 bubble_data = "<div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}%</span><br /><small>{}</small></h2></div><div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}%</span><br /><small>{}</small></h2></div>".format( largest_per, largest_text, smallest_per, smallest_text) lines.append(HtmlData(data=bubble_data)) # print lines dimensionCard1 = NormalCard(name=self.subheader, slug=None, cardData=lines) self._dimensionSummaryNode.add_a_card(dimensionCard1) self._result_setter.set_score_freq_card( json.loads( CommonUtils.convert_python_object_to_json(dimensionCard1))) return lines def _generate_analysis2(self): lines = [] freq_dict = self._dimension_col_freq_dict json_freq_dict = json.dumps(freq_dict) freq_dict = json.loads(freq_dict) colname = self._colname data_dict = {"colname": self._colname} data_dict["plural_colname"] = pattern.en.pluralize( data_dict["colname"]) count = freq_dict[colname]['count'] max_key = max(count, key=count.get) min_key = min(count, key=count.get) data_dict["max"] = { "key": freq_dict[colname][colname][max_key], "val": count[max_key] } data_dict["min"] = { "key": freq_dict[colname][colname][min_key], "val": count[min_key] } data_dict["keys"] = freq_dict[colname][colname].values() data_dict["avg"] = round( sum(count.values()) / float(len(count.values())), 2) data_dict["above_avg"] = [ freq_dict[colname][colname][key] for key in count.keys() if count[key] > data_dict["avg"] ] data_dict["per_bigger_avg"] = round( data_dict["max"]["val"] / float(data_dict["avg"]), 2) data_dict["per_bigger_low"] = round( data_dict["max"]["val"] / float(data_dict["min"]["val"]), 2) uniq_val = list(set(count.values())) data_dict["n_uniq"] = len(uniq_val) if len(uniq_val) == 1: data_dict["count"] = uniq_val[0] if len(data_dict["keys"]) >= 2: percent_75 = sum(count.values()) * 0.75 kv = sorted(count.items(), key=operator.itemgetter(1), reverse=True) kv_75 = [(k, v) for k, v in kv if v <= percent_75] kv_75 = [] temp_sum = 0 for k, v in kv[:-1]: temp_sum = temp_sum + v kv_75.append((freq_dict[colname][colname][k], v)) if temp_sum >= percent_75: break data_dict["percent_contr"] = round( temp_sum * 100 / float(sum(count.values())), 2) data_dict["kv_75"] = len(kv_75) data_dict["kv_75_cat"] = [k for k, v in kv_75] largest_text = " %s is the largest with %s observations" % ( data_dict["max"]["key"], str(NarrativesUtils.round_number(data_dict["max"]["val"]))) smallest_text = " %s is the smallest with %s observations" % ( data_dict["min"]["key"], str(NarrativesUtils.round_number(data_dict["min"]["val"]))) largest_per = NarrativesUtils.round_number( data_dict["max"]["val"] / float(sum(count.values())), 2) * 100 smallest_per = NarrativesUtils.round_number( data_dict["min"]["val"] / float(sum(count.values())), 2) * 100 data_dict['largest_per'] = largest_per self.count = { "largest": [largest_text, str(round(largest_per, 0)) + '%'], "smallest": [smallest_text, str(round(smallest_per, 0)) + '%'] } self.subheader = "Snapshot of " + data_dict["colname"] output1 = NarrativesUtils.get_template_output(self._base_dir,\ 'dimension_distribution1.html',data_dict) output2 = NarrativesUtils.get_template_output(self._base_dir,\ 'dimension_distribution2.html',data_dict) lines.append(output1) lines.append(output2) return lines
def generate_narratives(self): regression_narrative_obj = LinearRegressionNarrative( self._df_regression_result, self._correlations, self._dataframe_helper, self._dataframe_context, self._metaParser, self._spark ) main_card_data = regression_narrative_obj.generate_main_card_data() main_card_narrative = NarrativesUtils.get_template_output(self._base_dir,\ 'regression_main_card.html',main_card_data) self.narratives['main_card'] = {} self.narratives["main_card"]['paragraphs'] = NarrativesUtils.paragraph_splitter(main_card_narrative) self.narratives["main_card"]['header'] = 'Key Measures that affect ' + self.result_column self.narratives["main_card"]['chart'] = {} self.narratives["main_card"]['chart']['heading'] = '' self.narratives["main_card"]['chart']['data'] = [[i for i,j in self._all_coeffs], [j['coefficient'] for i,j in self._all_coeffs]] self.narratives["main_card"]['chart']['label'] = {'x':'Measure Name', 'y': 'Change in ' + self.result_column + ' per unit increase'} main_card = NormalCard() main_card_header = HtmlData(data = '<h3>Key Measures that affect ' + self.result_column+"</h3>") main_card_paragraphs = NarrativesUtils.block_splitter(main_card_narrative,self._blockSplitter) main_card_chart_data = [{"key":val[0],"value":val[1]} for val in zip([i for i,j in self._all_coeffs],[j['coefficient'] for i,j in self._all_coeffs])] main_card_chart = NormalChartData(data=main_card_chart_data) mainCardChartJson = ChartJson() mainCardChartJson.set_data(main_card_chart.get_data()) mainCardChartJson.set_label_text({'x':'Influencing Factors','y': 'Change in ' + self.result_column + ' per unit increase'}) mainCardChartJson.set_chart_type("bar") mainCardChartJson.set_axes({"x":"key","y":"value"}) mainCardChartJson.set_yaxis_number_format(".2f") # st_info = ["Test : Regression","Threshold for p-value: 0.05", "Effect Size: Regression Coefficient"] chart_data = sorted(main_card_chart_data,key=lambda x:x["value"],reverse=True) statistical_info_array=[ ("Test Type","Regression"), ("Effect Size","Coefficients"), ("Max Effect Size",chart_data[0]["key"]), ("Min Effect Size",chart_data[-1]["key"]), ] statistical_inferenc = "" if len(chart_data) == 1: statistical_inference = "{} is the only variable that have significant influence over {} (Target) having an \ Effect size of {}".format(chart_data[0]["key"],self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4)) elif len(chart_data) == 2: statistical_inference = "There are two variables ({} and {}) that have significant influence over {} (Target) and the \ Effect size ranges are {} and {} respectively".format(chart_data[0]["key"],chart_data[1]["key"],self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4),round(chart_data[1]["value"],4)) else: statistical_inference = "There are {} variables that have significant influence over {} (Target) and the \ Effect size ranges from {} to {}".format(len(chart_data),self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4),round(chart_data[-1]["value"],4)) if statistical_inference != "": statistical_info_array.append(("Inference",statistical_inference)) statistical_info_array = NarrativesUtils.statistical_info_array_formatter(statistical_info_array) main_card.set_card_data(data = [main_card_header]+main_card_paragraphs+[C3ChartData(data=mainCardChartJson,info=statistical_info_array)]) main_card.set_card_name("Key Influencers") self._regressionNode.add_a_card(main_card) count = 0 for measure_column in self.significant_measures: sigMeasureNode = NarrativesTree() sigMeasureNode.set_name(measure_column) measureCard1 = NormalCard() measureCard1.set_card_name("{}: Impact on {}".format(measure_column,self.result_column)) measureCard1Data = [] if self._run_dimension_level_regression: measureCard2 = NormalCard() measureCard2.set_card_name("Key Areas where it Matters") measureCard2Data = [] measure_column_cards = {} card0 = {} card1data = regression_narrative_obj.generate_card1_data(measure_column) card1heading = "<h3>Impact of "+measure_column+" on "+self.result_column+"</h3>" measureCard1Header = HtmlData(data=card1heading) card1data.update({"blockSplitter":self._blockSplitter}) card1narrative = NarrativesUtils.get_template_output(self._base_dir,\ 'regression_card1.html',card1data) card1paragraphs = NarrativesUtils.block_splitter(card1narrative,self._blockSplitter) card0 = {"paragraphs":card1paragraphs} card0["charts"] = {} card0['charts']['chart2']={} # card0['charts']['chart2']['data']=card1data["chart_data"] # card0['charts']['chart2']['heading'] = '' # card0['charts']['chart2']['labels'] = {} card0['charts']['chart1']={} card0["heading"] = card1heading measure_column_cards['card0'] = card0 measureCard1Header = HtmlData(data=card1heading) measureCard1Data += [measureCard1Header] measureCard1para = card1paragraphs measureCard1Data += measureCard1para if self._run_dimension_level_regression: print("running narratives for key area dict") self._dim_regression = self.run_regression_for_dimension_levels() card2table, card2data=regression_narrative_obj.generate_card2_data(measure_column,self._dim_regression) card2data.update({"blockSplitter":self._blockSplitter}) card2narrative = NarrativesUtils.get_template_output(self._base_dir,\ 'regression_card2.html',card2data) card2paragraphs = NarrativesUtils.block_splitter(card2narrative,self._blockSplitter) card1 = {'tables': card2table, 'paragraphs' : card2paragraphs, 'heading' : 'Key Areas where ' + measure_column + ' matters'} measure_column_cards['card1'] = card1 measureCard2Data += card2paragraphs if "table1" in card2table: table1data = regression_narrative_obj.convert_table_data(card2table["table1"]) card2Table1 = TableData() card2Table1.set_table_data(table1data) card2Table1.set_table_type("heatMap") card2Table1.set_table_top_header(card2table["table1"]["heading"]) card2Table1Json = json.loads(CommonUtils.convert_python_object_to_json(card2Table1)) # measureCard2Data.insert(3,card2Table1) measureCard2Data.insert(3,card2Table1Json) if "table2" in card2table: table2data = regression_narrative_obj.convert_table_data(card2table["table2"]) card2Table2 = TableData() card2Table2.set_table_data(table2data) card2Table2.set_table_type("heatMap") card2Table2.set_table_top_header(card2table["table2"]["heading"]) # measureCard2Data.insert(5,card2Table2) card2Table2Json = json.loads(CommonUtils.convert_python_object_to_json(card2Table2)) # measureCard2Data.append(card2Table2) measureCard2Data.append(card2Table2Json) # self._result_setter.set_trend_section_data({"result_column":self.result_column, # "measure_column":measure_column, # "base_dir":self._base_dir # }) # trend_narratives_obj = TimeSeriesNarrative(self._dataframe_helper, self._dataframe_context, self._result_setter, self._spark, self._story_narrative) # card2 = trend_narratives_obj.get_regression_trend_card_data() # if card2: # measure_column_cards['card2'] = card2 # # # card3 = {} progressMessage = CommonUtils.create_progress_message_object(self._analysisName,"custom","info","Analyzing Key Influencers",self._completionStatus,self._completionStatus,display=True) CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=False) card4data = regression_narrative_obj.generate_card4_data(self.result_column,measure_column) card4data.update({"blockSplitter":self._blockSplitter}) # card4heading = "Sensitivity Analysis: Effect of "+self.result_column+" on Segments of "+measure_column card4narrative = NarrativesUtils.get_template_output(self._base_dir,\ 'regression_card4.html',card4data) card4paragraphs = NarrativesUtils.block_splitter(card4narrative,self._blockSplitter) # card3 = {"paragraphs":card4paragraphs} card0['paragraphs'] = card1paragraphs+card4paragraphs card4Chart = card4data["charts"] # st_info = ["Test : Regression", "Variables : "+ self.result_column +", "+measure_column,"Intercept : "+str(round(self._df_regression_result.get_intercept(),2)), "Regression Coefficient : "+ str(round(self._df_regression_result.get_coeff(measure_column),2))] statistical_info_array=[ ("Test Type","Regression"), ("Coefficient",str(round(self._df_regression_result.get_coeff(measure_column),2))), ("P-Value","<= 0.05"), ("Intercept",str(round(self._df_regression_result.get_intercept(),2))), ("R Square ",str(round(self._df_regression_result.get_rsquare(),2))), ] inferenceTuple = () coeff = self._df_regression_result.get_coeff(measure_column) if coeff > 0: inferenceTuple = ("Inference","For every additional unit of increase in {} there will be an increase of {} units in {} (target).".format(measure_column,str(round(coeff,2)),self._dataframe_context.get_result_column())) else: inferenceTuple = ("Inference","For every additional unit of decrease in {} there will be an decrease of {} units in {} (target).".format(measure_column,str(round(coeff,2)),self._dataframe_context.get_result_column())) if len(inferenceTuple) > 0: statistical_info_array.append(inferenceTuple) statistical_info_array = NarrativesUtils.statistical_info_array_formatter(statistical_info_array) card4paragraphs.insert(2,C3ChartData(data=card4Chart,info=statistical_info_array)) measureCard1Data += card4paragraphs self.narratives['cards'].append(measure_column_cards) if count == 0: card4data.pop("charts") self._result_setter.update_executive_summary_data(card4data) count += 1 measureCard1.set_card_data(measureCard1Data) if self._run_dimension_level_regression: measureCard2.set_card_data(measureCard2Data) sigMeasureNode.add_cards([measureCard1,measureCard2]) sigMeasureNode.add_cards([measureCard1]) self._regressionNode.add_a_node(sigMeasureNode) # self._result_setter.set_trend_section_completion_status(True) self._story_narrative.add_a_node(self._regressionNode)
class MeasureColumnNarrative(object): MAX_FRACTION_DIGITS = 2 def __init__(self, data_frame,column_name, measure_descr_stats, df_helper, df_context, result_setter, story_narrative,scriptWeight=None, analysisName=None): self._story_narrative = story_narrative self._result_setter = result_setter self._column_name = column_name.lower() self._capitalized_column_name = "%s%s" % (column_name[0].upper(), column_name[1:]) self._measure_descr_stats = measure_descr_stats self._five_point_summary_stats = measure_descr_stats.get_five_point_summary_stats() self._data_frame = data_frame try: self._total_rows = self._data_frame.shape[0] except: self._total_rows = self._data_frame.count() # self._histogram = measure_descr_stats.get_histogram() # self._num_columns = context.get_column_count() # self._num_rows = context.get_row_count() # self._measures = context.get_measures() # self._dimensions = context.get_dimensions() # self._time_dimensions = context.get_time_dimension() self._dataframe_helper = df_helper self._dataframe_context = df_context self._pandas_flag = self._dataframe_context._pandas_flag self._storyOnScoredData = self._dataframe_context.get_story_on_scored_data() self.title = None self.heading = self._capitalized_column_name + ' Performance Analysis' self.sub_heading = "Distribution of " + self._capitalized_column_name self.summary = None self._analysis1 = None self._analysis2 = None self.analysis = None self.take_away = None self.card2 = '' self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER self._highlightFlag = "|~HIGHLIGHT~|" self._base_dir = "/descriptive/" self.num_measures = len(self._dataframe_helper.get_numeric_columns()) self.num_dimensions = len(self._dataframe_helper.get_string_columns()) self.num_time_dimensions = len(self._dataframe_helper.get_timestamp_columns()) self._completionStatus = self._dataframe_context.get_completion_status() self._messageURL = self._dataframe_context.get_message_url() if analysisName == None: self._analysisName = self._dataframe_context.get_analysis_name() else: self._analysisName = analysisName if scriptWeight == None: self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight() else: self._scriptWeightDict = scriptWeight self._scriptStages = { "statNarrativeStart":{ "summary":"Started The Descriptive Stats Narratives", "weight":0 }, "statNarrativeEnd":{ "summary":"Narratives For Descriptive Stats Finished", "weight":10 }, } CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._analysisName,"statNarrativeStart","info",display=False,emptyBin=False,customMsg=None,weightKey="narratives") self._measureSummaryNode = NarrativesTree() self._headNode = NarrativesTree() self._headNode.set_name("Overview") self._generate_narratives() self._story_narrative.add_a_node(self._measureSummaryNode) self._result_setter.set_head_node(self._headNode) self._result_setter.set_distribution_node(self._measureSummaryNode) CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._analysisName,"statNarrativeEnd","info",display=False,emptyBin=False,customMsg=None,weightKey="narratives") def _get_c3_histogram(self): data = self._measure_descr_stats.get_histogram() data_c3 = [] for bin in data: data_c3.append({'bin_name':'< '+ humanize.intcomma(round(bin['end_value'],2)), 'Count':bin['num_records']}) data_c3 = NormalChartData(data_c3) chartObj = ChartJson(data=data_c3.get_data(), axes={'x':'bin_name','y':'Count'},label_text={'x':'','y':'No. of Observations'},chart_type='bar') chartObj.set_yaxis_number_format(".2s") return chartObj def _generate_narratives(self): lines = [] self._generate_title() if self._storyOnScoredData != True: self._generate_summary() self._analysis1 = self._generate_analysis_para1() self._analysis2 = self._generate_analysis_para2() lines += NarrativesUtils.block_splitter(self._analysis1,self._blockSplitter) lines += [C3ChartData(self._get_c3_histogram())] self._tableData = [['Minimum','Quartile 1','Median','Quartile 3','Maximum'], [NarrativesUtils.round_number(self._measure_descr_stats.get_min()), NarrativesUtils.round_number(self._five_point_summary_stats.get_q1_split()), NarrativesUtils.round_number(self._five_point_summary_stats.get_q2_split()), NarrativesUtils.round_number(self._five_point_summary_stats.get_q3_split()), NarrativesUtils.round_number(self._measure_descr_stats.get_max())]] lines += [TableData({'tableType':'normal','tableData':self._tableData})] lines += NarrativesUtils.block_splitter(self._analysis2,self._blockSplitter) if self.card2 != '': lines += self.card2['data']['content'] measureCard1 = NormalCard(name=self.sub_heading,slug=None,cardData = lines) self._measureSummaryNode.add_a_card(measureCard1) self._measureSummaryNode.set_name("Overview") self.analysis = [self._analysis1, self._analysis2] self.take_away = self._generate_take_away() def _generate_title(self): self.title = '%s Performance Report' % (self._capitalized_column_name,) def _generate_summary(self): ignored_columns = self._dataframe_context.get_ignore_column_suggestions() if ignored_columns == None: ignored_columns = [] metaHelperInstance = MetaDataHelper(self._data_frame, self._total_rows) sampleData = metaHelperInstance.get_sample_data() try: sampleData = sampleData.toPandas() except: pass l1=[] l2=[] if self._pandas_flag: for column in self._dataframe_helper.get_string_columns(): uniqueVals = sampleData[column].unique().tolist() if len(uniqueVals) > 0 and metaHelperInstance.get_datetime_format_pandas([self._data_frame[column].sort_values(ascending=False)[0]])!=None: dateColumnFormat = metaHelperInstance.get_datetime_format_pandas(uniqueVals) l1.append(column) else: dateColumnFormat = None l2.append(column) # l1 = self._dataframe_helper.get_timestamp_columns() # l2 = self._dataframe_helper.get_string_columns() else: for column in self._dataframe_helper.get_string_columns(): uniqueVals = sampleData[column].unique().tolist() if len(uniqueVals) > 0 and metaHelperInstance.get_datetime_format([self._data_frame.orderBy([column],ascending=[False]).select(column).first()[0]])!=None: dateColumnFormat = metaHelperInstance.get_datetime_format(uniqueVals) l1.append(column) else: dateColumnFormat = None l2.append(column) data_dict = {"n_c" : self._dataframe_helper.get_num_columns(), "n_m" : len(self._dataframe_helper.get_numeric_columns()), "n_d" : len(l2), "n_td" : len(l1), "c" : self._column_name, "d" : l2, "m" : self._dataframe_helper.get_numeric_columns(), "td" : l1, "observations" : self._dataframe_helper.get_num_rows(), "ignorecolumns" : ignored_columns, "n_t" : len(self._dataframe_helper.get_string_columns())+len(self._dataframe_helper.get_numeric_columns())+len(self._dataframe_helper.get_timestamp_columns()) # "n_t" : self._dataframe_helper.get_num_columns()+len(ignored_columns) } self.summary = NarrativesUtils.get_template_output(self._base_dir,\ 'descr_stats_summary.html',data_dict) MeasureSummaryCard = SummaryCard(name='Summary',slug=None,cardData = None) MeasureSummaryCard.set_no_of_measures(data_dict["n_m"]) MeasureSummaryCard.set_no_of_dimensions(data_dict["n_d"]) MeasureSummaryCard.set_no_of_time_dimensions(data_dict["n_td"]) MeasureSummaryCard.set_summary_html(NarrativesUtils.block_splitter(self.summary,self._blockSplitter)) self._story_narrative.add_a_card(MeasureSummaryCard) self._headNode.add_a_card(MeasureSummaryCard) def _generate_analysis_para1(self): output = 'Para1 entered' data_dict = {"cols" : self._dataframe_helper.get_num_columns(), "min" : int(round(self._measure_descr_stats.get_min(), 0)), "max" : int(round(self._measure_descr_stats.get_max(), 0)), "n" : self._five_point_summary_stats.get_num_outliers(), "l" : self._five_point_summary_stats.get_left_outliers(), "r" : self._five_point_summary_stats.get_right_outliers(), "m" : self._dataframe_helper.get_numeric_columns(), "total" : NarrativesUtils.round_number(self._measure_descr_stats.get_total(), 0), "avg" : NarrativesUtils.round_number(self._measure_descr_stats.get_mean(), 2), "o": self._five_point_summary_stats.get_num_outliers(), "col_name": self._column_name, 'rows': self._dataframe_helper.get_num_rows() } output = NarrativesUtils.get_template_output(self._base_dir,\ 'distribution_narratives.html',data_dict) return output def _generate_analysis_para2(self): output = 'Para2 entered' histogram_buckets = self._measure_descr_stats.get_histogram() print(histogram_buckets) print("$"*200) threshold = self._dataframe_helper.get_num_rows() * 0.75 s = 0 start = 0 end = len(histogram_buckets) flag = 0 for bin_size in range(1,len(histogram_buckets)): s_t = 0 for i in range(len(histogram_buckets)-bin_size+1): s_t = 0 for j in range(i,i+bin_size): s_t = s_t + histogram_buckets[j]['num_records'] if(s_t >= threshold) and (s_t > s): s = s_t start = i end = i + bin_size - 1 flag = 1 if (flag == 1): break bin_size_75 = old_div((end - start + 1)*100,len(histogram_buckets)) s = old_div(s*100,self._dataframe_helper.get_num_rows()) print(histogram_buckets) print("="*120) start_value = histogram_buckets[start]['start_value'] print(start,end) if end >= len(histogram_buckets): end = len(histogram_buckets)-1 print(start,end) end_value = histogram_buckets[end]['end_value'] if len(histogram_buckets) > 2: lowest = min(histogram_buckets[0]['num_records'],histogram_buckets[1]['num_records'],histogram_buckets[2]['num_records']) highest = max(histogram_buckets[0]['num_records'],histogram_buckets[1]['num_records'],histogram_buckets[2]['num_records']) else: lowest = min(histogram_buckets[0]['num_records'],histogram_buckets[1]['num_records']) highest = max(histogram_buckets[0]['num_records'],histogram_buckets[1]['num_records']) quartile_sums = self._five_point_summary_stats.get_sums() quartile_means = self._five_point_summary_stats.get_means() print(quartile_means) quartile_frequencies = self._five_point_summary_stats.get_frequencies() total = self._measure_descr_stats.get_total() avg = self._measure_descr_stats.get_mean() counts = self._measure_descr_stats.get_num_values() data_dict = {"histogram" : histogram_buckets, "per_cont_hist1" : NarrativesUtils.round_number(old_div(histogram_buckets[0]['num_records']*100,self._measure_descr_stats.get_total()), MeasureColumnNarrative.MAX_FRACTION_DIGITS), "per_cont_hist2" : NarrativesUtils.round_number(old_div(histogram_buckets[1]['num_records']*100,self._measure_descr_stats.get_total()), MeasureColumnNarrative.MAX_FRACTION_DIGITS), "lowest_cont" : NarrativesUtils.round_number(old_div(lowest*100,self._measure_descr_stats.get_total()), MeasureColumnNarrative.MAX_FRACTION_DIGITS), "highest_cont" : NarrativesUtils.round_number(old_div(highest*100,self._measure_descr_stats.get_total()), MeasureColumnNarrative.MAX_FRACTION_DIGITS), "num_bins" : len(histogram_buckets), "seventy_five" : bin_size_75, "col_name" : self._column_name, "skew" : self._measure_descr_stats.get_skew(), "three_quarter_percent" : round(s,2), "start_value" : start_value, "end_value" : end_value, "measure_colname":self._column_name, "q4_cont" : NarrativesUtils.round_number(old_div(quartile_frequencies['q4']*100.0,counts), 2), "q1_cont" : NarrativesUtils.round_number(old_div(quartile_frequencies['q1']*100.0,counts), 2), "q4_frac" : NarrativesUtils.round_number(old_div(quartile_sums['q4']*100.0,total), 2), "q1_frac" : NarrativesUtils.round_number(old_div(quartile_sums['q1']*100.0,total), 2), "q4_sum" : NarrativesUtils.round_number(quartile_sums['q4'], 2), "q4_mean" : NarrativesUtils.round_number(quartile_means['q4'], 2), "q1_sum" : NarrativesUtils.round_number(quartile_sums['q1'], 2), "q4_overall_mean" : round(old_div(quartile_means['q4']*1.0,avg), 2), "total" : NarrativesUtils.round_number(total,2), "avg" : NarrativesUtils.round_number(avg,2), "highlightFlag":self._highlightFlag, "blockSplitter":self._blockSplitter } try: data_dict["q4_q1_mean"] = round(old_div(quartile_means['q4']*1.0,quartile_means['q1']), 1) except: data_dict["q4_q1_mean"] = None self._result_setter.update_executive_summary_data({"skew":data_dict["skew"]}) if abs(self._measure_descr_stats.get_skew())>0.1: content = NarrativesUtils.get_template_output(self._base_dir,\ 'descriptive_card2.html',data_dict) blocks = NarrativesUtils.block_splitter(content,self._blockSplitter,highlightFlag=self._highlightFlag) self.card2 = {} self.card2['data'] = { 'heading': 'Concentration of High & Low segments', 'content': blocks } quartiles = ['q1','q2','q3','q4'] observations = [0.0] + [old_div(quartile_frequencies[i]*100.0,counts) for i in quartiles] totals = [0.0] + [old_div(quartile_sums[i]*100.0,total) for i in quartiles] chart = {'x-label': '% of Observations', 'y-label': '% of Total '+self._column_name+' (Cumulative)', 'x': list(NarrativesUtils.accumu(observations)), 'y': list(NarrativesUtils.accumu(totals))} self.card2['chart'] = chart output = NarrativesUtils.get_template_output(self._base_dir,\ 'histogram_narrative.html',data_dict) return output def _generate_take_away(self): output = 'Takeaway entered' histogram_buckets = self._measure_descr_stats.get_histogram() threshold = self._dataframe_helper.get_num_rows() * 0.75 s = 0 start = 0 end = len(histogram_buckets) flag = 0 for bin_size in range(1,len(histogram_buckets)): s_t = 0 for i in range(len(histogram_buckets)-bin_size+1): s_t = 0 for j in range(i,i+bin_size): s_t = s_t + histogram_buckets[j]['num_records'] if(s_t >= threshold) and (s_t > s): s = s_t start = i end = i + bin_size - 1 flag = 1 if (flag == 1): break bin_size_75 = old_div((end - start + 1)*100,len(histogram_buckets)) s = old_div(s*100,self._dataframe_helper.get_num_rows()) start_value = histogram_buckets[start]['start_value'] if end >= len(histogram_buckets): end = len(histogram_buckets)-1 end_value = histogram_buckets[end]['end_value'] data_dict = {"num_bins" : len(histogram_buckets), "seventy_five" : bin_size_75, "col_name" : self._column_name, "c_col_name" : self._capitalized_column_name, "skew" : self._measure_descr_stats.get_skew(), "start": start_value, "end": end_value } if (len(histogram_buckets)>3): output = NarrativesUtils.get_template_output(self._base_dir,\ 'histogram_takeaway.html',data_dict) return output
def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = [ x for x in algosToRun if x.get_algorithm_slug() == self._slug ][0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() categorical_columns = [ x for x in categorical_columns if x != result_column ] appType = self._dataframe_context.get_app_type() model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print("model_path", model_path) pipeline_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/pipeline/" model_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/model" pmml_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/modelPmml" df = self._data_frame levels = df.select(result_column).distinct().count() appType = self._dataframe_context.get_app_type() model_filepath = model_path + "/" + self._slug + "/model" pmml_filepath = str(model_path) + "/" + str( self._slug) + "/traindeModel.pmml" CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "training", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") st = time.time() pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns, categorical_columns, result_column) trainingData, validationData = MLUtils.get_training_and_validation_data( df, result_column, 0.8) # indexed labelIndexer = StringIndexer(inputCol=result_column, outputCol="label") # OriginalTargetconverter = IndexToString(inputCol="label", outputCol="originalTargetColumn") # Label Mapping and Inverse labelIdx = labelIndexer.fit(trainingData) labelMapping = {k: v for k, v in enumerate(labelIdx.labels)} inverseLabelMapping = { v: float(k) for k, v in enumerate(labelIdx.labels) } if self._dataframe_context.get_trainerMode() == "autoML": automl_enable = True else: automl_enable = False clf = NaiveBayes() if not algoSetting.is_hyperparameter_tuning_enabled(): algoParams = algoSetting.get_params_dict() else: algoParams = algoSetting.get_params_dict_hyperparameter() print("=" * 100) print(algoParams) print("=" * 100) clfParams = [prm.name for prm in clf.params] algoParams = { getattr(clf, k): v if isinstance(v, list) else [v] for k, v in algoParams.items() if k in clfParams } #print("="*100) #print("ALGOPARAMS - ",algoParams) #print("="*100) paramGrid = ParamGridBuilder() # if not algoSetting.is_hyperparameter_tuning_enabled(): # for k,v in algoParams.items(): # if v == [None] * len(v): # continue # if k.name == 'thresholds': # paramGrid = paramGrid.addGrid(k,v[0]) # else: # paramGrid = paramGrid.addGrid(k,v) # paramGrid = paramGrid.build() # if not algoSetting.is_hyperparameter_tuning_enabled(): for k, v in algoParams.items(): print(k, v) if v == [None] * len(v): continue paramGrid = paramGrid.addGrid(k, v) paramGrid = paramGrid.build() # else: # for k,v in algoParams.items(): # print k.name, v # if v[0] == [None] * len(v[0]): # continue # paramGrid = paramGrid.addGrid(k,v[0]) # paramGrid = paramGrid.build() #print("="*143) #print("PARAMGRID - ", paramGrid) #print("="*143) if len(paramGrid) > 1: hyperParamInitParam = algoSetting.get_hyperparameter_params() evaluationMetricDict = { "name": hyperParamInitParam["evaluationMetric"] } evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] else: evaluationMetricDict = { "name": GLOBALSETTINGS.CLASSIFICATION_MODEL_EVALUATION_METRIC } evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] self._result_setter.set_hyper_parameter_results(self._slug, None) if validationDict["name"] == "kFold": numFold = int(validationDict["value"]) estimator = Pipeline(stages=[pipeline, labelIndexer, clf]) if algoSetting.is_hyperparameter_tuning_enabled(): modelFilepath = "/".join(model_filepath.split("/")[:-1]) pySparkHyperParameterResultObj = PySparkGridSearchResult( estimator, paramGrid, appType, modelFilepath, levels, evaluationMetricDict, trainingData, validationData, numFold, self._targetLevel, labelMapping, inverseLabelMapping, df) resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models( ) self._result_setter.set_hyper_parameter_results( self._slug, resultArray) self._result_setter.set_metadata_parallel_coordinates( self._slug, { "ignoreList": pySparkHyperParameterResultObj.get_ignore_list(), "hideColumns": pySparkHyperParameterResultObj.get_hide_columns(), "metricColName": pySparkHyperParameterResultObj. get_comparison_metric_colname(), "columnOrder": pySparkHyperParameterResultObj.get_keep_columns() }) bestModel = pySparkHyperParameterResultObj.getBestModel() prediction = pySparkHyperParameterResultObj.getBestPrediction() else: if automl_enable: paramGrid = (ParamGridBuilder().addGrid( clf.smoothing, [1.0, 0.2]).build()) crossval = CrossValidator( estimator=estimator, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator() if levels == 2 else MulticlassClassificationEvaluator(), numFolds=3 if numFold is None else numFold) # use 3+ folds in practice cvnb = crossval.fit(trainingData) prediction = cvnb.transform(validationData) bestModel = cvnb.bestModel else: train_test_ratio = float( self._dataframe_context.get_train_test_split()) estimator = Pipeline(stages=[pipeline, labelIndexer, clf]) if algoSetting.is_hyperparameter_tuning_enabled(): modelFilepath = "/".join(model_filepath.split("/")[:-1]) pySparkHyperParameterResultObj = PySparkTrainTestResult( estimator, paramGrid, appType, modelFilepath, levels, evaluationMetricDict, trainingData, validationData, train_test_ratio, self._targetLevel, labelMapping, inverseLabelMapping, df) resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models( ) self._result_setter.set_hyper_parameter_results( self._slug, resultArray) self._result_setter.set_metadata_parallel_coordinates( self._slug, { "ignoreList": pySparkHyperParameterResultObj.get_ignore_list(), "hideColumns": pySparkHyperParameterResultObj.get_hide_columns(), "metricColName": pySparkHyperParameterResultObj. get_comparison_metric_colname(), "columnOrder": pySparkHyperParameterResultObj.get_keep_columns() }) bestModel = pySparkHyperParameterResultObj.getBestModel() prediction = pySparkHyperParameterResultObj.getBestPrediction() else: tvs = TrainValidationSplit( estimator=estimator, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator() if levels == 2 else MulticlassClassificationEvaluator(), trainRatio=train_test_ratio) tvspnb = tvs.fit(trainingData) prediction = tvspnb.transform(validationData) bestModel = tvspnb.bestModel modelmanagement_ = { param[0].name: param[1] for param in bestModel.stages[2].extractParamMap().items() } MLUtils.save_pipeline_or_model(bestModel, model_filepath) predsAndLabels = prediction.select(['prediction', 'label']).rdd.map(tuple) # label_classes = prediction.select("label").distinct().collect() # label_classes = prediction.agg((F.collect_set('label').alias('label'))).first().asDict()['label'] #results = transformed.select(["prediction","label"]) # if len(label_classes) > 2: # metrics = MulticlassMetrics(predsAndLabels) # accuracy of the model # else: # metrics = BinaryClassificationMetrics(predsAndLabels) posLabel = inverseLabelMapping[self._targetLevel] metrics = MulticlassMetrics(predsAndLabels) trainingTime = time.time() - st f1_score = metrics.fMeasure(inverseLabelMapping[self._targetLevel], 1.0) precision = metrics.precision(inverseLabelMapping[self._targetLevel]) recall = metrics.recall(inverseLabelMapping[self._targetLevel]) accuracy = metrics.accuracy print(f1_score, precision, recall, accuracy) #gain chart implementation def cal_prob_eval(x): if len(x) == 1: if x == posLabel: return (float(x[1])) else: return (float(1 - x[1])) else: return (float(x[int(posLabel)])) column_name = 'probability' def y_prob_for_eval_udf(): return udf(lambda x: cal_prob_eval(x)) prediction = prediction.withColumn( "y_prob_for_eval", y_prob_for_eval_udf()(col(column_name))) try: pys_df = prediction.select( ['y_prob_for_eval', 'prediction', 'label']) gain_lift_ks_obj = GainLiftKS(pys_df, 'y_prob_for_eval', 'prediction', 'label', posLabel, self._spark) gain_lift_KS_dataframe = gain_lift_ks_obj.Run().toPandas() except: try: temp_df = pys_df.toPandas() gain_lift_ks_obj = GainLiftKS(temp_df, 'y_prob_for_eval', 'prediction', 'label', posLabel, self._spark) gain_lift_KS_dataframe = gain_lift_ks_obj.Rank_Ordering() except: print("gain chant failed") gain_lift_KS_dataframe = None #feature_importance = MLUtils.calculate_sparkml_feature_importance(df, bestModel.stages[-1], categorical_columns, numerical_columns) act_list = prediction.select('label').collect() actual = [int(row.label) for row in act_list] pred_list = prediction.select('prediction').collect() predicted = [int(row.prediction) for row in pred_list] prob_list = prediction.select('probability').collect() probability = [list(row.probability) for row in prob_list] # objs = {"trained_model":bestModel,"actual":prediction.select('label'),"predicted":prediction.select('prediction'), # "probability":prediction.select('probability'),"feature_importance":None, # "featureList":list(categorical_columns) + list(numerical_columns),"labelMapping":labelMapping} objs = { "trained_model": bestModel, "actual": actual, "predicted": predicted, "probability": probability, "feature_importance": None, "featureList": list(categorical_columns) + list(numerical_columns), "labelMapping": labelMapping } conf_mat_ar = metrics.confusionMatrix().toArray() print(conf_mat_ar) confusion_matrix = {} for i in range(len(conf_mat_ar)): confusion_matrix[labelMapping[i]] = {} for j, val in enumerate(conf_mat_ar[i]): confusion_matrix[labelMapping[i]][labelMapping[j]] = val print(confusion_matrix) # accuracy of the model '''ROC CURVE IMPLEMENTATION''' y_prob = probability y_score = predicted y_test = actual logLoss = log_loss(y_test, y_prob) if levels <= 2: positive_label_probs = [] for val in y_prob: positive_label_probs.append(val[int(posLabel)]) roc_auc = roc_auc_score(y_test, y_score) roc_data_dict = { "y_score": y_score, "y_test": y_test, "positive_label_probs": positive_label_probs, "y_prob": y_prob, "positive_label": posLabel } roc_dataframe = pd.DataFrame({ "y_score": y_score, "y_test": y_test, "positive_label_probs": positive_label_probs }) #roc_dataframe.to_csv("binary_roc_data.csv") fpr, tpr, thresholds = roc_curve(y_test, positive_label_probs, pos_label=posLabel) roc_df = pd.DataFrame({ "FPR": fpr, "TPR": tpr, "thresholds": thresholds }) roc_df["tpr-fpr"] = roc_df["TPR"] - roc_df["FPR"] optimal_index = np.argmax(np.array(roc_df["tpr-fpr"])) fpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "FPR"] tpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "TPR"] rounded_roc_df = roc_df.round({'FPR': 2, 'TPR': 4}) unique_fpr = rounded_roc_df["FPR"].unique() final_roc_df = rounded_roc_df.groupby("FPR", as_index=False)[["TPR" ]].mean() endgame_roc_df = final_roc_df.round({'FPR': 2, 'TPR': 3}) elif levels > 2: positive_label_probs = [] for val in y_prob: positive_label_probs.append(val[int(posLabel)]) y_test_roc_multi = [] for val in y_test: if val != posLabel: val = posLabel + 1 y_test_roc_multi.append(val) else: y_test_roc_multi.append(val) y_score_roc_multi = [] for val in y_score: if val != posLabel: val = posLabel + 1 y_score_roc_multi.append(val) else: y_score_roc_multi.append(val) roc_auc = roc_auc_score(y_test_roc_multi, y_score_roc_multi) fpr, tpr, thresholds = roc_curve(y_test_roc_multi, positive_label_probs, pos_label=posLabel) roc_df = pd.DataFrame({ "FPR": fpr, "TPR": tpr, "thresholds": thresholds }) roc_df["tpr-fpr"] = roc_df["TPR"] - roc_df["FPR"] optimal_index = np.argmax(np.array(roc_df["tpr-fpr"])) fpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "FPR"] tpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "TPR"] rounded_roc_df = roc_df.round({'FPR': 2, 'TPR': 4}) unique_fpr = rounded_roc_df["FPR"].unique() final_roc_df = rounded_roc_df.groupby("FPR", as_index=False)[["TPR" ]].mean() endgame_roc_df = final_roc_df.round({'FPR': 2, 'TPR': 3}) # Calculating prediction_split val_cnts = prediction.groupBy('label').count() val_cnts = map(lambda row: row.asDict(), val_cnts.collect()) prediction_split = {} total_nos = prediction.select('label').count() for item in val_cnts: print(labelMapping) classname = labelMapping[item['label']] prediction_split[classname] = round( item['count'] * 100 / float(total_nos), 2) if not algoSetting.is_hyperparameter_tuning_enabled(): modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(modelName) bestModel.save("/".join(modelFilepathArr)) runtime = round((time.time() - st_global), 2) try: print(pmml_filepath) pmmlBuilder = PMMLBuilder(self._spark, trainingData, bestModel).putOption( clf, 'compact', True) pmmlBuilder.buildFile(pmml_filepath) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except Exception as e: print("PMML failed...", str(e)) pass cat_cols = list(set(categorical_columns) - {result_column}) self._model_summary = MLModelSummary() self._model_summary.set_algorithm_name("Naive Bayes") self._model_summary.set_algorithm_display_name("Naive Bayes") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_confusion_matrix(confusion_matrix) # self._model_summary.set_feature_importance(objs["feature_importance"]) self._model_summary.set_feature_list(objs["featureList"]) self._model_summary.set_model_accuracy(accuracy) self._model_summary.set_training_time(round((time.time() - st), 2)) self._model_summary.set_precision_recall_stats([precision, recall]) self._model_summary.set_model_precision(precision) self._model_summary.set_model_recall(recall) self._model_summary.set_model_F1_score(f1_score) self._model_summary.set_model_log_loss(logLoss) self._model_summary.set_gain_lift_KS_data(gain_lift_KS_dataframe) self._model_summary.set_AUC_score(roc_auc) self._model_summary.set_target_variable(result_column) self._model_summary.set_prediction_split(prediction_split) self._model_summary.set_validation_method("KFold") self._model_summary.set_level_map_dict(objs["labelMapping"]) # self._model_summary.set_model_features(list(set(x_train.columns)-set([result_column]))) self._model_summary.set_model_features(objs["featureList"]) self._model_summary.set_level_counts( self._metaParser.get_unique_level_dict( list(set(categorical_columns)) + [result_column])) #self._model_summary.set_num_trees(objs['trained_model'].getNumTrees) self._model_summary.set_num_rules(300) self._model_summary.set_target_level(self._targetLevel) if not algoSetting.is_hyperparameter_tuning_enabled(): modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": accuracy, "evaluationMetricName": "accuracy", "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } else: modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": accuracy, "evaluationMetricName": "accuracy", "slug": self._model_summary.get_slug(), "Model Id": resultArray[0]["Model Id"] } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } self._model_management = MLModelSummary() print(modelmanagement_) self._model_management.set_job_type( self._dataframe_context.get_job_name()) #Project name self._model_management.set_training_status( data="completed") # training status self._model_management.set_target_level( self._targetLevel) # target column value self._model_management.set_training_time(runtime) # run time self._model_management.set_model_accuracy(round(metrics.accuracy, 2)) # self._model_management.set_model_accuracy(round(metrics.accuracy_score(objs["actual"], objs["predicted"]),2))#accuracy self._model_management.set_algorithm_name( "NaiveBayes") #algorithm name self._model_management.set_validation_method( str(validationDict["displayName"]) + "(" + str(validationDict["value"]) + ")") #validation method self._model_management.set_target_variable( result_column) #target column name self._model_management.set_creation_date(data=str( datetime.now().strftime('%b %d ,%Y %H:%M '))) #creation date self._model_management.set_datasetName(self._datasetName) self._model_management.set_model_type(data='classification') self._model_management.set_var_smoothing( data=int(modelmanagement_['smoothing'])) # self._model_management.set_no_of_independent_variables(df) #no of independent varables modelManagementSummaryJson = [ ["Project Name", self._model_management.get_job_type()], ["Algorithm", self._model_management.get_algorithm_name()], ["Training Status", self._model_management.get_training_status()], ["Accuracy", self._model_management.get_model_accuracy()], ["RunTime", self._model_management.get_training_time()], #["Owner",None], ["Created On", self._model_management.get_creation_date()] ] modelManagementModelSettingsJson = [ ["Training Dataset", self._model_management.get_datasetName()], ["Target Column", self._model_management.get_target_variable()], ["Target Column Value", self._model_management.get_target_level()], ["Algorithm", self._model_management.get_algorithm_name()], [ "Model Validation", self._model_management.get_validation_method() ], ["Model Type", self._model_management.get_model_type()], ["Smoothing", self._model_management.get_var_smoothing()], #,["priors",self._model_management.get_priors()] #,["var_smoothing",self._model_management.get_var_smoothing()] ] nbOverviewCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_card_overview( self._model_management, modelManagementSummaryJson, modelManagementModelSettingsJson) ] nbPerformanceCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_cards( self._model_summary, endgame_roc_df) ] nbDeploymentCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_deploy_empty_card() ] nbCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary) ] NB_Overview_Node = NarrativesTree() NB_Overview_Node.set_name("Overview") NB_Performance_Node = NarrativesTree() NB_Performance_Node.set_name("Performance") NB_Deployment_Node = NarrativesTree() NB_Deployment_Node.set_name("Deployment") for card in nbOverviewCards: NB_Overview_Node.add_a_card(card) for card in nbPerformanceCards: NB_Performance_Node.add_a_card(card) for card in nbDeploymentCards: NB_Deployment_Node.add_a_card(card) for card in nbCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({ "naivebayes": json.loads( CommonUtils.convert_python_object_to_json(self._model_summary)) }) self._result_setter.set_naive_bayes_model_summary(modelSummaryJson) self._result_setter.set_nb_cards(nbCards) self._result_setter.set_nb_nodes( [NB_Overview_Node, NB_Performance_Node, NB_Deployment_Node]) self._result_setter.set_nb_fail_card({ "Algorithm_Name": "Naive Bayes", "success": "True" }) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "completion", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") print("\n\n")
class RegressionNarrative(object): def __init__(self, df_helper, df_context, result_setter, spark, df_regression_result, correlations,story_narrative,meta_parser): self._metaParser = meta_parser self._result_setter = result_setter self._story_narrative = story_narrative self._df_regression_result = df_regression_result self._correlations = correlations self._dataframe_helper = df_helper self._dataframe_context = df_context self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER # self._result_setter.set_trend_section_name("regression") self._measure_columns = self._dataframe_helper.get_numeric_columns() self._dimension_columns = self._dataframe_helper.get_string_columns() self._date_columns = self._dataframe_context.get_date_columns() self._uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(self._uid_col): self._dimension_columns = list(set(self._dimension_columns) - {self._uid_col}) if len(self._date_columns) >0 : self._dimension_columns = list(set(self._dimension_columns)-set(self._date_columns)) self._spark = spark self.measures = [] self.result_column = self._dataframe_helper.resultcolumn self.all_coefficients = self._df_regression_result.get_all_coeff() all_coeff = [(x,self.all_coefficients[x]) for x in list(self.all_coefficients.keys())] all_coeff = sorted(all_coeff,key = lambda x:abs(x[1]["coefficient"]),reverse = True) self._all_coeffs = all_coeff self.significant_measures = [x[0] for x in all_coeff if x[1]['p_value']<=0.05] print(self.significant_measures) print("regression narratives started") self.narratives = {"heading": self.result_column + "Performance Report", "main_card":{}, "cards":[] } self._base_dir = "/regression/" self._run_dimension_level_regression = False # self._dim_regression = self.run_regression_for_dimension_levels() self._regressionNode = NarrativesTree() self._completionStatus = self._dataframe_context.get_completion_status() self._analysisName = self._dataframe_context.get_analysis_name() self._messageURL = self._dataframe_context.get_message_url() self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight() self._scriptStages = { "regressionNarrativeStart":{ "summary":"Started The Regression Narratives", "weight":1 }, "regressionNarrativeEnd":{ "summary":"Narratives For Regression Finished", "weight":0 }, } self._completionStatus += old_div(self._scriptWeightDict[self._analysisName]["narratives"]*self._scriptStages["regressionNarrativeStart"]["weight"],10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "regressionNarrativeStart",\ "info",\ self._scriptStages["regressionNarrativeStart"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL,progressMessage) self._dataframe_context.update_completion_status(self._completionStatus) self.generate_narratives() self._regressionNode.set_name("Influencers") self._result_setter.set_regression_node(self._regressionNode) self._completionStatus += old_div(self._scriptWeightDict[self._analysisName]["narratives"]*self._scriptStages["regressionNarrativeEnd"]["weight"],10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "regressionNarrativeEnd",\ "info",\ self._scriptStages["regressionNarrativeEnd"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL,progressMessage) self._dataframe_context.update_completion_status(self._completionStatus) def generate_narratives(self): regression_narrative_obj = LinearRegressionNarrative( self._df_regression_result, self._correlations, self._dataframe_helper, self._dataframe_context, self._metaParser, self._spark ) main_card_data = regression_narrative_obj.generate_main_card_data() main_card_narrative = NarrativesUtils.get_template_output(self._base_dir,\ 'regression_main_card.html',main_card_data) self.narratives['main_card'] = {} self.narratives["main_card"]['paragraphs'] = NarrativesUtils.paragraph_splitter(main_card_narrative) self.narratives["main_card"]['header'] = 'Key Measures that affect ' + self.result_column self.narratives["main_card"]['chart'] = {} self.narratives["main_card"]['chart']['heading'] = '' self.narratives["main_card"]['chart']['data'] = [[i for i,j in self._all_coeffs], [j['coefficient'] for i,j in self._all_coeffs]] self.narratives["main_card"]['chart']['label'] = {'x':'Measure Name', 'y': 'Change in ' + self.result_column + ' per unit increase'} main_card = NormalCard() main_card_header = HtmlData(data = '<h3>Key Measures that affect ' + self.result_column+"</h3>") main_card_paragraphs = NarrativesUtils.block_splitter(main_card_narrative,self._blockSplitter) main_card_chart_data = [{"key":val[0],"value":val[1]} for val in zip([i for i,j in self._all_coeffs],[j['coefficient'] for i,j in self._all_coeffs])] main_card_chart = NormalChartData(data=main_card_chart_data) mainCardChartJson = ChartJson() mainCardChartJson.set_data(main_card_chart.get_data()) mainCardChartJson.set_label_text({'x':'Influencing Factors','y': 'Change in ' + self.result_column + ' per unit increase'}) mainCardChartJson.set_chart_type("bar") mainCardChartJson.set_axes({"x":"key","y":"value"}) mainCardChartJson.set_yaxis_number_format(".2f") # st_info = ["Test : Regression","Threshold for p-value: 0.05", "Effect Size: Regression Coefficient"] chart_data = sorted(main_card_chart_data,key=lambda x:x["value"],reverse=True) statistical_info_array=[ ("Test Type","Regression"), ("Effect Size","Coefficients"), ("Max Effect Size",chart_data[0]["key"]), ("Min Effect Size",chart_data[-1]["key"]), ] statistical_inferenc = "" if len(chart_data) == 1: statistical_inference = "{} is the only variable that have significant influence over {} (Target) having an \ Effect size of {}".format(chart_data[0]["key"],self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4)) elif len(chart_data) == 2: statistical_inference = "There are two variables ({} and {}) that have significant influence over {} (Target) and the \ Effect size ranges are {} and {} respectively".format(chart_data[0]["key"],chart_data[1]["key"],self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4),round(chart_data[1]["value"],4)) else: statistical_inference = "There are {} variables that have significant influence over {} (Target) and the \ Effect size ranges from {} to {}".format(len(chart_data),self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4),round(chart_data[-1]["value"],4)) if statistical_inference != "": statistical_info_array.append(("Inference",statistical_inference)) statistical_info_array = NarrativesUtils.statistical_info_array_formatter(statistical_info_array) main_card.set_card_data(data = [main_card_header]+main_card_paragraphs+[C3ChartData(data=mainCardChartJson,info=statistical_info_array)]) main_card.set_card_name("Key Influencers") self._regressionNode.add_a_card(main_card) count = 0 for measure_column in self.significant_measures: sigMeasureNode = NarrativesTree() sigMeasureNode.set_name(measure_column) measureCard1 = NormalCard() measureCard1.set_card_name("{}: Impact on {}".format(measure_column,self.result_column)) measureCard1Data = [] if self._run_dimension_level_regression: measureCard2 = NormalCard() measureCard2.set_card_name("Key Areas where it Matters") measureCard2Data = [] measure_column_cards = {} card0 = {} card1data = regression_narrative_obj.generate_card1_data(measure_column) card1heading = "<h3>Impact of "+measure_column+" on "+self.result_column+"</h3>" measureCard1Header = HtmlData(data=card1heading) card1data.update({"blockSplitter":self._blockSplitter}) card1narrative = NarrativesUtils.get_template_output(self._base_dir,\ 'regression_card1.html',card1data) card1paragraphs = NarrativesUtils.block_splitter(card1narrative,self._blockSplitter) card0 = {"paragraphs":card1paragraphs} card0["charts"] = {} card0['charts']['chart2']={} # card0['charts']['chart2']['data']=card1data["chart_data"] # card0['charts']['chart2']['heading'] = '' # card0['charts']['chart2']['labels'] = {} card0['charts']['chart1']={} card0["heading"] = card1heading measure_column_cards['card0'] = card0 measureCard1Header = HtmlData(data=card1heading) measureCard1Data += [measureCard1Header] measureCard1para = card1paragraphs measureCard1Data += measureCard1para if self._run_dimension_level_regression: print("running narratives for key area dict") self._dim_regression = self.run_regression_for_dimension_levels() card2table, card2data=regression_narrative_obj.generate_card2_data(measure_column,self._dim_regression) card2data.update({"blockSplitter":self._blockSplitter}) card2narrative = NarrativesUtils.get_template_output(self._base_dir,\ 'regression_card2.html',card2data) card2paragraphs = NarrativesUtils.block_splitter(card2narrative,self._blockSplitter) card1 = {'tables': card2table, 'paragraphs' : card2paragraphs, 'heading' : 'Key Areas where ' + measure_column + ' matters'} measure_column_cards['card1'] = card1 measureCard2Data += card2paragraphs if "table1" in card2table: table1data = regression_narrative_obj.convert_table_data(card2table["table1"]) card2Table1 = TableData() card2Table1.set_table_data(table1data) card2Table1.set_table_type("heatMap") card2Table1.set_table_top_header(card2table["table1"]["heading"]) card2Table1Json = json.loads(CommonUtils.convert_python_object_to_json(card2Table1)) # measureCard2Data.insert(3,card2Table1) measureCard2Data.insert(3,card2Table1Json) if "table2" in card2table: table2data = regression_narrative_obj.convert_table_data(card2table["table2"]) card2Table2 = TableData() card2Table2.set_table_data(table2data) card2Table2.set_table_type("heatMap") card2Table2.set_table_top_header(card2table["table2"]["heading"]) # measureCard2Data.insert(5,card2Table2) card2Table2Json = json.loads(CommonUtils.convert_python_object_to_json(card2Table2)) # measureCard2Data.append(card2Table2) measureCard2Data.append(card2Table2Json) # self._result_setter.set_trend_section_data({"result_column":self.result_column, # "measure_column":measure_column, # "base_dir":self._base_dir # }) # trend_narratives_obj = TimeSeriesNarrative(self._dataframe_helper, self._dataframe_context, self._result_setter, self._spark, self._story_narrative) # card2 = trend_narratives_obj.get_regression_trend_card_data() # if card2: # measure_column_cards['card2'] = card2 # # # card3 = {} progressMessage = CommonUtils.create_progress_message_object(self._analysisName,"custom","info","Analyzing Key Influencers",self._completionStatus,self._completionStatus,display=True) CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=False) card4data = regression_narrative_obj.generate_card4_data(self.result_column,measure_column) card4data.update({"blockSplitter":self._blockSplitter}) # card4heading = "Sensitivity Analysis: Effect of "+self.result_column+" on Segments of "+measure_column card4narrative = NarrativesUtils.get_template_output(self._base_dir,\ 'regression_card4.html',card4data) card4paragraphs = NarrativesUtils.block_splitter(card4narrative,self._blockSplitter) # card3 = {"paragraphs":card4paragraphs} card0['paragraphs'] = card1paragraphs+card4paragraphs card4Chart = card4data["charts"] # st_info = ["Test : Regression", "Variables : "+ self.result_column +", "+measure_column,"Intercept : "+str(round(self._df_regression_result.get_intercept(),2)), "Regression Coefficient : "+ str(round(self._df_regression_result.get_coeff(measure_column),2))] statistical_info_array=[ ("Test Type","Regression"), ("Coefficient",str(round(self._df_regression_result.get_coeff(measure_column),2))), ("P-Value","<= 0.05"), ("Intercept",str(round(self._df_regression_result.get_intercept(),2))), ("R Square ",str(round(self._df_regression_result.get_rsquare(),2))), ] inferenceTuple = () coeff = self._df_regression_result.get_coeff(measure_column) if coeff > 0: inferenceTuple = ("Inference","For every additional unit of increase in {} there will be an increase of {} units in {} (target).".format(measure_column,str(round(coeff,2)),self._dataframe_context.get_result_column())) else: inferenceTuple = ("Inference","For every additional unit of decrease in {} there will be an decrease of {} units in {} (target).".format(measure_column,str(round(coeff,2)),self._dataframe_context.get_result_column())) if len(inferenceTuple) > 0: statistical_info_array.append(inferenceTuple) statistical_info_array = NarrativesUtils.statistical_info_array_formatter(statistical_info_array) card4paragraphs.insert(2,C3ChartData(data=card4Chart,info=statistical_info_array)) measureCard1Data += card4paragraphs self.narratives['cards'].append(measure_column_cards) if count == 0: card4data.pop("charts") self._result_setter.update_executive_summary_data(card4data) count += 1 measureCard1.set_card_data(measureCard1Data) if self._run_dimension_level_regression: measureCard2.set_card_data(measureCard2Data) sigMeasureNode.add_cards([measureCard1,measureCard2]) sigMeasureNode.add_cards([measureCard1]) self._regressionNode.add_a_node(sigMeasureNode) # self._result_setter.set_trend_section_completion_status(True) self._story_narrative.add_a_node(self._regressionNode) def run_regression_for_dimension_levels(self): print("Running regression for Dimension Levels") significant_dimensions = self._dataframe_helper.get_significant_dimension() print("significant_dimensions:",significant_dimensions) if significant_dimensions != {}: sig_dims = [(x,significant_dimensions[x]) for x in list(significant_dimensions.keys())] sig_dims = sorted(sig_dims,key=lambda x:x[1],reverse=True) cat_columns = [x[0] for x in sig_dims[:5]] else: cat_columns = self._dimension_columns[:5] cat_columns= [x for x in cat_columns if x != "Agent Name"] print("Running regression for below 5 dimensions") print(cat_columns) regression_result_dimension_cols = dict(list(zip(cat_columns,[{}]*len(cat_columns)))) for col in cat_columns: print("For Column:",col) # column_levels = self._dataframe_helper.get_all_levels(col) column_levels = list(self._metaParser.get_unique_level_dict(col).keys()) level_regression_result = dict(list(zip(column_levels,[{}]*len(column_levels)))) print("No of levels in this column",len(column_levels)) for level in column_levels: print("Filtering data for level:",level) filtered_df = self._dataframe_helper.filter_dataframe(col,level) result = LinearRegression(filtered_df, self._dataframe_helper, self._dataframe_context,self._metaParser,self._spark).fit(self._dataframe_context.get_result_column()) if result == None: result = {"intercept" : 0.0, "rmse" : 0.0, "rsquare" : 0.0, "coeff" : 0.0 } else: result = {"intercept" : result.get_intercept(), "rmse" : result.get_root_mean_square_error(), "rsquare" : result.get_rsquare(), "coeff" : result.get_all_coeff() } level_regression_result[level] = result regression_result_dimension_cols[col] = level_regression_result # print json.dumps(regression_result_dimension_cols,indent=2) return regression_result_dimension_cols
data_dict_overall["price_trend"] = stockPriceTrendArrayFormatted data_dict_overall["avg_sentiment_score"] = data_dict_overall["avg_sentiment_score"]/number_stocks data_dict_overall["stock_value_change"] = data_dict_overall["stock_value_change"]/number_stocks data_dict_overall["stock_percent_change"] = data_dict_overall["stock_percent_change"]/number_stocks data_dict_overall["number_articles_by_concept"] = self.get_number_articles_per_concept(data_dict_overall["nArticlesAndSentimentsPerConcept"]) key, value = max(data_dict_overall["max_value_change"].iteritems(), key = lambda p: p[1]) data_dict_overall["max_value_change_overall"] = (self.get_capitalized_name(key),value) key, value = min(data_dict_overall["max_value_change"].iteritems(), key = lambda p: p[1]) data_dict_overall["min_value_change_overall"] = (self.get_capitalized_name(key),value) key,value = max(data_dict_overall["max_sentiment_change"].iteritems(), key = lambda p: p[1]) data_dict_overall["max_sentiment_change_overall"] = (self.get_capitalized_name(key),value) # print data_dict_overall finalResult = NarrativesTree() overviewNode = NarrativesTree() stockNode = NarrativesTree() overviewNode.set_name("Overview") stockNode.set_name("Single Stock Analysis") overviewCard = MLUtils.stock_sense_overview_card(data_dict_overall) overviewNode.add_a_card(overviewCard) finalResult.add_a_node(overviewNode) individualStockNodes = MLUtils.stock_sense_individual_stock_cards(stockDict) stockNode.add_nodes(individualStockNodes) finalResult.add_a_node(stockNode) return finalResult
class ChiSquareNarratives: #@accepts(object, int, DFChiSquareResult ,ContextSetter) def __init__(self, df_helper, df_chisquare_result, spark, df_context, data_frame, story_narrative, result_setter, scriptWeight=None, analysisName=None): self._story_narrative = story_narrative self._result_setter = result_setter self._data_frame = data_frame self._dataframe_context = df_context self._dataframe_helper = df_helper self._storyOnScoredData = self._dataframe_context.get_story_on_scored_data( ) self._measure_columns = df_helper.get_numeric_columns() self._df_chisquare = df_chisquare_result self._df_chisquare_result = df_chisquare_result.get_result() self.narratives = {} self._appid = df_context.get_app_id() self._chiSquareNode = NarrativesTree() self._chiSquareNode.set_name("Association") self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER self._noOfSigDimsToShow = GLOBALSETTINGS.CHISQUARESIGNIFICANTDIMENSIONTOSHOW self._base_dir = "/chisquare/" self._spark = spark ############################DataFrame Measure to Dimesion Column##################### pandas_df = self._data_frame.toPandas() target_dimension = self._df_chisquare_result.keys() bin_data = {} for col in self._measure_columns: chisquare_result = self._df_chisquare.get_chisquare_result( target_dimension[0], col) bin_data[col] = chisquare_result.get_contingency_table( ).get_column_two_levels() for bin_col in bin_data.keys(): for split in bin_data[bin_col]: val = split.split('to') pandas_df[bin_col][ (pandas_df[bin_col] >= float(val[0].replace(',', ''))) & (pandas_df[bin_col] < float(val[1].replace(',', '')) )] = split fields = [ StructField(field_name, StringType(), True) for field_name in pandas_df.columns ] schema = StructType(fields) SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) self._data_frame = SQLctx.createDataFrame(pandas_df, schema) # print self._data_frame ############################DataFrame Measure to Dimesion Column##################### if self._appid != None: if self._appid == "1": self._base_dir += "appid1/" elif self._appid == "2": self._base_dir += "appid2/" self._completionStatus = self._dataframe_context.get_completion_status( ) if analysisName == None: self._analysisName = self._dataframe_context.get_analysis_name() else: self._analysisName = analysisName self._messageURL = self._dataframe_context.get_message_url() if scriptWeight == None: self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight( ) else: self._scriptWeightDict = scriptWeight self._analysisDict = self._dataframe_context.get_analysis_dict() if self._analysisDict != {}: self._nColsToUse = self._analysisDict[ self._analysisName]["noOfColumnsToUse"] else: self._nColsToUse = None self._scriptStages = { "initialization": { "summary": "Initialized the Frequency Narratives", "weight": 0 }, "summarygeneration": { "summary": "summary generation finished", "weight": 10 }, "completion": { "summary": "Frequency Stats Narratives done", "weight": 0 }, } CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "initialization", "info", display=False, weightKey="narratives") self._generate_narratives() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "summarygeneration", "info", display=False, weightKey="narratives") CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "completion", "info", display=False, weightKey="narratives") def _generate_narratives(self): """ generate main card narrative and remaining cards are generated by calling ChiSquareAnalysis class for each of analyzed dimensions """ for target_dimension in self._df_chisquare_result.keys(): target_chisquare_result = self._df_chisquare_result[ target_dimension] analysed_variables = target_chisquare_result.keys( ) ## List of all analyzed var. # List of significant var out of analyzed var. significant_variables = [ dim for dim in target_chisquare_result.keys() if target_chisquare_result[dim].get_pvalue() <= 0.05 ] effect_sizes = [ target_chisquare_result[dim].get_effect_size() for dim in significant_variables ] effect_size_dict = dict(zip(significant_variables, effect_sizes)) significant_variables = [ y for (x, y) in sorted(zip(effect_sizes, significant_variables), reverse=True) ] #insignificant_variables = [i for i in self._df_chisquare_result[target_dimension] if i['pv']>0.05] num_analysed_variables = len(analysed_variables) num_significant_variables = len(significant_variables) self.narratives['main_card'] = {} self.narratives['main_card'][ 'heading'] = 'Relationship between ' + target_dimension + ' and other factors' self.narratives['main_card']['paragraphs'] = {} data_dict = { 'num_variables': num_analysed_variables, 'num_significant_variables': num_significant_variables, 'significant_variables': significant_variables, 'target': target_dimension, 'analysed_dimensions': analysed_variables, 'blockSplitter': self._blockSplitter } # for both para 1 and para 2 paragraph = {} paragraph['header'] = '' paragraph['content'] = NarrativesUtils.get_template_output( self._base_dir, 'main_card.html', data_dict) self.narratives['main_card']['paragraphs'] = [paragraph] self.narratives['cards'] = [] chart = { 'header': 'Strength of association between ' + target_dimension + ' and other dimensions' } chart['data'] = effect_size_dict chart['label_text'] = { 'x': 'Dimensions', 'y': 'Effect Size (Cramers-V)' } chart_data = [] chartDataValues = [] for k, v in effect_size_dict.items(): chart_data.append({"key": k, "value": float(v)}) chartDataValues.append(float(v)) chart_data = sorted(chart_data, key=lambda x: x["value"], reverse=True) chart_json = ChartJson() chart_json.set_data(chart_data) chart_json.set_chart_type("bar") # chart_json.set_label_text({'x':'Dimensions','y':'Effect Size (Cramers-V)'}) chart_json.set_label_text({ 'x': ' ', 'y': 'Effect Size (Cramers-V)' }) chart_json.set_axis_rotation(True) chart_json.set_axes({"x": "key", "y": "value"}) # chart_json.set_yaxis_number_format(".4f") chart_json.set_yaxis_number_format( NarrativesUtils.select_y_axis_format(chartDataValues)) self.narratives['main_card']['chart'] = chart main_card = NormalCard() header = "<h3>Strength of association between " + target_dimension + " and other dimensions</h3>" main_card_data = [HtmlData(data=header)] main_card_narrative = NarrativesUtils.get_template_output( self._base_dir, 'main_card.html', data_dict) main_card_narrative = NarrativesUtils.block_splitter( main_card_narrative, self._blockSplitter) main_card_data += main_card_narrative # st_info = ["Test : Chi Square", "Threshold for p-value : 0.05", "Effect Size : Cramer's V"] # print "chartdata",chart_data if len(chart_data) > 0: statistical_info_array = [ ("Test Type", "Chi-Square"), ("Effect Size", "Cramer's V"), ("Max Effect Size", chart_data[0]["key"]), ("Min Effect Size", chart_data[-1]["key"]), ] statistical_inferenc = "" if len(chart_data) == 1: statistical_inference = "{} is the only variable that have significant association with the {} (Target) having an \ Effect size of {}".format( chart_data[0]["key"], self._dataframe_context.get_result_column(), round(chart_data[0]["value"], 4)) elif len(chart_data) == 2: statistical_inference = "There are two variables ({} and {}) that have significant association with the {} (Target) and the \ Effect size ranges are {} and {} respectively".format( chart_data[0]["key"], chart_data[1]["key"], self._dataframe_context.get_result_column(), round(chart_data[0]["value"], 4), round(chart_data[1]["value"], 4)) else: statistical_inference = "There are {} variables that have significant association with the {} (Target) and the \ Effect size ranges from {} to {}".format( len(chart_data), self._dataframe_context.get_result_column(), round(chart_data[0]["value"], 4), round(chart_data[-1]["value"], 4)) if statistical_inference != "": statistical_info_array.append( ("Inference", statistical_inference)) statistical_info_array = NarrativesUtils.statistical_info_array_formatter( statistical_info_array) else: statistical_info_array = [] main_card_data.append( C3ChartData(data=chart_json, info=statistical_info_array)) main_card.set_card_data(main_card_data) main_card.set_card_name("Key Influencers") if self._storyOnScoredData != True: self._chiSquareNode.add_a_card(main_card) self._result_setter.add_a_score_chi_card(main_card) print "target_dimension", target_dimension if self._appid == '2' and num_significant_variables > 5: significant_variables = significant_variables[:5] else: if self._nColsToUse != None: significant_variables = significant_variables[:self. _nColsToUse] CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "custom", "info", display=True, customMsg="Analyzing key drivers", weightKey="narratives") for analysed_dimension in significant_variables[:self. _noOfSigDimsToShow]: chisquare_result = self._df_chisquare.get_chisquare_result( target_dimension, analysed_dimension) if self._appid == '2': print "APPID 2 is used" card = ChiSquareAnalysis( self._dataframe_context, self._dataframe_helper, chisquare_result, target_dimension, analysed_dimension, significant_variables, num_analysed_variables, self._data_frame, self._measure_columns, self._base_dir, None, target_chisquare_result) # self.narratives['cards'].append(card) self._result_setter.add_a_score_chi_card( json.loads( CommonUtils.convert_python_object_to_json( card.get_dimension_card1()))) elif self._appid == '1': print "APPID 1 is used" card = ChiSquareAnalysis( self._dataframe_context, self._dataframe_helper, chisquare_result, target_dimension, analysed_dimension, significant_variables, num_analysed_variables, self._data_frame, self._measure_columns, self._base_dir, None, target_chisquare_result) # self.narratives['cards'].append(card) self._result_setter.add_a_score_chi_card( json.loads( CommonUtils.convert_python_object_to_json( card.get_dimension_card1()))) else: target_dimension_card = ChiSquareAnalysis( self._dataframe_context, self._dataframe_helper, chisquare_result, target_dimension, analysed_dimension, significant_variables, num_analysed_variables, self._data_frame, self._measure_columns, self._base_dir, None, target_chisquare_result) self.narratives['cards'].append(target_dimension_card) self._chiSquareNode.add_a_node( target_dimension_card.get_dimension_node()) self._story_narrative.add_a_node(self._chiSquareNode) self._result_setter.set_chisquare_node(self._chiSquareNode)
class ChiSquareAnalysis: def __init__(self, df_context, df_helper, chisquare_result, target_dimension, analysed_dimension, significant_variables, num_analysed_variables, data_frame, measure_columns, base_dir, appid=None, target_chisquare_result=None): self._blockSplitter = "|~NEWBLOCK~|" self._highlightFlag = "|~HIGHLIGHT~|" self._dimensionNode = NarrativesTree() self._dimensionNode.set_name(target_dimension) self._data_frame = data_frame self._dataframe_context = df_context self._dataframe_helper = df_helper self._chisquare_result = chisquare_result self._target_dimension = target_dimension self._analysed_dimension = analysed_dimension self._significant_variables = significant_variables self._target_chisquare_result = target_chisquare_result self._measure_columns = self._dataframe_helper.get_numeric_columns() self._chiSquareLevelLimit = GLOBALSETTINGS.CHISQUARELEVELLIMIT self._num_analysed_variables = num_analysed_variables self._chiSquareTable = chisquare_result.get_contingency_table() significant_variables = list( set(significant_variables) - {analysed_dimension}) if len(significant_variables) <= 20: if len(significant_variables) <= 3: self._second_level_dimensions = list(significant_variables) else: self._second_level_dimensions = list(significant_variables)[:3] else: self._second_level_dimensions = list(significant_variables)[:5] print self._second_level_dimensions self._appid = appid self._card1 = NormalCard() self._targetCards = [] self._base_dir = base_dir self._binTargetCol = False self._binAnalyzedCol = False print "--------Chi-Square Narratives for ", analysed_dimension, "---------" if self._dataframe_context.get_custom_analysis_details() != None: binnedColObj = [ x["colName"] for x in self._dataframe_context.get_custom_analysis_details() ] print "analysed_dimension : ", self._analysed_dimension if binnedColObj != None and self._target_dimension in binnedColObj: self._binTargetCol = True if binnedColObj != None and ( self._analysed_dimension in binnedColObj or self._analysed_dimension in self._measure_columns): self._binAnalyzedCol = True if self._appid == None: self._generate_narratives() self._dimensionNode.add_cards([self._card1] + self._targetCards) self._dimensionNode.set_name("{}".format(analysed_dimension)) elif self._appid == "2": self._generate_narratives() self._dimensionNode.add_cards([self._card1]) self._dimensionNode.set_name("{}".format(analysed_dimension)) elif self._appid == "1": self._generate_narratives() self._dimensionNode.add_cards([self._card1]) self._dimensionNode.set_name("{}".format(analysed_dimension)) def get_dimension_node(self): return json.loads( CommonUtils.convert_python_object_to_json(self._dimensionNode)) def get_dimension_card1(self): return self._card1 def _generate_narratives(self): chisquare_result = self._chisquare_result target_dimension = self._target_dimension analysed_dimension = self._analysed_dimension significant_variables = self._significant_variables num_analysed_variables = self._num_analysed_variables table = self._chiSquareTable total = self._chiSquareTable.get_total() levels = self._chiSquareTable.get_column_two_levels() level_counts = self._chiSquareTable.get_column_total() levels_count_sum = sum(level_counts) levels_percentages = [ i * 100.0 / levels_count_sum for i in level_counts ] sorted_levels = sorted(zip(level_counts, levels), reverse=True) level_differences = [0.0] + [ sorted_levels[i][0] - sorted_levels[i + 1][0] for i in range(len(sorted_levels) - 1) ] top_dims = [ j for i, j in sorted_levels[:level_differences.index(max(level_differences))] ] top_dims_contribution = sum([ i for i, j in sorted_levels[:level_differences.index(max(level_differences))] ]) bottom_dim = sorted_levels[-1][1] bottom_dim_contribution = sorted_levels[-1][0] bottom_dims = [ y for x, y in sorted_levels if x == bottom_dim_contribution ] target_levels = self._chiSquareTable.get_column_one_levels() target_counts = self._chiSquareTable.get_row_total() sorted_target_levels = sorted(zip(target_counts, target_levels), reverse=True) top_target_count, top_target = sorted_target_levels[0] second_target_count, second_target = sorted_target_levels[1] top_target_contributions = [ table.get_value(top_target, i) for i in levels ] sum_top_target = sum(top_target_contributions) sorted_levels = sorted(zip(top_target_contributions, levels), reverse=True) level_differences = [0.0] + [ sorted_levels[i][0] - sorted_levels[i + 1][0] for i in range(len(sorted_levels) - 1) ] top_target_top_dims = [ j for i, j in sorted_levels[:level_differences.index(max(level_differences))] ] top_target_top_dims_contribution = sum([ i for i, j in sorted_levels[:level_differences.index(max(level_differences))] ]) top_target_bottom_dim = sorted_levels[-1][1] top_target_bottom_dim_contribution = sorted_levels[-1][0] top_target_percentages = [ i * 100.0 / sum_top_target for i in top_target_contributions ] best_top_target_index = top_target_contributions.index( max(top_target_contributions)) worst_top_target_index = top_target_contributions.index( min(top_target_contributions)) top_target_differences = [ x - y for x, y in zip(levels_percentages, top_target_percentages) ] if len(top_target_differences) > 6: tops = 2 bottoms = -2 elif len(top_target_differences) > 4: tops = 2 bottoms = -1 else: tops = 1 bottoms = -1 sorted_ = sorted(enumerate(top_target_differences), key=lambda x: x[1], reverse=True) best_top_difference_indices = [x for x, y in sorted_[:tops]] worst_top_difference_indices = [x for x, y in sorted_[bottoms:]] top_target_shares = [ x * 100.0 / y for x, y in zip(top_target_contributions, level_counts) ] max_top_target_shares = max(top_target_shares) best_top_target_share_index = [ idx for idx, val in enumerate(top_target_shares) if val == max_top_target_shares ] level_counts_threshold = sum(level_counts) * 0.05 / len(level_counts) min_top_target_shares = min([ x for x, y in zip(top_target_shares, level_counts) if y >= level_counts_threshold ]) worst_top_target_share_index = [ idx for idx, val in enumerate(top_target_shares) if val == min_top_target_shares ] overall_top_percentage = sum_top_target * 100.0 / total second_target_contributions = [ table.get_value(second_target, i) for i in levels ] sum_second_target = sum(second_target_contributions) sorted_levels = sorted(zip(second_target_contributions, levels), reverse=True) level_differences = [0.0] + [ sorted_levels[i][0] - sorted_levels[i + 1][0] for i in range(len(sorted_levels) - 1) ] second_target_top_dims = [ j for i, j in sorted_levels[:level_differences.index(max(level_differences))] ] second_target_top_dims_contribution = sum([ i for i, j in sorted_levels[:level_differences.index(max(level_differences))] ]) second_target_bottom_dim = sorted_levels[-1][1] second_target_bottom_dim_contribution = sorted_levels[-1][0] second_target_percentages = [ i * 100.0 / sum_second_target for i in second_target_contributions ] best_second_target_index = second_target_contributions.index( max(second_target_contributions)) worst_second_target_index = second_target_contributions.index( min(second_target_contributions)) second_target_differences = [ x - y for x, y in zip(levels_percentages, second_target_percentages) ] if len(second_target_differences) > 6: tops = 2 bottoms = -2 elif len(second_target_differences) > 4: tops = 2 bottoms = -1 else: tops = 1 bottoms = -1 sorted_ = sorted(enumerate(second_target_differences), key=lambda x: x[1], reverse=True) best_second_difference_indices = [x for x, y in sorted_[:tops]] worst_second_difference_indices = [x for x, y in sorted_[bottoms:]] second_target_shares = [ x * 100.0 / y for x, y in zip(second_target_contributions, level_counts) ] max_second_target_shares = max(second_target_shares) best_second_target_share_index = [ idx for idx, val in enumerate(second_target_shares) if val == max_second_target_shares ] level_counts_threshold = sum(level_counts) * 0.05 / len(level_counts) min_second_target_shares = min([ x for x, y in zip(second_target_shares, level_counts) if y >= level_counts_threshold ]) # worst_second_target_share_index = second_target_shares.index(min_second_target_shares) worst_second_target_share_index = [ idx for idx, val in enumerate(second_target_shares) if val == min_second_target_shares ] overall_second_percentage = sum_second_target * 100.0 / total targetCardDataDict = {} targetCardDataDict['target'] = target_dimension targetCardDataDict['colname'] = analysed_dimension targetCardDataDict['num_significant'] = len(significant_variables) targetCardDataDict['plural_colname'] = NarrativesUtils.pluralize( analysed_dimension) targetCardDataDict["blockSplitter"] = self._blockSplitter targetCardDataDict["binTargetCol"] = self._binTargetCol targetCardDataDict["binAnalyzedCol"] = self._binAnalyzedCol targetCardDataDict['highlightFlag'] = self._highlightFlag targetCardDataDict['levels'] = levels data_dict = {} data_dict[ 'best_second_difference'] = best_second_difference_indices ##these changed data_dict['worst_second_difference'] = worst_second_difference_indices data_dict['best_top_difference'] = best_top_difference_indices data_dict['worst_top_difference'] = worst_top_difference_indices data_dict['levels_percentages'] = levels_percentages data_dict['top_target_percentages'] = top_target_percentages data_dict['second_target_percentages'] = second_target_percentages data_dict['levels'] = levels data_dict['best_top_share'] = best_top_target_share_index data_dict['worst_top_share'] = worst_top_target_share_index data_dict['best_second_share'] = best_second_target_share_index data_dict['worst_second_share'] = worst_second_target_share_index data_dict['top_target_shares'] = top_target_shares data_dict['second_target_shares'] = second_target_shares data_dict['overall_second'] = overall_second_percentage data_dict['overall_top'] = overall_top_percentage data_dict['num_significant'] = len(significant_variables) data_dict['colname'] = analysed_dimension data_dict['plural_colname'] = NarrativesUtils.pluralize( analysed_dimension) data_dict['target'] = target_dimension data_dict['top_levels'] = top_dims data_dict['top_levels_percent'] = round( top_dims_contribution * 100.0 / total, 1) data_dict['bottom_level'] = bottom_dim data_dict['bottom_levels'] = bottom_dims data_dict['bottom_level_percent'] = round( bottom_dim_contribution * 100 / sum(level_counts), 2) data_dict['second_target'] = second_target data_dict['second_target_top_dims'] = second_target_top_dims data_dict[ 'second_target_top_dims_contribution'] = second_target_top_dims_contribution * 100.0 / sum( second_target_contributions) data_dict['second_target_bottom_dim'] = second_target_bottom_dim data_dict[ 'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution data_dict['best_second_target'] = levels[best_second_target_index] data_dict['best_second_target_count'] = second_target_contributions[ best_second_target_index] data_dict['best_second_target_percent'] = round( second_target_contributions[best_second_target_index] * 100.0 / sum(second_target_contributions), 2) data_dict['worst_second_target'] = levels[worst_second_target_index] data_dict['worst_second_target_percent'] = round( second_target_contributions[worst_second_target_index] * 100.0 / sum(second_target_contributions), 2) data_dict['top_target'] = top_target data_dict['top_target_top_dims'] = top_target_top_dims data_dict[ 'top_target_top_dims_contribution'] = top_target_top_dims_contribution * 100.0 / sum( top_target_contributions) data_dict['top_target_bottom_dim'] = top_target_bottom_dim data_dict[ 'top_target_bottom_dim_contribution'] = top_target_bottom_dim_contribution data_dict['best_top_target'] = levels[best_top_target_index] data_dict['best_top_target_count'] = top_target_contributions[ best_top_target_index] data_dict['best_top_target_percent'] = round( top_target_contributions[best_top_target_index] * 100.0 / sum(top_target_contributions), 2) data_dict['worst_top_target'] = levels[worst_top_target_index] data_dict['worst_top_target_percent'] = round( top_target_contributions[worst_top_target_index] * 100.0 / sum(top_target_contributions), 2) data_dict["blockSplitter"] = self._blockSplitter data_dict["binTargetCol"] = self._binTargetCol data_dict["binAnalyzedCol"] = self._binAnalyzedCol data_dict['highlightFlag'] = self._highlightFlag ############### # CARD1 # ############### print "self._binTargetCol & self._binAnalyzedCol : ", self._binTargetCol, self._binAnalyzedCol if (self._binTargetCol == True & self._binAnalyzedCol == False): print "Only Target Column is Binned, : ", self._binTargetCol output = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card1_binned_target.html', data_dict), self._blockSplitter, highlightFlag=self._highlightFlag) elif (self._binTargetCol == True & self._binAnalyzedCol == True): print "Target Column and IV is Binned : ", self._binTargetCol, self._binAnalyzedCol output = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card1_binned_target_and_IV.html', data_dict), self._blockSplitter, highlightFlag=self._highlightFlag) else: output = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output(self._base_dir, 'card1.html', data_dict), self._blockSplitter, highlightFlag=self._highlightFlag) targetDimCard1Data = [] targetDimcard1Heading = '<h3>Relationship between ' + self._target_dimension + ' and ' + self._analysed_dimension + "</h3>" toggledata = ToggleData() targetDimTable1Data = self.generate_card1_table1() targetDimCard1Table1 = TableData() targetDimCard1Table1.set_table_type("heatMap") targetDimCard1Table1.set_table_data(targetDimTable1Data) toggledata.set_toggleon_data({ "data": { "tableData": targetDimTable1Data, "tableType": "heatMap" }, "dataType": "table" }) targetDimTable2Data = self.generate_card1_table2() targetDimCard1Table2 = TableData() targetDimCard1Table2.set_table_type("normal") table2Data = targetDimTable2Data["data1"] table2Data = [ innerList[1:] for innerList in table2Data if innerList[0].strip() != "" ] targetDimCard1Table2.set_table_data(table2Data) toggledata.set_toggleoff_data({ "data": { "tableData": table2Data, "tableType": "heatMap" }, "dataType": "table" }) targetDimCard1Data.append(HtmlData(data=targetDimcard1Heading)) targetDimCard1Data.append(toggledata) targetDimCard1Data += output self._card1.set_card_data(targetDimCard1Data) self._card1.set_card_name("{}: Relationship with {}".format( self._analysed_dimension, self._target_dimension)) ############### # CARD2 # ############### if self._appid == None: key_factors = '' num_key_factors = len(self._second_level_dimensions) if len(self._second_level_dimensions) == 5: key_factors = ', '.join( self._second_level_dimensions[:4] ) + ' and ' + self._second_level_dimensions[4] elif len(self._second_level_dimensions) == 4: key_factors = ', '.join( self._second_level_dimensions[:3] ) + ' and ' + self._second_level_dimensions[3] elif len(self._second_level_dimensions) == 3: key_factors = ', '.join( self._second_level_dimensions[:2] ) + ' and ' + self._second_level_dimensions[2] elif len(self._second_level_dimensions) == 2: key_factors = ' and '.join(self._second_level_dimensions) elif len(self._second_level_dimensions) == 1: key_factors = self._second_level_dimensions[0] targetCardDataDict['num_key_factors'] = num_key_factors targetCardDataDict['key_factors'] = key_factors dict_for_test = {} for tupleObj in sorted_target_levels[:self._chiSquareLevelLimit]: targetLevel = tupleObj[1] targetCardDataDict['random_card2'] = random.randint(1, 100) targetCardDataDict['random_card4'] = random.randint(1, 100) second_target_contributions = [ table.get_value(targetLevel, i) for i in levels ] sum_second_target = sum(second_target_contributions) sorted_levels = sorted(zip(second_target_contributions, levels), reverse=True) level_differences = [0.0] + [ sorted_levels[i][0] - sorted_levels[i + 1][0] for i in range(len(sorted_levels) - 1) ] second_target_top_dims = [ j for i, j in sorted_levels[:level_differences. index(max(level_differences))] ] second_target_top_dims_contribution = sum([ i for i, j in sorted_levels[:level_differences. index(max(level_differences))] ]) second_target_bottom_dim = sorted_levels[-1][1] second_target_bottom_dim_contribution = sorted_levels[-1][0] second_target_percentages = [ i * 100.0 / sum_second_target for i in second_target_contributions ] best_second_target_index = second_target_contributions.index( max(second_target_contributions)) worst_second_target_index = second_target_contributions.index( min(second_target_contributions)) second_target_differences = [ x - y for x, y in zip(levels_percentages, second_target_percentages) ] if len(second_target_differences) > 6: tops = 2 bottoms = -2 elif len(second_target_differences) > 4: tops = 2 bottoms = -1 else: tops = 1 bottoms = -1 sorted_ = sorted(enumerate(second_target_differences), key=lambda x: x[1], reverse=True) best_second_difference_indices = [x for x, y in sorted_[:tops]] worst_second_difference_indices = [ x for x, y in sorted_[bottoms:] ] second_target_shares = [ x * 100.0 / y for x, y in zip(second_target_contributions, level_counts) ] max_second_target_shares = max(second_target_shares) best_second_target_share_index = [ idx for idx, val in enumerate(second_target_shares) if val == max_second_target_shares ] level_counts_threshold = sum(level_counts) * 0.05 / len( level_counts) min_second_target_shares = min([ x for x, y in zip(second_target_shares, level_counts) if y >= level_counts_threshold ]) worst_second_target_share_index = [ idx for idx, val in enumerate(second_target_shares) if val == min_second_target_shares ] overall_second_percentage = sum_second_target * 100.0 / total # DataFrame for contribution calculation df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\ filter(col(self._analysed_dimension)==second_target_top_dims[0]).\ select(self._second_level_dimensions).toPandas() df_second_dim = self._data_frame.filter(col(self._analysed_dimension)==second_target_top_dims[0]).\ select(self._second_level_dimensions).toPandas() # if self._chisquare_result.get_splits(): # splits = self._chisquare_result.get_splits() # idx = self._chiSquareTable.get_bin_names(splits).index(second_target_top_dims[0]) # idx1 = self._chiSquareTable.get_bin_names(splits).index(top_target_top_dims[0]) # splits[len(splits)-1] = splits[len(splits)-1]+1 # df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\ # filter(col(self._analysed_dimension)>=splits[idx]).filter(col(self._analysed_dimension)<splits[idx+1]).\ # select(self._second_level_dimensions).toPandas() # df_second_dim = self._data_frame.filter(col(self._analysed_dimension)>=splits[idx]).\ # filter(col(self._analysed_dimension)<splits[idx+1]).\ # select(self._second_level_dimensions).toPandas() # else: # df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\ # filter(col(self._analysed_dimension)==second_target_top_dims[0]).\ # select(self._second_level_dimensions).toPandas() # df_second_dim = self._data_frame.filter(col(self._analysed_dimension)==second_target_top_dims[0]).\ # select(self._second_level_dimensions).toPandas() # print self._data_frame.select('Sales').show() distribution_second = [] for d in self._second_level_dimensions: grouped = df_second_target.groupby(d).agg({ d: 'count' }).sort_values(d, ascending=False) contributions = df_second_dim.groupby(d).agg({d: 'count'}) contribution_index = list(contributions.index) contributions_val = contributions[d].tolist() contributions_list = dict( zip(contribution_index, contributions_val)) index_list = list(grouped.index) grouped_list = grouped[d].tolist() contributions_percent_list = [ round(y * 100.0 / contributions_list[x], 2) for x, y in zip(index_list, grouped_list) ] sum_ = grouped[d].sum() diffs = [0] + [ grouped_list[i] - grouped_list[i + 1] for i in range(len(grouped_list) - 1) ] max_diff = diffs.index(max(diffs)) index_txt = '' if max_diff == 1: index_txt = index_list[0] elif max_diff == 2: index_txt = index_list[0] + '(' + str( round(grouped_list[0] * 100.0 / sum_, 1) ) + '%)' + ' and ' + index_list[1] + '(' + str( round(grouped_list[1] * 100.0 / sum_, 1)) + '%)' elif max_diff > 2: index_txt = 'including ' + index_list[0] + '(' + str( round(grouped_list[0] * 100.0 / sum_, 1) ) + '%)' + ' and ' + index_list[1] + '(' + str( round(grouped_list[1] * 100.0 / sum_, 1)) + '%)' distribution_second.append({'contributions':[round(i*100.0/sum_,2) for i in grouped_list[:max_diff]],\ 'levels': index_list[:max_diff],'variation':random.randint(1,100),\ 'index_txt': index_txt, 'd':d,'contributions_percent':contributions_percent_list}) targetCardDataDict['distribution_second'] = distribution_second targetCardDataDict['second_target'] = targetLevel targetCardDataDict[ 'second_target_top_dims'] = second_target_top_dims targetCardDataDict[ 'second_target_top_dims_contribution'] = second_target_top_dims_contribution * 100.0 / sum( second_target_contributions) targetCardDataDict[ 'second_target_bottom_dim'] = second_target_bottom_dim targetCardDataDict[ 'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution targetCardDataDict['best_second_target'] = levels[ best_second_target_index] targetCardDataDict[ 'best_second_target_count'] = second_target_contributions[ best_second_target_index] targetCardDataDict['best_second_target_percent'] = round( second_target_contributions[best_second_target_index] * 100.0 / sum(second_target_contributions), 2) targetCardDataDict['worst_second_target'] = levels[ worst_second_target_index] targetCardDataDict['worst_second_target_percent'] = round( second_target_contributions[worst_second_target_index] * 100.0 / sum(second_target_contributions), 2) card2Data = [] targetLevelContributions = [ table.get_value(targetLevel, i) for i in levels ] card2Heading = '<h3>Distribution of ' + self._target_dimension + ' (' + targetLevel + ') across ' + self._analysed_dimension + "</h3>" chart, bubble = self.generate_distribution_card_chart( targetLevel, targetLevelContributions, levels, level_counts, total) card2ChartData = NormalChartData(data=chart["data"]) card2ChartJson = ChartJson() card2ChartJson.set_data(card2ChartData.get_data()) card2ChartJson.set_chart_type("combination") card2ChartJson.set_types({ "total": "bar", "percentage": "line" }) card2ChartJson.set_legend({ "total": "# of " + targetLevel, "percentage": "% of " + targetLevel }) card2ChartJson.set_axes({ "x": "key", "y": "total", "y2": "percentage" }) card2ChartJson.set_label_text({ "x": " ", "y": "Count", "y2": "Percentage" }) print "self._binTargetCol & self._binAnalyzedCol : ", self._binTargetCol, self._binAnalyzedCol if (self._binTargetCol == True & self._binAnalyzedCol == False): print "Only Target Column is Binned" output2 = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card2_binned_target.html', targetCardDataDict), self._blockSplitter) elif (self._binTargetCol == True & self._binAnalyzedCol == True): print "Target Column and IV is Binned" output2 = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card2_binned_target_and_IV.html', targetCardDataDict), self._blockSplitter) else: print "In Else, self._binTargetCol should be False : ", self._binTargetCol output2 = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card2.html', targetCardDataDict), self._blockSplitter) card2Data.append(HtmlData(data=card2Heading)) statistical_info_array = [ ("Test Type", "Chi-Square"), ("Chi-Square statistic", str(round(self._chisquare_result.get_stat(), 3))), ("P-Value", str(round(self._chisquare_result.get_pvalue(), 3))), ("Inference", "Chi-squared analysis shows a significant association between {} (target) and {}." .format(self._target_dimension, self._analysed_dimension)) ] statistical_info_array = NarrativesUtils.statistical_info_array_formatter( statistical_info_array) card2Data.append( C3ChartData(data=card2ChartJson, info=statistical_info_array)) card2Data += output2 card2BubbleData = "<div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}</span><br /><small>{}</small></h2></div><div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}</span><br /><small>{}</small></h2></div>".format( bubble[0]["value"], bubble[0]["text"], bubble[1]["value"], bubble[1]["text"]) card2Data.append(HtmlData(data=card2BubbleData)) targetCard = NormalCard() targetCard.set_card_data(card2Data) targetCard.set_card_name("{} : Distribution of {}".format( self._analysed_dimension, targetLevel)) self._targetCards.append(targetCard) dict_for_test[targetLevel] = targetCardDataDict out = {'data_dict': data_dict, 'target_dict': dict_for_test} return out # def generate_card2_narratives(self): def generate_distribution_card_chart(self, __target, __target_contributions, levels, levels_count, total): chart = {} label = {'total': '# of ' + __target, 'percentage': '% of ' + __target} label_text = { 'x': self._analysed_dimension, 'y': '# of ' + __target, 'y2': '% of ' + __target, } data = {} data['total'] = dict(zip(levels, __target_contributions)) __target_percentages = [ x * 100.0 / y for x, y in zip(__target_contributions, levels_count) ] data['percentage'] = dict(zip(levels, __target_percentages)) chartData = [] for val in zip(levels, __target_contributions, __target_percentages): chartData.append({ "key": val[0], "total": val[1], "percentage": val[2] }) # c3_data = [levels,__target_contributions,__target_percentages] chart_data = {'label': label, 'data': chartData} bubble_data1 = {} bubble_data2 = {} bubble_data1['value'] = str( round( max(__target_contributions) * 100.0 / sum(__target_contributions), 1)) + '%' m_index = __target_contributions.index(max(__target_contributions)) bubble_data1[ 'text'] = 'Overall ' + __target + ' comes from ' + levels[m_index] bubble_data2['value'] = str(round(max(__target_percentages), 1)) + '%' m_index = __target_percentages.index(max(__target_percentages)) bubble_data2[ 'text'] = levels[m_index] + ' has the highest rate of ' + __target bubble_data = [bubble_data1, bubble_data2] return chart_data, bubble_data def generate_card1_table1(self): table_percent_by_column = self._chiSquareTable.table_percent_by_column column_two_values = self._chiSquareTable.column_two_values header_row = [self._analysed_dimension ] + self._chiSquareTable.get_column_one_levels() all_columns = [column_two_values] + table_percent_by_column other_rows = zip(*all_columns) other_rows = [list(tup) for tup in other_rows] table_data = [header_row] + other_rows return table_data def generate_card1_table2(self): table = self._chiSquareTable.table table_percent = self._chiSquareTable.table_percent table_percent_by_row = self._chiSquareTable.table_percent_by_row table_percent_by_column = self._chiSquareTable.table_percent_by_column target_levels = self._chiSquareTable.get_column_one_levels() dim_levels = self._chiSquareTable.get_column_two_levels() header1 = [self._analysed_dimension] + target_levels + ['Total'] header = ['State', 'Active', 'Churn', 'Total'] #TODO remove data = [] data1 = [['Tag'] + header1] for idx, lvl in enumerate(dim_levels): first_row = ['Tag'] + header col_2_vals = zip(*table)[idx] data2 = ['bold'] + [lvl] + list(col_2_vals) + [sum(col_2_vals)] dict_ = dict(zip(first_row, data2)) data.append(dict_) data1.append(data2) col_2_vals = zip(*table_percent_by_column)[idx] data2 = [''] + ['As % within ' + self._analysed_dimension ] + list(col_2_vals) + [100.0] dict_ = dict(zip(first_row, data2)) data.append(dict_) data1.append(data2) col_2_vals = zip(*table_percent_by_row)[idx] col_2_vals1 = zip(*table_percent)[idx] data2 = [''] + [ 'As % within ' + self._target_dimension ] + list(col_2_vals) + [round(sum(col_2_vals1), 2)] dict_ = dict(zip(first_row, data2)) data.append(dict_) data1.append(data2) # col_2_vals = zip(*table_percent)[idx] data2 = [''] + ['As % of Total'] + list(col_2_vals1) + [ round(sum(col_2_vals1), 2) ] dict_ = dict(zip(first_row, data2)) data.append(dict_) data1.append(data2) out = { 'header': header, 'header1': header1, 'data': data, 'label': self._analysed_dimension, 'data1': data1 } return out
class ChiSquareNarratives(object): #@accepts(object, int, DFChiSquareResult ,ContextSetter) def __init__(self, df_helper, df_chisquare_result, spark, df_context, data_frame, story_narrative, result_setter, scriptWeight=None, analysisName=None): self._story_narrative = story_narrative self._result_setter = result_setter self._data_frame = data_frame self._dataframe_context = df_context self._pandas_flag = df_context._pandas_flag self._dataframe_helper = df_helper self._storyOnScoredData = self._dataframe_context.get_story_on_scored_data( ) self._measure_columns = df_helper.get_numeric_columns() self._df_chisquare = df_chisquare_result self._df_chisquare_result = df_chisquare_result.get_result() self.narratives = {} self._appid = df_context.get_app_id() self._chiSquareNode = NarrativesTree() self._chiSquareNode.set_name("Key Drivers") self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER self._noOfSigDimsToShow = GLOBALSETTINGS.CHISQUARESIGNIFICANTDIMENSIONTOSHOW self._base_dir = "/chisquare/" self._spark = spark ############################DataFrame Measure to Dimesion Column##################### if self._pandas_flag: pandas_df = self._data_frame.copy(deep=True) else: pandas_df = self._data_frame.toPandas() target_dimension = list(self._df_chisquare_result.keys()) bin_data = {} for col in self._measure_columns: if self._df_chisquare.get_chisquare_result(target_dimension[0], col): chisquare_result = self._df_chisquare.get_chisquare_result( target_dimension[0], col) bin_data[col] = chisquare_result.get_contingency_table( ).get_column_two_levels() for bin_col in list(bin_data.keys()): for split in bin_data[bin_col]: val = split.split('to') # pandas_df[bin_col][(float(pandas_df[bin_col])>=float(val[0].replace(',',''))) & (float(pandas_df[bin_col])<float(val[1].replace(',','')))] = split row_value = list(pandas_df[bin_col]) temp = [] for row_value_ in row_value: if not isinstance(row_value_, str) and \ (float(row_value_) >= float(val[0].replace(',',''))) and \ (float(row_value_) < float(val[1].replace(',',''))): temp.append(split) else: temp.append(row_value_) pandas_df[bin_col] = temp if self._pandas_flag: pass # self._data_frame = pandas_df else: fields = [ StructField(field_name, StringType(), True) for field_name in pandas_df.columns ] schema = StructType(fields) SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) self._data_frame = SQLctx.createDataFrame(pandas_df, schema) # print self._data_frame ############################DataFrame Measure to Dimesion Column##################### if self._appid != None: if self._appid == "1": self._base_dir += "appid1/" elif self._appid == "2": self._base_dir += "appid2/" self._completionStatus = self._dataframe_context.get_completion_status( ) if analysisName == None: self._analysisName = self._dataframe_context.get_analysis_name() else: self._analysisName = analysisName self._messageURL = self._dataframe_context.get_message_url() if scriptWeight == None: self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight( ) else: self._scriptWeightDict = scriptWeight self._analysisDict = self._dataframe_context.get_analysis_dict() if self._analysisDict != {}: self._nColsToUse = self._analysisDict[ self._analysisName]["noOfColumnsToUse"] else: self._nColsToUse = None self._scriptStages = { "initialization": { "summary": "Initialized the Frequency Narratives", "weight": 0 }, "summarygeneration": { "summary": "Summary Generation Finished", "weight": 4 }, "completion": { "summary": "Frequency Stats Narratives Done", "weight": 0 }, } CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "initialization", "info", display=False, weightKey="narratives") self.new_effect_size, self.signi_dict = self.feat_imp_threshold( target_dimension) self._generate_narratives() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "summarygeneration", "info", display=False, weightKey="narratives") CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "completion", "info", display=False, weightKey="narratives") def feat_imp_threshold(self, target_dimension, dummy_Cols=True, label_encoding=False): if self._pandas_flag: if is_numeric_dtype(self._data_frame[target_dimension[0]]): self.app_type = 'regression' elif is_string_dtype(self._data_frame[target_dimension[0]]): self.app_type = 'classification' else: if self._data_frame.select( target_dimension[0]).dtypes[0][1] == 'string': self.app_type = 'classification' elif self._data_frame.select( target_dimension[0]).dtypes[0][1] in ['int', 'double']: self.app_type = 'regression' try: DataValidation_obj = DataValidation(self._data_frame, target_dimension[0], self.app_type, self._pandas_flag) DataValidation_obj.data_validation_run() except Exception as e: CommonUtils.print_errors_and_store_traceback( self.LOGGER, "datavalidation", e) CommonUtils.save_error_messages(self.errorURL, self.app_type, e, ignore=self.ignoreMsg) try: DataPreprocessingAutoML_obj = DataPreprocessingAutoML( DataValidation_obj.data_frame, DataValidation_obj.target, DataValidation_obj.data_change_dict, DataValidation_obj.numeric_cols, DataValidation_obj.dimension_cols, DataValidation_obj.datetime_cols, DataValidation_obj.problem_type, self._pandas_flag) DataPreprocessingAutoML_obj.data_preprocessing_run() except Exception as e: CommonUtils.print_errors_and_store_traceback( self.LOGGER, "dataPreprocessing", e) CommonUtils.save_error_messages(self.errorURL, self.app_type, e, ignore=self.ignoreMsg) preprocess_df = DataPreprocessingAutoML_obj.data_frame FeatureEngineeringAutoML_obj = FeatureEngineeringAutoML( DataPreprocessingAutoML_obj.data_frame, DataPreprocessingAutoML_obj.target, DataPreprocessingAutoML_obj.data_change_dict, DataPreprocessingAutoML_obj.numeric_cols, DataPreprocessingAutoML_obj.dimension_cols, DataPreprocessingAutoML_obj.datetime_cols, DataPreprocessingAutoML_obj.problem_type, self._pandas_flag) if FeatureEngineeringAutoML_obj.datetime_cols != 0: FeatureEngineeringAutoML_obj.date_column_split( FeatureEngineeringAutoML_obj.datetime_cols) if dummy_Cols: if self._pandas_flag: FeatureEngineeringAutoML_obj.sk_one_hot_encoding( FeatureEngineeringAutoML_obj.dimension_cols) clean_df = FeatureEngineeringAutoML_obj.data_frame else: FeatureEngineeringAutoML_obj.pyspark_one_hot_encoding( FeatureEngineeringAutoML_obj.dimension_cols) clean_df = FeatureEngineeringAutoML_obj.data_frame if label_encoding: if self._pandas_flag: for column_name in FeatureEngineeringAutoML_obj.dimension_cols: preprocess_df[ column_name + '_label_encoded'] = LabelEncoder().fit_transform( preprocess_df[column_name]) preprocess_df = preprocess_df.drop(column_name, 1) clean_df = preprocess_df else: FeatureEngineeringAutoML_obj.pyspark_label_encoding( FeatureEngineeringAutoML_obj.dimension_cols) clean_df = FeatureEngineeringAutoML_obj.data_frame if self._pandas_flag: ind_var = clean_df.drop(target_dimension[0], 1) ind_var = ind_var[ind_var._get_numeric_data().columns] target = clean_df[target_dimension[0]] dtree = DecisionTreeClassifier(criterion='gini', max_depth=5, random_state=42) dtree.fit(ind_var, target) feat_imp_dict = {} for feature, importance in zip(list(ind_var.columns), dtree.feature_importances_): feat_imp_dict[feature] = round(importance, 2) else: num_var = [ col[0] for col in clean_df.dtypes if ((col[1] == 'int') | (col[1] == 'double')) & (col[0] != target_dimension[0]) ] num_var = [col for col in num_var if not col.endswith('indexed')] labels_count = [ len(clean_df.select(col).distinct().collect()) for col in num_var ] # labels_count = [len(clean_df.agg((F.collect_set(col).alias(col))).first().asDict()[col]) for col in num_var] labels_count.sort() max_count = labels_count[-1] label_indexes = StringIndexer(inputCol=target_dimension[0], outputCol='label', handleInvalid='keep') assembler = VectorAssembler(inputCols=num_var, outputCol="features") model = pysparkDecisionTreeClassifier(labelCol="label", featuresCol="features", seed=8464, impurity='gini', maxDepth=5, maxBins=max_count + 2) pipe = Pipeline(stages=[assembler, label_indexes, model]) mod_fit = pipe.fit(clean_df) df2 = mod_fit.transform(clean_df) list_extract = [] for i in df2.schema["features"].metadata["ml_attr"]["attrs"]: list_extract = list_extract + df2.schema["features"].metadata[ "ml_attr"]["attrs"][i] varlist = pd.DataFrame(list_extract) varlist['score'] = varlist['idx'].apply( lambda x: mod_fit.stages[-1].featureImportances[x]) feat_imp_dict = pd.Series(varlist.score.values, index=varlist.name).to_dict() feat_imp_ori_dict = {} actual_cols = list(self._data_frame.columns) actual_cols.remove(target_dimension[0]) for col in actual_cols: fea_imp_ori_list = [] for col_imp in feat_imp_dict: temp = col_imp.split(col, -1) if len(temp) == 2: fea_imp_ori_list.append(feat_imp_dict[col_imp]) feat_imp_ori_dict.update({col: sum(fea_imp_ori_list)}) sort_dict = dict( sorted(feat_imp_ori_dict.items(), key=lambda x: x[1], reverse=True)) if self._pandas_flag: self._data_frame = self._data_frame.apply( lambda col: pd.to_datetime(col, errors='ignore') if col.dtypes == object else col, axis=0) cat_var = [ key for key in dict(self._data_frame.dtypes) if dict(self._data_frame.dtypes)[key] in ['object'] ] else: cat_var = [ col[0] for col in self._data_frame.dtypes if col[1] == 'string' ] cat_var.remove(target_dimension[0]) si_var_dict = { key: value for key, value in sort_dict.items() if key in cat_var } threshold = 0 si_var_thresh = {} for key, value in si_var_dict.items(): threshold = threshold + value if threshold < 0.8: si_var_thresh[key] = value return feat_imp_dict, si_var_thresh def _generate_narratives(self): """ generate main card narrative and remaining cards are generated by calling ChiSquareAnalysis class for each of analyzed dimensions """ for target_dimension in list(self._df_chisquare_result.keys()): target_chisquare_result = self._df_chisquare_result[ target_dimension] analysed_variables = list( target_chisquare_result.keys()) ## List of all analyzed var. # List of significant var out of analyzed var. # significant_variables = [dim for dim in list(target_chisquare_result.keys()) if target_chisquare_result[dim].get_pvalue()<=0.05] effect_size_dict = self.new_effect_size significant_variables = list(self.signi_dict.keys()) effect_sizes = list(self.signi_dict.values()) significant_variables = [ y for (x, y) in sorted(zip(effect_sizes, significant_variables), reverse=True) if round(float(x), 2) > 0 ] #insignificant_variables = [i for i in self._df_chisquare_result[target_dimension] if i['pv']>0.05] num_analysed_variables = len(analysed_variables) num_significant_variables = len(significant_variables) self.narratives['main_card'] = {} self.narratives['main_card'][ 'heading'] = 'Relationship between ' + target_dimension + ' and other factors' self.narratives['main_card']['paragraphs'] = {} data_dict = { 'num_variables': num_analysed_variables, 'num_significant_variables': num_significant_variables, 'significant_variables': significant_variables, 'target': target_dimension, 'analysed_dimensions': analysed_variables, 'blockSplitter': self._blockSplitter } # for both para 1 and para 2 paragraph = {} paragraph['header'] = '' paragraph['content'] = NarrativesUtils.get_template_output( self._base_dir, 'main_card.html', data_dict) self.narratives['main_card']['paragraphs'] = [paragraph] self.narratives['cards'] = [] chart = { 'header': 'Strength of association between ' + target_dimension + ' and other dimensions' } chart['data'] = effect_size_dict chart['label_text'] = { 'x': 'Dimensions', 'y': 'Feature Importance' } chart_data = [] chartDataValues = [] for k, v in list(effect_size_dict.items()): "rounding the chart data for keydrivers tab" if round(float(v), 2) > 0: chart_data.append({ "Attribute": k, "Effect_Size": round(float(v), 2) }) chartDataValues.append(round(float(v), 2)) chart_data = sorted(chart_data, key=lambda x: x["Effect_Size"], reverse=True) chart_json = ChartJson() chart_json.set_data(chart_data) chart_json.set_chart_type("bar") # chart_json.set_label_text({'x':'Dimensions','y':'Effect Size (Cramers-V)'}) chart_json.set_label_text({'x': ' ', 'y': 'Feature Importance'}) chart_json.set_axis_rotation(True) chart_json.set_axes({"x": "Attribute", "y": "Feature Importance"}) chart_json.set_yaxis_number_format(".2f") # chart_json.set_yaxis_number_format(NarrativesUtils.select_y_axis_format(chartDataValues)) self.narratives['main_card']['chart'] = chart main_card = NormalCard() header = "<h3>Key Factors that drive " + target_dimension + "</h3>" main_card_data = [HtmlData(data=header)] main_card_narrative = NarrativesUtils.get_template_output( self._base_dir, 'main_card.html', data_dict) main_card_narrative = NarrativesUtils.block_splitter( main_card_narrative, self._blockSplitter) main_card_data += main_card_narrative # st_info = ["Test : Chi Square", "Threshold for p-value : 0.05", "Effect Size : Cramer's V"] # print "chartdata",chart_data if len(chart_data) > 0: statistical_info_array = [ ("Test Type", "Chi-Square"), ("Effect Size", "Cramer's V"), ("Max Effect Size", chart_data[0]["Attribute"]), ("Min Effect Size", chart_data[-1]["Attribute"]), ] statistical_inferenc = "" if len(chart_data) == 1: statistical_inference = "{} is the only variable that have significant association with the {} (Target) having an \ Effect size of {}".format( chart_data[0]["Attribute"], self._dataframe_context.get_result_column(), round(chart_data[0]["Effect_Size"], 4)) elif len(chart_data) == 2: statistical_inference = "There are two variables ({} and {}) that have significant association with the {} (Target) and the \ Effect size ranges are {} and {} respectively".format( chart_data[0]["Attribute"], chart_data[1]["Attribute"], self._dataframe_context.get_result_column(), round(chart_data[0]["Effect_Size"], 4), round(chart_data[1]["Effect_Size"], 4)) else: statistical_inference = "There are {} variables that have significant association with the {} (Target) and the \ Effect size ranges from {} to {}".format( len(chart_data), self._dataframe_context.get_result_column(), round(chart_data[0]["Effect_Size"], 4), round(chart_data[-1]["Effect_Size"], 4)) if statistical_inference != "": statistical_info_array.append( ("Inference", statistical_inference)) statistical_info_array = NarrativesUtils.statistical_info_array_formatter( statistical_info_array) else: statistical_info_array = [] main_card_data.append( C3ChartData(data=chart_json, info=statistical_info_array)) main_card.set_card_data(main_card_data) main_card.set_card_name("Key Influencers") if self._storyOnScoredData != True: self._chiSquareNode.add_a_card(main_card) self._result_setter.add_a_score_chi_card(main_card) print("target_dimension", target_dimension) if self._appid == '2' and num_significant_variables > 5: significant_variables = significant_variables[:5] else: if self._nColsToUse != None: significant_variables = significant_variables[:self. _nColsToUse] nColsToUse_temp = self._nColsToUse else: nColsToUse_temp = self._noOfSigDimsToShow CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "custom", "info", display=True, customMsg="Analyzing key drivers", weightKey="narratives") for analysed_dimension in significant_variables[:nColsToUse_temp]: chisquare_result = self._df_chisquare.get_chisquare_result( target_dimension, analysed_dimension) if self._appid == '2': print("APPID 2 is used") card = ChiSquareAnalysis( self._dataframe_context, self._dataframe_helper, chisquare_result, target_dimension, analysed_dimension, significant_variables, num_analysed_variables, self._data_frame, self._measure_columns, self._base_dir, None, target_chisquare_result) # self.narratives['cards'].append(card) self._result_setter.add_a_score_chi_card( json.loads( CommonUtils.convert_python_object_to_json( card.get_dimension_card1()))) elif self._appid == '1': print("APPID 1 is used") card = ChiSquareAnalysis( self._dataframe_context, self._dataframe_helper, chisquare_result, target_dimension, analysed_dimension, significant_variables, num_analysed_variables, self._data_frame, self._measure_columns, self._base_dir, None, target_chisquare_result) # self.narratives['cards'].append(card) self._result_setter.add_a_score_chi_card( json.loads( CommonUtils.convert_python_object_to_json( card.get_dimension_card1()))) else: target_dimension_card = ChiSquareAnalysis( self._dataframe_context, self._dataframe_helper, chisquare_result, target_dimension, analysed_dimension, significant_variables, num_analysed_variables, self._data_frame, self._measure_columns, self._base_dir, None, target_chisquare_result) self.narratives['cards'].append(target_dimension_card) self._chiSquareNode.add_a_node( target_dimension_card.get_dimension_node()) self._story_narrative.add_a_node(self._chiSquareNode) self._result_setter.set_chisquare_node(self._chiSquareNode)
class ChiSquareAnalysis(object): def __init__(self, df_context, df_helper, chisquare_result, target_dimension, analysed_dimension, significant_variables, num_analysed_variables, data_frame, measure_columns, base_dir, appid=None, target_chisquare_result=None): self._blockSplitter = "|~NEWBLOCK~|" self._highlightFlag = "|~HIGHLIGHT~|" self._dimensionNode = NarrativesTree() self._dimensionNode.set_name(target_dimension) self._data_frame = data_frame self._dataframe_context = df_context self._pandas_flag = df_context._pandas_flag self._dataframe_helper = df_helper self._chisquare_result = chisquare_result self._target_dimension = target_dimension self._analysed_dimension = analysed_dimension self._significant_variables = significant_variables self._target_chisquare_result = target_chisquare_result self._measure_columns = self._dataframe_helper.get_numeric_columns() self._chiSquareLevelLimit = GLOBALSETTINGS.CHISQUARELEVELLIMIT self._num_analysed_variables = num_analysed_variables self._chiSquareTable = chisquare_result.get_contingency_table() significant_variables = list( set(significant_variables) - {analysed_dimension}) if len(significant_variables) <= 20: if len(significant_variables) <= 3: self._second_level_dimensions = list(significant_variables) else: self._second_level_dimensions = list(significant_variables)[:3] else: self._second_level_dimensions = list(significant_variables)[:5] print(self._second_level_dimensions) self._appid = appid self._card1 = NormalCard() self._targetCards = [] self._base_dir = base_dir self._binTargetCol = False self._binAnalyzedCol = False print("--------Chi-Square Narratives for ", analysed_dimension, "---------") if self._dataframe_context.get_custom_analysis_details() != None: binnedColObj = [ x["colName"] for x in self._dataframe_context.get_custom_analysis_details() ] print("analysed_dimension : ", self._analysed_dimension) if binnedColObj != None and self._target_dimension in binnedColObj: self._binTargetCol = True if binnedColObj != None and ( self._analysed_dimension in binnedColObj or self._analysed_dimension in self._measure_columns): self._binAnalyzedCol = True if self._appid == None: self._generate_narratives() self._dimensionNode.add_cards([self._card1] + self._targetCards) self._dimensionNode.set_name("{}".format(analysed_dimension)) elif self._appid == "2": self._generate_narratives() self._dimensionNode.add_cards([self._card1]) self._dimensionNode.set_name("{}".format(analysed_dimension)) elif self._appid == "1": self._generate_narratives() self._dimensionNode.add_cards([self._card1]) self._dimensionNode.set_name("{}".format(analysed_dimension)) def get_dimension_node(self): return json.loads( CommonUtils.convert_python_object_to_json(self._dimensionNode)) def get_dimension_card1(self): return self._card1 def _generate_narratives(self): chisquare_result = self._chisquare_result target_dimension = self._target_dimension analysed_dimension = self._analysed_dimension significant_variables = self._significant_variables num_analysed_variables = self._num_analysed_variables table = self._chiSquareTable total = self._chiSquareTable.get_total() levels = self._chiSquareTable.get_column_two_levels() level_counts = self._chiSquareTable.get_column_total() levels_count_sum = sum(level_counts) levels_percentages = [ old_div(i * 100.0, levels_count_sum) for i in level_counts ] sorted_levels = sorted(zip(level_counts, levels), reverse=True) level_differences = [0.0] + [ sorted_levels[i][0] - sorted_levels[i + 1][0] for i in range(len(sorted_levels) - 1) ] top_dims = [ j for i, j in sorted_levels[:level_differences.index(max(level_differences))] ] top_dims_contribution = sum([ i for i, j in sorted_levels[:level_differences.index(max(level_differences))] ]) bottom_dim = sorted_levels[-1][1] bottom_dim_contribution = sorted_levels[-1][0] bottom_dims = [ y for x, y in sorted_levels if x == bottom_dim_contribution ] target_levels = self._chiSquareTable.get_column_one_levels() target_counts = self._chiSquareTable.get_row_total() sorted_target_levels = sorted(zip(target_counts, target_levels), reverse=True) top_target_count, top_target = sorted_target_levels[0] second_target_count, second_target = sorted_target_levels[1] top_target_contributions = [ table.get_value(top_target, i) for i in levels ] sum_top_target = sum(top_target_contributions) sorted_levels = sorted(zip(top_target_contributions, levels), reverse=True) level_differences = [0.0] + [ sorted_levels[i][0] - sorted_levels[i + 1][0] for i in range(len(sorted_levels) - 1) ] top_target_top_dims = [ j for i, j in sorted_levels[:level_differences.index(max(level_differences))] ] top_target_top_dims_contribution = sum([ i for i, j in sorted_levels[:level_differences.index(max(level_differences))] ]) top_target_bottom_dim = sorted_levels[-1][1] top_target_bottom_dim_contribution = sorted_levels[-1][0] top_target_percentages = [ old_div(i * 100.0, sum_top_target) for i in top_target_contributions ] best_top_target_index = top_target_contributions.index( max(top_target_contributions)) worst_top_target_index = top_target_contributions.index( min(top_target_contributions)) top_target_differences = [ x - y for x, y in zip(levels_percentages, top_target_percentages) ] if len(top_target_differences) > 6: tops = 2 bottoms = -2 elif len(top_target_differences) > 4: tops = 2 bottoms = -1 else: tops = 1 bottoms = -1 sorted_ = sorted(enumerate(top_target_differences), key=lambda x: x[1], reverse=True) best_top_difference_indices = [x for x, y in sorted_[:tops]] worst_top_difference_indices = [x for x, y in sorted_[bottoms:]] top_target_shares = [ old_div(x * 100.0, y) for x, y in zip(top_target_contributions, level_counts) ] max_top_target_shares = max(top_target_shares) best_top_target_share_index = [ idx for idx, val in enumerate(top_target_shares) if val == max_top_target_shares ] level_counts_threshold = old_div( sum(level_counts) * 0.05, len(level_counts)) min_top_target_shares = min([ x for x, y in zip(top_target_shares, level_counts) if y >= level_counts_threshold ]) if max_top_target_shares == min_top_target_shares: worst_top_target_share_index = [] else: worst_top_target_share_index = [ idx for idx, val in enumerate(top_target_shares) if val == min_top_target_shares ] overall_top_percentage = old_div(sum_top_target * 100.0, total) second_target_contributions = [ table.get_value(second_target, i) for i in levels ] sum_second_target = sum(second_target_contributions) sorted_levels = sorted(zip(second_target_contributions, levels), reverse=True) level_differences = [0.0] + [ sorted_levels[i][0] - sorted_levels[i + 1][0] for i in range(len(sorted_levels) - 1) ] second_target_top_dims = [ j for i, j in sorted_levels[:level_differences.index(max(level_differences))] ] second_target_top_dims_contribution = sum([ i for i, j in sorted_levels[:level_differences.index(max(level_differences))] ]) second_target_bottom_dim = sorted_levels[-1][1] second_target_bottom_dim_contribution = sorted_levels[-1][0] second_target_percentages = [ old_div(i * 100.0, sum_second_target) for i in second_target_contributions ] best_second_target_index = second_target_contributions.index( max(second_target_contributions)) worst_second_target_index = second_target_contributions.index( min(second_target_contributions)) second_target_differences = [ x - y for x, y in zip(levels_percentages, second_target_percentages) ] if len(second_target_differences) > 6: tops = 2 bottoms = -2 elif len(second_target_differences) > 4: tops = 2 bottoms = -1 else: tops = 1 bottoms = -1 sorted_ = sorted(enumerate(second_target_differences), key=lambda x: x[1], reverse=True) best_second_difference_indices = [x for x, y in sorted_[:tops]] worst_second_difference_indices = [x for x, y in sorted_[bottoms:]] second_target_shares = [ old_div(x * 100.0, y) for x, y in zip(second_target_contributions, level_counts) ] max_second_target_shares = max(second_target_shares) best_second_target_share_index = [ idx for idx, val in enumerate(second_target_shares) if val == max_second_target_shares ] level_counts_threshold = old_div( sum(level_counts) * 0.05, len(level_counts)) if min(second_target_shares) == 0: min_second_target_shares = min([ x for x, y in zip(second_target_shares, level_counts) if x != 0 ]) else: min_second_target_shares = min([ x for x, y in zip(second_target_shares, level_counts) if y >= level_counts_threshold ]) # worst_second_target_share_index = second_target_shares.index(min_second_target_shares) if max_second_target_shares == min_second_target_shares: worst_second_target_share_index = [] else: worst_second_target_share_index = [ idx for idx, val in enumerate(second_target_shares) if val == min_second_target_shares ] overall_second_percentage = old_div(sum_second_target * 100.0, total) targetCardDataDict = {} targetCardDataDict['target'] = target_dimension targetCardDataDict['colname'] = analysed_dimension targetCardDataDict['num_significant'] = len(significant_variables) targetCardDataDict['plural_colname'] = NarrativesUtils.pluralize( analysed_dimension) targetCardDataDict["blockSplitter"] = self._blockSplitter targetCardDataDict["binTargetCol"] = self._binTargetCol targetCardDataDict["binAnalyzedCol"] = self._binAnalyzedCol targetCardDataDict['highlightFlag'] = self._highlightFlag targetCardDataDict['levels'] = levels data_dict = {} data_dict[ 'best_second_difference'] = best_second_difference_indices ##these changed data_dict['worst_second_difference'] = worst_second_difference_indices data_dict['best_top_difference'] = best_top_difference_indices data_dict['worst_top_difference'] = worst_top_difference_indices data_dict['levels_percentages'] = levels_percentages data_dict['top_target_percentages'] = top_target_percentages data_dict['second_target_percentages'] = second_target_percentages data_dict['levels'] = levels data_dict['best_top_share'] = best_top_target_share_index data_dict['worst_top_share'] = worst_top_target_share_index data_dict['best_second_share'] = best_second_target_share_index data_dict['worst_second_share'] = worst_second_target_share_index data_dict['top_target_shares'] = top_target_shares data_dict['second_target_shares'] = second_target_shares data_dict['overall_second'] = overall_second_percentage data_dict['overall_top'] = overall_top_percentage data_dict['num_significant'] = len(significant_variables) data_dict['colname'] = analysed_dimension data_dict['plural_colname'] = NarrativesUtils.pluralize( analysed_dimension) data_dict['target'] = target_dimension data_dict['top_levels'] = top_dims data_dict['top_levels_percent'] = round( old_div(top_dims_contribution * 100.0, total), 1) data_dict['bottom_level'] = bottom_dim data_dict['bottom_levels'] = bottom_dims data_dict['bottom_level_percent'] = round( old_div(bottom_dim_contribution * 100, sum(level_counts)), 2) data_dict['second_target'] = second_target data_dict['second_target_top_dims'] = second_target_top_dims data_dict['second_target_top_dims_contribution'] = old_div( second_target_top_dims_contribution * 100.0, sum(second_target_contributions)) data_dict['second_target_bottom_dim'] = second_target_bottom_dim data_dict[ 'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution data_dict['best_second_target'] = levels[best_second_target_index] data_dict['best_second_target_count'] = second_target_contributions[ best_second_target_index] data_dict['best_second_target_percent'] = round( old_div( second_target_contributions[best_second_target_index] * 100.0, sum(second_target_contributions)), 2) data_dict['worst_second_target'] = levels[worst_second_target_index] data_dict['worst_second_target_percent'] = round( old_div( second_target_contributions[worst_second_target_index] * 100.0, sum(second_target_contributions)), 2) data_dict['top_target'] = top_target data_dict['top_target_top_dims'] = top_target_top_dims data_dict['top_target_top_dims_contribution'] = old_div( top_target_top_dims_contribution * 100.0, sum(top_target_contributions)) data_dict['top_target_bottom_dim'] = top_target_bottom_dim data_dict[ 'top_target_bottom_dim_contribution'] = top_target_bottom_dim_contribution data_dict['best_top_target'] = levels[best_top_target_index] data_dict['best_top_target_count'] = top_target_contributions[ best_top_target_index] data_dict['best_top_target_percent'] = round( old_div(top_target_contributions[best_top_target_index] * 100.0, sum(top_target_contributions)), 2) data_dict['worst_top_target'] = levels[worst_top_target_index] data_dict['worst_top_target_percent'] = round( old_div(top_target_contributions[worst_top_target_index] * 100.0, sum(top_target_contributions)), 2) data_dict["blockSplitter"] = self._blockSplitter data_dict["binTargetCol"] = self._binTargetCol data_dict["binAnalyzedCol"] = self._binAnalyzedCol data_dict['highlightFlag'] = self._highlightFlag # print "_"*60 # print "DATA DICT - ", data_dict # print "_"*60 ############### # CARD1 # ############### print("self._binTargetCol & self._binAnalyzedCol : ", self._binTargetCol, self._binAnalyzedCol) if len(data_dict['worst_second_share']) == 0: output = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card1_binned_target_worst_second.html', data_dict), self._blockSplitter, highlightFlag=self._highlightFlag) else: if (self._binTargetCol == True & self._binAnalyzedCol == False): print("Only Target Column is Binned, : ", self._binTargetCol) output = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card1_binned_target.html', data_dict), self._blockSplitter, highlightFlag=self._highlightFlag) elif (self._binTargetCol == True & self._binAnalyzedCol == True): print("Target Column and IV is Binned : ", self._binTargetCol, self._binAnalyzedCol) output = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card1_binned_target_and_IV.html', data_dict), self._blockSplitter, highlightFlag=self._highlightFlag) else: output = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card1.html', data_dict), self._blockSplitter, highlightFlag=self._highlightFlag) targetDimCard1Data = [] targetDimcard1Heading = '<h3>Impact of ' + self._analysed_dimension + ' on ' + self._target_dimension + "</h3>" toggledata = ToggleData() targetDimTable1Data = self.generate_card1_table1() targetDimCard1Table1 = TableData() targetDimCard1Table1.set_table_type("heatMap") targetDimCard1Table1.set_table_data(targetDimTable1Data) toggledata.set_toggleon_data({ "data": { "tableData": targetDimTable1Data, "tableType": "heatMap" }, "dataType": "table" }) targetDimTable2Data = self.generate_card1_table2() targetDimCard1Table2 = TableData() targetDimCard1Table2.set_table_type("normal") table2Data = targetDimTable2Data["data1"] table2Data = [ innerList[1:] for innerList in table2Data if innerList[0].strip() != "" ] targetDimCard1Table2.set_table_data(table2Data) toggledata.set_toggleoff_data({ "data": { "tableData": table2Data, "tableType": "heatMap" }, "dataType": "table" }) targetDimCard1Data.append(HtmlData(data=targetDimcard1Heading)) targetDimCard1Data.append(toggledata) targetDimCard1Data += output self._card1.set_card_data(targetDimCard1Data) self._card1.set_card_name("{}: Relationship with {}".format( self._analysed_dimension, self._target_dimension)) ############### # CARD2 # ############### if self._appid == None: key_factors = '' num_key_factors = len(self._second_level_dimensions) if len(self._second_level_dimensions) == 5: key_factors = ', '.join( self._second_level_dimensions[:4] ) + ' and ' + self._second_level_dimensions[4] elif len(self._second_level_dimensions) == 4: key_factors = ', '.join( self._second_level_dimensions[:3] ) + ' and ' + self._second_level_dimensions[3] elif len(self._second_level_dimensions) == 3: key_factors = ', '.join( self._second_level_dimensions[:2] ) + ' and ' + self._second_level_dimensions[2] elif len(self._second_level_dimensions) == 2: key_factors = ' and '.join(self._second_level_dimensions) elif len(self._second_level_dimensions) == 1: key_factors = self._second_level_dimensions[0] targetCardDataDict['num_key_factors'] = num_key_factors targetCardDataDict['key_factors'] = key_factors dict_for_test = {} for tupleObj in sorted_target_levels[:self._chiSquareLevelLimit]: targetLevel = tupleObj[1] targetCardDataDict['random_card2'] = random.randint(1, 100) targetCardDataDict['random_card4'] = random.randint(1, 100) second_target_contributions = [ table.get_value(targetLevel, i) for i in levels ] sum_second_target = sum(second_target_contributions) sorted_levels = sorted(zip(second_target_contributions, levels), reverse=True) level_differences = [0.0] + [ sorted_levels[i][0] - sorted_levels[i + 1][0] for i in range(len(sorted_levels) - 1) ] level_diff_index = level_differences.index( max(level_differences)) if level_differences.index( max(level_differences)) > 0 else len( level_differences ) ##added for pipeline keyerror issue second_target_top_dims = [ j for i, j in sorted_levels[:level_diff_index] ] second_target_top_dims_contribution = sum([ i for i, j in sorted_levels[:level_differences. index(max(level_differences))] ]) second_target_bottom_dim = sorted_levels[-1][1] second_target_bottom_dim_contribution = sorted_levels[-1][0] second_target_percentages = [ old_div(i * 100.0, sum_second_target) for i in second_target_contributions ] best_second_target_index = second_target_contributions.index( max(second_target_contributions)) worst_second_target_index = second_target_contributions.index( min(second_target_contributions)) second_target_differences = [ x - y for x, y in zip(levels_percentages, second_target_percentages) ] if len(second_target_differences) > 6: tops = 2 bottoms = -2 elif len(second_target_differences) > 4: tops = 2 bottoms = -1 else: tops = 1 bottoms = -1 sorted_ = sorted(enumerate(second_target_differences), key=lambda x: x[1], reverse=True) best_second_difference_indices = [x for x, y in sorted_[:tops]] worst_second_difference_indices = [ x for x, y in sorted_[bottoms:] ] second_target_shares = [ old_div(x * 100.0, y) for x, y in zip(second_target_contributions, level_counts) ] max_second_target_shares = max(second_target_shares) best_second_target_share_index = [ idx for idx, val in enumerate(second_target_shares) if val == max_second_target_shares ] level_counts_threshold = old_div( sum(level_counts) * 0.05, len(level_counts)) min_second_target_shares = min([ x for x, y in zip(second_target_shares, level_counts) if y >= level_counts_threshold ]) worst_second_target_share_index = [ idx for idx, val in enumerate(second_target_shares) if val == min_second_target_shares ] overall_second_percentage = old_div(sum_second_target * 100.0, total) # DataFrame for contribution calculation if self._pandas_flag: df_second_target = self._data_frame[( self._data_frame[self._target_dimension] == targetLevel ) & (self._data_frame[self._analysed_dimension] == second_target_top_dims[0])][ self._second_level_dimensions] df_second_dim = self._data_frame[( self._data_frame[self._analysed_dimension] == second_target_top_dims[0] )][self._second_level_dimensions] else: df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\ filter(col(self._analysed_dimension)==second_target_top_dims[0]).\ select(self._second_level_dimensions).toPandas() df_second_dim = self._data_frame.filter(col(self._analysed_dimension)==second_target_top_dims[0]).\ select(self._second_level_dimensions).toPandas() # if self._chisquare_result.get_splits(): # splits = self._chisquare_result.get_splits() # idx = self._chiSquareTable.get_bin_names(splits).index(second_target_top_dims[0]) # idx1 = self._chiSquareTable.get_bin_names(splits).index(top_target_top_dims[0]) # splits[len(splits)-1] = splits[len(splits)-1]+1 # df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\ # filter(col(self._analysed_dimension)>=splits[idx]).filter(col(self._analysed_dimension)<splits[idx+1]).\ # select(self._second_level_dimensions).toPandas() # df_second_dim = self._data_frame.filter(col(self._analysed_dimension)>=splits[idx]).\ # filter(col(self._analysed_dimension)<splits[idx+1]).\ # select(self._second_level_dimensions).toPandas() # else: # df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\ # filter(col(self._analysed_dimension)==second_target_top_dims[0]).\ # select(self._second_level_dimensions).toPandas() # df_second_dim = self._data_frame.filter(col(self._analysed_dimension)==second_target_top_dims[0]).\ # select(self._second_level_dimensions).toPandas() # print self._data_frame.select('Sales').show() distribution_second = [] d_l = [] for d in self._second_level_dimensions: grouped = df_second_target.groupby(d).agg({d: 'count'}) contributions = df_second_dim.groupby(d).agg({d: 'count'}) contribution_index = list(contributions.index) contributions_val = contributions[d].tolist() contributions_list = dict( list(zip(contribution_index, contributions_val))) index_list = list(grouped.index) grouped_list = grouped[d].tolist() contributions_percent_list = [ round(old_div(y * 100.0, contributions_list[x]), 2) for x, y in zip(index_list, grouped_list) ] sum_ = grouped[d].sum() diffs = [0] + [ grouped_list[i] - grouped_list[i + 1] for i in range(len(grouped_list) - 1) ] max_diff = diffs.index(max(diffs)) grouped_dict = dict(list(zip(index_list, grouped_list))) for val in contribution_index: if val not in list(grouped_dict.keys()): grouped_dict[val] = 0 else: pass index_list = [] grouped_list = [] contributions_val = [] for key in list(grouped_dict.keys()): index_list.append(str(key)) grouped_list.append(grouped_dict[key]) contributions_val.append(contributions_list[key]) ''' print "="*70 print "GROUPED - ", grouped print "INDEX LIST - ", index_list print "GROUPED LIST - ", grouped_list print "GROUPED DICT - ", grouped_dict print "CONTRIBUTIONS - ", contributions print "CONTRIBUTION INDEX - ", contribution_index print "CONTRIBUTIONS VAL - ", contributions_val print "CONTRIBUTIONS LIST - ", contributions_list print "CONTRIBUTIONS PERCENT LIST - ", contributions_percent_list print "SUM - ", sum_ print "DIFFS - ", diffs print "MAX DIFF - ", max_diff print "="*70 ''' informative_dict = { "levels": index_list, "positive_class_contribution": grouped_list, "positive_plus_others": contributions_val } informative_df = pd.DataFrame(informative_dict) informative_df["percentage_horizontal"] = old_div( informative_df["positive_class_contribution"] * 100, informative_df["positive_plus_others"]) informative_df["percentage_vertical"] = old_div( informative_df["positive_class_contribution"] * 100, sum_) informative_df.sort_values(["percentage_vertical"], inplace=True, ascending=False) informative_df = informative_df.reset_index(drop=True) percentage_vertical_sorted = list( informative_df["percentage_vertical"]) percentage_horizontal_sorted = list( informative_df["percentage_horizontal"]) levels_sorted = list(informative_df["levels"]) differences_list = [] for i in range(1, len(percentage_vertical_sorted)): difference = percentage_vertical_sorted[ i - 1] - percentage_vertical_sorted[i] differences_list.append(round(difference, 2)) ''' print "-"*70 print "DIFFERENCES LIST - ", differences_list print "-"*70 ''' index_txt = '' if differences_list: if differences_list[0] >= 30: print("showing 1st case") index_txt = levels_sorted[0] max_diff_equivalent = 1 else: if len(differences_list) >= 2: if differences_list[1] >= 10: print("showing 1st and 2nd case") index_txt = levels_sorted[0] + '(' + str( round(percentage_vertical_sorted[0], 1) ) + '%)' + ' and ' + levels_sorted[ 1] + '(' + str( round( percentage_vertical_sorted[1], 1)) + '%)' max_diff_equivalent = 2 else: print("showing 3rd case") index_txt = 'including ' + levels_sorted[ 0] + '(' + str( round( percentage_vertical_sorted[0], 1) ) + '%)' + ' and ' + levels_sorted[ 1] + '(' + str( round( percentage_vertical_sorted[ 1], 1)) + '%)' max_diff_equivalent = 3 else: print("showing 3rd case") index_txt = 'including ' + levels_sorted[ 0] + '(' + str( round(percentage_vertical_sorted[0], 1) ) + '%)' + ' and ' + levels_sorted[ 1] + '(' + str( round( percentage_vertical_sorted[1], 1)) + '%)' max_diff_equivalent = 3 else: max_diff_equivalent = 0 ''' print "-"*70 print informative_df.head(25) print "-"*70 ''' distribution_second.append({ 'contributions': [ round(i, 2) for i in percentage_vertical_sorted[:max_diff_equivalent] ], 'levels': levels_sorted[:max_diff_equivalent], 'variation': random.randint(1, 100), 'index_txt': index_txt, 'd': d, 'contributions_percent': percentage_horizontal_sorted }) ''' print "DISTRIBUTION SECOND - ", distribution_second print "<>"*50 ''' targetCardDataDict['distribution_second'] = distribution_second targetCardDataDict['second_target'] = targetLevel targetCardDataDict[ 'second_target_top_dims'] = second_target_top_dims targetCardDataDict[ 'second_target_top_dims_contribution'] = old_div( second_target_top_dims_contribution * 100.0, sum(second_target_contributions)) targetCardDataDict[ 'second_target_bottom_dim'] = second_target_bottom_dim targetCardDataDict[ 'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution targetCardDataDict['best_second_target'] = levels[ best_second_target_index] targetCardDataDict[ 'best_second_target_count'] = second_target_contributions[ best_second_target_index] targetCardDataDict['best_second_target_percent'] = round( old_div( second_target_contributions[best_second_target_index] * 100.0, sum(second_target_contributions)), 2) targetCardDataDict['worst_second_target'] = levels[ worst_second_target_index] targetCardDataDict['worst_second_target_percent'] = round( old_div( second_target_contributions[worst_second_target_index] * 100.0, sum(second_target_contributions)), 2) card2Data = [] targetLevelContributions = [ table.get_value(targetLevel, i) for i in levels ] impact_target_thershold = old_div( sum(targetLevelContributions) * 0.02, len(targetLevelContributions)) card2Heading = '<h3>Key Drivers of ' + self._target_dimension + ' (' + targetLevel + ')' + "</h3>" chart, bubble = self.generate_distribution_card_chart( targetLevel, targetLevelContributions, levels, level_counts, total, impact_target_thershold) card2ChartData = NormalChartData(data=chart["data"]) "rounding the chartdata values for key drivers tab inside table percentage(table data)" for d in card2ChartData.get_data(): d['percentage'] = round(d['percentage'], 2) d_l.append(d) card2ChartJson = ChartJson() card2ChartJson.set_data(d_l) card2ChartJson.set_chart_type("combination") card2ChartJson.set_types({ "total": "bar", "percentage": "line" }) card2ChartJson.set_legend({ "total": "# of " + targetLevel, "percentage": "% of " + targetLevel }) card2ChartJson.set_axes({ "x": "key", "y": "total", "y2": "percentage" }) card2ChartJson.set_label_text({ "x": " ", "y": "Count", "y2": "Percentage" }) print("self._binTargetCol & self._binAnalyzedCol : ", self._binTargetCol, self._binAnalyzedCol) if (self._binTargetCol == True & self._binAnalyzedCol == False): print("Only Target Column is Binned") output2 = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card2_binned_target.html', targetCardDataDict), self._blockSplitter) elif (self._binTargetCol == True & self._binAnalyzedCol == True): print("Target Column and IV is Binned") output2 = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card2_binned_target_and_IV.html', targetCardDataDict), self._blockSplitter) else: print("In Else, self._binTargetCol should be False : ", self._binTargetCol) output2 = NarrativesUtils.block_splitter( NarrativesUtils.get_template_output( self._base_dir, 'card2.html', targetCardDataDict), self._blockSplitter) card2Data.append(HtmlData(data=card2Heading)) statistical_info_array = [ ("Test Type", "Chi-Square"), ("Chi-Square statistic", str(round(self._chisquare_result.get_stat(), 3))), ("P-Value", str(round(self._chisquare_result.get_pvalue(), 3))), ("Inference", "Chi-squared analysis shows a significant association between {} (target) and {}." .format(self._target_dimension, self._analysed_dimension)) ] statistical_info_array = NarrativesUtils.statistical_info_array_formatter( statistical_info_array) card2Data.append( C3ChartData(data=card2ChartJson, info=statistical_info_array)) card2Data += output2 card2BubbleData = "<div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}</span><br /><small>{}</small></h2></div><div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}</span><br /><small>{}</small></h2></div>".format( bubble[0]["value"], bubble[0]["text"], bubble[1]["value"], bubble[1]["text"]) card2Data.append(HtmlData(data=card2BubbleData)) targetCard = NormalCard() targetCard.set_card_data(card2Data) targetCard.set_card_name("{} : Distribution of {}".format( self._analysed_dimension, targetLevel)) self._targetCards.append(targetCard) dict_for_test[targetLevel] = targetCardDataDict out = {'data_dict': data_dict, 'target_dict': dict_for_test} return out # def generate_card2_narratives(self): def generate_distribution_card_chart(self, __target, __target_contributions, levels, levels_count, total, thershold): chart = {} label = {'total': '# of ' + __target, 'percentage': '% of ' + __target} label_text = { 'x': self._analysed_dimension, 'y': '# of ' + __target, 'y2': '% of ' + __target, } data = {} data['total'] = dict(list(zip(levels, __target_contributions))) __target_percentages = [ old_div(x * 100.0, y) for x, y in zip(__target_contributions, levels_count) ] data['percentage'] = dict(list(zip(levels, __target_percentages))) chartData = [] for val in zip(levels, __target_contributions, __target_percentages): chartData.append({ "key": val[0], "total": val[1], "percentage": val[2] }) # c3_data = [levels,__target_contributions,__target_percentages] chart_data = {'label': label, 'data': chartData} bubble_data1 = {} bubble_data2 = {} bubble_data1['value'] = str( round( old_div( max(__target_contributions) * 100.0, sum(__target_contributions)), 1)) + '%' m_index = __target_contributions.index(max(__target_contributions)) bubble_data1[ 'text'] = 'Overall ' + __target + ' comes from ' + levels[m_index] intial = -1 for k, v, i in zip(__target_contributions, __target_percentages, list(range(len(__target_contributions)))): if k > thershold: if intial < v: intial = v bubble_data2['value'] = str(round(intial)) + '%' #m_index = __target_percentages.index(i) bubble_data2['text'] = levels[ i] + ' has the highest rate of ' + __target bubble_data = [bubble_data1, bubble_data2] return chart_data, bubble_data def generate_card1_table1(self): table_percent_by_column = self._chiSquareTable.table_percent_by_column column_two_values = self._chiSquareTable.column_two_values header_row = [self._analysed_dimension ] + self._chiSquareTable.get_column_one_levels() all_columns = [column_two_values] + table_percent_by_column other_rows = list(zip(*all_columns)) other_rows = [list(tup) for tup in other_rows] table_data = [header_row] + other_rows return table_data def generate_card1_table2(self): table = self._chiSquareTable.table table_percent = self._chiSquareTable.table_percent table_percent_by_row = self._chiSquareTable.table_percent_by_row table_percent_by_column = self._chiSquareTable.table_percent_by_column target_levels = self._chiSquareTable.get_column_one_levels() dim_levels = self._chiSquareTable.get_column_two_levels() header1 = [self._analysed_dimension] + target_levels + ['Total'] header = ['State', 'Active', 'Churn', 'Total'] #TODO remove data = [] data1 = [['Tag'] + header1] for idx, lvl in enumerate(dim_levels): first_row = ['Tag'] + header col_2_vals = list(zip(*table))[idx] data2 = ['bold'] + [lvl] + list(col_2_vals) + [sum(col_2_vals)] dict_ = dict(list(zip(first_row, data2))) data.append(dict_) data1.append(data2) col_2_vals = list(zip(*table_percent_by_column))[idx] data2 = [''] + ['As % within ' + self._analysed_dimension ] + list(col_2_vals) + [100.0] dict_ = dict(list(zip(first_row, data2))) data.append(dict_) data1.append(data2) col_2_vals = list(zip(*table_percent_by_row))[idx] col_2_vals1 = list(zip(*table_percent))[idx] data2 = [''] + [ 'As % within ' + self._target_dimension ] + list(col_2_vals) + [round(sum(col_2_vals1), 2)] dict_ = dict(list(zip(first_row, data2))) data.append(dict_) data1.append(data2) # col_2_vals = zip(*table_percent)[idx] data2 = [''] + ['As % of Total'] + list(col_2_vals1) + [ round(sum(col_2_vals1), 2) ] dict_ = dict(list(zip(first_row, data2))) data.append(dict_) data1.append(data2) out = { 'header': header, 'header1': header1, 'data': data, 'label': self._analysed_dimension, 'data1': data1 } return out
def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") appType = self._dataframe_context.get_app_type() algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = [ x for x in algosToRun if x.get_algorithm_slug() == self._slug ][0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) print("CATEGORICAL COLS - ", categorical_columns) result_column = self._dataframe_context.get_result_column() numerical_columns = self._dataframe_helper.get_numeric_columns() numerical_columns = [ x for x in numerical_columns if x != result_column ] model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print("model_path", model_path) pipeline_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/pipeline/" model_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/model" pmml_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/modelPmml" df = self._data_frame if self._mlEnv == "spark": pass elif self._mlEnv == "sklearn": model_filepath = model_path + "/" + self._slug + "/model.pkl" x_train, x_test, y_train, y_test = self._dataframe_helper.get_train_test_data( ) x_train = MLUtils.create_dummy_columns( x_train, [x for x in categorical_columns if x != result_column]) x_test = MLUtils.create_dummy_columns( x_test, [x for x in categorical_columns if x != result_column]) x_test = MLUtils.fill_missing_columns(x_test, x_train.columns, result_column) print("=" * 150) print("X-Train Shape - ", x_train.shape) print("Y-Train Shape - ", y_train.shape) print("X-Test Shape - ", x_test.shape) print("Y-Test Shape - ", y_test.shape) print("~" * 50) print("X-Train dtype - ", type(x_train)) print("Y-Train dtype - ", type(y_train)) print("X-Test dtype - ", type(x_test)) print("Y-Test dtype - ", type(y_test)) print("~" * 50) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "training", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") st = time.time() self._result_setter.set_hyper_parameter_results(self._slug, None) evaluationMetricDict = algoSetting.get_evaluvation_metric( Type="REGRESSION") evaluationMetricDict = { "name": GLOBALSETTINGS.REGRESSION_MODEL_EVALUATION_METRIC } evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] x_train_tensored, y_train_tensored, x_test_tensored, y_test_tensored = PYTORCHUTILS.get_tensored_data( x_train, y_train, x_test, y_test) trainset = torch_data_utils.TensorDataset(x_train_tensored, y_train_tensored) testset = torch_data_utils.TensorDataset(x_test_tensored, y_test_tensored) nnptr_params = algoSetting.get_nnptr_params_dict()[0] layers_for_network = PYTORCHUTILS.get_layers_for_network_module( nnptr_params, task_type="REGRESSION", first_layer_units=x_train.shape[1]) # Use GPU if available device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") network = PyTorchNetwork(layers_for_network).to(device) network.eval() other_params_dict = PYTORCHUTILS.get_other_pytorch_params( nnptr_params, task_type="REGRESSION", network_params=network.parameters()) print("~" * 50) print("NNPTR-PARAMS - ", nnptr_params) print("~" * 50) print("OTHER-PARAMS-DICT - ", other_params_dict) print("~" * 50) print("NEURAL-NETWORK - ", network) print("~" * 50) criterion = other_params_dict["loss_criterion"] n_epochs = other_params_dict["number_of_epochs"] batch_size = other_params_dict["batch_size"] optimizer = other_params_dict["optimizer"] dataloader_params = { "batch_size": batch_size, "shuffle": True # "num_workers": } train_loader = torch_data_utils.DataLoader(trainset, **dataloader_params) test_loader = torch_data_utils.DataLoader(testset, **dataloader_params) ''' Training the network; Batchnormalization(num_features) should be equal to units_op for that layer in training config; else --> RuntimeError('running_mean should contain 100 elements not 200',) ''' for epoch in range(n_epochs): batchwise_losses = [] average_loss = 0.0 for i, (inputs, labels) in enumerate(train_loader): inputs = inputs.to(device) labels = labels.to(device) # Zero the parameter gradients optimizer.zero_grad() # Forward + backward + optimize outputs = network(inputs.float()) loss = criterion(outputs, labels.float()) loss.backward() optimizer.step() average_loss += loss.item() batchwise_losses.append(loss.item()) average_loss_per_epoch = old_div(average_loss, (i + 1)) print("+" * 80) print("EPOCH - ", epoch) print("BATCHWISE_LOSSES shape - ", len(batchwise_losses)) print("AVERAGE LOSS PER EPOCH - ", average_loss_per_epoch) print("+" * 80) trainingTime = time.time() - st bestEstimator = network outputs_x_test_tensored = network(x_test_tensored.float()) y_score_mid = outputs_x_test_tensored.tolist() y_score = [x[0] for x in y_score_mid] print("Y-SCORE - ", y_score) print("Y-SCORE length - ", len(y_score)) y_prob = None featureImportance = {} objs = { "trained_model": bestEstimator, "actual": y_test, "predicted": y_score, "probability": y_prob, "feature_importance": featureImportance, "featureList": list(x_train.columns), "labelMapping": {} } #featureImportance = objs["trained_model"].feature_importances_ #featuresArray = [(col_name, featureImportance[idx]) for idx, col_name in enumerate(x_train.columns)] featuresArray = [] if not algoSetting.is_hyperparameter_tuning_enabled(): modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(modelName + ".pt") torch.save(objs["trained_model"], "/".join(modelFilepathArr)) #joblib.dump(objs["trained_model"],"/".join(modelFilepathArr)) runtime = round((time.time() - st), 2) else: runtime = round((time.time() - hyper_st), 2) try: modelPmmlPipeline = PMMLPipeline([("pretrained-estimator", objs["trained_model"])]) modelPmmlPipeline.target_field = result_column modelPmmlPipeline.active_fields = np.array( [col for col in x_train.columns if col != result_column]) sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr=True) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except: pass metrics = {} metrics["r2"] = r2_score(y_test, y_score) metrics["neg_mean_squared_error"] = mean_squared_error( y_test, y_score) metrics["neg_mean_absolute_error"] = mean_absolute_error( y_test, y_score) metrics["RMSE"] = sqrt(metrics["neg_mean_squared_error"]) metrics["explained_variance_score"] = explained_variance_score( y_test, y_score) transformed = pd.DataFrame({ "prediction": y_score, result_column: y_test }) print("TRANSFORMED PREDICTION TYPE - ", type(transformed["prediction"])) print(transformed["prediction"]) print("TRANSFORMED RESULT COL TYPE - ", type(transformed[result_column])) print(transformed[result_column]) transformed["difference"] = transformed[ result_column] - transformed["prediction"] transformed["mape"] = old_div( np.abs(transformed["difference"]) * 100, transformed[result_column]) sampleData = None nrows = transformed.shape[0] if nrows > 100: sampleData = transformed.sample(n=100, random_state=420) else: sampleData = transformed print(sampleData.head()) if transformed["mape"].max() > 100: GLOBALSETTINGS.MAPEBINS.append(transformed["mape"].max()) mapeCountArr = list( pd.cut(transformed["mape"], GLOBALSETTINGS.MAPEBINS). value_counts().to_dict().items()) GLOBALSETTINGS.MAPEBINS.pop(5) else: mapeCountArr = list( pd.cut(transformed["mape"], GLOBALSETTINGS.MAPEBINS). value_counts().to_dict().items()) mapeStatsArr = [(str(idx), dictObj) for idx, dictObj in enumerate( sorted([{ "count": x[1], "splitRange": (x[0].left, x[0].right) } for x in mapeCountArr], key=lambda x: x["splitRange"][0]))] print(mapeStatsArr) print(mapeCountArr) predictionColSummary = transformed["prediction"].describe( ).to_dict() quantileBins = [ predictionColSummary["min"], predictionColSummary["25%"], predictionColSummary["50%"], predictionColSummary["75%"], predictionColSummary["max"] ] print(quantileBins) quantileBins = sorted(list(set(quantileBins))) transformed["quantileBinId"] = pd.cut(transformed["prediction"], quantileBins) quantileDf = transformed.groupby("quantileBinId").agg({ "prediction": [np.sum, np.mean, np.size] }).reset_index() quantileDf.columns = ["prediction", "sum", "mean", "count"] print(quantileDf) quantileArr = list(quantileDf.T.to_dict().items()) quantileSummaryArr = [(obj[0], { "splitRange": (obj[1]["prediction"].left, obj[1]["prediction"].right), "count": obj[1]["count"], "mean": obj[1]["mean"], "sum": obj[1]["sum"] }) for obj in quantileArr] print(quantileSummaryArr) runtime = round((time.time() - st_global), 2) self._model_summary.set_model_type("regression") self._model_summary.set_algorithm_name("Neural Network (PyTorch)") self._model_summary.set_algorithm_display_name( "Neural Network (PyTorch)") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_training_time(trainingTime) self._model_summary.set_target_variable(result_column) self._model_summary.set_validation_method( validationDict["displayName"]) self._model_summary.set_model_evaluation_metrics(metrics) self._model_summary.set_model_params(nnptr_params) self._model_summary.set_quantile_summary(quantileSummaryArr) self._model_summary.set_mape_stats(mapeStatsArr) self._model_summary.set_sample_data(sampleData.to_dict()) self._model_summary.set_feature_importance(featuresArray) self._model_summary.set_feature_list(list(x_train.columns)) self._model_summary.set_model_mse( metrics["neg_mean_squared_error"]) self._model_summary.set_model_mae( metrics["neg_mean_absolute_error"]) self._model_summary.set_rmse(metrics["RMSE"]) self._model_summary.set_model_rsquared(metrics["r2"]) self._model_summary.set_model_exp_variance_score( metrics["explained_variance_score"]) try: pmml_filepath = str(model_path) + "/" + str( self._slug) + "/traindeModel.pmml" modelPmmlPipeline = PMMLPipeline([("pretrained-estimator", objs["trained_model"])]) modelPmmlPipeline.target_field = result_column modelPmmlPipeline.active_fields = np.array( [col for col in x_train.columns if col != result_column]) sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr=True) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except: pass if algoSetting.is_hyperparameter_tuning_enabled(): modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": metrics[evaluationMetricDict["name"]], "evaluationMetricName": evaluationMetricDict["name"], "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } else: modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": metrics[evaluationMetricDict["name"]], "evaluationMetricName": evaluationMetricDict["name"], "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } modelmanagement_ = nnptr_params self._model_management = MLModelSummary() if algoSetting.is_hyperparameter_tuning_enabled(): pass else: self._model_management.set_layer_info( data=modelmanagement_['hidden_layer_info']) self._model_management.set_loss_function( data=modelmanagement_['loss']) self._model_management.set_optimizer( data=modelmanagement_['optimizer']) self._model_management.set_batch_size( data=modelmanagement_['batch_size']) self._model_management.set_no_epochs( data=modelmanagement_['number_of_epochs']) # self._model_management.set_model_evaluation_metrics(data=modelmanagement_['metrics']) self._model_management.set_job_type( self._dataframe_context.get_job_name()) #Project name self._model_management.set_training_status( data="completed") # training status self._model_management.set_no_of_independent_variables( data=x_train) #no of independent varables self._model_management.set_training_time(runtime) # run time self._model_management.set_rmse(metrics["RMSE"]) self._model_management.set_algorithm_name( "Neural Network (TensorFlow)") #algorithm name self._model_management.set_validation_method( str(validationDict["displayName"]) + "(" + str(validationDict["value"]) + ")") #validation method self._model_management.set_target_variable( result_column) #target column name self._model_management.set_creation_date(data=str( datetime.now().strftime('%b %d ,%Y %H:%M '))) #creation date self._model_management.set_datasetName(self._datasetName) modelManagementSummaryJson = [ ["Project Name", self._model_management.get_job_type()], ["Algorithm", self._model_management.get_algorithm_name()], ["Training Status", self._model_management.get_training_status()], ["RMSE", self._model_management.get_rmse()], ["RunTime", self._model_management.get_training_time()], #["Owner",None], ["Created On", self._model_management.get_creation_date()] ] if algoSetting.is_hyperparameter_tuning_enabled(): modelManagementModelSettingsJson = [] else: modelManagementModelSettingsJson = [ ["Training Dataset", self._model_management.get_datasetName()], [ "Target Column", self._model_management.get_target_variable() ], [ "Number Of Independent Variables", self._model_management.get_no_of_independent_variables() ], ["Algorithm", self._model_management.get_algorithm_name()], [ "Model Validation", self._model_management.get_validation_method() ], ["batch_size", str(self._model_management.get_batch_size())], ["Loss", self._model_management.get_loss_function()], ["Optimizer", self._model_management.get_optimizer()], ["Epochs", self._model_management.get_no_epochs()], [ "Metrics", self._model_management.get_model_evaluation_metrics() ] ] for i in modelmanagement_["hidden_layer_info"]: string = "" key = str(modelmanagement_["hidden_layer_info"][i] ["layer"]) + " " + str(i) + ":" for j in modelmanagement_["hidden_layer_info"][i]: string = string + str(j) + ":" + str( modelmanagement_["hidden_layer_info"][i][j]) + ", " modelManagementModelSettingsJson.append([key, string]) print(modelManagementModelSettingsJson) nnptrCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary) ] nnptrPerformanceCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_cards_regression( self._model_summary) ] nnptrOverviewCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_card_overview( self._model_management, modelManagementSummaryJson, modelManagementModelSettingsJson) ] nnptrDeploymentCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_deploy_empty_card() ] nnptr_Overview_Node = NarrativesTree() nnptr_Overview_Node.set_name("Overview") nnptr_Performance_Node = NarrativesTree() nnptr_Performance_Node.set_name("Performance") nnptr_Deployment_Node = NarrativesTree() nnptr_Deployment_Node.set_name("Deployment") for card in nnptrOverviewCards: nnptr_Overview_Node.add_a_card(card) for card in nnptrPerformanceCards: nnptr_Performance_Node.add_a_card(card) for card in nnptrDeploymentCards: nnptr_Deployment_Node.add_a_card(card) for card in nnptrCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({ "Neural Network (PyTorch)": json.loads( CommonUtils.convert_python_object_to_json(self._model_summary)) }) self._result_setter.set_nnptr_regression_model_summary( modelSummaryJson) self._result_setter.set_nnptr_cards(nnptrCards) self._result_setter.set_nnptr_nodes([ nnptr_Overview_Node, nnptr_Performance_Node, nnptr_Deployment_Node ]) self._result_setter.set_nnptr_fail_card({ "Algorithm_Name": "Neural Network (PyTorch)", "Success": "True" }) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "completion", "info", display=True, emptyBin=False, customMsg=None, weightKey="total")
class DecisionTreeNarrative: MAX_FRACTION_DIGITS = 2 def _get_new_table(self): self._decisionTreeCard1Table = [["PREDICTION", "RULES", "PERCENTAGE"]] for keys in self._table.keys(): self._new_table[keys] = {} self._new_table[keys]['rules'] = self._table[keys] self._new_table[keys]['probability'] = [ round(i, 2) for i in self.success_percent[keys] ] keyTable = [ keys, self._new_table[keys]['rules'], self._new_table[keys]['probability'] ] self._decisionTreeCard1Table.append(keyTable) # @accepts(object, (str, basestring), DecisionTreeResult,DataFrameHelper,ContextSetter,ResultSetter,NarrativesTree,basestring,dict) def __init__(self, column_name, decision_tree_rules, df_helper, df_context, meta_parser, result_setter, story_narrative=None, analysisName=None, scriptWeight=None): self._story_narrative = story_narrative self._metaParser = meta_parser self._dataframe_context = df_context self._ignoreMsg = self._dataframe_context.get_message_ignore() self._result_setter = result_setter self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER self._column_name = column_name.lower() self._colname = column_name self._capitalized_column_name = "%s%s" % (column_name[0].upper(), column_name[1:]) self._decision_rules_dict = decision_tree_rules.get_decision_rules() self._decision_tree_json = CommonUtils.as_dict(decision_tree_rules) self._decision_tree_raw = self._decision_rules_dict # self._decision_tree_raw = {"tree":{"children":None}} # self._decision_tree_raw['tree']["children"] = self._decision_tree_json['tree']["children"] self._table = decision_tree_rules.get_table() self._new_table = {} self.successful_predictions = decision_tree_rules.get_success() self.total_predictions = decision_tree_rules.get_total() self.success_percent = decision_tree_rules.get_success_percent() self._important_vars = decision_tree_rules.get_significant_vars() self._target_distribution = decision_tree_rules.get_target_contributions( ) self._get_new_table() self._df_helper = df_helper self.subheader = None #self.table = {} self.dropdownComment = None self.dropdownValues = None self._base_dir = "/decisiontree/" self._completionStatus = self._dataframe_context.get_completion_status( ) if analysisName == None: self._analysisName = self._dataframe_context.get_analysis_name() else: self._analysisName = analysisName self._messageURL = self._dataframe_context.get_message_url() if scriptWeight == None: self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight( ) else: self._scriptWeightDict = scriptWeight self._scriptStages = { "dtreeNarrativeStart": { "summary": "Started the Decision Tree Narratives", "weight": 0 }, "dtreeNarrativeEnd": { "summary": "Narratives for Decision Tree Finished", "weight": 10 }, } self._completionStatus += self._scriptWeightDict[ self._analysisName]["narratives"] * self._scriptStages[ "dtreeNarrativeStart"]["weight"] / 10 progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "dtreeNarrativeStart",\ "info",\ self._scriptStages["dtreeNarrativeStart"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=self._ignoreMsg) self._dataframe_context.update_completion_status( self._completionStatus) self._decisionTreeNode = NarrativesTree() self._decisionTreeNode.set_name("Prediction") self._generate_narratives() # self._story_narrative.add_a_node(self._decisionTreeNode) self._result_setter.set_decision_tree_node(self._decisionTreeNode) self._result_setter.set_score_dtree_cards( json.loads( CommonUtils.convert_python_object_to_json( self._decisionTreeNode.get_all_cards()))) self._completionStatus = self._dataframe_context.get_completion_status( ) self._completionStatus += self._scriptWeightDict[ self._analysisName]["narratives"] * self._scriptStages[ "dtreeNarrativeEnd"]["weight"] / 10 progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "dtreeNarrativeEnd",\ "info",\ self._scriptStages["dtreeNarrativeEnd"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=self._ignoreMsg) self._dataframe_context.update_completion_status( self._completionStatus) def _generate_narratives(self): self._generate_summary() # def _generate_summary(self): # rules = self._decision_rules_dict # colname = self._colname # data_dict = {"dimension_name":self._colname} # data_dict["plural_colname"] = NarrativesUtils.pluralize(data_dict["dimension_name"]) # data_dict["significant_vars"] = [] # rules_dict = self._table # self.condensedTable={} # for target in rules_dict.keys(): # self.condensedTable[target]=[] # total = self.total_predictions[target] # success = self.successful_predictions[target] # success_percent = self.success_percent[target] # for idx,rule in enumerate(rules_dict[target]): # rules1 = NarrativeUtils.generate_rules(target,rule, total[idx], success[idx], success_percent[idx]) # self.condensedTable[target].append(rules1) # self.dropdownValues = rules_dict.keys() # data_dict["blockSplitter"] = self._blockSplitter # data_dict['rules'] = self.condensedTable # data_dict['success'] = self.success_percent # data_dict['significant_vars'] = list(set(itertools.chain.from_iterable(self._important_vars.values()))) # data_dict['significant_vars'] = self._important_vars # # print '*'*16 # # print data_dict['rules'] # # print self._new_table # self.card2_data = NarrativesUtils.paragraph_splitter(NarrativesUtils.get_template_output(self._base_dir,\ # 'decision_tree_card2.html',data_dict)) # self.card2_chart = self._target_distribution # # self.dropdownComment = NarrativesUtils.get_template_output(self._base_dir,\ # 'decision_rule_summary.html',data_dict) # main_card = NormalCard() # main_card_data = [] # main_card_narrative = NarrativesUtils.block_splitter(self.dropdownComment,self._blockSplitter) # main_card_data += main_card_narrative # main_card_data.append(TreeData(data=self._decision_tree_raw)) # main_card_table = TableData() # main_card_table.set_table_data(self._decisionTreeCard1Table) # main_card_table.set_table_type("decisionTreeTable") # main_card_data.append(main_card_table) # main_card.set_card_data(main_card_data) # main_card.set_card_name("Predicting Key Drivers of {}".format(self._colname)) # card2 = NormalCard() # card2Data = NarrativesUtils.block_splitter(NarrativesUtils.get_template_output(self._base_dir,\ # 'decision_tree_card2.html',data_dict),self._blockSplitter) # card2ChartData = [] # for k,v in self._target_distribution.items(): # card2ChartData.append({"key":k,"value":v}) # card2ChartData = NormalChartData(data=card2ChartData) # card2ChartJson = ChartJson() # card2ChartJson.set_data(card2ChartData.get_data()) # card2ChartJson.set_chart_type("bar") # card2ChartJson.set_axes({"x":"key","y":"value"}) # card2Data.insert(1,C3ChartData(data=card2ChartJson)) # card2.set_card_data(card2Data) # card2.set_card_name("Decision Rules for {}".format(self._colname)) # self._decisionTreeNode.add_a_card(main_card) # self._decisionTreeNode.add_a_card(card2) # self.subheader = NarrativesUtils.get_template_output(self._base_dir,\ # 'decision_tree_summary.html',data_dict) def _generate_summary(self): data_dict = {} rules_dict = self._table data_dict["blockSplitter"] = self._blockSplitter data_dict["targetcol"] = self._colname groups = rules_dict.keys() probabilityCutoff = 75 probabilityGroups = [{ "probability": probabilityCutoff, "count": 0, "range": [probabilityCutoff, 100] }, { "probability": probabilityCutoff - 1, "count": 0, "range": [0, probabilityCutoff - 1] }] tableArray = [[ "Prediction Rule", "Probability", "Prediction", "Freq", "group", "richRules" ]] dropdownData = [] chartDict = {} targetLevel = self._dataframe_context.get_target_level_for_model() probabilityArrayAll = [] self._completionStatus = self._dataframe_context.get_completion_status( ) progressMessage = CommonUtils.create_progress_message_object( self._analysisName, "custom", "info", "Generating Prediction rules", self._completionStatus, self._completionStatus, display=True) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=False) self._dataframe_context.update_completion_status( self._completionStatus) targetValues = [x for x in rules_dict.keys() if x == targetLevel ] + [x for x in rules_dict.keys() if x != targetLevel] for idx, target in enumerate(targetValues): if idx == 0: if self._dataframe_context.get_story_on_scored_data() != True: dropdownData.append({ "displayName": target, "name": target, "selected": True, "id": idx + 1 }) else: dropdownData.append({ "displayName": "{} : {}".format(self._colname, target), "name": target, "selected": True, "id": idx + 1 }) else: if self._dataframe_context.get_story_on_scored_data() != True: dropdownData.append({ "displayName": target, "name": target, "selected": False, "id": idx + 1 }) else: dropdownData.append({ "displayName": "{} : {}".format(self._colname, target), "name": target, "selected": False, "id": idx + 1 }) rulesArray = rules_dict[target] probabilityArray = [ round(x, 2) for x in self.success_percent[target] ] probabilityArrayAll += probabilityArray groupArray = [ "strong" if x >= probabilityCutoff else "mixed" for x in probabilityArray ] for idx2, obj in enumerate(probabilityGroups): grpCount = len([ x for x in probabilityArray if x >= obj["range"][0] and x <= obj["range"][1] ]) obj["count"] += grpCount probabilityGroups[idx2] = obj predictionArray = [target] * len(rulesArray) freqArray = self.total_predictions[target] chartDict[target] = sum(freqArray) success = self.successful_predictions[target] success_percent = self.success_percent[target] richRulesArray = [] crudeRuleArray = [] analysisType = self._dataframe_context.get_analysis_type() targetCol = self._dataframe_context.get_result_column() binFlag = False if self._dataframe_context.get_custom_analysis_details() != None: binnedColObj = [ x["colName"] for x in self._dataframe_context.get_custom_analysis_details() ] if binnedColObj != None and targetCol in binnedColObj: binFlag = True for idx2, crudeRule in enumerate(rulesArray): richRule, crudeRule = NarrativesUtils.generate_rules( self._colname, target, crudeRule, freqArray[idx2], success[idx2], success_percent[idx2], analysisType, binFlag=binFlag) richRulesArray.append(richRule) crudeRuleArray.append(crudeRule) probabilityArray = map( lambda x: humanize.apnumber(x) + "%" if x >= 10 else str(int(x)) + "%", probabilityArray) # targetArray = zip(richRulesArray,probabilityArray,predictionArray,freqArray,groupArray) targetArray = zip(crudeRuleArray, probabilityArray, predictionArray, freqArray, groupArray, richRulesArray) targetArray = [list(x) for x in targetArray] tableArray += targetArray donutChartMaxLevel = 10 if self._dataframe_context.get_story_on_scored_data() == True: chartDict = {} probabilityRangeForChart = GLOBALSETTINGS.PROBABILITY_RANGE_FOR_DONUT_CHART chartDict = dict( zip(probabilityRangeForChart.keys(), [0] * len(probabilityRangeForChart))) for val in probabilityArrayAll: for grps, grpRange in probabilityRangeForChart.items(): if val > grpRange[0] and val <= grpRange[1]: chartDict[grps] = chartDict[grps] + 1 chartDict = {k: v for k, v in chartDict.items() if v != 0} else: chartDict = dict([(k, sum(v)) for k, v in self.total_predictions.items()]) chartDict = {k: v for k, v in chartDict.items() if v != 0} if len(chartDict) > donutChartMaxLevel: chartDict = NarrativesUtils.restructure_donut_chart_data( chartDict, nLevels=donutChartMaxLevel) chartData = NormalChartData([chartDict]).get_data() chartJson = ChartJson(data=chartData) chartJson.set_title(self._colname) chartJson.set_chart_type("donut") mainCardChart = C3ChartData(data=chartJson) mainCardChart.set_width_percent(45) # mainCardChart = {"dataType": "c3Chart","widthPercent":33 ,"data": {"data": [chartDict],"title":self._colname,"axes":{},"label_text":{},"legend":{},"yAxisNumberFormat": ".2s","types":None,"axisRotation":False, "chart_type": "donut"}} dropdownDict = { "dataType": "dropdown", "label": "Showing prediction rules for", "data": dropdownData } data_dict["probabilityGroups"] = probabilityGroups if self._dataframe_context.get_story_on_scored_data() != True: maincardSummary = NarrativesUtils.get_template_output(self._base_dir,\ 'decisiontreesummary.html',data_dict) else: predictedLevelcountArray = [(x[2], x[3]) for x in tableArray[1:]] predictedLevelCountDict = {} # predictedLevelcountDict = defaultdict(predictedLevelcountArray) for val in predictedLevelcountArray: predictedLevelCountDict.setdefault(val[0], []).append(val[1]) levelCountDict = {} for k, v in predictedLevelCountDict.items(): levelCountDict[k] = sum(v) # levelCountDict = self._metaParser.get_unique_level_dict(self._colname) total = float( sum([x for x in levelCountDict.values() if x != None])) levelCountTuple = [{ "name": k, "count": v, "percentage": round(v * 100 / total, 2) } for k, v in levelCountDict.items() if v != None] percentageArray = [x["percentage"] for x in levelCountTuple] percentageArray = NarrativesUtils.ret_smart_round(percentageArray) levelCountTuple = [{ "name": obj["name"], "count": obj["count"], "percentage": str(percentageArray[idx]) + "%" } for idx, obj in enumerate(levelCountTuple)] data_dict["nlevel"] = len(levelCountDict) print "levelCountTuple", levelCountTuple print "levelCountDict", levelCountDict if targetLevel in levelCountDict: data_dict["topLevel"] = [ x for x in levelCountTuple if x["name"] == targetLevel ][0] if len(levelCountTuple) > 1: data_dict["secondLevel"] = max([ x for x in levelCountTuple if x["name"] != targetLevel ], key=lambda x: x["count"]) else: data_dict["secondLevel"] = None else: data_dict["topLevel"] = levelCountTuple[0] if len(levelCountTuple) > 1: data_dict["secondLevel"] = levelCountTuple[1] else: data_dict["secondLevel"] = None print data_dict maincardSummary = NarrativesUtils.get_template_output( self._base_dir, 'decisiontreescore.html', data_dict) main_card = NormalCard() main_card_data = [] main_card_narrative = NarrativesUtils.block_splitter( maincardSummary, self._blockSplitter) main_card_data += main_card_narrative main_card_data.append(mainCardChart) main_card_data.append(dropdownDict) main_card_table = TableData() if self._dataframe_context.get_story_on_scored_data() == True: main_card_table.set_table_width(75) main_card_table.set_table_data(tableArray) main_card_table.set_table_type("popupDecisionTreeTable") main_card_data.append(main_card_table) uidTable = self._result_setter.get_unique_identifier_table() if uidTable != None: main_card_data.append(uidTable) else: main_card_table.set_table_width(100) main_card.set_card_data(main_card_data) main_card.set_card_name("Predicting Key Drivers of {}".format( self._colname)) self._decisionTreeNode.add_a_card(main_card)
def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") appType = self._dataframe_context.get_app_type() algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = [ x for x in algosToRun if x.get_algorithm_slug() == self._slug ][0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) print(categorical_columns) result_column = self._dataframe_context.get_result_column() numerical_columns = self._dataframe_helper.get_numeric_columns() numerical_columns = [ x for x in numerical_columns if x != result_column ] model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print("model_path", model_path) pipeline_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/pipeline/" model_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/model" pmml_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/modelPmml" df = self._data_frame if self._mlEnv == "spark": pass elif self._mlEnv == "sklearn": model_filepath = model_path + "/" + self._slug + "/model.pkl" x_train, x_test, y_train, y_test = self._dataframe_helper.get_train_test_data( ) x_train = MLUtils.create_dummy_columns( x_train, [x for x in categorical_columns if x != result_column]) x_test = MLUtils.create_dummy_columns( x_test, [x for x in categorical_columns if x != result_column]) x_test = MLUtils.fill_missing_columns(x_test, x_train.columns, result_column) st = time.time() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "training", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") if algoSetting.is_hyperparameter_tuning_enabled(): pass else: self._result_setter.set_hyper_parameter_results( self._slug, None) evaluationMetricDict = algoSetting.get_evaluvation_metric( Type="Regression") evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] params_tf = algoSetting.get_tf_params_dict() algoParams = algoSetting.get_params_dict() algoParams = {k: v for k, v in list(algoParams.items())} model = tf.keras.models.Sequential() first_layer_flag = True for i in range(len(list( params_tf['hidden_layer_info'].keys()))): if params_tf['hidden_layer_info'][str( i)]["layer"] == "Dense": if first_layer_flag: model.add( tf.keras.layers.Dense( params_tf['hidden_layer_info'][str( i)]["units"], activation=params_tf['hidden_layer_info'][ str(i)]["activation"], input_shape=(len(x_train.columns), ), use_bias=params_tf['hidden_layer_info'][ str(i)]["use_bias"], kernel_initializer=params_tf[ 'hidden_layer_info'][str( i)]["kernel_initializer"], bias_initializer=params_tf[ 'hidden_layer_info'][str( i)]["bias_initializer"], kernel_regularizer=params_tf[ 'hidden_layer_info'][str( i)]["kernel_regularizer"], bias_regularizer=params_tf[ 'hidden_layer_info'][str( i)]["bias_regularizer"], activity_regularizer=params_tf[ 'hidden_layer_info'][str( i)]["activity_regularizer"], kernel_constraint=params_tf[ 'hidden_layer_info'][str( i)]["kernel_constraint"], bias_constraint=params_tf[ 'hidden_layer_info'][str( i)]["bias_constraint"])) try: if params_tf['hidden_layer_info'][str( i)]["batch_normalization"] == "True": model.add( tf.keras.layers.BatchNormalization()) except: print( "BATCH_NORM_FAILED ##########################" ) pass first_layer_flag = False else: model.add( tf.keras.layers.Dense( params_tf['hidden_layer_info'][str( i)]["units"], activation=params_tf['hidden_layer_info'][ str(i)]["activation"], use_bias=params_tf['hidden_layer_info'][ str(i)]["use_bias"], kernel_initializer=params_tf[ 'hidden_layer_info'][str( i)]["kernel_initializer"], bias_initializer=params_tf[ 'hidden_layer_info'][str( i)]["bias_initializer"], kernel_regularizer=params_tf[ 'hidden_layer_info'][str( i)]["kernel_regularizer"], bias_regularizer=params_tf[ 'hidden_layer_info'][str( i)]["bias_regularizer"], activity_regularizer=params_tf[ 'hidden_layer_info'][str( i)]["activity_regularizer"], kernel_constraint=params_tf[ 'hidden_layer_info'][str( i)]["kernel_constraint"], bias_constraint=params_tf[ 'hidden_layer_info'][str( i)]["bias_constraint"])) try: if params_tf['hidden_layer_info'][str( i)]["batch_normalization"] == "True": model.add( tf.keras.layers.BatchNormalization()) except: print( "BATCH_NORM_FAILED ##########################" ) pass elif params_tf['hidden_layer_info'][str( i)]["layer"] == "Dropout": model.add( tf.keras.layers.Dropout( float(params_tf['hidden_layer_info'][str(i)] ["rate"]))) elif params_tf['hidden_layer_info'][str( i)]["layer"] == "Lambda": if params_tf['hidden_layer_info'][str( i)]["lambda"] == "Addition": model.add( tf.keras.layers.Lambda(lambda x: x + int( params_tf['hidden_layer_info'][str(i)][ "units"]))) if params_tf['hidden_layer_info'][str( i)]["lambda"] == "Multiplication": model.add( tf.keras.layers.Lambda(lambda x: x * int( params_tf['hidden_layer_info'][str(i)][ "units"]))) if params_tf['hidden_layer_info'][str( i)]["lambda"] == "Subtraction": model.add( tf.keras.layers.Lambda(lambda x: x - int( params_tf['hidden_layer_info'][str(i)][ "units"]))) if params_tf['hidden_layer_info'][str( i)]["lambda"] == "Division": model.add( tf.keras.layers.Lambda(lambda x: old_div( x, int(params_tf['hidden_layer_info'][str(i)][ "units"])))) model.compile(optimizer=algoParams["optimizer"], loss=algoParams["loss"], metrics=[algoParams['metrics']]) model.fit(x_train, y_train, epochs=algoParams["number_of_epochs"], verbose=1, batch_size=algoParams["batch_size"]) bestEstimator = model print(model.summary()) trainingTime = time.time() - st y_score = bestEstimator.predict(x_test) y_score = list(y_score.flatten()) try: y_prob = bestEstimator.predict_proba(x_test) except: y_prob = [0] * len(y_score) featureImportance = {} objs = { "trained_model": bestEstimator, "actual": y_test, "predicted": y_score, "probability": y_prob, "feature_importance": featureImportance, "featureList": list(x_train.columns), "labelMapping": {} } #featureImportance = objs["trained_model"].feature_importances_ #featuresArray = [(col_name, featureImportance[idx]) for idx, col_name in enumerate(x_train.columns)] featuresArray = [] if not algoSetting.is_hyperparameter_tuning_enabled(): modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(modelName + ".h5") objs["trained_model"].save("/".join(modelFilepathArr)) #joblib.dump(objs["trained_model"],"/".join(modelFilepathArr)) metrics = {} metrics["r2"] = r2_score(y_test, y_score) metrics["neg_mean_squared_error"] = mean_squared_error( y_test, y_score) metrics["neg_mean_absolute_error"] = mean_absolute_error( y_test, y_score) metrics["RMSE"] = sqrt(metrics["neg_mean_squared_error"]) metrics["explained_variance_score"] = explained_variance_score( y_test, y_score) transformed = pd.DataFrame({ "prediction": y_score, result_column: y_test }) transformed["difference"] = transformed[ result_column] - transformed["prediction"] transformed["mape"] = old_div( np.abs(transformed["difference"]) * 100, transformed[result_column]) sampleData = None nrows = transformed.shape[0] if nrows > 100: sampleData = transformed.sample(n=100, random_state=420) else: sampleData = transformed print(sampleData.head()) if transformed["mape"].max() > 100: GLOBALSETTINGS.MAPEBINS.append(transformed["mape"].max()) mapeCountArr = list( pd.cut(transformed["mape"], GLOBALSETTINGS.MAPEBINS). value_counts().to_dict().items()) GLOBALSETTINGS.MAPEBINS.pop(5) else: mapeCountArr = list( pd.cut(transformed["mape"], GLOBALSETTINGS.MAPEBINS). value_counts().to_dict().items()) mapeStatsArr = [(str(idx), dictObj) for idx, dictObj in enumerate( sorted([{ "count": x[1], "splitRange": (x[0].left, x[0].right) } for x in mapeCountArr], key=lambda x: x["splitRange"][0]))] print(mapeStatsArr) print(mapeCountArr) predictionColSummary = transformed["prediction"].describe( ).to_dict() quantileBins = [ predictionColSummary["min"], predictionColSummary["25%"], predictionColSummary["50%"], predictionColSummary["75%"], predictionColSummary["max"] ] print(quantileBins) quantileBins = sorted(list(set(quantileBins))) transformed["quantileBinId"] = pd.cut(transformed["prediction"], quantileBins) quantileDf = transformed.groupby("quantileBinId").agg({ "prediction": [np.sum, np.mean, np.size] }).reset_index() quantileDf.columns = ["prediction", "sum", "mean", "count"] print(quantileDf) quantileArr = list(quantileDf.T.to_dict().items()) quantileSummaryArr = [(obj[0], { "splitRange": (obj[1]["prediction"].left, obj[1]["prediction"].right), "count": obj[1]["count"], "mean": obj[1]["mean"], "sum": obj[1]["sum"] }) for obj in quantileArr] print(quantileSummaryArr) runtime = round((time.time() - st_global), 2) self._model_summary.set_model_type("regression") self._model_summary.set_algorithm_name( "Neural Network (TensorFlow)") self._model_summary.set_algorithm_display_name( "Neural Network (TensorFlow)") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_training_time(trainingTime) self._model_summary.set_target_variable(result_column) self._model_summary.set_validation_method( validationDict["displayName"]) self._model_summary.set_model_evaluation_metrics(metrics) self._model_summary.set_model_params(params_tf) self._model_summary.set_quantile_summary(quantileSummaryArr) self._model_summary.set_mape_stats(mapeStatsArr) self._model_summary.set_sample_data(sampleData.to_dict()) self._model_summary.set_feature_importance(featuresArray) self._model_summary.set_feature_list(list(x_train.columns)) self._model_summary.set_model_mse( metrics["neg_mean_squared_error"]) self._model_summary.set_model_mae( metrics["neg_mean_absolute_error"]) self._model_summary.set_rmse(metrics["RMSE"]) self._model_summary.set_model_rsquared(metrics["r2"]) self._model_summary.set_model_exp_variance_score( metrics["explained_variance_score"]) try: pmml_filepath = str(model_path) + "/" + str( self._slug) + "/traindeModel.pmml" modelPmmlPipeline = PMMLPipeline([("pretrained-estimator", objs["trained_model"])]) modelPmmlPipeline.target_field = result_column modelPmmlPipeline.active_fields = np.array( [col for col in x_train.columns if col != result_column]) sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr=True) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except: pass if algoSetting.is_hyperparameter_tuning_enabled(): modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": metrics[evaluationMetricDict["name"]], "evaluationMetricName": evaluationMetricDict["name"], "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } else: modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": metrics[evaluationMetricDict["name"]], "evaluationMetricName": evaluationMetricDict["name"], "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } modelmanagement_ = params_tf modelmanagement_.update(algoParams) self._model_management = MLModelSummary() if algoSetting.is_hyperparameter_tuning_enabled(): pass else: self._model_management.set_layer_info( data=modelmanagement_['hidden_layer_info']) self._model_management.set_loss_function( data=modelmanagement_['loss']) self._model_management.set_optimizer( data=modelmanagement_['optimizer']) self._model_management.set_batch_size( data=modelmanagement_['batch_size']) self._model_management.set_no_epochs( data=modelmanagement_['number_of_epochs']) self._model_management.set_model_evaluation_metrics( data=modelmanagement_['metrics']) self._model_management.set_job_type( self._dataframe_context.get_job_name()) #Project name self._model_management.set_training_status( data="completed") # training status self._model_management.set_no_of_independent_variables( data=x_train) #no of independent varables self._model_management.set_training_time(runtime) # run time self._model_management.set_rmse(metrics["RMSE"]) self._model_management.set_algorithm_name( "Neural Network (TensorFlow)") #algorithm name self._model_management.set_validation_method( str(validationDict["displayName"]) + "(" + str(validationDict["value"]) + ")") #validation method self._model_management.set_target_variable( result_column) #target column name self._model_management.set_creation_date(data=str( datetime.now().strftime('%b %d ,%Y %H:%M '))) #creation date self._model_management.set_datasetName(self._datasetName) modelManagementSummaryJson = [ ["Project Name", self._model_management.get_job_type()], ["Algorithm", self._model_management.get_algorithm_name()], ["Training Status", self._model_management.get_training_status()], ["RMSE", self._model_management.get_rmse()], ["RunTime", self._model_management.get_training_time()], #["Owner",None], ["Created On", self._model_management.get_creation_date()] ] if algoSetting.is_hyperparameter_tuning_enabled(): modelManagementModelSettingsJson = [] else: modelManagementModelSettingsJson = [ ["Training Dataset", self._model_management.get_datasetName()], [ "Target Column", self._model_management.get_target_variable() ], [ "Number Of Independent Variables", self._model_management.get_no_of_independent_variables() ], ["Algorithm", self._model_management.get_algorithm_name()], [ "Model Validation", self._model_management.get_validation_method() ], ["batch_size", str(self._model_management.get_batch_size())], ["Loss", self._model_management.get_loss_function()], ["Optimizer", self._model_management.get_optimizer()], ["Epochs", self._model_management.get_no_epochs()], [ "Metrics", self._model_management.get_model_evaluation_metrics() ] ] for i in range( len(list(modelmanagement_['hidden_layer_info'].keys()))): string = "" key = "layer No-" + str(i) + "-" + str( modelmanagement_["hidden_layer_info"][str(i)]["layer"] + "-") for j in modelmanagement_["hidden_layer_info"][str(i)]: modelManagementModelSettingsJson.append([ key + j + ":", modelmanagement_["hidden_layer_info"][str(i)][j] ]) print(modelManagementModelSettingsJson) tfregCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary) ] tfregPerformanceCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_cards_regression( self._model_summary) ] tfregOverviewCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_card_overview( self._model_management, modelManagementSummaryJson, modelManagementModelSettingsJson) ] tfregDeploymentCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_deploy_empty_card() ] TFReg_Overview_Node = NarrativesTree() TFReg_Overview_Node.set_name("Overview") TFReg_Performance_Node = NarrativesTree() TFReg_Performance_Node.set_name("Performance") TFReg_Deployment_Node = NarrativesTree() TFReg_Deployment_Node.set_name("Deployment") for card in tfregOverviewCards: TFReg_Overview_Node.add_a_card(card) for card in tfregPerformanceCards: TFReg_Performance_Node.add_a_card(card) for card in tfregDeploymentCards: TFReg_Deployment_Node.add_a_card(card) for card in tfregCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({ "Neural Network (TensorFlow)": json.loads( CommonUtils.convert_python_object_to_json(self._model_summary)) }) self._result_setter.set_tfreg_regression_model_summart( modelSummaryJson) self._result_setter.set_tfreg_cards(tfregCards) self._result_setter.set_tfreg_nodes([ TFReg_Overview_Node, TFReg_Performance_Node, TFReg_Deployment_Node ]) self._result_setter.set_tfreg_fail_card({ "Algorithm_Name": "Neural Network (TensorFlow)", "Success": "True" }) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "completion", "info", display=True, emptyBin=False, customMsg=None, weightKey="total")
class AnovaNarratives(object): ALPHA = 0.05 KEY_SUMMARY = 'summary' KEY_NARRATIVES = 'narratives' KEY_TAKEAWAY = 'key_takeaway' DRILL_DOWN = 'drill_down_narrative' KEY_CARD = 'card' KEY_HEADING = 'heading' KEY_SUBHEADING = 'header' KEY_CHART = 'charts' KEY_PARAGRAPH = 'paragraphs' KEY_PARA_HEADER = 'header' KEY_PARA_CONTENT = 'content' KEY_BUBBLE = 'bubble_data' # @accepts(object, DFAnovaResult, DataFrameHelper) def __init__(self, df_anova_result, df_helper, df_context, result_setter, story_narrative, scriptWeight=None, analysisName=None): self._story_narrative = story_narrative self._result_setter = result_setter self._dataframe_context = df_context self._df_anova_result = df_anova_result self._df_helper = df_helper self.narratives = {} self.narratives['variables'] = '' self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER self._base_dir = "/anova/" self._analysisName = self._dataframe_context.get_analysis_name() self._analysisDict = self._dataframe_context.get_analysis_dict() self._completionStatus = self._dataframe_context.get_completion_status( ) self._messageURL = self._dataframe_context.get_message_url() if analysisName == None: self._analysisName = self._dataframe_context.get_analysis_name() else: self._analysisName = analysisName if scriptWeight == None: self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight( ) else: self._scriptWeightDict = scriptWeight self._scriptStages = { "anovaNarrativeStart": { "summary": "Started The Anova Narratives", "weight": 0 }, "anovaNarrativeEnd": { "summary": "Narratives For Anova Finished", "weight": 10 }, } # self._completionStatus += self._scriptWeightDict[self._analysisName]["narratives"]*self._scriptStages["anovaNarrativeStart"]["weight"]/10 # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ # "anovaNarrativeStart",\ # "info",\ # self._scriptStages["anovaNarrativeStart"]["summary"],\ # self._completionStatus,\ # self._completionStatus) # CommonUtils.save_progress_message(self._messageURL,progressMessage) # self._dataframe_context.update_completion_status(self._completionStatus) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "anovaNarrativeStart", "info", display=False, emptyBin=False, customMsg=None, weightKey="narratives") self._generate_narratives() # self._completionStatus += self._scriptWeightDict[self._analysisName]["narratives"]*self._scriptStages["anovaNarrativeEnd"]["weight"]/10 # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ # "anovaNarrativeEnd",\ # "info",\ # self._scriptStages["anovaNarrativeEnd"]["summary"],\ # self._completionStatus,\ # self._completionStatus) # CommonUtils.save_progress_message(self._messageURL,progressMessage) # self._dataframe_context.update_completion_status(self._completionStatus) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "anovaNarrativeEnd", "info", display=False, emptyBin=False, customMsg=None, weightKey="narratives") if self._anovaNodes.get_card_count() > 0: self._story_narrative.add_a_node(self._anovaNodes) #self._generate_take_away() self._result_setter.set_anova_node(self._anovaNodes) def _generate_narratives(self): try: nColsToUse = self._analysisDict[ self._analysisName]["noOfColumnsToUse"] except: nColsToUse = None self._anovaNodes = NarrativesTree() self._anovaNodes.set_name("Performance") for measure_column in self._df_anova_result.get_measure_columns(): measure_anova_result = self._df_anova_result.get_measure_result( measure_column) significant_dimensions_dict, insignificant_dimensions = measure_anova_result.get_OneWayAnovaSignificantDimensions( ) num_dimensions = len(list(significant_dimensions_dict.items()) ) + len(insignificant_dimensions) significant_dimensions = [ k for k, v in sorted(list(significant_dimensions_dict.items()), key=lambda x: -x[1]) ] if nColsToUse != None: significant_dimensions = significant_dimensions[:nColsToUse] num_significant_dimensions = len(significant_dimensions) num_insignificant_dimensions = len(insignificant_dimensions) print("num_significant_dimensions", num_significant_dimensions) if num_significant_dimensions > 0: mainCard = NormalCard(name="Overview of Key Factors") data_c3 = [] for sig_dim in significant_dimensions: data_c3.append({ 'dimension': sig_dim, 'effect_size': float(significant_dimensions_dict[sig_dim]) }) self.narratives = {} self.narratives[AnovaNarratives. KEY_HEADING] = "%s Performance Analysis" % ( measure_column, ) self.narratives['main_card'] = {} self.narratives['cards'] = [] self.narratives['main_card'][ AnovaNarratives. KEY_SUBHEADING] = "Relationship between %s and other Dimensions" % ( measure_column) self.narratives['main_card'][ AnovaNarratives.KEY_PARAGRAPH] = [] data_dict = { \ 'significant_dimensions' : significant_dimensions, 'insignificant_dimensions' : insignificant_dimensions, 'num_significant_dimensions' : num_significant_dimensions, 'num_insignificant_dimensions' : num_insignificant_dimensions, 'num_dimensions' : num_significant_dimensions+num_insignificant_dimensions, 'target' : measure_column \ } output = {'header': ''} output['content'] = NarrativesUtils.get_template_output( self._base_dir, 'anova_template_1.html', data_dict) self.narratives['main_card'][ AnovaNarratives.KEY_PARAGRAPH].append(output) output1 = {'header': ''} output1['content'] = NarrativesUtils.get_template_output( self._base_dir, 'anova_template_2.html', data_dict) lines = [] lines += NarrativesUtils.block_splitter( output['content'], self._blockSplitter) data_c3 = NormalChartData(data_c3) chart_data = data_c3.get_data() chartDataValues = [] effect_size_values = [] for obj in chart_data: effect_size_values.append(obj["effect_size"]) chart_data_min = min(effect_size_values) if chart_data_min < 0.00001: for obj in chart_data: chartDataValues.append(str(obj["effect_size"])) else: for obj in chart_data: chartDataValues.append(obj["effect_size"]) chart_json = ChartJson(data=chart_data, axes={ 'x': 'dimension', 'y': 'effect_size' }, label_text={ 'x': '', 'y': 'Effect Size (scaled exp values)' }, chart_type='bar') chart_json.set_axis_rotation(True) # chart_json.set_yaxis_number_format(".4f") chart_json.set_yaxis_number_format( NarrativesUtils.select_y_axis_format(chartDataValues)) # st_info = ["Test : ANOVA", "Threshold for p-value : 0.05", "Effect Size : Tukey's HSD"] statistical_info_array = [ ("Test Type", "ANOVA"), ("Effect Size", "ETA squared"), ("Max Effect Size", chart_data[0]["dimension"]), ("Min Effect Size", chart_data[-1]["dimension"]), ] statistical_inferenc = "" if len(chart_data) == 1: statistical_inference = "{} is the only variable that have significant association with the {} (Target) having an \ Effect size of {}".format( chart_data[0]["dimension"], self._dataframe_context.get_result_column(), round(chart_data[0]["effect_size"], 4)) elif len(chart_data) == 2: statistical_inference = "There are two variables ({} and {}) that have significant association with the {} (Target) and the \ Effect size ranges are {} and {} respectively".format( chart_data[0]["dimension"], chart_data[1]["dimension"], self._dataframe_context.get_result_column(), round(chart_data[0]["effect_size"], 4), round(chart_data[1]["effect_size"], 4)) else: statistical_inference = "There are {} variables that have significant association with the {} (Target) and the \ Effect size ranges from {} to {}".format( len(chart_data), self._dataframe_context.get_result_column(), round(chart_data[0]["effect_size"], 4), round(chart_data[-1]["effect_size"], 4)) if statistical_inference != "": statistical_info_array.append( ("Inference", statistical_inference)) statistical_info_array = NarrativesUtils.statistical_info_array_formatter( statistical_info_array) lines += [ C3ChartData(data=chart_json, info=statistical_info_array) ] lines += NarrativesUtils.block_splitter( output1['content'], self._blockSplitter) mainCard.set_card_data(lines) self._anovaNodes.add_a_card(mainCard) self.narratives['main_card'][ AnovaNarratives.KEY_PARAGRAPH].append(output1) self.narratives['main_card'][AnovaNarratives.KEY_CHART] = {} effect_size_chart = { 'heading': '', 'labels': { 'Dimension': 'Effect Size' }, 'data': significant_dimensions_dict } print(significant_dimensions_dict) self.narratives['main_card'][AnovaNarratives.KEY_CHART][ 'effect_size'] = effect_size_chart progressMessage = CommonUtils.create_progress_message_object( self._analysisName, "custom", "info", "Analyzing Key Drivers", self._completionStatus, self._completionStatus, display=True) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=False) self._generate_dimension_narratives(significant_dimensions, measure_anova_result, measure_column) else: mainCard = NormalCard(name="Overview of Key Factors") cardText = HtmlData( "There are no dimensions in the dataset that have significant influence on {}" .format(measure_column)) mainCard.set_card_data([cardText]) self._anovaNodes.add_a_card(mainCard) def _generate_dimension_narratives(self, significant_dimensions, measure_anova_result, measure): self.narratives['cards'] = [] anova_trend_result = measure_anova_result.get_trend_data() if len(significant_dimensions) == 0: self.narratives['cards'].append({ 'card1': '', 'card2': '', 'card3': '' }) self.narratives['variables'] = significant_dimensions for dimension in significant_dimensions: dimensionNode = NarrativesTree(name=dimension) narratives = OneWayAnovaNarratives(self._dataframe_context, measure, dimension, measure_anova_result, anova_trend_result, self._result_setter, dimensionNode, self._base_dir) self._anovaNodes.add_a_node(dimensionNode) self.narratives['cards'].append(narratives)
class TestChiSquare(unittest.TestCase): # def __init__(self): # pass def setUp(self): APP_NAME = "test" spark = CommonUtils.get_spark_session(app_name=APP_NAME, hive_environment=False) spark.sparkContext.setLogLevel("ERROR") # spark.conf.set("spark.sql.execution.arrow.enabled", "true") configJson = get_test_configs("testCase", testFor="chisquare") config = configJson["config"] jobConfig = configJson["job_config"] jobType = jobConfig["job_type"] jobName = jobConfig["job_name"] jobURL = jobConfig["job_url"] messageURL = jobConfig["message_url"] try: errorURL = jobConfig["error_reporting_url"] except: errorURL = None if "app_id" in jobConfig: appid = jobConfig["app_id"] else: appid = None debugMode = True LOGGER = {} configJsonObj = configparser.ParserConfig(config) configJsonObj.set_json_params() configJsonObj = configparser.ParserConfig(config) configJsonObj.set_json_params() dataframe_context = ContextSetter(configJsonObj) dataframe_context.set_job_type( jobType ) #jobType should be set before set_params call of dataframe_context dataframe_context.set_params() dataframe_context.set_message_url(messageURL) dataframe_context.set_app_id(appid) dataframe_context.set_debug_mode(debugMode) dataframe_context.set_job_url(jobURL) dataframe_context.set_app_name(APP_NAME) dataframe_context.set_error_url(errorURL) dataframe_context.set_logger(LOGGER) dataframe_context.set_xml_url(jobConfig["xml_url"]) dataframe_context.set_job_name(jobName) dataframe_context.set_environment("debugMode") dataframe_context.set_message_ignore(True) dataframe_context.set_analysis_name("Descriptive analysis") df = MasterHelper.load_dataset(spark, dataframe_context) metaParserInstance = MasterHelper.get_metadata(df, spark, dataframe_context, None) df, df_helper = MasterHelper.set_dataframe_helper( df, dataframe_context, metaParserInstance) targetVal = dataframe_context.get_result_column() self.result_setter = ResultSetter(dataframe_context) self.story_narrative = NarrativesTree() self.story_narrative.set_name( "{} Performance Report".format(targetVal)) self.data_frame = df self.df_helper = df_helper self.df_context = dataframe_context self.meta_parser = metaParserInstance self.measure_columns = df_helper.get_numeric_columns() self.base_dir = "/chisquare/" self.significant_variables = [ 'Buyer_Gender', 'Sales', 'Discount_Range', 'Shipping_Cost', 'Last_Transaction', 'Marketing_Cost' ] self.measure_columns = [ 'Tenure_in_Days', 'Sales', 'Marketing_Cost', 'Shipping_Cost', 'Last_Transaction' ] self.df_chisquare_obj = ChiSquare( self.data_frame, self.df_helper, self.df_context, self.meta_parser).test_all( dimension_columns=(self.df_context.get_result_column(), )) self.df_chisquare_result = self.df_chisquare_obj.get_result() self.num_analysed_variables = 11 def test_chisquare_dimension(self): test_dimension = ChiSquare(self.data_frame, self.df_helper, self.df_context, self.meta_parser).test_dimension( 'Price_Range', 'Source') self.assertAlmostEqual(test_dimension.get_pvalue(), exp_values['pval']['Price_Range-Source'], places=5) self.assertAlmostEqual(test_dimension.get_effect_size(), exp_values['effect_size']['Price_Range-Source'], places=5) self.assertAlmostEqual(test_dimension.get_stat(), exp_values['stats']['Price_Range-Source'], places=5) self.assertAlmostEqual(test_dimension.get_v_value(), exp_values['v_value']['Price_Range-Source'], places=5) def test_chisquare_measure(self): test_measures = ChiSquare(self.data_frame, self.df_helper, self.df_context, self.meta_parser).test_measures( 'Price_Range', 'Marketing_Cost') self.assertAlmostEqual( test_measures.get_pvalue(), exp_values['pval']['Price_Range-Marketing_Cost'], places=5) self.assertAlmostEqual( test_measures.get_effect_size(), exp_values['effect_size']['Price_Range-Marketing_Cost'], places=5) self.assertAlmostEqual( test_measures.get_stat(), exp_values['stats']['Price_Range-Marketing_Cost'], places=5) self.assertAlmostEqual( test_measures.get_v_value(), exp_values['v_value']['Price_Range-Marketing_Cost'], places=5) def test_chisquare_all(self): #PVal-Test self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Deal_Type').get_pvalue(), exp_values['pval']['Price_Range-Deal_Type']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Discount_Range').get_pvalue(), exp_values['pval']['Price_Range-Discount_Range']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result('Price_Range', 'Source').get_pvalue(), exp_values['pval']['Price_Range-Source']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Platform').get_pvalue(), exp_values['pval']['Price_Range-Platform']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Buyer_Age').get_pvalue(), exp_values['pval']['Price_Range-Buyer_Age']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Buyer_Gender').get_pvalue(), exp_values['pval']['Price_Range-Buyer-Gender']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Tenure_in_Days').get_pvalue(), exp_values['pval']['Price_Range-Tenure_in_Days']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result('Price_Range', 'Sales').get_pvalue(), exp_values['pval']['Price_Range-Sales']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Marketing_Cost').get_pvalue(), exp_values['pval']['Price_Range-Marketing_Cost']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Shipping_Cost').get_pvalue(), exp_values['pval']['Price_Range-Shipping_Cost']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Last_Transaction').get_pvalue(), exp_values['pval']['Price_Range-Last_Transaction']) #EffectSize_Test self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Deal_Type').get_effect_size(), exp_values['effect_size']['Price_Range-Deal_Type']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Discount_Range').get_effect_size(), exp_values['effect_size']['Price_Range-Discount_Range']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Source').get_effect_size(), exp_values['effect_size']['Price_Range-Source']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Platform').get_effect_size(), exp_values['effect_size']['Price_Range-Platform']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Buyer_Age').get_effect_size(), exp_values['effect_size']['Price_Range-Buyer_Age']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Buyer_Gender').get_effect_size(), exp_values['effect_size']['Price_Range-Buyer-Gender']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Tenure_in_Days').get_effect_size(), exp_values['effect_size']['Price_Range-Tenure_in_Days']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Sales').get_effect_size(), exp_values['effect_size']['Price_Range-Sales']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Marketing_Cost').get_effect_size(), exp_values['effect_size']['Price_Range-Marketing_Cost']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Shipping_Cost').get_effect_size(), exp_values['effect_size']['Price_Range-Shipping_Cost']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Last_Transaction').get_effect_size(), exp_values['effect_size']['Price_Range-Last_Transaction']) #Stats_Test self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result('Price_Range', 'Deal_Type').get_stat(), exp_values['stats']['Price_Range-Deal_Type']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Discount_Range').get_stat(), exp_values['stats']['Price_Range-Discount_Range']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result('Price_Range', 'Source').get_stat(), exp_values['stats']['Price_Range-Source']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result('Price_Range', 'Platform').get_stat(), exp_values['stats']['Price_Range-Platform']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result('Price_Range', 'Buyer_Age').get_stat(), exp_values['stats']['Price_Range-Buyer_Age']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Buyer_Gender').get_stat(), exp_values['stats']['Price_Range-Buyer-Gender']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Tenure_in_Days').get_stat(), exp_values['stats']['Price_Range-Tenure_in_Days']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result('Price_Range', 'Sales').get_stat(), exp_values['stats']['Price_Range-Sales']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Marketing_Cost').get_stat(), exp_values['stats']['Price_Range-Marketing_Cost']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Shipping_Cost').get_stat(), exp_values['stats']['Price_Range-Shipping_Cost']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Last_Transaction').get_stat(), exp_values['stats']['Price_Range-Last_Transaction']) # #VVal-Test self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Deal_Type').get_v_value(), exp_values['v_value']['Price_Range-Deal_Type']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Discount_Range').get_v_value(), exp_values['v_value']['Price_Range-Discount_Range']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result('Price_Range', 'Source').get_v_value(), exp_values['v_value']['Price_Range-Source']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Platform').get_v_value(), exp_values['v_value']['Price_Range-Platform']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Buyer_Age').get_v_value(), exp_values['v_value']['Price_Range-Buyer_Age']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Buyer_Gender').get_v_value(), exp_values['v_value']['Price_Range-Buyer-Gender']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Tenure_in_Days').get_v_value(), exp_values['v_value']['Price_Range-Tenure_in_Days']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result('Price_Range', 'Sales').get_v_value(), exp_values['v_value']['Price_Range-Sales']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Marketing_Cost').get_v_value(), exp_values['v_value']['Price_Range-Marketing_Cost']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Shipping_Cost').get_v_value(), exp_values['v_value']['Price_Range-Shipping_Cost']) self.assertAlmostEqual( self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Last_Transaction').get_v_value(), exp_values['v_value']['Price_Range-Last_Transaction']) def test_chisquare_analysis(self): target_chisquare_result = self.df_chisquare_result['Price_Range'] chisquare_result = self.df_chisquare_obj.get_chisquare_result( 'Price_Range', 'Buyer_Gender') out = ChiSquareAnalysis( self.df_context, self.df_helper, chisquare_result, 'Price_Range', 'Buyer_Gender', self.significant_variables, self.num_analysed_variables, self.data_frame, self.measure_columns, self.base_dir, None, target_chisquare_result)._generate_narratives() self.assertEqual(out['data_dict'], exp_data_dict) self.assertEqual(out['target_dict']['11 to 50'], out['target_dict']['11 to 50']) self.assertEqual(out['target_dict']['101 to 500'], out['target_dict']['101 to 500']) self.assertEqual(out['target_dict']['0 to 10'], out['target_dict']['0 to 10'])
class BusinessCard(object): """ Functionalities """ def __init__(self, story_result, meta_parser, result_setter, dataframe_context, dataframe_helper, start_time, analysis_type): self._story_result = story_result self._meta_parser = meta_parser self._result_setter = result_setter self._dataframe_context = dataframe_context self._dataframe_helper = dataframe_helper self.subheader = "Impact" self.business_card1 = NormalCard() self.business_card1.set_card_name("Overview") self.businessCardData = [] self.start_time = start_time self.analysis_type = analysis_type def set_params(self): self.target_levels = self._dataframe_helper.get_num_unique_values( self._dataframe_context.get_result_column()) self.number_variables = self.get_number_variables() self.number_measures = self.get_number_measures() self.number_dimensions = self.get_number_dimensions() if self.analysis_type == 'dimension': self.analysis_list = [ "overview_rules", "association_summary", "association_rules", "prediction_rules" ] elif self.analysis_type == 'measure': self.analysis_list = [ "overview_rules", "performance_summary", "performance_rules", "influencers_summary", "influencers_rules", "prediction_rules" ] self.data_points = self.get_number_data_points() self.number_charts = self.get_number_charts() self.number_prediction_rules = self.get_number_prediction_rules() self.number_pages = self.get_number_pages() self.number_analysis = self.get_number_analysis() self.number_queries = self.get_number_queries() self.time_mAdvisor = time.time() - self.start_time self.time_analyst = self.get_time_analyst() self.time_saved = self.get_time_saved() self.impact_on_productivity = self.get_impact_on_productivity() def get_number_charts(self): return json.dumps(self._story_result, indent=2).count("c3Chart") def get_number_analysis(self): if self.analysis_type == 'dimension': significant_variables_levels = {"None": 0} for each in self._story_result['listOfNodes']: try: if each['name'] == 'Key Drivers': for node in each['listOfNodes']: significant_variables_levels[node['name']] = [ self._meta_parser.get_num_unique_values( node['name']) if node['name'] in self._dataframe_helper.get_string_columns() else 5 ][0] except: for key in each.keys(): if not key.startswith('maxdepth'): if each['name'] == 'Key Drivers': for node in each['listOfNodes']: significant_variables_levels[ node['name']] = [ self._meta_parser. get_num_unique_values(node['name']) if node['name'] in self._dataframe_helper. get_string_columns() else 5 ][0] self.number_analysis_dict = {} self.number_analysis_dict[ "overview_rules"] = self.target_levels * 2 self.number_analysis_dict['association_summary'] = ( self.number_dimensions + self.number_measures) * 2 self.number_analysis_dict["association_rules"] = sum( significant_variables_levels.values()) * 6 self.number_analysis_dict[ "prediction_rules"] = self.number_prediction_rules * 5 return sum(self.number_analysis_dict.values()) elif self.analysis_type == 'measure': significant_variables_levels = {"None": 0} for each in self._story_result['listOfNodes']: if each['name'] == 'Performance': for node in each['listOfNodes']: significant_variables_levels[node['name']] = [ self._dataframe_helper.get_num_unique_values( node['name']) if node['name'] in self._dataframe_helper.get_string_columns() else 5 ][0] self.number_analysis_dict = {} self.number_analysis_dict[ "overview_rules"] = self.target_levels * 2 self.number_analysis_dict["performance_summary"] = ( self.number_dimensions + self.number_measures) * 2 self.number_analysis_dict["performance_rules"] = sum( significant_variables_levels.values()) * 6 self.number_analysis_dict[ "prediction_rules"] = self.number_prediction_rules * 5 self.number_analysis_dict[ "influencers_summary"] = self.number_measures * 2 self.number_analysis_dict["influencers_rules"] = 8 return sum(self.number_analysis_dict.values()) def get_number_queries(self): if self.analysis_type == 'dimension': queries_per_analysis_dict = { "overview_rules": 15, "association_summary": 120, "association_rules": 600, "prediction_rules": 200 } elif self.analysis_type == 'measure': queries_per_analysis_dict = { "overview_rules": 15, "performance_summary": 120, "performance_rules": 600, "influencers_summary": 100, "influencers_rules": 80, "prediction_rules": 200 } sum = 0 for analysis in self.analysis_list: sum += self.number_analysis_dict[ analysis] * queries_per_analysis_dict[analysis] return sum def get_number_prediction_rules(self): num_prediction_rules = 0 for each_node in self._story_result['listOfNodes']: try: if each_node['name'] == 'Prediction': for card in each_node['listOfCards'][0]['cardData']: if card['dataType'] == 'table': num_prediction_rules = len( card['data']['tableData']) except: for key in each_node.keys(): if key.startswith('maxdepth'): if each_node['maxdepth3'][ 'name'] == 'Prediction' or each_node[ 'maxdepth4'][ 'name'] == 'Prediction' or each_node[ 'maxdepth5'][ 'name'] == 'Prediction': for Depth in range(3, 6): for card in each_node['maxdepth' + str( Depth)]['listOfCards'][0]['cardData']: if card['dataType'] == 'table': num_prediction_rules += len( card['data']['tableData']) return num_prediction_rules def get_number_pages(self): sum = 0 for each in self._story_result['listOfNodes']: try: if each['listOfNodes']: for items in each['listOfNodes']: sum += len(items['listOfCards']) sum += len(each['listOfCards']) else: sum += len(each['listOfCards']) except: for key in each.keys(): if key.startswith('maxdepth'): if each['maxdepth3']['listOfNodes'] or each[ 'maxdepth4']['listOfNodes'] or each[ 'maxdepth5']['listOfNodes']: for Depth in range(3, 6): for items in each['maxdepth' + str(Depth)]['listOfNodes']: sum += len( items['maxdepth' + str(Depth)]['listOfCards']) sum += len(each['maxdepth' + str(Depth)]['listOfCards']) else: for Depth in range(3, 6): sum += len(each['maxdepth' + str(Depth)]['listOfCards']) return sum def get_number_data_points(self): return self._meta_parser.get_num_rows( ) * self._meta_parser.get_num_columns() def get_number_variables(self): return self._meta_parser.get_num_columns() def get_number_dimensions(self): self.number_dimensions = len( self._dataframe_helper.get_string_columns()) return self.number_dimensions def get_number_measures(self): self.number_measures = len( self._dataframe_helper.get_numeric_columns()) return self.number_measures def get_time_analyst(self): if self.analysis_type == 'dimension': time_per_analysis_dict = { "overview_rules": 10, "association_summary": 120, "association_rules": 180, "prediction_rules": 300 } elif self.analysis_type == 'measure': time_per_analysis_dict = { "overview_rules": 10, "performance_summary": 120, "performance_rules": 180, "influencers_summary": 120, "influencers_rules": 180, "prediction_rules": 300 } sum = 0 for analysis in self.analysis_list: sum += self.number_analysis_dict[ analysis] * time_per_analysis_dict[analysis] return sum def get_time_saved(self): ''' Total Time Saved - 21 Hrs ( Productitvity Gain = Time taken by data scientist - time taken by mAdvisor) ''' return self.time_analyst - self.time_mAdvisor def get_impact_on_productivity(self): ''' Impact on Productivity - 3.5 X ( Impact on Productivity = Time taken by data scientist / time taken by mAdvisor) ''' productivity = str( round(old_div(self.time_analyst, self.time_mAdvisor), 1)) + "X" return productivity def get_summary_data(self): summaryData = [{ "name": "Total Data Points", "value": str(self.data_points) }, { "name": "Number of Queries", "value": str(self.number_queries) }, { "name": "Number of Analysis", "value": str(self.number_analysis) }, { "name": "Total Pages", "value": str(self.number_pages) }, { "name": "Total Time Saved", "value": CommonUtils.humanize_time(self.time_saved) }, { "name": "Impact on Productivity", "value": str(self.impact_on_productivity) }] # summaryData = HtmlData(data="<p> Hello World!!! </p>") summaryDataClass = DataBox(data=summaryData) self.businessCardData.append(summaryDataClass) # businessCardData.append(summaryData) # self.business_card1.set_card_data(self.businessCardData) # self._businessImpactNode.add_a_card(self.business_card1) def get_summary_para(self): para_normal = """<blockquote><p> <b>Great Job !!!</b> You have analysed the dataset that contains {} variables after executing about <b>{}</b> analytics queries and <b>{}</b> Statistical and ML analysis in parallel. Using mAdvisor, you have completed the analysis within <b>{}</b> which would have required around <b>{}</b>. </p></blockquote> """.format(self.number_variables, self.number_queries, self.number_analysis, CommonUtils.humanize_time(self.time_mAdvisor), CommonUtils.humanize_time(self.time_analyst)) para_images = """<div class="col-md-6"> <div class="d_analyst_block"> <span class="d_analyst_img"></span> <h1 class="pull-left xs-mt-40 xs-ml-10"> <small>Data Analyst <span class="bImpact_time_icon xs-ml-10"></span></small> <br> <small>{}</small> </h1> </div> </div> <div class="col-md-6"> <div class="d_m_block"> <span class="d_m_img"></span> <h1 class="pull-left xs-mt-40 xs-ml-10"><span class="bImpact_time_icon"></span><br> <small>{}</small> </h1> </div> </div> <div class="clearfix xs-m-50"></div> """.format(CommonUtils.humanize_time(self.time_analyst), CommonUtils.humanize_time(self.time_mAdvisor)) para_concatinated = """ <div class="row"> <div class="col-md-8 col-md-offset-2 xs-mt-20"> {}{} </div> </div> """.format(para_images, para_normal) paraDataClass = HtmlData(data=para_concatinated) self.businessCardData.append(paraDataClass) def Run(self): print("In Run of BusinessCard") self._businessImpactNode = NarrativesTree() self._businessImpactNode.set_name("Impact") self.set_params() summary = self.get_summary_data() summary_para = self.get_summary_para() self.business_card1.set_card_data(self.businessCardData) self._businessImpactNode.add_a_card(self.business_card1) self._result_setter.set_business_impact_node(self._businessImpactNode)