コード例 #1
0
class DimensionColumnNarrative:
    MAX_FRACTION_DIGITS = 2

    def __init__(self,
                 column_name,
                 df_helper,
                 df_context,
                 freq_dimension_stats,
                 result_setter,
                 story_narrative,
                 scriptWeight=None,
                 analysisName=None):
        self._story_narrative = story_narrative
        self._result_setter = result_setter
        self._column_name = column_name.lower()
        self._colname = column_name
        self._capitalized_column_name = "%s%s" % (column_name[0].upper(),
                                                  column_name[1:])
        self._dimension_col_freq_dict = freq_dimension_stats.get_frequency_dict(
        )
        self.header = None
        self.subheader = None
        self.count = {}
        self.summary = []
        self.analysis = []
        self.frequency_dict = json.loads(self._dimension_col_freq_dict)
        self.appid = df_context.get_app_id()
        self._base_dir = "/dimensions/"
        if self.appid != None:
            if self.appid == "1":
                self._base_dir += "appid1/"
            elif self.appid == "2":
                self._base_dir += "appid2/"
        self._dataframe_context = df_context
        self._dataframe_helper = df_helper
        self._storyOnScoredData = self._dataframe_context.get_story_on_scored_data(
        )
        self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER
        self._dimensionSummaryNode = NarrativesTree()
        self._dimensionSummaryNode.set_name("Overview")
        self._headNode = NarrativesTree()
        self._headNode.set_name("Overview")

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        if analysisName == None:
            self._analysisName = self._dataframe_context.get_analysis_name()
        else:
            self._analysisName = analysisName
        self._messageURL = self._dataframe_context.get_message_url()
        if scriptWeight == None:
            self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight(
            )
        else:
            self._scriptWeightDict = scriptWeight
        self._scriptStages = {
            "initialization": {
                "summary": "Initialized the Frequency Narratives",
                "weight": 2
            },
            "summarygeneration": {
                "summary": "summary generation finished",
                "weight": 8
            },
            "completion": {
                "summary": "Frequency Stats Narratives done",
                "weight": 0
            },
        }

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "initialization",
            "info",
            weightKey="narratives")
        self._generate_narratives()
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "summarygeneration",
            "info",
            weightKey="narratives")

        self._story_narrative.add_a_node(self._dimensionSummaryNode)

        self._result_setter.set_head_node(self._headNode)
        self._result_setter.set_distribution_node(self._dimensionSummaryNode)
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "summarygeneration",
            "info",
            weightKey="narratives")

    def _generate_narratives(self):
        if self.appid != None:
            if self.appid == "1":
                self._generate_title()
                self._generate_summary()
                self._generate_analysis()
            elif self.appid == "2":
                self._generate_title()
                self._generate_summary()
                self._generate_analysis()
            else:
                self._generate_title()
                if self._storyOnScoredData != True:
                    self._generate_summary()
                self._generate_analysis()
        else:
            self._generate_title()
            if self._storyOnScoredData != True:
                self._generate_summary()
            self._generate_analysis()

    def _generate_title(self):
        self.header = '%s Performance Report' % (
            self._capitalized_column_name, )
        # self._dimensionSummaryNode.set_name(self.header)

    def _generate_summary(self):
        ignored_columns = self._dataframe_context.get_ignore_column_suggestions(
        )
        if ignored_columns == None:
            ignored_columns = []

        data_dict = {
            "n_c":
            len(self._dataframe_helper.get_columns()),
            "n_m":
            len(self._dataframe_helper.get_numeric_columns()),
            "n_d":
            len(self._dataframe_helper.get_string_columns()),
            "n_td":
            len(self._dataframe_helper.get_timestamp_columns()),
            "c":
            self._column_name,
            "d":
            self._dataframe_helper.get_string_columns(),
            "m":
            self._dataframe_helper.get_numeric_columns(),
            "td":
            self._dataframe_helper.get_timestamp_columns(),
            "observations":
            self._dataframe_helper.get_num_rows(),
            "ignorecolumns":
            ignored_columns,
            "n_t":
            len(self._dataframe_helper.get_string_columns()) +
            len(self._dataframe_helper.get_numeric_columns()) +
            len(self._dataframe_helper.get_timestamp_columns()),
            # "n_t" : self._dataframe_helper.get_num_columns()+len(ignored_columns),
            "blockSplitter":
            self._blockSplitter
        }
        output = NarrativesUtils.get_template_output(self._base_dir,\
                                        'dimension_report_summary.html',data_dict)
        summary = NarrativesUtils.block_splitter(output, self._blockSplitter)
        dimensionSummaryCard = SummaryCard(name=self.header,
                                           slug=None,
                                           cardData=None)
        dimensionSummaryCard.set_no_of_measures(data_dict["n_m"])
        dimensionSummaryCard.set_no_of_dimensions(data_dict["n_d"])
        dimensionSummaryCard.set_no_of_time_dimensions(data_dict["n_td"])

        dimensionSummaryCard.set_summary_html(summary)
        dimensionSummaryCard.set_card_name("overall summary card")
        # dimensionSummaryCard.set_quote_html
        self._story_narrative.add_a_card(dimensionSummaryCard)
        self._headNode.add_a_card(dimensionSummaryCard)

    def _generate_analysis(self):
        lines = []
        freq_dict = self._dimension_col_freq_dict
        # print "freq_dict",freq_dict
        json_freq_dict = json.dumps(freq_dict)
        freq_dict = json.loads(freq_dict)
        colname = self._colname
        freq_data = []
        print "self._dataframe_helper.get_cols_to_bin()", self._dataframe_helper.get_cols_to_bin(
        )
        if colname in self._dataframe_helper.get_cols_to_bin():
            keys_to_sort = freq_dict[colname][colname].values()
            convert = lambda text: int(text) if text.isdigit() else text
            alphanum_key = lambda key: [
                convert(c) for c in re.split('([0-9]+)', key)
            ]
            keys_to_sort.sort(key=alphanum_key)
            temp_dict = {}
            for k, v in freq_dict[colname][colname].items():
                temp_dict[v] = freq_dict[colname]["count"][k]
            for each in keys_to_sort:
                freq_data.append({"key": each, "Count": temp_dict[each]})
        else:
            for k, v in freq_dict[colname][colname].items():
                freq_data.append({
                    "key": v,
                    "Count": freq_dict[colname]["count"][k]
                })
            freq_data = sorted(freq_data,
                               key=lambda x: x["Count"],
                               reverse=True)
        data_dict = {"colname": self._colname}
        data_dict["plural_colname"] = pattern.en.pluralize(
            data_dict["colname"])
        count = freq_dict[colname]['count']
        max_key = max(count, key=count.get)
        min_key = min(count, key=count.get)
        data_dict["blockSplitter"] = self._blockSplitter
        data_dict["max"] = {
            "key": freq_dict[colname][colname][max_key],
            "val": count[max_key]
        }
        data_dict["min"] = {
            "key": freq_dict[colname][colname][min_key],
            "val": count[min_key]
        }
        data_dict["keys"] = freq_dict[colname][colname].values()
        data_dict["avg"] = round(
            sum(count.values()) / float(len(count.values())), 2)
        data_dict["above_avg"] = [
            freq_dict[colname][colname][key] for key in count.keys()
            if count[key] > data_dict["avg"]
        ]
        data_dict["per_bigger_avg"] = round(
            data_dict["max"]["val"] / float(data_dict["avg"]), 4)
        data_dict["per_bigger_low"] = round(
            data_dict["max"]["val"] / float(data_dict["min"]["val"]), 4)
        uniq_val = list(set(count.values()))
        data_dict["n_uniq"] = len(uniq_val)
        if len(uniq_val) == 1:
            data_dict["count"] = uniq_val[0]
        if len(data_dict["keys"]) >= 3:
            #percent_75 = np.percentile(count.values(),75)
            #kv=[(freq_dict[colname][colname][key],count[key]) for key in count.keys()]
            percent_75 = sum(count.values()) * 0.75
            kv = sorted(count.items(),
                        key=operator.itemgetter(1),
                        reverse=True)
            kv_75 = [(k, v) for k, v in kv if v <= percent_75]
            kv_75 = []
            temp_sum = 0
            for k, v in kv:
                temp_sum = temp_sum + v
                kv_75.append((freq_dict[colname][colname][k], v))
                if temp_sum >= percent_75:
                    break
            data_dict["percent_contr"] = round(
                temp_sum * 100.0 / float(sum(count.values())), 2)
            data_dict["kv_75"] = len(kv_75)

            data_dict["kv_75_cat"] = [k for k, v in kv_75]

        largest_text = " %s is the largest with %s observations" % (
            data_dict["max"]["key"],
            NarrativesUtils.round_number(data_dict["max"]["val"]))
        smallest_text = " %s is the smallest with %s observations" % (
            data_dict["min"]["key"],
            NarrativesUtils.round_number(data_dict["min"]["val"]))
        largest_per = round(
            data_dict["max"]["val"] * 100.0 / float(sum(count.values())), 2)
        data_dict['largest_per'] = largest_per
        smallest_per = round(
            data_dict["min"]["val"] * 100.0 / float(sum(count.values())), 2)
        self.count = {
            "largest": [largest_text,
                        str(round(largest_per, 1)) + '%'],
            "smallest": [smallest_text,
                         str(round(smallest_per, 1)) + '%']
        }
        if len(data_dict["keys"]) >= 3:
            # self.subheader = "Top %d %s account for more than three quarters (%d percent) of observations." % (data_dict["kv_75"],data_dict["plural_colname"],data_dict["percent_contr"])
            self.subheader = 'Distribution of ' + self._capitalized_column_name
        else:
            self.subheader = 'Distribution of ' + self._capitalized_column_name
        output1 =  NarrativesUtils.get_template_output(self._base_dir,\
                                                'dimension_distribution1.html',data_dict)
        output1 = NarrativesUtils.block_splitter(output1, self._blockSplitter)
        output2 = NarrativesUtils.get_template_output(self._base_dir,\
                                                'dimension_distribution2.html',data_dict)
        output2 = NarrativesUtils.block_splitter(output2, self._blockSplitter)
        chart_data = NormalChartData(data=freq_data)
        chart_json = ChartJson()
        chart_json.set_data(chart_data.get_data())
        chart_json.set_chart_type("bar")
        chart_json.set_axes({"x": "key", "y": "Count"})
        chart_json.set_label_text({'x': ' ', 'y': 'No. of Observations'})
        chart_json.set_yaxis_number_format(".2s")
        lines += output1
        lines += [C3ChartData(data=chart_json)]
        lines += output2
        bubble_data = "<div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}%</span><br /><small>{}</small></h2></div><div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}%</span><br /><small>{}</small></h2></div>".format(
            largest_per, largest_text, smallest_per, smallest_text)
        lines.append(HtmlData(data=bubble_data))
        # print lines
        dimensionCard1 = NormalCard(name=self.subheader,
                                    slug=None,
                                    cardData=lines)
        self._dimensionSummaryNode.add_a_card(dimensionCard1)
        self._result_setter.set_score_freq_card(
            json.loads(
                CommonUtils.convert_python_object_to_json(dimensionCard1)))
        return lines

    def _generate_analysis2(self):
        lines = []
        freq_dict = self._dimension_col_freq_dict
        json_freq_dict = json.dumps(freq_dict)
        freq_dict = json.loads(freq_dict)
        colname = self._colname
        data_dict = {"colname": self._colname}
        data_dict["plural_colname"] = pattern.en.pluralize(
            data_dict["colname"])
        count = freq_dict[colname]['count']
        max_key = max(count, key=count.get)
        min_key = min(count, key=count.get)
        data_dict["max"] = {
            "key": freq_dict[colname][colname][max_key],
            "val": count[max_key]
        }
        data_dict["min"] = {
            "key": freq_dict[colname][colname][min_key],
            "val": count[min_key]
        }
        data_dict["keys"] = freq_dict[colname][colname].values()
        data_dict["avg"] = round(
            sum(count.values()) / float(len(count.values())), 2)
        data_dict["above_avg"] = [
            freq_dict[colname][colname][key] for key in count.keys()
            if count[key] > data_dict["avg"]
        ]
        data_dict["per_bigger_avg"] = round(
            data_dict["max"]["val"] / float(data_dict["avg"]), 2)
        data_dict["per_bigger_low"] = round(
            data_dict["max"]["val"] / float(data_dict["min"]["val"]), 2)
        uniq_val = list(set(count.values()))
        data_dict["n_uniq"] = len(uniq_val)
        if len(uniq_val) == 1:
            data_dict["count"] = uniq_val[0]
        if len(data_dict["keys"]) >= 2:
            percent_75 = sum(count.values()) * 0.75
            kv = sorted(count.items(),
                        key=operator.itemgetter(1),
                        reverse=True)
            kv_75 = [(k, v) for k, v in kv if v <= percent_75]
            kv_75 = []
            temp_sum = 0
            for k, v in kv[:-1]:
                temp_sum = temp_sum + v
                kv_75.append((freq_dict[colname][colname][k], v))
                if temp_sum >= percent_75:
                    break
            data_dict["percent_contr"] = round(
                temp_sum * 100 / float(sum(count.values())), 2)
            data_dict["kv_75"] = len(kv_75)

            data_dict["kv_75_cat"] = [k for k, v in kv_75]

        largest_text = " %s is the largest with %s observations" % (
            data_dict["max"]["key"],
            str(NarrativesUtils.round_number(data_dict["max"]["val"])))
        smallest_text = " %s is the smallest with %s observations" % (
            data_dict["min"]["key"],
            str(NarrativesUtils.round_number(data_dict["min"]["val"])))
        largest_per = NarrativesUtils.round_number(
            data_dict["max"]["val"] / float(sum(count.values())), 2) * 100
        smallest_per = NarrativesUtils.round_number(
            data_dict["min"]["val"] / float(sum(count.values())), 2) * 100
        data_dict['largest_per'] = largest_per
        self.count = {
            "largest": [largest_text,
                        str(round(largest_per, 0)) + '%'],
            "smallest": [smallest_text,
                         str(round(smallest_per, 0)) + '%']
        }
        self.subheader = "Snapshot of " + data_dict["colname"]
        output1 =  NarrativesUtils.get_template_output(self._base_dir,\
                                                'dimension_distribution1.html',data_dict)
        output2 = NarrativesUtils.get_template_output(self._base_dir,\
                                                'dimension_distribution2.html',data_dict)
        lines.append(output1)
        lines.append(output2)
        return lines
コード例 #2
0
    def generate_narratives(self):
        regression_narrative_obj = LinearRegressionNarrative(
                                    self._df_regression_result,
                                    self._correlations,
                                    self._dataframe_helper,
                                    self._dataframe_context,
                                    self._metaParser,
                                    self._spark
                                    )
        main_card_data = regression_narrative_obj.generate_main_card_data()
        main_card_narrative = NarrativesUtils.get_template_output(self._base_dir,\
                                                        'regression_main_card.html',main_card_data)
        self.narratives['main_card'] = {}
        self.narratives["main_card"]['paragraphs'] = NarrativesUtils.paragraph_splitter(main_card_narrative)
        self.narratives["main_card"]['header'] = 'Key Measures that affect ' + self.result_column
        self.narratives["main_card"]['chart'] = {}
        self.narratives["main_card"]['chart']['heading'] = ''
        self.narratives["main_card"]['chart']['data'] = [[i for i,j in self._all_coeffs],
                                                         [j['coefficient'] for i,j in self._all_coeffs]]
        self.narratives["main_card"]['chart']['label'] = {'x':'Measure Name',
                                                            'y': 'Change in ' + self.result_column + ' per unit increase'}

        main_card = NormalCard()
        main_card_header = HtmlData(data = '<h3>Key Measures that affect ' + self.result_column+"</h3>")
        main_card_paragraphs = NarrativesUtils.block_splitter(main_card_narrative,self._blockSplitter)
        main_card_chart_data = [{"key":val[0],"value":val[1]} for val in zip([i for i,j in self._all_coeffs],[j['coefficient'] for i,j in self._all_coeffs])]
        main_card_chart = NormalChartData(data=main_card_chart_data)
        mainCardChartJson = ChartJson()
        mainCardChartJson.set_data(main_card_chart.get_data())
        mainCardChartJson.set_label_text({'x':'Influencing Factors','y': 'Change in ' + self.result_column + ' per unit increase'})
        mainCardChartJson.set_chart_type("bar")
        mainCardChartJson.set_axes({"x":"key","y":"value"})
        mainCardChartJson.set_yaxis_number_format(".2f")
        # st_info = ["Test : Regression","Threshold for p-value: 0.05", "Effect Size: Regression Coefficient"]
        chart_data = sorted(main_card_chart_data,key=lambda x:x["value"],reverse=True)
        statistical_info_array=[
            ("Test Type","Regression"),
            ("Effect Size","Coefficients"),
            ("Max Effect Size",chart_data[0]["key"]),
            ("Min Effect Size",chart_data[-1]["key"]),
            ]
        statistical_inferenc = ""
        if len(chart_data) == 1:
            statistical_inference = "{} is the only variable that have significant influence over {} (Target) having an \
             Effect size of {}".format(chart_data[0]["key"],self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4))
        elif len(chart_data) == 2:
            statistical_inference = "There are two variables ({} and {}) that have significant influence over {} (Target) and the \
             Effect size ranges are {} and {} respectively".format(chart_data[0]["key"],chart_data[1]["key"],self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4),round(chart_data[1]["value"],4))
        else:
            statistical_inference = "There are {} variables that have significant influence over {} (Target) and the \
             Effect size ranges from {} to {}".format(len(chart_data),self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4),round(chart_data[-1]["value"],4))
        if statistical_inference != "":
            statistical_info_array.append(("Inference",statistical_inference))
        statistical_info_array = NarrativesUtils.statistical_info_array_formatter(statistical_info_array)
        main_card.set_card_data(data = [main_card_header]+main_card_paragraphs+[C3ChartData(data=mainCardChartJson,info=statistical_info_array)])
        main_card.set_card_name("Key Influencers")
        self._regressionNode.add_a_card(main_card)


        count = 0
        for measure_column in self.significant_measures:
            sigMeasureNode = NarrativesTree()
            sigMeasureNode.set_name(measure_column)
            measureCard1 = NormalCard()
            measureCard1.set_card_name("{}: Impact on {}".format(measure_column,self.result_column))
            measureCard1Data = []
            if self._run_dimension_level_regression:
                measureCard2 = NormalCard()
                measureCard2.set_card_name("Key Areas where it Matters")
                measureCard2Data = []

            measure_column_cards = {}
            card0 = {}
            card1data = regression_narrative_obj.generate_card1_data(measure_column)
            card1heading = "<h3>Impact of "+measure_column+" on "+self.result_column+"</h3>"
            measureCard1Header = HtmlData(data=card1heading)
            card1data.update({"blockSplitter":self._blockSplitter})
            card1narrative = NarrativesUtils.get_template_output(self._base_dir,\
                                                            'regression_card1.html',card1data)

            card1paragraphs = NarrativesUtils.block_splitter(card1narrative,self._blockSplitter)
            card0 = {"paragraphs":card1paragraphs}
            card0["charts"] = {}
            card0['charts']['chart2']={}
            # card0['charts']['chart2']['data']=card1data["chart_data"]
            # card0['charts']['chart2']['heading'] = ''
            # card0['charts']['chart2']['labels'] = {}
            card0['charts']['chart1']={}
            card0["heading"] = card1heading
            measure_column_cards['card0'] = card0

            measureCard1Header = HtmlData(data=card1heading)
            measureCard1Data += [measureCard1Header]
            measureCard1para = card1paragraphs
            measureCard1Data += measureCard1para

            if self._run_dimension_level_regression:
                print("running narratives for key area dict")
                self._dim_regression = self.run_regression_for_dimension_levels()
                card2table, card2data=regression_narrative_obj.generate_card2_data(measure_column,self._dim_regression)
                card2data.update({"blockSplitter":self._blockSplitter})
                card2narrative = NarrativesUtils.get_template_output(self._base_dir,\
                                                            'regression_card2.html',card2data)
                card2paragraphs = NarrativesUtils.block_splitter(card2narrative,self._blockSplitter)

                card1 = {'tables': card2table, 'paragraphs' : card2paragraphs,
                        'heading' : 'Key Areas where ' + measure_column + ' matters'}
                measure_column_cards['card1'] = card1

                measureCard2Data += card2paragraphs
                if "table1" in card2table:
                    table1data = regression_narrative_obj.convert_table_data(card2table["table1"])
                    card2Table1 = TableData()
                    card2Table1.set_table_data(table1data)
                    card2Table1.set_table_type("heatMap")
                    card2Table1.set_table_top_header(card2table["table1"]["heading"])
                    card2Table1Json = json.loads(CommonUtils.convert_python_object_to_json(card2Table1))
                    # measureCard2Data.insert(3,card2Table1)
                    measureCard2Data.insert(3,card2Table1Json)

                if "table2" in card2table:
                    table2data = regression_narrative_obj.convert_table_data(card2table["table2"])
                    card2Table2 = TableData()
                    card2Table2.set_table_data(table2data)
                    card2Table2.set_table_type("heatMap")
                    card2Table2.set_table_top_header(card2table["table2"]["heading"])
                    # measureCard2Data.insert(5,card2Table2)
                    card2Table2Json = json.loads(CommonUtils.convert_python_object_to_json(card2Table2))
                    # measureCard2Data.append(card2Table2)
                    measureCard2Data.append(card2Table2Json)


            # self._result_setter.set_trend_section_data({"result_column":self.result_column,
            #                                             "measure_column":measure_column,
            #                                             "base_dir":self._base_dir
            #                                             })
            # trend_narratives_obj = TimeSeriesNarrative(self._dataframe_helper, self._dataframe_context, self._result_setter, self._spark, self._story_narrative)
            # card2 =  trend_narratives_obj.get_regression_trend_card_data()
            # if card2:
            #     measure_column_cards['card2'] = card2
            #
            #
            # card3 = {}
            progressMessage = CommonUtils.create_progress_message_object(self._analysisName,"custom","info","Analyzing Key Influencers",self._completionStatus,self._completionStatus,display=True)
            CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=False)
            card4data = regression_narrative_obj.generate_card4_data(self.result_column,measure_column)
            card4data.update({"blockSplitter":self._blockSplitter})
            # card4heading = "Sensitivity Analysis: Effect of "+self.result_column+" on Segments of "+measure_column
            card4narrative = NarrativesUtils.get_template_output(self._base_dir,\
                                                                'regression_card4.html',card4data)
            card4paragraphs = NarrativesUtils.block_splitter(card4narrative,self._blockSplitter)
            # card3 = {"paragraphs":card4paragraphs}
            card0['paragraphs'] = card1paragraphs+card4paragraphs
            card4Chart = card4data["charts"]
            # st_info = ["Test : Regression", "Variables : "+ self.result_column +", "+measure_column,"Intercept : "+str(round(self._df_regression_result.get_intercept(),2)), "Regression Coefficient : "+ str(round(self._df_regression_result.get_coeff(measure_column),2))]
            statistical_info_array=[
                ("Test Type","Regression"),
                ("Coefficient",str(round(self._df_regression_result.get_coeff(measure_column),2))),
                ("P-Value","<= 0.05"),
                ("Intercept",str(round(self._df_regression_result.get_intercept(),2))),
                ("R Square ",str(round(self._df_regression_result.get_rsquare(),2))),
                ]
            inferenceTuple = ()
            coeff = self._df_regression_result.get_coeff(measure_column)
            if coeff > 0:
                inferenceTuple = ("Inference","For every additional unit of increase in {} there will be an increase of {} units in {} (target).".format(measure_column,str(round(coeff,2)),self._dataframe_context.get_result_column()))
            else:
                inferenceTuple = ("Inference","For every additional unit of decrease in {} there will be an decrease of {} units in {} (target).".format(measure_column,str(round(coeff,2)),self._dataframe_context.get_result_column()))
            if len(inferenceTuple) > 0:
                statistical_info_array.append(inferenceTuple)
            statistical_info_array = NarrativesUtils.statistical_info_array_formatter(statistical_info_array)

            card4paragraphs.insert(2,C3ChartData(data=card4Chart,info=statistical_info_array))
            measureCard1Data += card4paragraphs

            self.narratives['cards'].append(measure_column_cards)

            if count == 0:
                card4data.pop("charts")
                self._result_setter.update_executive_summary_data(card4data)
            count += 1
            measureCard1.set_card_data(measureCard1Data)
            if self._run_dimension_level_regression:
                measureCard2.set_card_data(measureCard2Data)
                sigMeasureNode.add_cards([measureCard1,measureCard2])
            sigMeasureNode.add_cards([measureCard1])
            self._regressionNode.add_a_node(sigMeasureNode)
        # self._result_setter.set_trend_section_completion_status(True)
        self._story_narrative.add_a_node(self._regressionNode)
コード例 #3
0
class MeasureColumnNarrative(object):

    MAX_FRACTION_DIGITS = 2

    def __init__(self, data_frame,column_name, measure_descr_stats, df_helper, df_context, result_setter, story_narrative,scriptWeight=None, analysisName=None):
        self._story_narrative = story_narrative
        self._result_setter = result_setter
        self._column_name = column_name.lower()
        self._capitalized_column_name = "%s%s" % (column_name[0].upper(), column_name[1:])
        self._measure_descr_stats = measure_descr_stats
        self._five_point_summary_stats = measure_descr_stats.get_five_point_summary_stats()
        self._data_frame = data_frame
        try:
            self._total_rows = self._data_frame.shape[0]
        except:
            self._total_rows = self._data_frame.count()
        # self._histogram = measure_descr_stats.get_histogram()
        # self._num_columns = context.get_column_count()
        # self._num_rows = context.get_row_count()
        # self._measures = context.get_measures()
        # self._dimensions = context.get_dimensions()
        # self._time_dimensions = context.get_time_dimension()
        self._dataframe_helper = df_helper
        self._dataframe_context = df_context
        self._pandas_flag = self._dataframe_context._pandas_flag
        self._storyOnScoredData = self._dataframe_context.get_story_on_scored_data()
        self.title = None
        self.heading = self._capitalized_column_name + ' Performance Analysis'
        self.sub_heading = "Distribution of " + self._capitalized_column_name
        self.summary = None
        self._analysis1 = None
        self._analysis2 = None
        self.analysis = None
        self.take_away = None
        self.card2 = ''
        self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER
        self._highlightFlag = "|~HIGHLIGHT~|"
        self._base_dir = "/descriptive/"
        self.num_measures = len(self._dataframe_helper.get_numeric_columns())
        self.num_dimensions = len(self._dataframe_helper.get_string_columns())
        self.num_time_dimensions = len(self._dataframe_helper.get_timestamp_columns())

        self._completionStatus = self._dataframe_context.get_completion_status()
        self._messageURL = self._dataframe_context.get_message_url()
        if analysisName == None:
            self._analysisName = self._dataframe_context.get_analysis_name()
        else:
            self._analysisName = analysisName
        if scriptWeight == None:
            self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight()
        else:
            self._scriptWeightDict = scriptWeight
        self._scriptStages = {
            "statNarrativeStart":{
                "summary":"Started The Descriptive Stats Narratives",
                "weight":0
                },
            "statNarrativeEnd":{
                "summary":"Narratives For Descriptive Stats Finished",
                "weight":10
                },
            }

        CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._analysisName,"statNarrativeStart","info",display=False,emptyBin=False,customMsg=None,weightKey="narratives")


        self._measureSummaryNode = NarrativesTree()
        self._headNode = NarrativesTree()
        self._headNode.set_name("Overview")
        self._generate_narratives()
        self._story_narrative.add_a_node(self._measureSummaryNode)
        self._result_setter.set_head_node(self._headNode)
        self._result_setter.set_distribution_node(self._measureSummaryNode)

        CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._analysisName,"statNarrativeEnd","info",display=False,emptyBin=False,customMsg=None,weightKey="narratives")


    def _get_c3_histogram(self):
        data = self._measure_descr_stats.get_histogram()
        data_c3 = []
        for bin in data:
            data_c3.append({'bin_name':'< '+ humanize.intcomma(round(bin['end_value'],2)),
                            'Count':bin['num_records']})

        data_c3 = NormalChartData(data_c3)
        chartObj = ChartJson(data=data_c3.get_data(), axes={'x':'bin_name','y':'Count'},label_text={'x':'','y':'No. of Observations'},chart_type='bar')
        chartObj.set_yaxis_number_format(".2s")
        return chartObj
    def _generate_narratives(self):
        lines = []
        self._generate_title()
        if self._storyOnScoredData != True:
            self._generate_summary()
        self._analysis1 = self._generate_analysis_para1()
        self._analysis2 = self._generate_analysis_para2()
        lines += NarrativesUtils.block_splitter(self._analysis1,self._blockSplitter)
        lines += [C3ChartData(self._get_c3_histogram())]
        self._tableData = [['Minimum','Quartile 1','Median','Quartile 3','Maximum'],
                            [NarrativesUtils.round_number(self._measure_descr_stats.get_min()),
                             NarrativesUtils.round_number(self._five_point_summary_stats.get_q1_split()),
                             NarrativesUtils.round_number(self._five_point_summary_stats.get_q2_split()),
                             NarrativesUtils.round_number(self._five_point_summary_stats.get_q3_split()),
                             NarrativesUtils.round_number(self._measure_descr_stats.get_max())]]
        lines += [TableData({'tableType':'normal','tableData':self._tableData})]
        lines += NarrativesUtils.block_splitter(self._analysis2,self._blockSplitter)
        if self.card2 != '':
            lines += self.card2['data']['content']
        measureCard1 = NormalCard(name=self.sub_heading,slug=None,cardData = lines)
        self._measureSummaryNode.add_a_card(measureCard1)
        self._measureSummaryNode.set_name("Overview")
        self.analysis = [self._analysis1, self._analysis2]
        self.take_away = self._generate_take_away()

    def _generate_title(self):
        self.title = '%s Performance Report' % (self._capitalized_column_name,)

    def _generate_summary(self):

        ignored_columns = self._dataframe_context.get_ignore_column_suggestions()
        if ignored_columns == None:
            ignored_columns = []

        metaHelperInstance = MetaDataHelper(self._data_frame, self._total_rows)
        sampleData = metaHelperInstance.get_sample_data()
        try:
            sampleData = sampleData.toPandas()
        except:
            pass
        l1=[]
        l2=[]
        if self._pandas_flag:
            for column in self._dataframe_helper.get_string_columns():
                uniqueVals = sampleData[column].unique().tolist()
                if len(uniqueVals) > 0 and metaHelperInstance.get_datetime_format_pandas([self._data_frame[column].sort_values(ascending=False)[0]])!=None:
                    dateColumnFormat = metaHelperInstance.get_datetime_format_pandas(uniqueVals)
                    l1.append(column)
                else:
                    dateColumnFormat = None
                    l2.append(column)
            # l1 = self._dataframe_helper.get_timestamp_columns()
            # l2 = self._dataframe_helper.get_string_columns()
        else:
            for column in self._dataframe_helper.get_string_columns():
                uniqueVals = sampleData[column].unique().tolist()
                if len(uniqueVals) > 0 and metaHelperInstance.get_datetime_format([self._data_frame.orderBy([column],ascending=[False]).select(column).first()[0]])!=None:
                    dateColumnFormat = metaHelperInstance.get_datetime_format(uniqueVals)
                    l1.append(column)
                else:
                    dateColumnFormat = None
                    l2.append(column)

        data_dict = {"n_c" : self._dataframe_helper.get_num_columns(),
                    "n_m" : len(self._dataframe_helper.get_numeric_columns()),
                    "n_d" : len(l2),
                    "n_td" : len(l1),
                    "c" : self._column_name,
                    "d" : l2,
                    "m" : self._dataframe_helper.get_numeric_columns(),
                    "td" : l1,
                    "observations" : self._dataframe_helper.get_num_rows(),
                    "ignorecolumns" : ignored_columns,
                    "n_t" : len(self._dataframe_helper.get_string_columns())+len(self._dataframe_helper.get_numeric_columns())+len(self._dataframe_helper.get_timestamp_columns())
                    # "n_t" : self._dataframe_helper.get_num_columns()+len(ignored_columns)
        }
        self.summary = NarrativesUtils.get_template_output(self._base_dir,\
                                        'descr_stats_summary.html',data_dict)
        MeasureSummaryCard = SummaryCard(name='Summary',slug=None,cardData = None)
        MeasureSummaryCard.set_no_of_measures(data_dict["n_m"])
        MeasureSummaryCard.set_no_of_dimensions(data_dict["n_d"])
        MeasureSummaryCard.set_no_of_time_dimensions(data_dict["n_td"])
        MeasureSummaryCard.set_summary_html(NarrativesUtils.block_splitter(self.summary,self._blockSplitter))
        self._story_narrative.add_a_card(MeasureSummaryCard)
        self._headNode.add_a_card(MeasureSummaryCard)

    def _generate_analysis_para1(self):
        output = 'Para1 entered'
        data_dict = {"cols" : self._dataframe_helper.get_num_columns(),
                    "min" : int(round(self._measure_descr_stats.get_min(), 0)),
                    "max" : int(round(self._measure_descr_stats.get_max(), 0)),
                    "n" : self._five_point_summary_stats.get_num_outliers(),
                    "l" : self._five_point_summary_stats.get_left_outliers(),
                    "r" : self._five_point_summary_stats.get_right_outliers(),
                    "m" : self._dataframe_helper.get_numeric_columns(),
                    "total" : NarrativesUtils.round_number(self._measure_descr_stats.get_total(), 0),
                    "avg" : NarrativesUtils.round_number(self._measure_descr_stats.get_mean(), 2),
                    "o": self._five_point_summary_stats.get_num_outliers(),
                    "col_name": self._column_name,
                    'rows': self._dataframe_helper.get_num_rows()
        }
        output = NarrativesUtils.get_template_output(self._base_dir,\
                                        'distribution_narratives.html',data_dict)
        return output

    def _generate_analysis_para2(self):
        output = 'Para2 entered'
        histogram_buckets = self._measure_descr_stats.get_histogram()
        print(histogram_buckets)
        print("$"*200)
        threshold = self._dataframe_helper.get_num_rows() * 0.75
        s = 0
        start = 0
        end = len(histogram_buckets)
        flag = 0
        for bin_size in range(1,len(histogram_buckets)):
            s_t = 0
            for i in range(len(histogram_buckets)-bin_size+1):
                s_t = 0
                for j in range(i,i+bin_size):
                    s_t = s_t + histogram_buckets[j]['num_records']
                if(s_t >= threshold) and (s_t > s):
                    s = s_t
                    start = i
                    end = i + bin_size - 1
                    flag = 1
            if (flag == 1):
                break
        bin_size_75 = old_div((end - start + 1)*100,len(histogram_buckets))
        s = old_div(s*100,self._dataframe_helper.get_num_rows())
        print(histogram_buckets)
        print("="*120)
        start_value = histogram_buckets[start]['start_value']
        print(start,end)
        if end >= len(histogram_buckets):
            end = len(histogram_buckets)-1
        print(start,end)
        end_value = histogram_buckets[end]['end_value']
        if len(histogram_buckets) > 2:
            lowest = min(histogram_buckets[0]['num_records'],histogram_buckets[1]['num_records'],histogram_buckets[2]['num_records'])
            highest = max(histogram_buckets[0]['num_records'],histogram_buckets[1]['num_records'],histogram_buckets[2]['num_records'])
        else:
            lowest = min(histogram_buckets[0]['num_records'],histogram_buckets[1]['num_records'])
            highest = max(histogram_buckets[0]['num_records'],histogram_buckets[1]['num_records'])

        quartile_sums = self._five_point_summary_stats.get_sums()
        quartile_means = self._five_point_summary_stats.get_means()
        print(quartile_means)
        quartile_frequencies = self._five_point_summary_stats.get_frequencies()
        total = self._measure_descr_stats.get_total()
        avg = self._measure_descr_stats.get_mean()
        counts = self._measure_descr_stats.get_num_values()

        data_dict = {"histogram" : histogram_buckets,
                    "per_cont_hist1" : NarrativesUtils.round_number(old_div(histogram_buckets[0]['num_records']*100,self._measure_descr_stats.get_total()), MeasureColumnNarrative.MAX_FRACTION_DIGITS),
                    "per_cont_hist2" : NarrativesUtils.round_number(old_div(histogram_buckets[1]['num_records']*100,self._measure_descr_stats.get_total()), MeasureColumnNarrative.MAX_FRACTION_DIGITS),
                    "lowest_cont" : NarrativesUtils.round_number(old_div(lowest*100,self._measure_descr_stats.get_total()), MeasureColumnNarrative.MAX_FRACTION_DIGITS),
                    "highest_cont" : NarrativesUtils.round_number(old_div(highest*100,self._measure_descr_stats.get_total()), MeasureColumnNarrative.MAX_FRACTION_DIGITS),
                    "num_bins" : len(histogram_buckets),
                    "seventy_five" : bin_size_75,
                    "col_name" : self._column_name,
                    "skew" : self._measure_descr_stats.get_skew(),
                    "three_quarter_percent" : round(s,2),
                    "start_value" : start_value,
                    "end_value" : end_value,
                    "measure_colname":self._column_name,
                    "q4_cont" : NarrativesUtils.round_number(old_div(quartile_frequencies['q4']*100.0,counts), 2),
                    "q1_cont" : NarrativesUtils.round_number(old_div(quartile_frequencies['q1']*100.0,counts), 2),
                    "q4_frac" : NarrativesUtils.round_number(old_div(quartile_sums['q4']*100.0,total), 2),
                    "q1_frac" : NarrativesUtils.round_number(old_div(quartile_sums['q1']*100.0,total), 2),
                    "q4_sum" : NarrativesUtils.round_number(quartile_sums['q4'], 2),
                    "q4_mean" : NarrativesUtils.round_number(quartile_means['q4'], 2),
                    "q1_sum" : NarrativesUtils.round_number(quartile_sums['q1'], 2),
                    "q4_overall_mean" : round(old_div(quartile_means['q4']*1.0,avg), 2),
                    "total" : NarrativesUtils.round_number(total,2),
                    "avg" : NarrativesUtils.round_number(avg,2),
                    "highlightFlag":self._highlightFlag,
                    "blockSplitter":self._blockSplitter
        }
        try:
            data_dict["q4_q1_mean"] = round(old_div(quartile_means['q4']*1.0,quartile_means['q1']), 1)
        except:
            data_dict["q4_q1_mean"] = None

        self._result_setter.update_executive_summary_data({"skew":data_dict["skew"]})
        if abs(self._measure_descr_stats.get_skew())>0.1:
            content = NarrativesUtils.get_template_output(self._base_dir,\
                                            'descriptive_card2.html',data_dict)
            blocks = NarrativesUtils.block_splitter(content,self._blockSplitter,highlightFlag=self._highlightFlag)
            self.card2 = {}
            self.card2['data'] = {
                                    'heading': 'Concentration of High & Low segments',
                                    'content': blocks
                                }
            quartiles = ['q1','q2','q3','q4']
            observations = [0.0] + [old_div(quartile_frequencies[i]*100.0,counts) for i in quartiles]
            totals = [0.0] + [old_div(quartile_sums[i]*100.0,total) for i in quartiles]
            chart = {'x-label': '% of Observations',
                    'y-label': '% of Total '+self._column_name+' (Cumulative)',
                    'x': list(NarrativesUtils.accumu(observations)),
                    'y': list(NarrativesUtils.accumu(totals))}
            self.card2['chart'] = chart
        output = NarrativesUtils.get_template_output(self._base_dir,\
                                        'histogram_narrative.html',data_dict)
        return output

    def _generate_take_away(self):
        output = 'Takeaway entered'
        histogram_buckets = self._measure_descr_stats.get_histogram()
        threshold = self._dataframe_helper.get_num_rows() * 0.75
        s = 0
        start = 0
        end = len(histogram_buckets)
        flag = 0
        for bin_size in range(1,len(histogram_buckets)):
            s_t = 0
            for i in range(len(histogram_buckets)-bin_size+1):
                s_t = 0
                for j in range(i,i+bin_size):
                    s_t = s_t + histogram_buckets[j]['num_records']
                if(s_t >= threshold) and (s_t > s):
                    s = s_t
                    start = i
                    end = i + bin_size - 1
                    flag = 1
            if (flag == 1):
                break
        bin_size_75 = old_div((end - start + 1)*100,len(histogram_buckets))
        s = old_div(s*100,self._dataframe_helper.get_num_rows())
        start_value = histogram_buckets[start]['start_value']
        if end >= len(histogram_buckets):
            end = len(histogram_buckets)-1
        end_value = histogram_buckets[end]['end_value']
        data_dict = {"num_bins" : len(histogram_buckets),
                    "seventy_five" : bin_size_75,
                    "col_name" : self._column_name,
                    "c_col_name" : self._capitalized_column_name,
                    "skew" : self._measure_descr_stats.get_skew(),
                    "start": start_value,
                    "end": end_value
                    }
        if (len(histogram_buckets)>3):
            output = NarrativesUtils.get_template_output(self._base_dir,\
                                            'histogram_takeaway.html',data_dict)
        return output
コード例 #4
0
    def Train(self):
        st_global = time.time()

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._slug,
            "initialization",
            "info",
            display=True,
            emptyBin=False,
            customMsg=None,
            weightKey="total")

        algosToRun = self._dataframe_context.get_algorithms_to_run()
        algoSetting = [
            x for x in algosToRun if x.get_algorithm_slug() == self._slug
        ][0]
        categorical_columns = self._dataframe_helper.get_string_columns()
        uid_col = self._dataframe_context.get_uid_column()

        if self._metaParser.check_column_isin_ignored_suggestion(uid_col):
            categorical_columns = list(set(categorical_columns) - {uid_col})

        allDateCols = self._dataframe_context.get_date_columns()
        categorical_columns = list(set(categorical_columns) - set(allDateCols))
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        result_column = self._dataframe_context.get_result_column()
        categorical_columns = [
            x for x in categorical_columns if x != result_column
        ]

        appType = self._dataframe_context.get_app_type()

        model_path = self._dataframe_context.get_model_path()
        if model_path.startswith("file"):
            model_path = model_path[7:]
        validationDict = self._dataframe_context.get_validation_dict()
        print("model_path", model_path)
        pipeline_filepath = "file://" + str(model_path) + "/" + str(
            self._slug) + "/pipeline/"
        model_filepath = "file://" + str(model_path) + "/" + str(
            self._slug) + "/model"
        pmml_filepath = "file://" + str(model_path) + "/" + str(
            self._slug) + "/modelPmml"

        df = self._data_frame
        levels = df.select(result_column).distinct().count()

        appType = self._dataframe_context.get_app_type()

        model_filepath = model_path + "/" + self._slug + "/model"
        pmml_filepath = str(model_path) + "/" + str(
            self._slug) + "/traindeModel.pmml"

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._slug,
            "training",
            "info",
            display=True,
            emptyBin=False,
            customMsg=None,
            weightKey="total")

        st = time.time()
        pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns,
                                                      categorical_columns,
                                                      result_column)

        trainingData, validationData = MLUtils.get_training_and_validation_data(
            df, result_column, 0.8)  # indexed

        labelIndexer = StringIndexer(inputCol=result_column, outputCol="label")
        # OriginalTargetconverter = IndexToString(inputCol="label", outputCol="originalTargetColumn")

        # Label Mapping and Inverse
        labelIdx = labelIndexer.fit(trainingData)
        labelMapping = {k: v for k, v in enumerate(labelIdx.labels)}
        inverseLabelMapping = {
            v: float(k)
            for k, v in enumerate(labelIdx.labels)
        }
        if self._dataframe_context.get_trainerMode() == "autoML":
            automl_enable = True
        else:
            automl_enable = False
        clf = NaiveBayes()
        if not algoSetting.is_hyperparameter_tuning_enabled():
            algoParams = algoSetting.get_params_dict()
        else:
            algoParams = algoSetting.get_params_dict_hyperparameter()
        print("=" * 100)
        print(algoParams)
        print("=" * 100)
        clfParams = [prm.name for prm in clf.params]
        algoParams = {
            getattr(clf, k): v if isinstance(v, list) else [v]
            for k, v in algoParams.items() if k in clfParams
        }
        #print("="*100)
        #print("ALGOPARAMS - ",algoParams)
        #print("="*100)

        paramGrid = ParamGridBuilder()
        # if not algoSetting.is_hyperparameter_tuning_enabled():
        #     for k,v in algoParams.items():
        #         if v == [None] * len(v):
        #             continue
        #         if k.name == 'thresholds':
        #             paramGrid = paramGrid.addGrid(k,v[0])
        #         else:
        #             paramGrid = paramGrid.addGrid(k,v)
        #     paramGrid = paramGrid.build()

        # if not algoSetting.is_hyperparameter_tuning_enabled():
        for k, v in algoParams.items():
            print(k, v)
            if v == [None] * len(v):
                continue
            paramGrid = paramGrid.addGrid(k, v)
        paramGrid = paramGrid.build()
        # else:
        #     for k,v in algoParams.items():
        #         print k.name, v
        #         if v[0] == [None] * len(v[0]):
        #             continue
        #         paramGrid = paramGrid.addGrid(k,v[0])
        #     paramGrid = paramGrid.build()

        #print("="*143)
        #print("PARAMGRID - ", paramGrid)
        #print("="*143)

        if len(paramGrid) > 1:
            hyperParamInitParam = algoSetting.get_hyperparameter_params()
            evaluationMetricDict = {
                "name": hyperParamInitParam["evaluationMetric"]
            }
            evaluationMetricDict[
                "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[
                    evaluationMetricDict["name"]]
        else:
            evaluationMetricDict = {
                "name": GLOBALSETTINGS.CLASSIFICATION_MODEL_EVALUATION_METRIC
            }
            evaluationMetricDict[
                "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[
                    evaluationMetricDict["name"]]

        self._result_setter.set_hyper_parameter_results(self._slug, None)

        if validationDict["name"] == "kFold":
            numFold = int(validationDict["value"])
            estimator = Pipeline(stages=[pipeline, labelIndexer, clf])
            if algoSetting.is_hyperparameter_tuning_enabled():
                modelFilepath = "/".join(model_filepath.split("/")[:-1])
                pySparkHyperParameterResultObj = PySparkGridSearchResult(
                    estimator, paramGrid, appType, modelFilepath, levels,
                    evaluationMetricDict, trainingData, validationData,
                    numFold, self._targetLevel, labelMapping,
                    inverseLabelMapping, df)
                resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models(
                )
                self._result_setter.set_hyper_parameter_results(
                    self._slug, resultArray)
                self._result_setter.set_metadata_parallel_coordinates(
                    self._slug, {
                        "ignoreList":
                        pySparkHyperParameterResultObj.get_ignore_list(),
                        "hideColumns":
                        pySparkHyperParameterResultObj.get_hide_columns(),
                        "metricColName":
                        pySparkHyperParameterResultObj.
                        get_comparison_metric_colname(),
                        "columnOrder":
                        pySparkHyperParameterResultObj.get_keep_columns()
                    })

                bestModel = pySparkHyperParameterResultObj.getBestModel()
                prediction = pySparkHyperParameterResultObj.getBestPrediction()

            else:
                if automl_enable:
                    paramGrid = (ParamGridBuilder().addGrid(
                        clf.smoothing, [1.0, 0.2]).build())
                crossval = CrossValidator(
                    estimator=estimator,
                    estimatorParamMaps=paramGrid,
                    evaluator=BinaryClassificationEvaluator()
                    if levels == 2 else MulticlassClassificationEvaluator(),
                    numFolds=3 if numFold is None else
                    numFold)  # use 3+ folds in practice
                cvnb = crossval.fit(trainingData)
                prediction = cvnb.transform(validationData)
                bestModel = cvnb.bestModel

        else:
            train_test_ratio = float(
                self._dataframe_context.get_train_test_split())
            estimator = Pipeline(stages=[pipeline, labelIndexer, clf])
            if algoSetting.is_hyperparameter_tuning_enabled():
                modelFilepath = "/".join(model_filepath.split("/")[:-1])
                pySparkHyperParameterResultObj = PySparkTrainTestResult(
                    estimator, paramGrid, appType, modelFilepath, levels,
                    evaluationMetricDict, trainingData, validationData,
                    train_test_ratio, self._targetLevel, labelMapping,
                    inverseLabelMapping, df)
                resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models(
                )
                self._result_setter.set_hyper_parameter_results(
                    self._slug, resultArray)
                self._result_setter.set_metadata_parallel_coordinates(
                    self._slug, {
                        "ignoreList":
                        pySparkHyperParameterResultObj.get_ignore_list(),
                        "hideColumns":
                        pySparkHyperParameterResultObj.get_hide_columns(),
                        "metricColName":
                        pySparkHyperParameterResultObj.
                        get_comparison_metric_colname(),
                        "columnOrder":
                        pySparkHyperParameterResultObj.get_keep_columns()
                    })

                bestModel = pySparkHyperParameterResultObj.getBestModel()
                prediction = pySparkHyperParameterResultObj.getBestPrediction()

            else:
                tvs = TrainValidationSplit(
                    estimator=estimator,
                    estimatorParamMaps=paramGrid,
                    evaluator=BinaryClassificationEvaluator()
                    if levels == 2 else MulticlassClassificationEvaluator(),
                    trainRatio=train_test_ratio)

                tvspnb = tvs.fit(trainingData)
                prediction = tvspnb.transform(validationData)
                bestModel = tvspnb.bestModel

        modelmanagement_ = {
            param[0].name: param[1]
            for param in bestModel.stages[2].extractParamMap().items()
        }

        MLUtils.save_pipeline_or_model(bestModel, model_filepath)
        predsAndLabels = prediction.select(['prediction',
                                            'label']).rdd.map(tuple)
        # label_classes = prediction.select("label").distinct().collect()
        # label_classes = prediction.agg((F.collect_set('label').alias('label'))).first().asDict()['label']
        #results = transformed.select(["prediction","label"])
        # if len(label_classes) > 2:
        #     metrics = MulticlassMetrics(predsAndLabels) # accuracy of the model
        # else:
        #     metrics = BinaryClassificationMetrics(predsAndLabels)
        posLabel = inverseLabelMapping[self._targetLevel]
        metrics = MulticlassMetrics(predsAndLabels)

        trainingTime = time.time() - st

        f1_score = metrics.fMeasure(inverseLabelMapping[self._targetLevel],
                                    1.0)
        precision = metrics.precision(inverseLabelMapping[self._targetLevel])
        recall = metrics.recall(inverseLabelMapping[self._targetLevel])
        accuracy = metrics.accuracy

        print(f1_score, precision, recall, accuracy)

        #gain chart implementation
        def cal_prob_eval(x):
            if len(x) == 1:
                if x == posLabel:
                    return (float(x[1]))
                else:
                    return (float(1 - x[1]))
            else:
                return (float(x[int(posLabel)]))

        column_name = 'probability'

        def y_prob_for_eval_udf():
            return udf(lambda x: cal_prob_eval(x))

        prediction = prediction.withColumn(
            "y_prob_for_eval",
            y_prob_for_eval_udf()(col(column_name)))

        try:
            pys_df = prediction.select(
                ['y_prob_for_eval', 'prediction', 'label'])
            gain_lift_ks_obj = GainLiftKS(pys_df, 'y_prob_for_eval',
                                          'prediction', 'label', posLabel,
                                          self._spark)
            gain_lift_KS_dataframe = gain_lift_ks_obj.Run().toPandas()
        except:
            try:
                temp_df = pys_df.toPandas()
                gain_lift_ks_obj = GainLiftKS(temp_df, 'y_prob_for_eval',
                                              'prediction', 'label', posLabel,
                                              self._spark)
                gain_lift_KS_dataframe = gain_lift_ks_obj.Rank_Ordering()
            except:
                print("gain chant failed")
                gain_lift_KS_dataframe = None

        #feature_importance = MLUtils.calculate_sparkml_feature_importance(df, bestModel.stages[-1], categorical_columns, numerical_columns)
        act_list = prediction.select('label').collect()
        actual = [int(row.label) for row in act_list]

        pred_list = prediction.select('prediction').collect()
        predicted = [int(row.prediction) for row in pred_list]
        prob_list = prediction.select('probability').collect()
        probability = [list(row.probability) for row in prob_list]
        # objs = {"trained_model":bestModel,"actual":prediction.select('label'),"predicted":prediction.select('prediction'),
        # "probability":prediction.select('probability'),"feature_importance":None,
        # "featureList":list(categorical_columns) + list(numerical_columns),"labelMapping":labelMapping}
        objs = {
            "trained_model": bestModel,
            "actual": actual,
            "predicted": predicted,
            "probability": probability,
            "feature_importance": None,
            "featureList": list(categorical_columns) + list(numerical_columns),
            "labelMapping": labelMapping
        }

        conf_mat_ar = metrics.confusionMatrix().toArray()
        print(conf_mat_ar)
        confusion_matrix = {}
        for i in range(len(conf_mat_ar)):
            confusion_matrix[labelMapping[i]] = {}
            for j, val in enumerate(conf_mat_ar[i]):
                confusion_matrix[labelMapping[i]][labelMapping[j]] = val
        print(confusion_matrix)  # accuracy of the model
        '''ROC CURVE IMPLEMENTATION'''
        y_prob = probability
        y_score = predicted
        y_test = actual
        logLoss = log_loss(y_test, y_prob)
        if levels <= 2:
            positive_label_probs = []
            for val in y_prob:
                positive_label_probs.append(val[int(posLabel)])
            roc_auc = roc_auc_score(y_test, y_score)

            roc_data_dict = {
                "y_score": y_score,
                "y_test": y_test,
                "positive_label_probs": positive_label_probs,
                "y_prob": y_prob,
                "positive_label": posLabel
            }
            roc_dataframe = pd.DataFrame({
                "y_score":
                y_score,
                "y_test":
                y_test,
                "positive_label_probs":
                positive_label_probs
            })
            #roc_dataframe.to_csv("binary_roc_data.csv")
            fpr, tpr, thresholds = roc_curve(y_test,
                                             positive_label_probs,
                                             pos_label=posLabel)
            roc_df = pd.DataFrame({
                "FPR": fpr,
                "TPR": tpr,
                "thresholds": thresholds
            })
            roc_df["tpr-fpr"] = roc_df["TPR"] - roc_df["FPR"]

            optimal_index = np.argmax(np.array(roc_df["tpr-fpr"]))
            fpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "FPR"]
            tpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "TPR"]

            rounded_roc_df = roc_df.round({'FPR': 2, 'TPR': 4})

            unique_fpr = rounded_roc_df["FPR"].unique()

            final_roc_df = rounded_roc_df.groupby("FPR",
                                                  as_index=False)[["TPR"
                                                                   ]].mean()
            endgame_roc_df = final_roc_df.round({'FPR': 2, 'TPR': 3})
        elif levels > 2:
            positive_label_probs = []
            for val in y_prob:
                positive_label_probs.append(val[int(posLabel)])

            y_test_roc_multi = []
            for val in y_test:
                if val != posLabel:
                    val = posLabel + 1
                    y_test_roc_multi.append(val)
                else:
                    y_test_roc_multi.append(val)

            y_score_roc_multi = []
            for val in y_score:
                if val != posLabel:
                    val = posLabel + 1
                    y_score_roc_multi.append(val)
                else:
                    y_score_roc_multi.append(val)

            roc_auc = roc_auc_score(y_test_roc_multi, y_score_roc_multi)

            fpr, tpr, thresholds = roc_curve(y_test_roc_multi,
                                             positive_label_probs,
                                             pos_label=posLabel)
            roc_df = pd.DataFrame({
                "FPR": fpr,
                "TPR": tpr,
                "thresholds": thresholds
            })
            roc_df["tpr-fpr"] = roc_df["TPR"] - roc_df["FPR"]

            optimal_index = np.argmax(np.array(roc_df["tpr-fpr"]))
            fpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "FPR"]
            tpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "TPR"]

            rounded_roc_df = roc_df.round({'FPR': 2, 'TPR': 4})
            unique_fpr = rounded_roc_df["FPR"].unique()
            final_roc_df = rounded_roc_df.groupby("FPR",
                                                  as_index=False)[["TPR"
                                                                   ]].mean()
            endgame_roc_df = final_roc_df.round({'FPR': 2, 'TPR': 3})
        # Calculating prediction_split
        val_cnts = prediction.groupBy('label').count()
        val_cnts = map(lambda row: row.asDict(), val_cnts.collect())
        prediction_split = {}
        total_nos = prediction.select('label').count()
        for item in val_cnts:
            print(labelMapping)
            classname = labelMapping[item['label']]
            prediction_split[classname] = round(
                item['count'] * 100 / float(total_nos), 2)

        if not algoSetting.is_hyperparameter_tuning_enabled():
            modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH -
                                     1) + "1"
            modelFilepathArr = model_filepath.split("/")[:-1]
            modelFilepathArr.append(modelName)
            bestModel.save("/".join(modelFilepathArr))
        runtime = round((time.time() - st_global), 2)

        try:
            print(pmml_filepath)
            pmmlBuilder = PMMLBuilder(self._spark, trainingData,
                                      bestModel).putOption(
                                          clf, 'compact', True)
            pmmlBuilder.buildFile(pmml_filepath)
            pmmlfile = open(pmml_filepath, "r")
            pmmlText = pmmlfile.read()
            pmmlfile.close()
            self._result_setter.update_pmml_object({self._slug: pmmlText})
        except Exception as e:
            print("PMML failed...", str(e))
            pass

        cat_cols = list(set(categorical_columns) - {result_column})
        self._model_summary = MLModelSummary()
        self._model_summary.set_algorithm_name("Naive Bayes")
        self._model_summary.set_algorithm_display_name("Naive Bayes")
        self._model_summary.set_slug(self._slug)
        self._model_summary.set_training_time(runtime)
        self._model_summary.set_confusion_matrix(confusion_matrix)
        # self._model_summary.set_feature_importance(objs["feature_importance"])
        self._model_summary.set_feature_list(objs["featureList"])
        self._model_summary.set_model_accuracy(accuracy)
        self._model_summary.set_training_time(round((time.time() - st), 2))
        self._model_summary.set_precision_recall_stats([precision, recall])
        self._model_summary.set_model_precision(precision)
        self._model_summary.set_model_recall(recall)
        self._model_summary.set_model_F1_score(f1_score)
        self._model_summary.set_model_log_loss(logLoss)
        self._model_summary.set_gain_lift_KS_data(gain_lift_KS_dataframe)
        self._model_summary.set_AUC_score(roc_auc)
        self._model_summary.set_target_variable(result_column)
        self._model_summary.set_prediction_split(prediction_split)
        self._model_summary.set_validation_method("KFold")
        self._model_summary.set_level_map_dict(objs["labelMapping"])
        # self._model_summary.set_model_features(list(set(x_train.columns)-set([result_column])))
        self._model_summary.set_model_features(objs["featureList"])
        self._model_summary.set_level_counts(
            self._metaParser.get_unique_level_dict(
                list(set(categorical_columns)) + [result_column]))
        #self._model_summary.set_num_trees(objs['trained_model'].getNumTrees)
        self._model_summary.set_num_rules(300)
        self._model_summary.set_target_level(self._targetLevel)

        if not algoSetting.is_hyperparameter_tuning_enabled():
            modelDropDownObj = {
                "name": self._model_summary.get_algorithm_name(),
                "evaluationMetricValue": accuracy,
                "evaluationMetricName": "accuracy",
                "slug": self._model_summary.get_slug(),
                "Model Id": modelName
            }
            modelSummaryJson = {
                "dropdown": modelDropDownObj,
                "levelcount": self._model_summary.get_level_counts(),
                "modelFeatureList": self._model_summary.get_feature_list(),
                "levelMapping": self._model_summary.get_level_map_dict(),
                "slug": self._model_summary.get_slug(),
                "name": self._model_summary.get_algorithm_name()
            }
        else:
            modelDropDownObj = {
                "name": self._model_summary.get_algorithm_name(),
                "evaluationMetricValue": accuracy,
                "evaluationMetricName": "accuracy",
                "slug": self._model_summary.get_slug(),
                "Model Id": resultArray[0]["Model Id"]
            }
            modelSummaryJson = {
                "dropdown": modelDropDownObj,
                "levelcount": self._model_summary.get_level_counts(),
                "modelFeatureList": self._model_summary.get_feature_list(),
                "levelMapping": self._model_summary.get_level_map_dict(),
                "slug": self._model_summary.get_slug(),
                "name": self._model_summary.get_algorithm_name()
            }
        self._model_management = MLModelSummary()
        print(modelmanagement_)
        self._model_management.set_job_type(
            self._dataframe_context.get_job_name())  #Project name
        self._model_management.set_training_status(
            data="completed")  # training status
        self._model_management.set_target_level(
            self._targetLevel)  # target column value
        self._model_management.set_training_time(runtime)  # run time
        self._model_management.set_model_accuracy(round(metrics.accuracy, 2))
        # self._model_management.set_model_accuracy(round(metrics.accuracy_score(objs["actual"], objs["predicted"]),2))#accuracy
        self._model_management.set_algorithm_name(
            "NaiveBayes")  #algorithm name
        self._model_management.set_validation_method(
            str(validationDict["displayName"]) + "(" +
            str(validationDict["value"]) + ")")  #validation method
        self._model_management.set_target_variable(
            result_column)  #target column name
        self._model_management.set_creation_date(data=str(
            datetime.now().strftime('%b %d ,%Y  %H:%M ')))  #creation date
        self._model_management.set_datasetName(self._datasetName)
        self._model_management.set_model_type(data='classification')
        self._model_management.set_var_smoothing(
            data=int(modelmanagement_['smoothing']))

        # self._model_management.set_no_of_independent_variables(df) #no of independent varables

        modelManagementSummaryJson = [
            ["Project Name",
             self._model_management.get_job_type()],
            ["Algorithm",
             self._model_management.get_algorithm_name()],
            ["Training Status",
             self._model_management.get_training_status()],
            ["Accuracy",
             self._model_management.get_model_accuracy()],
            ["RunTime", self._model_management.get_training_time()],
            #["Owner",None],
            ["Created On",
             self._model_management.get_creation_date()]
        ]

        modelManagementModelSettingsJson = [
            ["Training Dataset",
             self._model_management.get_datasetName()],
            ["Target Column",
             self._model_management.get_target_variable()],
            ["Target Column Value",
             self._model_management.get_target_level()],
            ["Algorithm",
             self._model_management.get_algorithm_name()],
            [
                "Model Validation",
                self._model_management.get_validation_method()
            ],
            ["Model Type",
             self._model_management.get_model_type()],
            ["Smoothing",
             self._model_management.get_var_smoothing()],

            #,["priors",self._model_management.get_priors()]
            #,["var_smoothing",self._model_management.get_var_smoothing()]
        ]

        nbOverviewCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj))
            for cardObj in MLUtils.create_model_management_card_overview(
                self._model_management, modelManagementSummaryJson,
                modelManagementModelSettingsJson)
        ]
        nbPerformanceCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj))
            for cardObj in MLUtils.create_model_management_cards(
                self._model_summary, endgame_roc_df)
        ]
        nbDeploymentCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj))
            for cardObj in MLUtils.create_model_management_deploy_empty_card()
        ]
        nbCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for
            cardObj in MLUtils.create_model_summary_cards(self._model_summary)
        ]
        NB_Overview_Node = NarrativesTree()
        NB_Overview_Node.set_name("Overview")
        NB_Performance_Node = NarrativesTree()
        NB_Performance_Node.set_name("Performance")
        NB_Deployment_Node = NarrativesTree()
        NB_Deployment_Node.set_name("Deployment")
        for card in nbOverviewCards:
            NB_Overview_Node.add_a_card(card)
        for card in nbPerformanceCards:
            NB_Performance_Node.add_a_card(card)
        for card in nbDeploymentCards:
            NB_Deployment_Node.add_a_card(card)
        for card in nbCards:
            self._prediction_narrative.add_a_card(card)

        self._result_setter.set_model_summary({
            "naivebayes":
            json.loads(
                CommonUtils.convert_python_object_to_json(self._model_summary))
        })
        self._result_setter.set_naive_bayes_model_summary(modelSummaryJson)
        self._result_setter.set_nb_cards(nbCards)
        self._result_setter.set_nb_nodes(
            [NB_Overview_Node, NB_Performance_Node, NB_Deployment_Node])
        self._result_setter.set_nb_fail_card({
            "Algorithm_Name": "Naive Bayes",
            "success": "True"
        })

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._slug,
            "completion",
            "info",
            display=True,
            emptyBin=False,
            customMsg=None,
            weightKey="total")

        print("\n\n")
コード例 #5
0
class RegressionNarrative(object):
    def __init__(self, df_helper, df_context, result_setter, spark, df_regression_result, correlations,story_narrative,meta_parser):
        self._metaParser = meta_parser
        self._result_setter = result_setter
        self._story_narrative = story_narrative
        self._df_regression_result = df_regression_result
        self._correlations = correlations
        self._dataframe_helper = df_helper
        self._dataframe_context = df_context
        self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER

        # self._result_setter.set_trend_section_name("regression")
        self._measure_columns = self._dataframe_helper.get_numeric_columns()
        self._dimension_columns = self._dataframe_helper.get_string_columns()
        self._date_columns = self._dataframe_context.get_date_columns()
        self._uid_col = self._dataframe_context.get_uid_column()
        if self._metaParser.check_column_isin_ignored_suggestion(self._uid_col):
            self._dimension_columns = list(set(self._dimension_columns) - {self._uid_col})
        if len(self._date_columns) >0 :
            self._dimension_columns = list(set(self._dimension_columns)-set(self._date_columns))
        self._spark = spark
        self.measures = []
        self.result_column = self._dataframe_helper.resultcolumn

        self.all_coefficients = self._df_regression_result.get_all_coeff()
        all_coeff = [(x,self.all_coefficients[x]) for x in list(self.all_coefficients.keys())]
        all_coeff = sorted(all_coeff,key = lambda x:abs(x[1]["coefficient"]),reverse = True)
        self._all_coeffs = all_coeff
        self.significant_measures = [x[0] for x in all_coeff if x[1]['p_value']<=0.05]
        print(self.significant_measures)
        print("regression narratives started")
        self.narratives = {"heading": self.result_column + "Performance Report",
                           "main_card":{},
                           "cards":[]
                        }
        self._base_dir = "/regression/"
        self._run_dimension_level_regression = False

        # self._dim_regression = self.run_regression_for_dimension_levels()
        self._regressionNode = NarrativesTree()

        self._completionStatus = self._dataframe_context.get_completion_status()
        self._analysisName = self._dataframe_context.get_analysis_name()
        self._messageURL = self._dataframe_context.get_message_url()
        self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight()
        self._scriptStages = {
            "regressionNarrativeStart":{
                "summary":"Started The Regression Narratives",
                "weight":1
                },
            "regressionNarrativeEnd":{
                "summary":"Narratives For Regression Finished",
                "weight":0
                },
            }
        self._completionStatus += old_div(self._scriptWeightDict[self._analysisName]["narratives"]*self._scriptStages["regressionNarrativeStart"]["weight"],10)
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "regressionNarrativeStart",\
                                    "info",\
                                    self._scriptStages["regressionNarrativeStart"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,progressMessage)
        self._dataframe_context.update_completion_status(self._completionStatus)

        self.generate_narratives()
        self._regressionNode.set_name("Influencers")
        self._result_setter.set_regression_node(self._regressionNode)

        self._completionStatus += old_div(self._scriptWeightDict[self._analysisName]["narratives"]*self._scriptStages["regressionNarrativeEnd"]["weight"],10)
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "regressionNarrativeEnd",\
                                    "info",\
                                    self._scriptStages["regressionNarrativeEnd"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,progressMessage)
        self._dataframe_context.update_completion_status(self._completionStatus)


    def generate_narratives(self):
        regression_narrative_obj = LinearRegressionNarrative(
                                    self._df_regression_result,
                                    self._correlations,
                                    self._dataframe_helper,
                                    self._dataframe_context,
                                    self._metaParser,
                                    self._spark
                                    )
        main_card_data = regression_narrative_obj.generate_main_card_data()
        main_card_narrative = NarrativesUtils.get_template_output(self._base_dir,\
                                                        'regression_main_card.html',main_card_data)
        self.narratives['main_card'] = {}
        self.narratives["main_card"]['paragraphs'] = NarrativesUtils.paragraph_splitter(main_card_narrative)
        self.narratives["main_card"]['header'] = 'Key Measures that affect ' + self.result_column
        self.narratives["main_card"]['chart'] = {}
        self.narratives["main_card"]['chart']['heading'] = ''
        self.narratives["main_card"]['chart']['data'] = [[i for i,j in self._all_coeffs],
                                                         [j['coefficient'] for i,j in self._all_coeffs]]
        self.narratives["main_card"]['chart']['label'] = {'x':'Measure Name',
                                                            'y': 'Change in ' + self.result_column + ' per unit increase'}

        main_card = NormalCard()
        main_card_header = HtmlData(data = '<h3>Key Measures that affect ' + self.result_column+"</h3>")
        main_card_paragraphs = NarrativesUtils.block_splitter(main_card_narrative,self._blockSplitter)
        main_card_chart_data = [{"key":val[0],"value":val[1]} for val in zip([i for i,j in self._all_coeffs],[j['coefficient'] for i,j in self._all_coeffs])]
        main_card_chart = NormalChartData(data=main_card_chart_data)
        mainCardChartJson = ChartJson()
        mainCardChartJson.set_data(main_card_chart.get_data())
        mainCardChartJson.set_label_text({'x':'Influencing Factors','y': 'Change in ' + self.result_column + ' per unit increase'})
        mainCardChartJson.set_chart_type("bar")
        mainCardChartJson.set_axes({"x":"key","y":"value"})
        mainCardChartJson.set_yaxis_number_format(".2f")
        # st_info = ["Test : Regression","Threshold for p-value: 0.05", "Effect Size: Regression Coefficient"]
        chart_data = sorted(main_card_chart_data,key=lambda x:x["value"],reverse=True)
        statistical_info_array=[
            ("Test Type","Regression"),
            ("Effect Size","Coefficients"),
            ("Max Effect Size",chart_data[0]["key"]),
            ("Min Effect Size",chart_data[-1]["key"]),
            ]
        statistical_inferenc = ""
        if len(chart_data) == 1:
            statistical_inference = "{} is the only variable that have significant influence over {} (Target) having an \
             Effect size of {}".format(chart_data[0]["key"],self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4))
        elif len(chart_data) == 2:
            statistical_inference = "There are two variables ({} and {}) that have significant influence over {} (Target) and the \
             Effect size ranges are {} and {} respectively".format(chart_data[0]["key"],chart_data[1]["key"],self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4),round(chart_data[1]["value"],4))
        else:
            statistical_inference = "There are {} variables that have significant influence over {} (Target) and the \
             Effect size ranges from {} to {}".format(len(chart_data),self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4),round(chart_data[-1]["value"],4))
        if statistical_inference != "":
            statistical_info_array.append(("Inference",statistical_inference))
        statistical_info_array = NarrativesUtils.statistical_info_array_formatter(statistical_info_array)
        main_card.set_card_data(data = [main_card_header]+main_card_paragraphs+[C3ChartData(data=mainCardChartJson,info=statistical_info_array)])
        main_card.set_card_name("Key Influencers")
        self._regressionNode.add_a_card(main_card)


        count = 0
        for measure_column in self.significant_measures:
            sigMeasureNode = NarrativesTree()
            sigMeasureNode.set_name(measure_column)
            measureCard1 = NormalCard()
            measureCard1.set_card_name("{}: Impact on {}".format(measure_column,self.result_column))
            measureCard1Data = []
            if self._run_dimension_level_regression:
                measureCard2 = NormalCard()
                measureCard2.set_card_name("Key Areas where it Matters")
                measureCard2Data = []

            measure_column_cards = {}
            card0 = {}
            card1data = regression_narrative_obj.generate_card1_data(measure_column)
            card1heading = "<h3>Impact of "+measure_column+" on "+self.result_column+"</h3>"
            measureCard1Header = HtmlData(data=card1heading)
            card1data.update({"blockSplitter":self._blockSplitter})
            card1narrative = NarrativesUtils.get_template_output(self._base_dir,\
                                                            'regression_card1.html',card1data)

            card1paragraphs = NarrativesUtils.block_splitter(card1narrative,self._blockSplitter)
            card0 = {"paragraphs":card1paragraphs}
            card0["charts"] = {}
            card0['charts']['chart2']={}
            # card0['charts']['chart2']['data']=card1data["chart_data"]
            # card0['charts']['chart2']['heading'] = ''
            # card0['charts']['chart2']['labels'] = {}
            card0['charts']['chart1']={}
            card0["heading"] = card1heading
            measure_column_cards['card0'] = card0

            measureCard1Header = HtmlData(data=card1heading)
            measureCard1Data += [measureCard1Header]
            measureCard1para = card1paragraphs
            measureCard1Data += measureCard1para

            if self._run_dimension_level_regression:
                print("running narratives for key area dict")
                self._dim_regression = self.run_regression_for_dimension_levels()
                card2table, card2data=regression_narrative_obj.generate_card2_data(measure_column,self._dim_regression)
                card2data.update({"blockSplitter":self._blockSplitter})
                card2narrative = NarrativesUtils.get_template_output(self._base_dir,\
                                                            'regression_card2.html',card2data)
                card2paragraphs = NarrativesUtils.block_splitter(card2narrative,self._blockSplitter)

                card1 = {'tables': card2table, 'paragraphs' : card2paragraphs,
                        'heading' : 'Key Areas where ' + measure_column + ' matters'}
                measure_column_cards['card1'] = card1

                measureCard2Data += card2paragraphs
                if "table1" in card2table:
                    table1data = regression_narrative_obj.convert_table_data(card2table["table1"])
                    card2Table1 = TableData()
                    card2Table1.set_table_data(table1data)
                    card2Table1.set_table_type("heatMap")
                    card2Table1.set_table_top_header(card2table["table1"]["heading"])
                    card2Table1Json = json.loads(CommonUtils.convert_python_object_to_json(card2Table1))
                    # measureCard2Data.insert(3,card2Table1)
                    measureCard2Data.insert(3,card2Table1Json)

                if "table2" in card2table:
                    table2data = regression_narrative_obj.convert_table_data(card2table["table2"])
                    card2Table2 = TableData()
                    card2Table2.set_table_data(table2data)
                    card2Table2.set_table_type("heatMap")
                    card2Table2.set_table_top_header(card2table["table2"]["heading"])
                    # measureCard2Data.insert(5,card2Table2)
                    card2Table2Json = json.loads(CommonUtils.convert_python_object_to_json(card2Table2))
                    # measureCard2Data.append(card2Table2)
                    measureCard2Data.append(card2Table2Json)


            # self._result_setter.set_trend_section_data({"result_column":self.result_column,
            #                                             "measure_column":measure_column,
            #                                             "base_dir":self._base_dir
            #                                             })
            # trend_narratives_obj = TimeSeriesNarrative(self._dataframe_helper, self._dataframe_context, self._result_setter, self._spark, self._story_narrative)
            # card2 =  trend_narratives_obj.get_regression_trend_card_data()
            # if card2:
            #     measure_column_cards['card2'] = card2
            #
            #
            # card3 = {}
            progressMessage = CommonUtils.create_progress_message_object(self._analysisName,"custom","info","Analyzing Key Influencers",self._completionStatus,self._completionStatus,display=True)
            CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=False)
            card4data = regression_narrative_obj.generate_card4_data(self.result_column,measure_column)
            card4data.update({"blockSplitter":self._blockSplitter})
            # card4heading = "Sensitivity Analysis: Effect of "+self.result_column+" on Segments of "+measure_column
            card4narrative = NarrativesUtils.get_template_output(self._base_dir,\
                                                                'regression_card4.html',card4data)
            card4paragraphs = NarrativesUtils.block_splitter(card4narrative,self._blockSplitter)
            # card3 = {"paragraphs":card4paragraphs}
            card0['paragraphs'] = card1paragraphs+card4paragraphs
            card4Chart = card4data["charts"]
            # st_info = ["Test : Regression", "Variables : "+ self.result_column +", "+measure_column,"Intercept : "+str(round(self._df_regression_result.get_intercept(),2)), "Regression Coefficient : "+ str(round(self._df_regression_result.get_coeff(measure_column),2))]
            statistical_info_array=[
                ("Test Type","Regression"),
                ("Coefficient",str(round(self._df_regression_result.get_coeff(measure_column),2))),
                ("P-Value","<= 0.05"),
                ("Intercept",str(round(self._df_regression_result.get_intercept(),2))),
                ("R Square ",str(round(self._df_regression_result.get_rsquare(),2))),
                ]
            inferenceTuple = ()
            coeff = self._df_regression_result.get_coeff(measure_column)
            if coeff > 0:
                inferenceTuple = ("Inference","For every additional unit of increase in {} there will be an increase of {} units in {} (target).".format(measure_column,str(round(coeff,2)),self._dataframe_context.get_result_column()))
            else:
                inferenceTuple = ("Inference","For every additional unit of decrease in {} there will be an decrease of {} units in {} (target).".format(measure_column,str(round(coeff,2)),self._dataframe_context.get_result_column()))
            if len(inferenceTuple) > 0:
                statistical_info_array.append(inferenceTuple)
            statistical_info_array = NarrativesUtils.statistical_info_array_formatter(statistical_info_array)

            card4paragraphs.insert(2,C3ChartData(data=card4Chart,info=statistical_info_array))
            measureCard1Data += card4paragraphs

            self.narratives['cards'].append(measure_column_cards)

            if count == 0:
                card4data.pop("charts")
                self._result_setter.update_executive_summary_data(card4data)
            count += 1
            measureCard1.set_card_data(measureCard1Data)
            if self._run_dimension_level_regression:
                measureCard2.set_card_data(measureCard2Data)
                sigMeasureNode.add_cards([measureCard1,measureCard2])
            sigMeasureNode.add_cards([measureCard1])
            self._regressionNode.add_a_node(sigMeasureNode)
        # self._result_setter.set_trend_section_completion_status(True)
        self._story_narrative.add_a_node(self._regressionNode)


    def run_regression_for_dimension_levels(self):
        print("Running regression for Dimension Levels")
        significant_dimensions = self._dataframe_helper.get_significant_dimension()
        print("significant_dimensions:",significant_dimensions)
        if significant_dimensions != {}:
            sig_dims = [(x,significant_dimensions[x]) for x in list(significant_dimensions.keys())]
            sig_dims = sorted(sig_dims,key=lambda x:x[1],reverse=True)
            cat_columns = [x[0] for x in sig_dims[:5]]
        else:
            cat_columns = self._dimension_columns[:5]
        cat_columns= [x for x in cat_columns if x != "Agent Name"]
        print("Running regression for below 5 dimensions")
        print(cat_columns)
        regression_result_dimension_cols = dict(list(zip(cat_columns,[{}]*len(cat_columns))))
        for col in cat_columns:
            print("For Column:",col)
            # column_levels = self._dataframe_helper.get_all_levels(col)
            column_levels = list(self._metaParser.get_unique_level_dict(col).keys())
            level_regression_result = dict(list(zip(column_levels,[{}]*len(column_levels))))
            print("No of levels in this column",len(column_levels))
            for level in column_levels:
                print("Filtering data for level:",level)
                filtered_df = self._dataframe_helper.filter_dataframe(col,level)
                result = LinearRegression(filtered_df, self._dataframe_helper, self._dataframe_context,self._metaParser,self._spark).fit(self._dataframe_context.get_result_column())
                if result == None:
                    result = {"intercept" : 0.0,
                              "rmse" : 0.0,
                              "rsquare" : 0.0,
                              "coeff" : 0.0
                              }
                else:
                    result = {"intercept" : result.get_intercept(),
                              "rmse" : result.get_root_mean_square_error(),
                              "rsquare" : result.get_rsquare(),
                              "coeff" : result.get_all_coeff()
                              }
                level_regression_result[level] = result
            regression_result_dimension_cols[col] = level_regression_result
        # print json.dumps(regression_result_dimension_cols,indent=2)
        return regression_result_dimension_cols
コード例 #6
0
        data_dict_overall["price_trend"] = stockPriceTrendArrayFormatted

        data_dict_overall["avg_sentiment_score"] = data_dict_overall["avg_sentiment_score"]/number_stocks
        data_dict_overall["stock_value_change"] = data_dict_overall["stock_value_change"]/number_stocks
        data_dict_overall["stock_percent_change"] = data_dict_overall["stock_percent_change"]/number_stocks

        data_dict_overall["number_articles_by_concept"] = self.get_number_articles_per_concept(data_dict_overall["nArticlesAndSentimentsPerConcept"])

        key, value = max(data_dict_overall["max_value_change"].iteritems(), key = lambda p: p[1])
        data_dict_overall["max_value_change_overall"] = (self.get_capitalized_name(key),value)
        key, value = min(data_dict_overall["max_value_change"].iteritems(), key = lambda p: p[1])
        data_dict_overall["min_value_change_overall"] = (self.get_capitalized_name(key),value)

        key,value = max(data_dict_overall["max_sentiment_change"].iteritems(), key = lambda p: p[1])
        data_dict_overall["max_sentiment_change_overall"] = (self.get_capitalized_name(key),value)

        # print data_dict_overall
        finalResult = NarrativesTree()
        overviewNode = NarrativesTree()
        stockNode = NarrativesTree()
        overviewNode.set_name("Overview")
        stockNode.set_name("Single Stock Analysis")
        overviewCard = MLUtils.stock_sense_overview_card(data_dict_overall)
        overviewNode.add_a_card(overviewCard)
        finalResult.add_a_node(overviewNode)
        individualStockNodes = MLUtils.stock_sense_individual_stock_cards(stockDict)
        stockNode.add_nodes(individualStockNodes)
        finalResult.add_a_node(stockNode)

        return finalResult
コード例 #7
0
class ChiSquareNarratives:
    #@accepts(object, int, DFChiSquareResult ,ContextSetter)
    def __init__(self,
                 df_helper,
                 df_chisquare_result,
                 spark,
                 df_context,
                 data_frame,
                 story_narrative,
                 result_setter,
                 scriptWeight=None,
                 analysisName=None):
        self._story_narrative = story_narrative
        self._result_setter = result_setter
        self._data_frame = data_frame
        self._dataframe_context = df_context
        self._dataframe_helper = df_helper
        self._storyOnScoredData = self._dataframe_context.get_story_on_scored_data(
        )
        self._measure_columns = df_helper.get_numeric_columns()
        self._df_chisquare = df_chisquare_result
        self._df_chisquare_result = df_chisquare_result.get_result()
        self.narratives = {}
        self._appid = df_context.get_app_id()
        self._chiSquareNode = NarrativesTree()
        self._chiSquareNode.set_name("Association")
        self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER
        self._noOfSigDimsToShow = GLOBALSETTINGS.CHISQUARESIGNIFICANTDIMENSIONTOSHOW
        self._base_dir = "/chisquare/"
        self._spark = spark

        ############################DataFrame Measure to Dimesion Column#####################

        pandas_df = self._data_frame.toPandas()
        target_dimension = self._df_chisquare_result.keys()

        bin_data = {}
        for col in self._measure_columns:
            chisquare_result = self._df_chisquare.get_chisquare_result(
                target_dimension[0], col)
            bin_data[col] = chisquare_result.get_contingency_table(
            ).get_column_two_levels()

        for bin_col in bin_data.keys():
            for split in bin_data[bin_col]:
                val = split.split('to')
                pandas_df[bin_col][
                    (pandas_df[bin_col] >= float(val[0].replace(',', '')))
                    & (pandas_df[bin_col] < float(val[1].replace(',', ''))
                       )] = split

        fields = [
            StructField(field_name, StringType(), True)
            for field_name in pandas_df.columns
        ]
        schema = StructType(fields)

        SQLctx = SQLContext(sparkContext=self._spark.sparkContext,
                            sparkSession=self._spark)
        self._data_frame = SQLctx.createDataFrame(pandas_df, schema)

        # print self._data_frame
        ############################DataFrame Measure to Dimesion Column#####################

        if self._appid != None:
            if self._appid == "1":
                self._base_dir += "appid1/"
            elif self._appid == "2":
                self._base_dir += "appid2/"

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        if analysisName == None:
            self._analysisName = self._dataframe_context.get_analysis_name()
        else:
            self._analysisName = analysisName

        self._messageURL = self._dataframe_context.get_message_url()
        if scriptWeight == None:
            self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight(
            )
        else:
            self._scriptWeightDict = scriptWeight
        self._analysisDict = self._dataframe_context.get_analysis_dict()
        if self._analysisDict != {}:
            self._nColsToUse = self._analysisDict[
                self._analysisName]["noOfColumnsToUse"]
        else:
            self._nColsToUse = None

        self._scriptStages = {
            "initialization": {
                "summary": "Initialized the Frequency Narratives",
                "weight": 0
            },
            "summarygeneration": {
                "summary": "summary generation finished",
                "weight": 10
            },
            "completion": {
                "summary": "Frequency Stats Narratives done",
                "weight": 0
            },
        }
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "initialization",
            "info",
            display=False,
            weightKey="narratives")

        self._generate_narratives()

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "summarygeneration",
            "info",
            display=False,
            weightKey="narratives")

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "completion",
            "info",
            display=False,
            weightKey="narratives")

    def _generate_narratives(self):
        """
        generate main card narrative and remaining cards are generated by calling ChiSquareAnalysis class for each of analyzed dimensions
        """
        for target_dimension in self._df_chisquare_result.keys():
            target_chisquare_result = self._df_chisquare_result[
                target_dimension]
            analysed_variables = target_chisquare_result.keys(
            )  ## List of all analyzed var.
            # List of significant var out of analyzed var.
            significant_variables = [
                dim for dim in target_chisquare_result.keys()
                if target_chisquare_result[dim].get_pvalue() <= 0.05
            ]
            effect_sizes = [
                target_chisquare_result[dim].get_effect_size()
                for dim in significant_variables
            ]

            effect_size_dict = dict(zip(significant_variables, effect_sizes))
            significant_variables = [
                y
                for (x, y) in sorted(zip(effect_sizes, significant_variables),
                                     reverse=True)
            ]
            #insignificant_variables = [i for i in self._df_chisquare_result[target_dimension] if i['pv']>0.05]

            num_analysed_variables = len(analysed_variables)
            num_significant_variables = len(significant_variables)
            self.narratives['main_card'] = {}
            self.narratives['main_card'][
                'heading'] = 'Relationship between ' + target_dimension + ' and other factors'
            self.narratives['main_card']['paragraphs'] = {}
            data_dict = {
                'num_variables': num_analysed_variables,
                'num_significant_variables': num_significant_variables,
                'significant_variables': significant_variables,
                'target': target_dimension,
                'analysed_dimensions': analysed_variables,
                'blockSplitter': self._blockSplitter
            }  # for both para 1 and para 2
            paragraph = {}
            paragraph['header'] = ''

            paragraph['content'] = NarrativesUtils.get_template_output(
                self._base_dir, 'main_card.html', data_dict)
            self.narratives['main_card']['paragraphs'] = [paragraph]
            self.narratives['cards'] = []
            chart = {
                'header':
                'Strength of association between ' + target_dimension +
                ' and other dimensions'
            }
            chart['data'] = effect_size_dict
            chart['label_text'] = {
                'x': 'Dimensions',
                'y': 'Effect Size (Cramers-V)'
            }

            chart_data = []
            chartDataValues = []
            for k, v in effect_size_dict.items():
                chart_data.append({"key": k, "value": float(v)})
                chartDataValues.append(float(v))
            chart_data = sorted(chart_data,
                                key=lambda x: x["value"],
                                reverse=True)
            chart_json = ChartJson()
            chart_json.set_data(chart_data)
            chart_json.set_chart_type("bar")
            # chart_json.set_label_text({'x':'Dimensions','y':'Effect Size (Cramers-V)'})
            chart_json.set_label_text({
                'x': '  ',
                'y': 'Effect Size (Cramers-V)'
            })
            chart_json.set_axis_rotation(True)
            chart_json.set_axes({"x": "key", "y": "value"})
            # chart_json.set_yaxis_number_format(".4f")
            chart_json.set_yaxis_number_format(
                NarrativesUtils.select_y_axis_format(chartDataValues))
            self.narratives['main_card']['chart'] = chart

            main_card = NormalCard()
            header = "<h3>Strength of association between " + target_dimension + " and other dimensions</h3>"
            main_card_data = [HtmlData(data=header)]
            main_card_narrative = NarrativesUtils.get_template_output(
                self._base_dir, 'main_card.html', data_dict)
            main_card_narrative = NarrativesUtils.block_splitter(
                main_card_narrative, self._blockSplitter)
            main_card_data += main_card_narrative
            # st_info = ["Test : Chi Square", "Threshold for p-value : 0.05", "Effect Size : Cramer's V"]
            # print "chartdata",chart_data
            if len(chart_data) > 0:
                statistical_info_array = [
                    ("Test Type", "Chi-Square"),
                    ("Effect Size", "Cramer's V"),
                    ("Max Effect Size", chart_data[0]["key"]),
                    ("Min Effect Size", chart_data[-1]["key"]),
                ]
                statistical_inferenc = ""
                if len(chart_data) == 1:
                    statistical_inference = "{} is the only variable that have significant association with the {} (Target) having an \
                     Effect size of {}".format(
                        chart_data[0]["key"],
                        self._dataframe_context.get_result_column(),
                        round(chart_data[0]["value"], 4))
                elif len(chart_data) == 2:
                    statistical_inference = "There are two variables ({} and {}) that have significant association with the {} (Target) and the \
                     Effect size ranges are {} and {} respectively".format(
                        chart_data[0]["key"], chart_data[1]["key"],
                        self._dataframe_context.get_result_column(),
                        round(chart_data[0]["value"], 4),
                        round(chart_data[1]["value"], 4))
                else:
                    statistical_inference = "There are {} variables that have significant association with the {} (Target) and the \
                     Effect size ranges from {} to {}".format(
                        len(chart_data),
                        self._dataframe_context.get_result_column(),
                        round(chart_data[0]["value"], 4),
                        round(chart_data[-1]["value"], 4))
                if statistical_inference != "":
                    statistical_info_array.append(
                        ("Inference", statistical_inference))
                statistical_info_array = NarrativesUtils.statistical_info_array_formatter(
                    statistical_info_array)
            else:
                statistical_info_array = []
            main_card_data.append(
                C3ChartData(data=chart_json, info=statistical_info_array))
            main_card.set_card_data(main_card_data)
            main_card.set_card_name("Key Influencers")

            if self._storyOnScoredData != True:
                self._chiSquareNode.add_a_card(main_card)
                self._result_setter.add_a_score_chi_card(main_card)

            print "target_dimension", target_dimension
            if self._appid == '2' and num_significant_variables > 5:
                significant_variables = significant_variables[:5]
            else:
                if self._nColsToUse != None:
                    significant_variables = significant_variables[:self.
                                                                  _nColsToUse]

            CommonUtils.create_update_and_save_progress_message(
                self._dataframe_context,
                self._scriptWeightDict,
                self._scriptStages,
                self._analysisName,
                "custom",
                "info",
                display=True,
                customMsg="Analyzing key drivers",
                weightKey="narratives")
            for analysed_dimension in significant_variables[:self.
                                                            _noOfSigDimsToShow]:
                chisquare_result = self._df_chisquare.get_chisquare_result(
                    target_dimension, analysed_dimension)
                if self._appid == '2':
                    print "APPID 2 is used"
                    card = ChiSquareAnalysis(
                        self._dataframe_context, self._dataframe_helper,
                        chisquare_result, target_dimension, analysed_dimension,
                        significant_variables, num_analysed_variables,
                        self._data_frame, self._measure_columns,
                        self._base_dir, None, target_chisquare_result)
                    # self.narratives['cards'].append(card)
                    self._result_setter.add_a_score_chi_card(
                        json.loads(
                            CommonUtils.convert_python_object_to_json(
                                card.get_dimension_card1())))

                elif self._appid == '1':
                    print "APPID 1 is used"
                    card = ChiSquareAnalysis(
                        self._dataframe_context, self._dataframe_helper,
                        chisquare_result, target_dimension, analysed_dimension,
                        significant_variables, num_analysed_variables,
                        self._data_frame, self._measure_columns,
                        self._base_dir, None, target_chisquare_result)
                    # self.narratives['cards'].append(card)
                    self._result_setter.add_a_score_chi_card(
                        json.loads(
                            CommonUtils.convert_python_object_to_json(
                                card.get_dimension_card1())))
                else:
                    target_dimension_card = ChiSquareAnalysis(
                        self._dataframe_context, self._dataframe_helper,
                        chisquare_result, target_dimension, analysed_dimension,
                        significant_variables, num_analysed_variables,
                        self._data_frame, self._measure_columns,
                        self._base_dir, None, target_chisquare_result)
                    self.narratives['cards'].append(target_dimension_card)
                    self._chiSquareNode.add_a_node(
                        target_dimension_card.get_dimension_node())
        self._story_narrative.add_a_node(self._chiSquareNode)
        self._result_setter.set_chisquare_node(self._chiSquareNode)
コード例 #8
0
class ChiSquareAnalysis:
    def __init__(self,
                 df_context,
                 df_helper,
                 chisquare_result,
                 target_dimension,
                 analysed_dimension,
                 significant_variables,
                 num_analysed_variables,
                 data_frame,
                 measure_columns,
                 base_dir,
                 appid=None,
                 target_chisquare_result=None):
        self._blockSplitter = "|~NEWBLOCK~|"
        self._highlightFlag = "|~HIGHLIGHT~|"
        self._dimensionNode = NarrativesTree()
        self._dimensionNode.set_name(target_dimension)
        self._data_frame = data_frame
        self._dataframe_context = df_context
        self._dataframe_helper = df_helper
        self._chisquare_result = chisquare_result
        self._target_dimension = target_dimension
        self._analysed_dimension = analysed_dimension
        self._significant_variables = significant_variables
        self._target_chisquare_result = target_chisquare_result
        self._measure_columns = self._dataframe_helper.get_numeric_columns()
        self._chiSquareLevelLimit = GLOBALSETTINGS.CHISQUARELEVELLIMIT

        self._num_analysed_variables = num_analysed_variables
        self._chiSquareTable = chisquare_result.get_contingency_table()

        significant_variables = list(
            set(significant_variables) - {analysed_dimension})
        if len(significant_variables) <= 20:
            if len(significant_variables) <= 3:
                self._second_level_dimensions = list(significant_variables)
            else:
                self._second_level_dimensions = list(significant_variables)[:3]
        else:
            self._second_level_dimensions = list(significant_variables)[:5]

        print self._second_level_dimensions

        self._appid = appid
        self._card1 = NormalCard()
        self._targetCards = []
        self._base_dir = base_dir

        self._binTargetCol = False
        self._binAnalyzedCol = False
        print "--------Chi-Square Narratives for ", analysed_dimension, "---------"
        if self._dataframe_context.get_custom_analysis_details() != None:
            binnedColObj = [
                x["colName"]
                for x in self._dataframe_context.get_custom_analysis_details()
            ]
            print "analysed_dimension : ", self._analysed_dimension
            if binnedColObj != None and self._target_dimension in binnedColObj:
                self._binTargetCol = True
            if binnedColObj != None and (
                    self._analysed_dimension in binnedColObj
                    or self._analysed_dimension in self._measure_columns):
                self._binAnalyzedCol = True

        if self._appid == None:
            self._generate_narratives()
            self._dimensionNode.add_cards([self._card1] + self._targetCards)
            self._dimensionNode.set_name("{}".format(analysed_dimension))
        elif self._appid == "2":
            self._generate_narratives()
            self._dimensionNode.add_cards([self._card1])
            self._dimensionNode.set_name("{}".format(analysed_dimension))
        elif self._appid == "1":
            self._generate_narratives()
            self._dimensionNode.add_cards([self._card1])
            self._dimensionNode.set_name("{}".format(analysed_dimension))

    def get_dimension_node(self):
        return json.loads(
            CommonUtils.convert_python_object_to_json(self._dimensionNode))

    def get_dimension_card1(self):
        return self._card1

    def _generate_narratives(self):
        chisquare_result = self._chisquare_result
        target_dimension = self._target_dimension
        analysed_dimension = self._analysed_dimension
        significant_variables = self._significant_variables
        num_analysed_variables = self._num_analysed_variables
        table = self._chiSquareTable
        total = self._chiSquareTable.get_total()

        levels = self._chiSquareTable.get_column_two_levels()
        level_counts = self._chiSquareTable.get_column_total()
        levels_count_sum = sum(level_counts)
        levels_percentages = [
            i * 100.0 / levels_count_sum for i in level_counts
        ]
        sorted_levels = sorted(zip(level_counts, levels), reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]

        top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        bottom_dim = sorted_levels[-1][1]
        bottom_dim_contribution = sorted_levels[-1][0]
        bottom_dims = [
            y for x, y in sorted_levels if x == bottom_dim_contribution
        ]

        target_levels = self._chiSquareTable.get_column_one_levels()

        target_counts = self._chiSquareTable.get_row_total()
        sorted_target_levels = sorted(zip(target_counts, target_levels),
                                      reverse=True)

        top_target_count, top_target = sorted_target_levels[0]
        second_target_count, second_target = sorted_target_levels[1]

        top_target_contributions = [
            table.get_value(top_target, i) for i in levels
        ]
        sum_top_target = sum(top_target_contributions)

        sorted_levels = sorted(zip(top_target_contributions, levels),
                               reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]
        top_target_top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        top_target_top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        top_target_bottom_dim = sorted_levels[-1][1]
        top_target_bottom_dim_contribution = sorted_levels[-1][0]

        top_target_percentages = [
            i * 100.0 / sum_top_target for i in top_target_contributions
        ]
        best_top_target_index = top_target_contributions.index(
            max(top_target_contributions))
        worst_top_target_index = top_target_contributions.index(
            min(top_target_contributions))
        top_target_differences = [
            x - y for x, y in zip(levels_percentages, top_target_percentages)
        ]
        if len(top_target_differences) > 6:
            tops = 2
            bottoms = -2
        elif len(top_target_differences) > 4:
            tops = 2
            bottoms = -1
        else:
            tops = 1
            bottoms = -1
        sorted_ = sorted(enumerate(top_target_differences),
                         key=lambda x: x[1],
                         reverse=True)
        best_top_difference_indices = [x for x, y in sorted_[:tops]]
        worst_top_difference_indices = [x for x, y in sorted_[bottoms:]]

        top_target_shares = [
            x * 100.0 / y
            for x, y in zip(top_target_contributions, level_counts)
        ]
        max_top_target_shares = max(top_target_shares)
        best_top_target_share_index = [
            idx for idx, val in enumerate(top_target_shares)
            if val == max_top_target_shares
        ]
        level_counts_threshold = sum(level_counts) * 0.05 / len(level_counts)
        min_top_target_shares = min([
            x for x, y in zip(top_target_shares, level_counts)
            if y >= level_counts_threshold
        ])
        worst_top_target_share_index = [
            idx for idx, val in enumerate(top_target_shares)
            if val == min_top_target_shares
        ]
        overall_top_percentage = sum_top_target * 100.0 / total

        second_target_contributions = [
            table.get_value(second_target, i) for i in levels
        ]
        sum_second_target = sum(second_target_contributions)

        sorted_levels = sorted(zip(second_target_contributions, levels),
                               reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]
        second_target_top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        second_target_top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        second_target_bottom_dim = sorted_levels[-1][1]
        second_target_bottom_dim_contribution = sorted_levels[-1][0]

        second_target_percentages = [
            i * 100.0 / sum_second_target for i in second_target_contributions
        ]
        best_second_target_index = second_target_contributions.index(
            max(second_target_contributions))
        worst_second_target_index = second_target_contributions.index(
            min(second_target_contributions))
        second_target_differences = [
            x - y
            for x, y in zip(levels_percentages, second_target_percentages)
        ]
        if len(second_target_differences) > 6:
            tops = 2
            bottoms = -2
        elif len(second_target_differences) > 4:
            tops = 2
            bottoms = -1
        else:
            tops = 1
            bottoms = -1
        sorted_ = sorted(enumerate(second_target_differences),
                         key=lambda x: x[1],
                         reverse=True)
        best_second_difference_indices = [x for x, y in sorted_[:tops]]
        worst_second_difference_indices = [x for x, y in sorted_[bottoms:]]

        second_target_shares = [
            x * 100.0 / y
            for x, y in zip(second_target_contributions, level_counts)
        ]
        max_second_target_shares = max(second_target_shares)
        best_second_target_share_index = [
            idx for idx, val in enumerate(second_target_shares)
            if val == max_second_target_shares
        ]
        level_counts_threshold = sum(level_counts) * 0.05 / len(level_counts)
        min_second_target_shares = min([
            x for x, y in zip(second_target_shares, level_counts)
            if y >= level_counts_threshold
        ])
        # worst_second_target_share_index = second_target_shares.index(min_second_target_shares)
        worst_second_target_share_index = [
            idx for idx, val in enumerate(second_target_shares)
            if val == min_second_target_shares
        ]
        overall_second_percentage = sum_second_target * 100.0 / total

        targetCardDataDict = {}
        targetCardDataDict['target'] = target_dimension
        targetCardDataDict['colname'] = analysed_dimension
        targetCardDataDict['num_significant'] = len(significant_variables)
        targetCardDataDict['plural_colname'] = NarrativesUtils.pluralize(
            analysed_dimension)

        targetCardDataDict["blockSplitter"] = self._blockSplitter
        targetCardDataDict["binTargetCol"] = self._binTargetCol
        targetCardDataDict["binAnalyzedCol"] = self._binAnalyzedCol
        targetCardDataDict['highlightFlag'] = self._highlightFlag
        targetCardDataDict['levels'] = levels

        data_dict = {}
        data_dict[
            'best_second_difference'] = best_second_difference_indices  ##these changed
        data_dict['worst_second_difference'] = worst_second_difference_indices
        data_dict['best_top_difference'] = best_top_difference_indices
        data_dict['worst_top_difference'] = worst_top_difference_indices
        data_dict['levels_percentages'] = levels_percentages
        data_dict['top_target_percentages'] = top_target_percentages
        data_dict['second_target_percentages'] = second_target_percentages
        data_dict['levels'] = levels
        data_dict['best_top_share'] = best_top_target_share_index
        data_dict['worst_top_share'] = worst_top_target_share_index
        data_dict['best_second_share'] = best_second_target_share_index
        data_dict['worst_second_share'] = worst_second_target_share_index
        data_dict['top_target_shares'] = top_target_shares
        data_dict['second_target_shares'] = second_target_shares
        data_dict['overall_second'] = overall_second_percentage
        data_dict['overall_top'] = overall_top_percentage

        data_dict['num_significant'] = len(significant_variables)
        data_dict['colname'] = analysed_dimension
        data_dict['plural_colname'] = NarrativesUtils.pluralize(
            analysed_dimension)
        data_dict['target'] = target_dimension
        data_dict['top_levels'] = top_dims
        data_dict['top_levels_percent'] = round(
            top_dims_contribution * 100.0 / total, 1)
        data_dict['bottom_level'] = bottom_dim
        data_dict['bottom_levels'] = bottom_dims
        data_dict['bottom_level_percent'] = round(
            bottom_dim_contribution * 100 / sum(level_counts), 2)
        data_dict['second_target'] = second_target
        data_dict['second_target_top_dims'] = second_target_top_dims
        data_dict[
            'second_target_top_dims_contribution'] = second_target_top_dims_contribution * 100.0 / sum(
                second_target_contributions)
        data_dict['second_target_bottom_dim'] = second_target_bottom_dim
        data_dict[
            'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution
        data_dict['best_second_target'] = levels[best_second_target_index]
        data_dict['best_second_target_count'] = second_target_contributions[
            best_second_target_index]
        data_dict['best_second_target_percent'] = round(
            second_target_contributions[best_second_target_index] * 100.0 /
            sum(second_target_contributions), 2)
        data_dict['worst_second_target'] = levels[worst_second_target_index]
        data_dict['worst_second_target_percent'] = round(
            second_target_contributions[worst_second_target_index] * 100.0 /
            sum(second_target_contributions), 2)

        data_dict['top_target'] = top_target
        data_dict['top_target_top_dims'] = top_target_top_dims
        data_dict[
            'top_target_top_dims_contribution'] = top_target_top_dims_contribution * 100.0 / sum(
                top_target_contributions)
        data_dict['top_target_bottom_dim'] = top_target_bottom_dim
        data_dict[
            'top_target_bottom_dim_contribution'] = top_target_bottom_dim_contribution
        data_dict['best_top_target'] = levels[best_top_target_index]
        data_dict['best_top_target_count'] = top_target_contributions[
            best_top_target_index]
        data_dict['best_top_target_percent'] = round(
            top_target_contributions[best_top_target_index] * 100.0 /
            sum(top_target_contributions), 2)
        data_dict['worst_top_target'] = levels[worst_top_target_index]
        data_dict['worst_top_target_percent'] = round(
            top_target_contributions[worst_top_target_index] * 100.0 /
            sum(top_target_contributions), 2)

        data_dict["blockSplitter"] = self._blockSplitter
        data_dict["binTargetCol"] = self._binTargetCol
        data_dict["binAnalyzedCol"] = self._binAnalyzedCol
        data_dict['highlightFlag'] = self._highlightFlag

        ###############
        #     CARD1   #
        ###############

        print "self._binTargetCol & self._binAnalyzedCol : ", self._binTargetCol, self._binAnalyzedCol
        if (self._binTargetCol == True & self._binAnalyzedCol == False):
            print "Only Target Column is Binned, : ", self._binTargetCol
            output = NarrativesUtils.block_splitter(
                NarrativesUtils.get_template_output(
                    self._base_dir, 'card1_binned_target.html', data_dict),
                self._blockSplitter,
                highlightFlag=self._highlightFlag)
        elif (self._binTargetCol == True & self._binAnalyzedCol == True):
            print "Target Column and IV is Binned : ", self._binTargetCol, self._binAnalyzedCol
            output = NarrativesUtils.block_splitter(
                NarrativesUtils.get_template_output(
                    self._base_dir, 'card1_binned_target_and_IV.html',
                    data_dict),
                self._blockSplitter,
                highlightFlag=self._highlightFlag)
        else:
            output = NarrativesUtils.block_splitter(
                NarrativesUtils.get_template_output(self._base_dir,
                                                    'card1.html', data_dict),
                self._blockSplitter,
                highlightFlag=self._highlightFlag)

        targetDimCard1Data = []
        targetDimcard1Heading = '<h3>Relationship between ' + self._target_dimension + '  and ' + self._analysed_dimension + "</h3>"

        toggledata = ToggleData()

        targetDimTable1Data = self.generate_card1_table1()
        targetDimCard1Table1 = TableData()
        targetDimCard1Table1.set_table_type("heatMap")
        targetDimCard1Table1.set_table_data(targetDimTable1Data)
        toggledata.set_toggleon_data({
            "data": {
                "tableData": targetDimTable1Data,
                "tableType": "heatMap"
            },
            "dataType": "table"
        })

        targetDimTable2Data = self.generate_card1_table2()
        targetDimCard1Table2 = TableData()
        targetDimCard1Table2.set_table_type("normal")
        table2Data = targetDimTable2Data["data1"]
        table2Data = [
            innerList[1:] for innerList in table2Data
            if innerList[0].strip() != ""
        ]
        targetDimCard1Table2.set_table_data(table2Data)

        toggledata.set_toggleoff_data({
            "data": {
                "tableData": table2Data,
                "tableType": "heatMap"
            },
            "dataType": "table"
        })

        targetDimCard1Data.append(HtmlData(data=targetDimcard1Heading))
        targetDimCard1Data.append(toggledata)
        targetDimCard1Data += output

        self._card1.set_card_data(targetDimCard1Data)
        self._card1.set_card_name("{}: Relationship with {}".format(
            self._analysed_dimension, self._target_dimension))

        ###############
        #     CARD2   #
        ###############

        if self._appid == None:

            key_factors = ''
            num_key_factors = len(self._second_level_dimensions)

            if len(self._second_level_dimensions) == 5:
                key_factors = ', '.join(
                    self._second_level_dimensions[:4]
                ) + ' and ' + self._second_level_dimensions[4]
            elif len(self._second_level_dimensions) == 4:
                key_factors = ', '.join(
                    self._second_level_dimensions[:3]
                ) + ' and ' + self._second_level_dimensions[3]
            elif len(self._second_level_dimensions) == 3:
                key_factors = ', '.join(
                    self._second_level_dimensions[:2]
                ) + ' and ' + self._second_level_dimensions[2]
            elif len(self._second_level_dimensions) == 2:
                key_factors = ' and '.join(self._second_level_dimensions)
            elif len(self._second_level_dimensions) == 1:
                key_factors = self._second_level_dimensions[0]

            targetCardDataDict['num_key_factors'] = num_key_factors
            targetCardDataDict['key_factors'] = key_factors
            dict_for_test = {}
            for tupleObj in sorted_target_levels[:self._chiSquareLevelLimit]:
                targetLevel = tupleObj[1]

                targetCardDataDict['random_card2'] = random.randint(1, 100)
                targetCardDataDict['random_card4'] = random.randint(1, 100)

                second_target_contributions = [
                    table.get_value(targetLevel, i) for i in levels
                ]
                sum_second_target = sum(second_target_contributions)

                sorted_levels = sorted(zip(second_target_contributions,
                                           levels),
                                       reverse=True)

                level_differences = [0.0] + [
                    sorted_levels[i][0] - sorted_levels[i + 1][0]
                    for i in range(len(sorted_levels) - 1)
                ]
                second_target_top_dims = [
                    j for i, j in sorted_levels[:level_differences.
                                                index(max(level_differences))]
                ]
                second_target_top_dims_contribution = sum([
                    i for i, j in sorted_levels[:level_differences.
                                                index(max(level_differences))]
                ])
                second_target_bottom_dim = sorted_levels[-1][1]
                second_target_bottom_dim_contribution = sorted_levels[-1][0]

                second_target_percentages = [
                    i * 100.0 / sum_second_target
                    for i in second_target_contributions
                ]
                best_second_target_index = second_target_contributions.index(
                    max(second_target_contributions))
                worst_second_target_index = second_target_contributions.index(
                    min(second_target_contributions))
                second_target_differences = [
                    x - y for x, y in zip(levels_percentages,
                                          second_target_percentages)
                ]
                if len(second_target_differences) > 6:
                    tops = 2
                    bottoms = -2
                elif len(second_target_differences) > 4:
                    tops = 2
                    bottoms = -1
                else:
                    tops = 1
                    bottoms = -1
                sorted_ = sorted(enumerate(second_target_differences),
                                 key=lambda x: x[1],
                                 reverse=True)
                best_second_difference_indices = [x for x, y in sorted_[:tops]]
                worst_second_difference_indices = [
                    x for x, y in sorted_[bottoms:]
                ]

                second_target_shares = [
                    x * 100.0 / y
                    for x, y in zip(second_target_contributions, level_counts)
                ]
                max_second_target_shares = max(second_target_shares)
                best_second_target_share_index = [
                    idx for idx, val in enumerate(second_target_shares)
                    if val == max_second_target_shares
                ]
                level_counts_threshold = sum(level_counts) * 0.05 / len(
                    level_counts)
                min_second_target_shares = min([
                    x for x, y in zip(second_target_shares, level_counts)
                    if y >= level_counts_threshold
                ])
                worst_second_target_share_index = [
                    idx for idx, val in enumerate(second_target_shares)
                    if val == min_second_target_shares
                ]
                overall_second_percentage = sum_second_target * 100.0 / total

                # DataFrame for contribution calculation

                df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\
                                        filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                                        select(self._second_level_dimensions).toPandas()
                df_second_dim = self._data_frame.filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                                    select(self._second_level_dimensions).toPandas()

                # if self._chisquare_result.get_splits():
                #     splits = self._chisquare_result.get_splits()
                #     idx = self._chiSquareTable.get_bin_names(splits).index(second_target_top_dims[0])
                #     idx1 = self._chiSquareTable.get_bin_names(splits).index(top_target_top_dims[0])
                #     splits[len(splits)-1] = splits[len(splits)-1]+1
                #     df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\
                #                         filter(col(self._analysed_dimension)>=splits[idx]).filter(col(self._analysed_dimension)<splits[idx+1]).\
                #                         select(self._second_level_dimensions).toPandas()
                #     df_second_dim = self._data_frame.filter(col(self._analysed_dimension)>=splits[idx]).\
                #                     filter(col(self._analysed_dimension)<splits[idx+1]).\
                #                     select(self._second_level_dimensions).toPandas()
                # else:
                #     df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\
                #                         filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                #                         select(self._second_level_dimensions).toPandas()
                #     df_second_dim = self._data_frame.filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                #                     select(self._second_level_dimensions).toPandas()

                # print self._data_frame.select('Sales').show()

                distribution_second = []
                for d in self._second_level_dimensions:

                    grouped = df_second_target.groupby(d).agg({
                        d: 'count'
                    }).sort_values(d, ascending=False)
                    contributions = df_second_dim.groupby(d).agg({d: 'count'})
                    contribution_index = list(contributions.index)
                    contributions_val = contributions[d].tolist()
                    contributions_list = dict(
                        zip(contribution_index, contributions_val))
                    index_list = list(grouped.index)
                    grouped_list = grouped[d].tolist()
                    contributions_percent_list = [
                        round(y * 100.0 / contributions_list[x], 2)
                        for x, y in zip(index_list, grouped_list)
                    ]
                    sum_ = grouped[d].sum()
                    diffs = [0] + [
                        grouped_list[i] - grouped_list[i + 1]
                        for i in range(len(grouped_list) - 1)
                    ]
                    max_diff = diffs.index(max(diffs))

                    index_txt = ''
                    if max_diff == 1:
                        index_txt = index_list[0]
                    elif max_diff == 2:
                        index_txt = index_list[0] + '(' + str(
                            round(grouped_list[0] * 100.0 / sum_, 1)
                        ) + '%)' + ' and ' + index_list[1] + '(' + str(
                            round(grouped_list[1] * 100.0 / sum_, 1)) + '%)'
                    elif max_diff > 2:
                        index_txt = 'including ' + index_list[0] + '(' + str(
                            round(grouped_list[0] * 100.0 / sum_, 1)
                        ) + '%)' + ' and ' + index_list[1] + '(' + str(
                            round(grouped_list[1] * 100.0 / sum_, 1)) + '%)'
                    distribution_second.append({'contributions':[round(i*100.0/sum_,2) for i in grouped_list[:max_diff]],\
                                            'levels': index_list[:max_diff],'variation':random.randint(1,100),\
                                            'index_txt': index_txt, 'd':d,'contributions_percent':contributions_percent_list})

                targetCardDataDict['distribution_second'] = distribution_second
                targetCardDataDict['second_target'] = targetLevel
                targetCardDataDict[
                    'second_target_top_dims'] = second_target_top_dims
                targetCardDataDict[
                    'second_target_top_dims_contribution'] = second_target_top_dims_contribution * 100.0 / sum(
                        second_target_contributions)
                targetCardDataDict[
                    'second_target_bottom_dim'] = second_target_bottom_dim
                targetCardDataDict[
                    'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution
                targetCardDataDict['best_second_target'] = levels[
                    best_second_target_index]
                targetCardDataDict[
                    'best_second_target_count'] = second_target_contributions[
                        best_second_target_index]
                targetCardDataDict['best_second_target_percent'] = round(
                    second_target_contributions[best_second_target_index] *
                    100.0 / sum(second_target_contributions), 2)
                targetCardDataDict['worst_second_target'] = levels[
                    worst_second_target_index]
                targetCardDataDict['worst_second_target_percent'] = round(
                    second_target_contributions[worst_second_target_index] *
                    100.0 / sum(second_target_contributions), 2)

                card2Data = []
                targetLevelContributions = [
                    table.get_value(targetLevel, i) for i in levels
                ]
                card2Heading = '<h3>Distribution of ' + self._target_dimension + ' (' + targetLevel + ') across ' + self._analysed_dimension + "</h3>"
                chart, bubble = self.generate_distribution_card_chart(
                    targetLevel, targetLevelContributions, levels,
                    level_counts, total)
                card2ChartData = NormalChartData(data=chart["data"])
                card2ChartJson = ChartJson()
                card2ChartJson.set_data(card2ChartData.get_data())
                card2ChartJson.set_chart_type("combination")
                card2ChartJson.set_types({
                    "total": "bar",
                    "percentage": "line"
                })
                card2ChartJson.set_legend({
                    "total": "# of " + targetLevel,
                    "percentage": "% of " + targetLevel
                })
                card2ChartJson.set_axes({
                    "x": "key",
                    "y": "total",
                    "y2": "percentage"
                })
                card2ChartJson.set_label_text({
                    "x": " ",
                    "y": "Count",
                    "y2": "Percentage"
                })
                print "self._binTargetCol & self._binAnalyzedCol : ", self._binTargetCol, self._binAnalyzedCol
                if (self._binTargetCol == True & self._binAnalyzedCol ==
                        False):
                    print "Only Target Column is Binned"
                    output2 = NarrativesUtils.block_splitter(
                        NarrativesUtils.get_template_output(
                            self._base_dir, 'card2_binned_target.html',
                            targetCardDataDict), self._blockSplitter)
                elif (self._binTargetCol == True & self._binAnalyzedCol ==
                      True):
                    print "Target Column and IV is Binned"
                    output2 = NarrativesUtils.block_splitter(
                        NarrativesUtils.get_template_output(
                            self._base_dir, 'card2_binned_target_and_IV.html',
                            targetCardDataDict), self._blockSplitter)
                else:
                    print "In Else, self._binTargetCol should be False : ", self._binTargetCol
                    output2 = NarrativesUtils.block_splitter(
                        NarrativesUtils.get_template_output(
                            self._base_dir, 'card2.html', targetCardDataDict),
                        self._blockSplitter)

                card2Data.append(HtmlData(data=card2Heading))
                statistical_info_array = [
                    ("Test Type", "Chi-Square"),
                    ("Chi-Square statistic",
                     str(round(self._chisquare_result.get_stat(), 3))),
                    ("P-Value",
                     str(round(self._chisquare_result.get_pvalue(), 3))),
                    ("Inference",
                     "Chi-squared analysis shows a significant association between {} (target) and {}."
                     .format(self._target_dimension, self._analysed_dimension))
                ]
                statistical_info_array = NarrativesUtils.statistical_info_array_formatter(
                    statistical_info_array)

                card2Data.append(
                    C3ChartData(data=card2ChartJson,
                                info=statistical_info_array))
                card2Data += output2
                card2BubbleData = "<div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}</span><br /><small>{}</small></h2></div><div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}</span><br /><small>{}</small></h2></div>".format(
                    bubble[0]["value"], bubble[0]["text"], bubble[1]["value"],
                    bubble[1]["text"])
                card2Data.append(HtmlData(data=card2BubbleData))
                targetCard = NormalCard()
                targetCard.set_card_data(card2Data)
                targetCard.set_card_name("{} : Distribution of {}".format(
                    self._analysed_dimension, targetLevel))
                self._targetCards.append(targetCard)
                dict_for_test[targetLevel] = targetCardDataDict
        out = {'data_dict': data_dict, 'target_dict': dict_for_test}

        return out

    # def generate_card2_narratives(self):

    def generate_distribution_card_chart(self, __target,
                                         __target_contributions, levels,
                                         levels_count, total):
        chart = {}
        label = {'total': '# of ' + __target, 'percentage': '% of ' + __target}
        label_text = {
            'x': self._analysed_dimension,
            'y': '# of ' + __target,
            'y2': '% of ' + __target,
        }
        data = {}
        data['total'] = dict(zip(levels, __target_contributions))
        __target_percentages = [
            x * 100.0 / y for x, y in zip(__target_contributions, levels_count)
        ]
        data['percentage'] = dict(zip(levels, __target_percentages))
        chartData = []
        for val in zip(levels, __target_contributions, __target_percentages):
            chartData.append({
                "key": val[0],
                "total": val[1],
                "percentage": val[2]
            })
        # c3_data = [levels,__target_contributions,__target_percentages]
        chart_data = {'label': label, 'data': chartData}
        bubble_data1 = {}
        bubble_data2 = {}
        bubble_data1['value'] = str(
            round(
                max(__target_contributions) * 100.0 /
                sum(__target_contributions), 1)) + '%'
        m_index = __target_contributions.index(max(__target_contributions))
        bubble_data1[
            'text'] = 'Overall ' + __target + ' comes from ' + levels[m_index]

        bubble_data2['value'] = str(round(max(__target_percentages), 1)) + '%'
        m_index = __target_percentages.index(max(__target_percentages))
        bubble_data2[
            'text'] = levels[m_index] + ' has the highest rate of ' + __target

        bubble_data = [bubble_data1, bubble_data2]
        return chart_data, bubble_data

    def generate_card1_table1(self):
        table_percent_by_column = self._chiSquareTable.table_percent_by_column
        column_two_values = self._chiSquareTable.column_two_values
        header_row = [self._analysed_dimension
                      ] + self._chiSquareTable.get_column_one_levels()
        all_columns = [column_two_values] + table_percent_by_column
        other_rows = zip(*all_columns)
        other_rows = [list(tup) for tup in other_rows]
        table_data = [header_row] + other_rows
        return table_data

    def generate_card1_table2(self):
        table = self._chiSquareTable.table
        table_percent = self._chiSquareTable.table_percent
        table_percent_by_row = self._chiSquareTable.table_percent_by_row
        table_percent_by_column = self._chiSquareTable.table_percent_by_column
        target_levels = self._chiSquareTable.get_column_one_levels()
        dim_levels = self._chiSquareTable.get_column_two_levels()

        header1 = [self._analysed_dimension] + target_levels + ['Total']
        header = ['State', 'Active', 'Churn', 'Total']  #TODO remove
        data = []
        data1 = [['Tag'] + header1]

        for idx, lvl in enumerate(dim_levels):
            first_row = ['Tag'] + header
            col_2_vals = zip(*table)[idx]
            data2 = ['bold'] + [lvl] + list(col_2_vals) + [sum(col_2_vals)]

            dict_ = dict(zip(first_row, data2))
            data.append(dict_)
            data1.append(data2)

            col_2_vals = zip(*table_percent_by_column)[idx]
            data2 = [''] + ['As % within ' + self._analysed_dimension
                            ] + list(col_2_vals) + [100.0]
            dict_ = dict(zip(first_row, data2))
            data.append(dict_)
            data1.append(data2)

            col_2_vals = zip(*table_percent_by_row)[idx]
            col_2_vals1 = zip(*table_percent)[idx]
            data2 = [''] + [
                'As % within ' + self._target_dimension
            ] + list(col_2_vals) + [round(sum(col_2_vals1), 2)]
            dict_ = dict(zip(first_row, data2))
            data.append(dict_)
            data1.append(data2)
            # col_2_vals = zip(*table_percent)[idx]
            data2 = [''] + ['As % of Total'] + list(col_2_vals1) + [
                round(sum(col_2_vals1), 2)
            ]
            dict_ = dict(zip(first_row, data2))
            data.append(dict_)
            data1.append(data2)

        out = {
            'header': header,
            'header1': header1,
            'data': data,
            'label': self._analysed_dimension,
            'data1': data1
        }
        return out
コード例 #9
0
class ChiSquareNarratives(object):
    #@accepts(object, int, DFChiSquareResult ,ContextSetter)
    def __init__(self,
                 df_helper,
                 df_chisquare_result,
                 spark,
                 df_context,
                 data_frame,
                 story_narrative,
                 result_setter,
                 scriptWeight=None,
                 analysisName=None):
        self._story_narrative = story_narrative
        self._result_setter = result_setter
        self._data_frame = data_frame
        self._dataframe_context = df_context
        self._pandas_flag = df_context._pandas_flag
        self._dataframe_helper = df_helper
        self._storyOnScoredData = self._dataframe_context.get_story_on_scored_data(
        )
        self._measure_columns = df_helper.get_numeric_columns()
        self._df_chisquare = df_chisquare_result
        self._df_chisquare_result = df_chisquare_result.get_result()
        self.narratives = {}
        self._appid = df_context.get_app_id()
        self._chiSquareNode = NarrativesTree()
        self._chiSquareNode.set_name("Key Drivers")
        self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER
        self._noOfSigDimsToShow = GLOBALSETTINGS.CHISQUARESIGNIFICANTDIMENSIONTOSHOW
        self._base_dir = "/chisquare/"
        self._spark = spark

        ############################DataFrame Measure to Dimesion Column#####################

        if self._pandas_flag:
            pandas_df = self._data_frame.copy(deep=True)
        else:
            pandas_df = self._data_frame.toPandas()
        target_dimension = list(self._df_chisquare_result.keys())

        bin_data = {}
        for col in self._measure_columns:
            if self._df_chisquare.get_chisquare_result(target_dimension[0],
                                                       col):
                chisquare_result = self._df_chisquare.get_chisquare_result(
                    target_dimension[0], col)
                bin_data[col] = chisquare_result.get_contingency_table(
                ).get_column_two_levels()

        for bin_col in list(bin_data.keys()):
            for split in bin_data[bin_col]:
                val = split.split('to')
                # pandas_df[bin_col][(float(pandas_df[bin_col])>=float(val[0].replace(',',''))) & (float(pandas_df[bin_col])<float(val[1].replace(',','')))] =  split
                row_value = list(pandas_df[bin_col])
                temp = []
                for row_value_ in row_value:
                    if not isinstance(row_value_, str)  and  \
                      (float(row_value_) >= float(val[0].replace(',','')))   and   \
                      (float(row_value_) <  float(val[1].replace(',',''))):
                        temp.append(split)
                    else:
                        temp.append(row_value_)
                pandas_df[bin_col] = temp
        if self._pandas_flag:
            pass
            # self._data_frame = pandas_df
        else:
            fields = [
                StructField(field_name, StringType(), True)
                for field_name in pandas_df.columns
            ]
            schema = StructType(fields)

            SQLctx = SQLContext(sparkContext=self._spark.sparkContext,
                                sparkSession=self._spark)
            self._data_frame = SQLctx.createDataFrame(pandas_df, schema)

        # print self._data_frame
        ############################DataFrame Measure to Dimesion Column#####################

        if self._appid != None:
            if self._appid == "1":
                self._base_dir += "appid1/"
            elif self._appid == "2":
                self._base_dir += "appid2/"

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        if analysisName == None:
            self._analysisName = self._dataframe_context.get_analysis_name()
        else:
            self._analysisName = analysisName

        self._messageURL = self._dataframe_context.get_message_url()
        if scriptWeight == None:
            self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight(
            )
        else:
            self._scriptWeightDict = scriptWeight
        self._analysisDict = self._dataframe_context.get_analysis_dict()
        if self._analysisDict != {}:
            self._nColsToUse = self._analysisDict[
                self._analysisName]["noOfColumnsToUse"]
        else:
            self._nColsToUse = None

        self._scriptStages = {
            "initialization": {
                "summary": "Initialized the Frequency Narratives",
                "weight": 0
            },
            "summarygeneration": {
                "summary": "Summary Generation Finished",
                "weight": 4
            },
            "completion": {
                "summary": "Frequency Stats Narratives Done",
                "weight": 0
            },
        }
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "initialization",
            "info",
            display=False,
            weightKey="narratives")
        self.new_effect_size, self.signi_dict = self.feat_imp_threshold(
            target_dimension)
        self._generate_narratives()

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "summarygeneration",
            "info",
            display=False,
            weightKey="narratives")

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "completion",
            "info",
            display=False,
            weightKey="narratives")

    def feat_imp_threshold(self,
                           target_dimension,
                           dummy_Cols=True,
                           label_encoding=False):
        if self._pandas_flag:
            if is_numeric_dtype(self._data_frame[target_dimension[0]]):
                self.app_type = 'regression'
            elif is_string_dtype(self._data_frame[target_dimension[0]]):
                self.app_type = 'classification'
        else:
            if self._data_frame.select(
                    target_dimension[0]).dtypes[0][1] == 'string':
                self.app_type = 'classification'
            elif self._data_frame.select(
                    target_dimension[0]).dtypes[0][1] in ['int', 'double']:
                self.app_type = 'regression'
        try:
            DataValidation_obj = DataValidation(self._data_frame,
                                                target_dimension[0],
                                                self.app_type,
                                                self._pandas_flag)
            DataValidation_obj.data_validation_run()
        except Exception as e:
            CommonUtils.print_errors_and_store_traceback(
                self.LOGGER, "datavalidation", e)
            CommonUtils.save_error_messages(self.errorURL,
                                            self.app_type,
                                            e,
                                            ignore=self.ignoreMsg)
        try:
            DataPreprocessingAutoML_obj = DataPreprocessingAutoML(
                DataValidation_obj.data_frame, DataValidation_obj.target,
                DataValidation_obj.data_change_dict,
                DataValidation_obj.numeric_cols,
                DataValidation_obj.dimension_cols,
                DataValidation_obj.datetime_cols,
                DataValidation_obj.problem_type, self._pandas_flag)
            DataPreprocessingAutoML_obj.data_preprocessing_run()
        except Exception as e:
            CommonUtils.print_errors_and_store_traceback(
                self.LOGGER, "dataPreprocessing", e)
            CommonUtils.save_error_messages(self.errorURL,
                                            self.app_type,
                                            e,
                                            ignore=self.ignoreMsg)
        preprocess_df = DataPreprocessingAutoML_obj.data_frame
        FeatureEngineeringAutoML_obj = FeatureEngineeringAutoML(
            DataPreprocessingAutoML_obj.data_frame,
            DataPreprocessingAutoML_obj.target,
            DataPreprocessingAutoML_obj.data_change_dict,
            DataPreprocessingAutoML_obj.numeric_cols,
            DataPreprocessingAutoML_obj.dimension_cols,
            DataPreprocessingAutoML_obj.datetime_cols,
            DataPreprocessingAutoML_obj.problem_type, self._pandas_flag)
        if FeatureEngineeringAutoML_obj.datetime_cols != 0:
            FeatureEngineeringAutoML_obj.date_column_split(
                FeatureEngineeringAutoML_obj.datetime_cols)
        if dummy_Cols:
            if self._pandas_flag:
                FeatureEngineeringAutoML_obj.sk_one_hot_encoding(
                    FeatureEngineeringAutoML_obj.dimension_cols)
                clean_df = FeatureEngineeringAutoML_obj.data_frame
            else:
                FeatureEngineeringAutoML_obj.pyspark_one_hot_encoding(
                    FeatureEngineeringAutoML_obj.dimension_cols)
                clean_df = FeatureEngineeringAutoML_obj.data_frame
        if label_encoding:
            if self._pandas_flag:
                for column_name in FeatureEngineeringAutoML_obj.dimension_cols:
                    preprocess_df[
                        column_name +
                        '_label_encoded'] = LabelEncoder().fit_transform(
                            preprocess_df[column_name])
                    preprocess_df = preprocess_df.drop(column_name, 1)
                clean_df = preprocess_df
            else:
                FeatureEngineeringAutoML_obj.pyspark_label_encoding(
                    FeatureEngineeringAutoML_obj.dimension_cols)
                clean_df = FeatureEngineeringAutoML_obj.data_frame
        if self._pandas_flag:
            ind_var = clean_df.drop(target_dimension[0], 1)
            ind_var = ind_var[ind_var._get_numeric_data().columns]
            target = clean_df[target_dimension[0]]
            dtree = DecisionTreeClassifier(criterion='gini',
                                           max_depth=5,
                                           random_state=42)
            dtree.fit(ind_var, target)
            feat_imp_dict = {}
            for feature, importance in zip(list(ind_var.columns),
                                           dtree.feature_importances_):
                feat_imp_dict[feature] = round(importance, 2)
        else:
            num_var = [
                col[0] for col in clean_df.dtypes
                if ((col[1] == 'int') | (col[1] == 'double'))
                & (col[0] != target_dimension[0])
            ]
            num_var = [col for col in num_var if not col.endswith('indexed')]
            labels_count = [
                len(clean_df.select(col).distinct().collect())
                for col in num_var
            ]
            # labels_count = [len(clean_df.agg((F.collect_set(col).alias(col))).first().asDict()[col]) for col in num_var]
            labels_count.sort()
            max_count = labels_count[-1]
            label_indexes = StringIndexer(inputCol=target_dimension[0],
                                          outputCol='label',
                                          handleInvalid='keep')
            assembler = VectorAssembler(inputCols=num_var,
                                        outputCol="features")
            model = pysparkDecisionTreeClassifier(labelCol="label",
                                                  featuresCol="features",
                                                  seed=8464,
                                                  impurity='gini',
                                                  maxDepth=5,
                                                  maxBins=max_count + 2)
            pipe = Pipeline(stages=[assembler, label_indexes, model])
            mod_fit = pipe.fit(clean_df)
            df2 = mod_fit.transform(clean_df)
            list_extract = []
            for i in df2.schema["features"].metadata["ml_attr"]["attrs"]:
                list_extract = list_extract + df2.schema["features"].metadata[
                    "ml_attr"]["attrs"][i]
            varlist = pd.DataFrame(list_extract)
            varlist['score'] = varlist['idx'].apply(
                lambda x: mod_fit.stages[-1].featureImportances[x])
            feat_imp_dict = pd.Series(varlist.score.values,
                                      index=varlist.name).to_dict()
        feat_imp_ori_dict = {}
        actual_cols = list(self._data_frame.columns)
        actual_cols.remove(target_dimension[0])
        for col in actual_cols:
            fea_imp_ori_list = []
            for col_imp in feat_imp_dict:
                temp = col_imp.split(col, -1)
                if len(temp) == 2:
                    fea_imp_ori_list.append(feat_imp_dict[col_imp])
            feat_imp_ori_dict.update({col: sum(fea_imp_ori_list)})
        sort_dict = dict(
            sorted(feat_imp_ori_dict.items(), key=lambda x: x[1],
                   reverse=True))
        if self._pandas_flag:
            self._data_frame = self._data_frame.apply(
                lambda col: pd.to_datetime(col, errors='ignore')
                if col.dtypes == object else col,
                axis=0)
            cat_var = [
                key for key in dict(self._data_frame.dtypes)
                if dict(self._data_frame.dtypes)[key] in ['object']
            ]
        else:
            cat_var = [
                col[0] for col in self._data_frame.dtypes if col[1] == 'string'
            ]
        cat_var.remove(target_dimension[0])
        si_var_dict = {
            key: value
            for key, value in sort_dict.items() if key in cat_var
        }
        threshold = 0
        si_var_thresh = {}
        for key, value in si_var_dict.items():
            threshold = threshold + value
            if threshold < 0.8:
                si_var_thresh[key] = value
        return feat_imp_dict, si_var_thresh

    def _generate_narratives(self):
        """
        generate main card narrative and remaining cards are generated by calling ChiSquareAnalysis class for each of analyzed dimensions
        """
        for target_dimension in list(self._df_chisquare_result.keys()):
            target_chisquare_result = self._df_chisquare_result[
                target_dimension]
            analysed_variables = list(
                target_chisquare_result.keys())  ## List of all analyzed var.
            # List of significant var out of analyzed var.
            # significant_variables = [dim for dim in list(target_chisquare_result.keys()) if target_chisquare_result[dim].get_pvalue()<=0.05]
            effect_size_dict = self.new_effect_size
            significant_variables = list(self.signi_dict.keys())
            effect_sizes = list(self.signi_dict.values())
            significant_variables = [
                y
                for (x, y) in sorted(zip(effect_sizes, significant_variables),
                                     reverse=True) if round(float(x), 2) > 0
            ]
            #insignificant_variables = [i for i in self._df_chisquare_result[target_dimension] if i['pv']>0.05]

            num_analysed_variables = len(analysed_variables)
            num_significant_variables = len(significant_variables)
            self.narratives['main_card'] = {}
            self.narratives['main_card'][
                'heading'] = 'Relationship between ' + target_dimension + ' and other factors'
            self.narratives['main_card']['paragraphs'] = {}
            data_dict = {
                'num_variables': num_analysed_variables,
                'num_significant_variables': num_significant_variables,
                'significant_variables': significant_variables,
                'target': target_dimension,
                'analysed_dimensions': analysed_variables,
                'blockSplitter': self._blockSplitter
            }  # for both para 1 and para 2
            paragraph = {}
            paragraph['header'] = ''

            paragraph['content'] = NarrativesUtils.get_template_output(
                self._base_dir, 'main_card.html', data_dict)
            self.narratives['main_card']['paragraphs'] = [paragraph]
            self.narratives['cards'] = []
            chart = {
                'header':
                'Strength of association between ' + target_dimension +
                ' and other dimensions'
            }
            chart['data'] = effect_size_dict
            chart['label_text'] = {
                'x': 'Dimensions',
                'y': 'Feature Importance'
            }

            chart_data = []
            chartDataValues = []
            for k, v in list(effect_size_dict.items()):
                "rounding the chart data for keydrivers tab"
                if round(float(v), 2) > 0:
                    chart_data.append({
                        "Attribute": k,
                        "Effect_Size": round(float(v), 2)
                    })
                    chartDataValues.append(round(float(v), 2))
            chart_data = sorted(chart_data,
                                key=lambda x: x["Effect_Size"],
                                reverse=True)
            chart_json = ChartJson()
            chart_json.set_data(chart_data)
            chart_json.set_chart_type("bar")
            # chart_json.set_label_text({'x':'Dimensions','y':'Effect Size (Cramers-V)'})
            chart_json.set_label_text({'x': '  ', 'y': 'Feature Importance'})
            chart_json.set_axis_rotation(True)
            chart_json.set_axes({"x": "Attribute", "y": "Feature Importance"})
            chart_json.set_yaxis_number_format(".2f")
            # chart_json.set_yaxis_number_format(NarrativesUtils.select_y_axis_format(chartDataValues))
            self.narratives['main_card']['chart'] = chart

            main_card = NormalCard()
            header = "<h3>Key Factors that drive " + target_dimension + "</h3>"
            main_card_data = [HtmlData(data=header)]
            main_card_narrative = NarrativesUtils.get_template_output(
                self._base_dir, 'main_card.html', data_dict)
            main_card_narrative = NarrativesUtils.block_splitter(
                main_card_narrative, self._blockSplitter)
            main_card_data += main_card_narrative
            # st_info = ["Test : Chi Square", "Threshold for p-value : 0.05", "Effect Size : Cramer's V"]
            # print "chartdata",chart_data
            if len(chart_data) > 0:
                statistical_info_array = [
                    ("Test Type", "Chi-Square"),
                    ("Effect Size", "Cramer's V"),
                    ("Max Effect Size", chart_data[0]["Attribute"]),
                    ("Min Effect Size", chart_data[-1]["Attribute"]),
                ]
                statistical_inferenc = ""
                if len(chart_data) == 1:
                    statistical_inference = "{} is the only variable that have significant association with the {} (Target) having an \
                     Effect size of {}".format(
                        chart_data[0]["Attribute"],
                        self._dataframe_context.get_result_column(),
                        round(chart_data[0]["Effect_Size"], 4))
                elif len(chart_data) == 2:
                    statistical_inference = "There are two variables ({} and {}) that have significant association with the {} (Target) and the \
                     Effect size ranges are {} and {} respectively".format(
                        chart_data[0]["Attribute"], chart_data[1]["Attribute"],
                        self._dataframe_context.get_result_column(),
                        round(chart_data[0]["Effect_Size"], 4),
                        round(chart_data[1]["Effect_Size"], 4))
                else:
                    statistical_inference = "There are {} variables that have significant association with the {} (Target) and the \
                     Effect size ranges from {} to {}".format(
                        len(chart_data),
                        self._dataframe_context.get_result_column(),
                        round(chart_data[0]["Effect_Size"], 4),
                        round(chart_data[-1]["Effect_Size"], 4))
                if statistical_inference != "":
                    statistical_info_array.append(
                        ("Inference", statistical_inference))
                statistical_info_array = NarrativesUtils.statistical_info_array_formatter(
                    statistical_info_array)
            else:
                statistical_info_array = []
            main_card_data.append(
                C3ChartData(data=chart_json, info=statistical_info_array))
            main_card.set_card_data(main_card_data)
            main_card.set_card_name("Key Influencers")

            if self._storyOnScoredData != True:
                self._chiSquareNode.add_a_card(main_card)
                self._result_setter.add_a_score_chi_card(main_card)

            print("target_dimension", target_dimension)
            if self._appid == '2' and num_significant_variables > 5:
                significant_variables = significant_variables[:5]
            else:
                if self._nColsToUse != None:
                    significant_variables = significant_variables[:self.
                                                                  _nColsToUse]
                    nColsToUse_temp = self._nColsToUse
                else:
                    nColsToUse_temp = self._noOfSigDimsToShow

            CommonUtils.create_update_and_save_progress_message(
                self._dataframe_context,
                self._scriptWeightDict,
                self._scriptStages,
                self._analysisName,
                "custom",
                "info",
                display=True,
                customMsg="Analyzing key drivers",
                weightKey="narratives")
            for analysed_dimension in significant_variables[:nColsToUse_temp]:
                chisquare_result = self._df_chisquare.get_chisquare_result(
                    target_dimension, analysed_dimension)
                if self._appid == '2':
                    print("APPID 2 is used")
                    card = ChiSquareAnalysis(
                        self._dataframe_context, self._dataframe_helper,
                        chisquare_result, target_dimension, analysed_dimension,
                        significant_variables, num_analysed_variables,
                        self._data_frame, self._measure_columns,
                        self._base_dir, None, target_chisquare_result)
                    # self.narratives['cards'].append(card)
                    self._result_setter.add_a_score_chi_card(
                        json.loads(
                            CommonUtils.convert_python_object_to_json(
                                card.get_dimension_card1())))

                elif self._appid == '1':
                    print("APPID 1 is used")
                    card = ChiSquareAnalysis(
                        self._dataframe_context, self._dataframe_helper,
                        chisquare_result, target_dimension, analysed_dimension,
                        significant_variables, num_analysed_variables,
                        self._data_frame, self._measure_columns,
                        self._base_dir, None, target_chisquare_result)
                    # self.narratives['cards'].append(card)
                    self._result_setter.add_a_score_chi_card(
                        json.loads(
                            CommonUtils.convert_python_object_to_json(
                                card.get_dimension_card1())))
                else:
                    target_dimension_card = ChiSquareAnalysis(
                        self._dataframe_context, self._dataframe_helper,
                        chisquare_result, target_dimension, analysed_dimension,
                        significant_variables, num_analysed_variables,
                        self._data_frame, self._measure_columns,
                        self._base_dir, None, target_chisquare_result)
                    self.narratives['cards'].append(target_dimension_card)
                    self._chiSquareNode.add_a_node(
                        target_dimension_card.get_dimension_node())
        self._story_narrative.add_a_node(self._chiSquareNode)
        self._result_setter.set_chisquare_node(self._chiSquareNode)
コード例 #10
0
class ChiSquareAnalysis(object):
    def __init__(self,
                 df_context,
                 df_helper,
                 chisquare_result,
                 target_dimension,
                 analysed_dimension,
                 significant_variables,
                 num_analysed_variables,
                 data_frame,
                 measure_columns,
                 base_dir,
                 appid=None,
                 target_chisquare_result=None):
        self._blockSplitter = "|~NEWBLOCK~|"
        self._highlightFlag = "|~HIGHLIGHT~|"
        self._dimensionNode = NarrativesTree()
        self._dimensionNode.set_name(target_dimension)
        self._data_frame = data_frame
        self._dataframe_context = df_context
        self._pandas_flag = df_context._pandas_flag
        self._dataframe_helper = df_helper
        self._chisquare_result = chisquare_result
        self._target_dimension = target_dimension
        self._analysed_dimension = analysed_dimension
        self._significant_variables = significant_variables
        self._target_chisquare_result = target_chisquare_result
        self._measure_columns = self._dataframe_helper.get_numeric_columns()
        self._chiSquareLevelLimit = GLOBALSETTINGS.CHISQUARELEVELLIMIT

        self._num_analysed_variables = num_analysed_variables
        self._chiSquareTable = chisquare_result.get_contingency_table()

        significant_variables = list(
            set(significant_variables) - {analysed_dimension})
        if len(significant_variables) <= 20:
            if len(significant_variables) <= 3:
                self._second_level_dimensions = list(significant_variables)
            else:
                self._second_level_dimensions = list(significant_variables)[:3]
        else:
            self._second_level_dimensions = list(significant_variables)[:5]

        print(self._second_level_dimensions)

        self._appid = appid
        self._card1 = NormalCard()
        self._targetCards = []
        self._base_dir = base_dir

        self._binTargetCol = False
        self._binAnalyzedCol = False
        print("--------Chi-Square Narratives for ", analysed_dimension,
              "---------")
        if self._dataframe_context.get_custom_analysis_details() != None:
            binnedColObj = [
                x["colName"]
                for x in self._dataframe_context.get_custom_analysis_details()
            ]
            print("analysed_dimension : ", self._analysed_dimension)
            if binnedColObj != None and self._target_dimension in binnedColObj:
                self._binTargetCol = True
            if binnedColObj != None and (
                    self._analysed_dimension in binnedColObj
                    or self._analysed_dimension in self._measure_columns):
                self._binAnalyzedCol = True

        if self._appid == None:
            self._generate_narratives()
            self._dimensionNode.add_cards([self._card1] + self._targetCards)
            self._dimensionNode.set_name("{}".format(analysed_dimension))
        elif self._appid == "2":
            self._generate_narratives()
            self._dimensionNode.add_cards([self._card1])
            self._dimensionNode.set_name("{}".format(analysed_dimension))
        elif self._appid == "1":
            self._generate_narratives()
            self._dimensionNode.add_cards([self._card1])
            self._dimensionNode.set_name("{}".format(analysed_dimension))

    def get_dimension_node(self):
        return json.loads(
            CommonUtils.convert_python_object_to_json(self._dimensionNode))

    def get_dimension_card1(self):
        return self._card1

    def _generate_narratives(self):
        chisquare_result = self._chisquare_result
        target_dimension = self._target_dimension
        analysed_dimension = self._analysed_dimension
        significant_variables = self._significant_variables
        num_analysed_variables = self._num_analysed_variables
        table = self._chiSquareTable
        total = self._chiSquareTable.get_total()

        levels = self._chiSquareTable.get_column_two_levels()
        level_counts = self._chiSquareTable.get_column_total()
        levels_count_sum = sum(level_counts)
        levels_percentages = [
            old_div(i * 100.0, levels_count_sum) for i in level_counts
        ]
        sorted_levels = sorted(zip(level_counts, levels), reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]

        top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        bottom_dim = sorted_levels[-1][1]
        bottom_dim_contribution = sorted_levels[-1][0]
        bottom_dims = [
            y for x, y in sorted_levels if x == bottom_dim_contribution
        ]

        target_levels = self._chiSquareTable.get_column_one_levels()

        target_counts = self._chiSquareTable.get_row_total()
        sorted_target_levels = sorted(zip(target_counts, target_levels),
                                      reverse=True)

        top_target_count, top_target = sorted_target_levels[0]
        second_target_count, second_target = sorted_target_levels[1]

        top_target_contributions = [
            table.get_value(top_target, i) for i in levels
        ]
        sum_top_target = sum(top_target_contributions)

        sorted_levels = sorted(zip(top_target_contributions, levels),
                               reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]
        top_target_top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        top_target_top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        top_target_bottom_dim = sorted_levels[-1][1]
        top_target_bottom_dim_contribution = sorted_levels[-1][0]

        top_target_percentages = [
            old_div(i * 100.0, sum_top_target)
            for i in top_target_contributions
        ]
        best_top_target_index = top_target_contributions.index(
            max(top_target_contributions))
        worst_top_target_index = top_target_contributions.index(
            min(top_target_contributions))
        top_target_differences = [
            x - y for x, y in zip(levels_percentages, top_target_percentages)
        ]
        if len(top_target_differences) > 6:
            tops = 2
            bottoms = -2
        elif len(top_target_differences) > 4:
            tops = 2
            bottoms = -1
        else:
            tops = 1
            bottoms = -1
        sorted_ = sorted(enumerate(top_target_differences),
                         key=lambda x: x[1],
                         reverse=True)
        best_top_difference_indices = [x for x, y in sorted_[:tops]]
        worst_top_difference_indices = [x for x, y in sorted_[bottoms:]]

        top_target_shares = [
            old_div(x * 100.0, y)
            for x, y in zip(top_target_contributions, level_counts)
        ]
        max_top_target_shares = max(top_target_shares)
        best_top_target_share_index = [
            idx for idx, val in enumerate(top_target_shares)
            if val == max_top_target_shares
        ]
        level_counts_threshold = old_div(
            sum(level_counts) * 0.05, len(level_counts))
        min_top_target_shares = min([
            x for x, y in zip(top_target_shares, level_counts)
            if y >= level_counts_threshold
        ])
        if max_top_target_shares == min_top_target_shares:
            worst_top_target_share_index = []
        else:
            worst_top_target_share_index = [
                idx for idx, val in enumerate(top_target_shares)
                if val == min_top_target_shares
            ]
        overall_top_percentage = old_div(sum_top_target * 100.0, total)

        second_target_contributions = [
            table.get_value(second_target, i) for i in levels
        ]
        sum_second_target = sum(second_target_contributions)

        sorted_levels = sorted(zip(second_target_contributions, levels),
                               reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]
        second_target_top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        second_target_top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        second_target_bottom_dim = sorted_levels[-1][1]
        second_target_bottom_dim_contribution = sorted_levels[-1][0]

        second_target_percentages = [
            old_div(i * 100.0, sum_second_target)
            for i in second_target_contributions
        ]
        best_second_target_index = second_target_contributions.index(
            max(second_target_contributions))
        worst_second_target_index = second_target_contributions.index(
            min(second_target_contributions))
        second_target_differences = [
            x - y
            for x, y in zip(levels_percentages, second_target_percentages)
        ]
        if len(second_target_differences) > 6:
            tops = 2
            bottoms = -2
        elif len(second_target_differences) > 4:
            tops = 2
            bottoms = -1
        else:
            tops = 1
            bottoms = -1
        sorted_ = sorted(enumerate(second_target_differences),
                         key=lambda x: x[1],
                         reverse=True)
        best_second_difference_indices = [x for x, y in sorted_[:tops]]
        worst_second_difference_indices = [x for x, y in sorted_[bottoms:]]

        second_target_shares = [
            old_div(x * 100.0, y)
            for x, y in zip(second_target_contributions, level_counts)
        ]
        max_second_target_shares = max(second_target_shares)
        best_second_target_share_index = [
            idx for idx, val in enumerate(second_target_shares)
            if val == max_second_target_shares
        ]
        level_counts_threshold = old_div(
            sum(level_counts) * 0.05, len(level_counts))
        if min(second_target_shares) == 0:
            min_second_target_shares = min([
                x for x, y in zip(second_target_shares, level_counts) if x != 0
            ])
        else:
            min_second_target_shares = min([
                x for x, y in zip(second_target_shares, level_counts)
                if y >= level_counts_threshold
            ])
        # worst_second_target_share_index = second_target_shares.index(min_second_target_shares)
        if max_second_target_shares == min_second_target_shares:
            worst_second_target_share_index = []
        else:
            worst_second_target_share_index = [
                idx for idx, val in enumerate(second_target_shares)
                if val == min_second_target_shares
            ]
        overall_second_percentage = old_div(sum_second_target * 100.0, total)

        targetCardDataDict = {}
        targetCardDataDict['target'] = target_dimension
        targetCardDataDict['colname'] = analysed_dimension
        targetCardDataDict['num_significant'] = len(significant_variables)
        targetCardDataDict['plural_colname'] = NarrativesUtils.pluralize(
            analysed_dimension)

        targetCardDataDict["blockSplitter"] = self._blockSplitter
        targetCardDataDict["binTargetCol"] = self._binTargetCol
        targetCardDataDict["binAnalyzedCol"] = self._binAnalyzedCol
        targetCardDataDict['highlightFlag'] = self._highlightFlag
        targetCardDataDict['levels'] = levels

        data_dict = {}
        data_dict[
            'best_second_difference'] = best_second_difference_indices  ##these changed
        data_dict['worst_second_difference'] = worst_second_difference_indices
        data_dict['best_top_difference'] = best_top_difference_indices
        data_dict['worst_top_difference'] = worst_top_difference_indices
        data_dict['levels_percentages'] = levels_percentages
        data_dict['top_target_percentages'] = top_target_percentages
        data_dict['second_target_percentages'] = second_target_percentages
        data_dict['levels'] = levels
        data_dict['best_top_share'] = best_top_target_share_index
        data_dict['worst_top_share'] = worst_top_target_share_index
        data_dict['best_second_share'] = best_second_target_share_index
        data_dict['worst_second_share'] = worst_second_target_share_index
        data_dict['top_target_shares'] = top_target_shares
        data_dict['second_target_shares'] = second_target_shares
        data_dict['overall_second'] = overall_second_percentage
        data_dict['overall_top'] = overall_top_percentage

        data_dict['num_significant'] = len(significant_variables)
        data_dict['colname'] = analysed_dimension
        data_dict['plural_colname'] = NarrativesUtils.pluralize(
            analysed_dimension)
        data_dict['target'] = target_dimension
        data_dict['top_levels'] = top_dims
        data_dict['top_levels_percent'] = round(
            old_div(top_dims_contribution * 100.0, total), 1)
        data_dict['bottom_level'] = bottom_dim
        data_dict['bottom_levels'] = bottom_dims
        data_dict['bottom_level_percent'] = round(
            old_div(bottom_dim_contribution * 100, sum(level_counts)), 2)
        data_dict['second_target'] = second_target
        data_dict['second_target_top_dims'] = second_target_top_dims
        data_dict['second_target_top_dims_contribution'] = old_div(
            second_target_top_dims_contribution * 100.0,
            sum(second_target_contributions))
        data_dict['second_target_bottom_dim'] = second_target_bottom_dim
        data_dict[
            'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution
        data_dict['best_second_target'] = levels[best_second_target_index]
        data_dict['best_second_target_count'] = second_target_contributions[
            best_second_target_index]
        data_dict['best_second_target_percent'] = round(
            old_div(
                second_target_contributions[best_second_target_index] * 100.0,
                sum(second_target_contributions)), 2)
        data_dict['worst_second_target'] = levels[worst_second_target_index]
        data_dict['worst_second_target_percent'] = round(
            old_div(
                second_target_contributions[worst_second_target_index] * 100.0,
                sum(second_target_contributions)), 2)

        data_dict['top_target'] = top_target
        data_dict['top_target_top_dims'] = top_target_top_dims
        data_dict['top_target_top_dims_contribution'] = old_div(
            top_target_top_dims_contribution * 100.0,
            sum(top_target_contributions))
        data_dict['top_target_bottom_dim'] = top_target_bottom_dim
        data_dict[
            'top_target_bottom_dim_contribution'] = top_target_bottom_dim_contribution
        data_dict['best_top_target'] = levels[best_top_target_index]
        data_dict['best_top_target_count'] = top_target_contributions[
            best_top_target_index]
        data_dict['best_top_target_percent'] = round(
            old_div(top_target_contributions[best_top_target_index] * 100.0,
                    sum(top_target_contributions)), 2)
        data_dict['worst_top_target'] = levels[worst_top_target_index]
        data_dict['worst_top_target_percent'] = round(
            old_div(top_target_contributions[worst_top_target_index] * 100.0,
                    sum(top_target_contributions)), 2)

        data_dict["blockSplitter"] = self._blockSplitter
        data_dict["binTargetCol"] = self._binTargetCol
        data_dict["binAnalyzedCol"] = self._binAnalyzedCol
        data_dict['highlightFlag'] = self._highlightFlag

        # print "_"*60
        # print "DATA DICT - ", data_dict
        # print "_"*60

        ###############
        #     CARD1   #
        ###############

        print("self._binTargetCol & self._binAnalyzedCol : ",
              self._binTargetCol, self._binAnalyzedCol)
        if len(data_dict['worst_second_share']) == 0:
            output = NarrativesUtils.block_splitter(
                NarrativesUtils.get_template_output(
                    self._base_dir, 'card1_binned_target_worst_second.html',
                    data_dict),
                self._blockSplitter,
                highlightFlag=self._highlightFlag)
        else:
            if (self._binTargetCol == True & self._binAnalyzedCol == False):
                print("Only Target Column is Binned, : ", self._binTargetCol)
                output = NarrativesUtils.block_splitter(
                    NarrativesUtils.get_template_output(
                        self._base_dir, 'card1_binned_target.html', data_dict),
                    self._blockSplitter,
                    highlightFlag=self._highlightFlag)
            elif (self._binTargetCol == True & self._binAnalyzedCol == True):
                print("Target Column and IV is Binned : ", self._binTargetCol,
                      self._binAnalyzedCol)
                output = NarrativesUtils.block_splitter(
                    NarrativesUtils.get_template_output(
                        self._base_dir, 'card1_binned_target_and_IV.html',
                        data_dict),
                    self._blockSplitter,
                    highlightFlag=self._highlightFlag)
            else:
                output = NarrativesUtils.block_splitter(
                    NarrativesUtils.get_template_output(
                        self._base_dir, 'card1.html', data_dict),
                    self._blockSplitter,
                    highlightFlag=self._highlightFlag)

        targetDimCard1Data = []
        targetDimcard1Heading = '<h3>Impact of ' + self._analysed_dimension + '  on ' + self._target_dimension + "</h3>"

        toggledata = ToggleData()

        targetDimTable1Data = self.generate_card1_table1()
        targetDimCard1Table1 = TableData()
        targetDimCard1Table1.set_table_type("heatMap")
        targetDimCard1Table1.set_table_data(targetDimTable1Data)
        toggledata.set_toggleon_data({
            "data": {
                "tableData": targetDimTable1Data,
                "tableType": "heatMap"
            },
            "dataType": "table"
        })

        targetDimTable2Data = self.generate_card1_table2()
        targetDimCard1Table2 = TableData()
        targetDimCard1Table2.set_table_type("normal")
        table2Data = targetDimTable2Data["data1"]
        table2Data = [
            innerList[1:] for innerList in table2Data
            if innerList[0].strip() != ""
        ]
        targetDimCard1Table2.set_table_data(table2Data)

        toggledata.set_toggleoff_data({
            "data": {
                "tableData": table2Data,
                "tableType": "heatMap"
            },
            "dataType": "table"
        })

        targetDimCard1Data.append(HtmlData(data=targetDimcard1Heading))
        targetDimCard1Data.append(toggledata)
        targetDimCard1Data += output

        self._card1.set_card_data(targetDimCard1Data)
        self._card1.set_card_name("{}: Relationship with {}".format(
            self._analysed_dimension, self._target_dimension))

        ###############
        #     CARD2   #
        ###############

        if self._appid == None:

            key_factors = ''
            num_key_factors = len(self._second_level_dimensions)

            if len(self._second_level_dimensions) == 5:
                key_factors = ', '.join(
                    self._second_level_dimensions[:4]
                ) + ' and ' + self._second_level_dimensions[4]
            elif len(self._second_level_dimensions) == 4:
                key_factors = ', '.join(
                    self._second_level_dimensions[:3]
                ) + ' and ' + self._second_level_dimensions[3]
            elif len(self._second_level_dimensions) == 3:
                key_factors = ', '.join(
                    self._second_level_dimensions[:2]
                ) + ' and ' + self._second_level_dimensions[2]
            elif len(self._second_level_dimensions) == 2:
                key_factors = ' and '.join(self._second_level_dimensions)
            elif len(self._second_level_dimensions) == 1:
                key_factors = self._second_level_dimensions[0]

            targetCardDataDict['num_key_factors'] = num_key_factors
            targetCardDataDict['key_factors'] = key_factors
            dict_for_test = {}
            for tupleObj in sorted_target_levels[:self._chiSquareLevelLimit]:
                targetLevel = tupleObj[1]

                targetCardDataDict['random_card2'] = random.randint(1, 100)
                targetCardDataDict['random_card4'] = random.randint(1, 100)

                second_target_contributions = [
                    table.get_value(targetLevel, i) for i in levels
                ]
                sum_second_target = sum(second_target_contributions)

                sorted_levels = sorted(zip(second_target_contributions,
                                           levels),
                                       reverse=True)

                level_differences = [0.0] + [
                    sorted_levels[i][0] - sorted_levels[i + 1][0]
                    for i in range(len(sorted_levels) - 1)
                ]
                level_diff_index = level_differences.index(
                    max(level_differences)) if level_differences.index(
                        max(level_differences)) > 0 else len(
                            level_differences
                        )  ##added for pipeline keyerror issue
                second_target_top_dims = [
                    j for i, j in sorted_levels[:level_diff_index]
                ]
                second_target_top_dims_contribution = sum([
                    i for i, j in sorted_levels[:level_differences.
                                                index(max(level_differences))]
                ])
                second_target_bottom_dim = sorted_levels[-1][1]
                second_target_bottom_dim_contribution = sorted_levels[-1][0]

                second_target_percentages = [
                    old_div(i * 100.0, sum_second_target)
                    for i in second_target_contributions
                ]
                best_second_target_index = second_target_contributions.index(
                    max(second_target_contributions))
                worst_second_target_index = second_target_contributions.index(
                    min(second_target_contributions))
                second_target_differences = [
                    x - y for x, y in zip(levels_percentages,
                                          second_target_percentages)
                ]
                if len(second_target_differences) > 6:
                    tops = 2
                    bottoms = -2
                elif len(second_target_differences) > 4:
                    tops = 2
                    bottoms = -1
                else:
                    tops = 1
                    bottoms = -1
                sorted_ = sorted(enumerate(second_target_differences),
                                 key=lambda x: x[1],
                                 reverse=True)
                best_second_difference_indices = [x for x, y in sorted_[:tops]]
                worst_second_difference_indices = [
                    x for x, y in sorted_[bottoms:]
                ]

                second_target_shares = [
                    old_div(x * 100.0, y)
                    for x, y in zip(second_target_contributions, level_counts)
                ]
                max_second_target_shares = max(second_target_shares)
                best_second_target_share_index = [
                    idx for idx, val in enumerate(second_target_shares)
                    if val == max_second_target_shares
                ]
                level_counts_threshold = old_div(
                    sum(level_counts) * 0.05, len(level_counts))
                min_second_target_shares = min([
                    x for x, y in zip(second_target_shares, level_counts)
                    if y >= level_counts_threshold
                ])
                worst_second_target_share_index = [
                    idx for idx, val in enumerate(second_target_shares)
                    if val == min_second_target_shares
                ]
                overall_second_percentage = old_div(sum_second_target * 100.0,
                                                    total)

                # DataFrame for contribution calculation
                if self._pandas_flag:
                    df_second_target = self._data_frame[(
                        self._data_frame[self._target_dimension] == targetLevel
                    ) & (self._data_frame[self._analysed_dimension] ==
                         second_target_top_dims[0])][
                             self._second_level_dimensions]
                    df_second_dim = self._data_frame[(
                        self._data_frame[self._analysed_dimension] ==
                        second_target_top_dims[0]
                    )][self._second_level_dimensions]
                else:
                    df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\
                                            filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                                            select(self._second_level_dimensions).toPandas()
                    df_second_dim = self._data_frame.filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                                        select(self._second_level_dimensions).toPandas()

                # if self._chisquare_result.get_splits():
                #     splits = self._chisquare_result.get_splits()
                #     idx = self._chiSquareTable.get_bin_names(splits).index(second_target_top_dims[0])
                #     idx1 = self._chiSquareTable.get_bin_names(splits).index(top_target_top_dims[0])
                #     splits[len(splits)-1] = splits[len(splits)-1]+1
                #     df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\
                #                         filter(col(self._analysed_dimension)>=splits[idx]).filter(col(self._analysed_dimension)<splits[idx+1]).\
                #                         select(self._second_level_dimensions).toPandas()
                #     df_second_dim = self._data_frame.filter(col(self._analysed_dimension)>=splits[idx]).\
                #                     filter(col(self._analysed_dimension)<splits[idx+1]).\
                #                     select(self._second_level_dimensions).toPandas()
                # else:
                #     df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\
                #                         filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                #                         select(self._second_level_dimensions).toPandas()
                #     df_second_dim = self._data_frame.filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                #                     select(self._second_level_dimensions).toPandas()

                # print self._data_frame.select('Sales').show()

                distribution_second = []
                d_l = []
                for d in self._second_level_dimensions:
                    grouped = df_second_target.groupby(d).agg({d: 'count'})
                    contributions = df_second_dim.groupby(d).agg({d: 'count'})
                    contribution_index = list(contributions.index)
                    contributions_val = contributions[d].tolist()
                    contributions_list = dict(
                        list(zip(contribution_index, contributions_val)))
                    index_list = list(grouped.index)
                    grouped_list = grouped[d].tolist()
                    contributions_percent_list = [
                        round(old_div(y * 100.0, contributions_list[x]), 2)
                        for x, y in zip(index_list, grouped_list)
                    ]
                    sum_ = grouped[d].sum()
                    diffs = [0] + [
                        grouped_list[i] - grouped_list[i + 1]
                        for i in range(len(grouped_list) - 1)
                    ]
                    max_diff = diffs.index(max(diffs))
                    grouped_dict = dict(list(zip(index_list, grouped_list)))

                    for val in contribution_index:
                        if val not in list(grouped_dict.keys()):
                            grouped_dict[val] = 0
                        else:
                            pass

                    index_list = []
                    grouped_list = []
                    contributions_val = []

                    for key in list(grouped_dict.keys()):
                        index_list.append(str(key))
                        grouped_list.append(grouped_dict[key])
                        contributions_val.append(contributions_list[key])
                    '''
                    print "="*70
                    print "GROUPED - ", grouped
                    print "INDEX LIST - ", index_list
                    print "GROUPED LIST - ", grouped_list
                    print "GROUPED DICT - ", grouped_dict
                    print "CONTRIBUTIONS - ", contributions
                    print "CONTRIBUTION INDEX - ", contribution_index
                    print "CONTRIBUTIONS VAL - ", contributions_val
                    print "CONTRIBUTIONS LIST - ", contributions_list
                    print "CONTRIBUTIONS PERCENT LIST - ", contributions_percent_list
                    print "SUM - ", sum_
                    print "DIFFS - ", diffs
                    print "MAX DIFF - ", max_diff
                    print "="*70
                    '''

                    informative_dict = {
                        "levels": index_list,
                        "positive_class_contribution": grouped_list,
                        "positive_plus_others": contributions_val
                    }

                    informative_df = pd.DataFrame(informative_dict)
                    informative_df["percentage_horizontal"] = old_div(
                        informative_df["positive_class_contribution"] * 100,
                        informative_df["positive_plus_others"])
                    informative_df["percentage_vertical"] = old_div(
                        informative_df["positive_class_contribution"] * 100,
                        sum_)
                    informative_df.sort_values(["percentage_vertical"],
                                               inplace=True,
                                               ascending=False)
                    informative_df = informative_df.reset_index(drop=True)

                    percentage_vertical_sorted = list(
                        informative_df["percentage_vertical"])
                    percentage_horizontal_sorted = list(
                        informative_df["percentage_horizontal"])
                    levels_sorted = list(informative_df["levels"])

                    differences_list = []
                    for i in range(1, len(percentage_vertical_sorted)):
                        difference = percentage_vertical_sorted[
                            i - 1] - percentage_vertical_sorted[i]
                        differences_list.append(round(difference, 2))
                    '''
                    print "-"*70
                    print "DIFFERENCES LIST - ", differences_list
                    print "-"*70
                    '''

                    index_txt = ''
                    if differences_list:
                        if differences_list[0] >= 30:
                            print("showing 1st case")
                            index_txt = levels_sorted[0]
                            max_diff_equivalent = 1
                        else:
                            if len(differences_list) >= 2:
                                if differences_list[1] >= 10:
                                    print("showing 1st and 2nd case")
                                    index_txt = levels_sorted[0] + '(' + str(
                                        round(percentage_vertical_sorted[0], 1)
                                    ) + '%)' + ' and ' + levels_sorted[
                                        1] + '(' + str(
                                            round(
                                                percentage_vertical_sorted[1],
                                                1)) + '%)'
                                    max_diff_equivalent = 2
                                else:
                                    print("showing 3rd case")
                                    index_txt = 'including ' + levels_sorted[
                                        0] + '(' + str(
                                            round(
                                                percentage_vertical_sorted[0],
                                                1)
                                        ) + '%)' + ' and ' + levels_sorted[
                                            1] + '(' + str(
                                                round(
                                                    percentage_vertical_sorted[
                                                        1], 1)) + '%)'
                                    max_diff_equivalent = 3
                            else:
                                print("showing 3rd case")
                                index_txt = 'including ' + levels_sorted[
                                    0] + '(' + str(
                                        round(percentage_vertical_sorted[0], 1)
                                    ) + '%)' + ' and ' + levels_sorted[
                                        1] + '(' + str(
                                            round(
                                                percentage_vertical_sorted[1],
                                                1)) + '%)'
                                max_diff_equivalent = 3

                    else:
                        max_diff_equivalent = 0
                    '''
                    print "-"*70
                    print informative_df.head(25)
                    print "-"*70
                    '''

                    distribution_second.append({
                        'contributions': [
                            round(i, 2) for i in
                            percentage_vertical_sorted[:max_diff_equivalent]
                        ],
                        'levels':
                        levels_sorted[:max_diff_equivalent],
                        'variation':
                        random.randint(1, 100),
                        'index_txt':
                        index_txt,
                        'd':
                        d,
                        'contributions_percent':
                        percentage_horizontal_sorted
                    })
                '''
                  print "DISTRIBUTION SECOND - ", distribution_second
                  print "<>"*50
                  '''
                targetCardDataDict['distribution_second'] = distribution_second
                targetCardDataDict['second_target'] = targetLevel
                targetCardDataDict[
                    'second_target_top_dims'] = second_target_top_dims
                targetCardDataDict[
                    'second_target_top_dims_contribution'] = old_div(
                        second_target_top_dims_contribution * 100.0,
                        sum(second_target_contributions))
                targetCardDataDict[
                    'second_target_bottom_dim'] = second_target_bottom_dim
                targetCardDataDict[
                    'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution
                targetCardDataDict['best_second_target'] = levels[
                    best_second_target_index]
                targetCardDataDict[
                    'best_second_target_count'] = second_target_contributions[
                        best_second_target_index]
                targetCardDataDict['best_second_target_percent'] = round(
                    old_div(
                        second_target_contributions[best_second_target_index] *
                        100.0, sum(second_target_contributions)), 2)
                targetCardDataDict['worst_second_target'] = levels[
                    worst_second_target_index]
                targetCardDataDict['worst_second_target_percent'] = round(
                    old_div(
                        second_target_contributions[worst_second_target_index]
                        * 100.0, sum(second_target_contributions)), 2)

                card2Data = []
                targetLevelContributions = [
                    table.get_value(targetLevel, i) for i in levels
                ]
                impact_target_thershold = old_div(
                    sum(targetLevelContributions) * 0.02,
                    len(targetLevelContributions))
                card2Heading = '<h3>Key Drivers of ' + self._target_dimension + ' (' + targetLevel + ')' + "</h3>"
                chart, bubble = self.generate_distribution_card_chart(
                    targetLevel, targetLevelContributions, levels,
                    level_counts, total, impact_target_thershold)
                card2ChartData = NormalChartData(data=chart["data"])
                "rounding the chartdata values for key drivers tab inside table percentage(table data)"
                for d in card2ChartData.get_data():
                    d['percentage'] = round(d['percentage'], 2)
                    d_l.append(d)
                card2ChartJson = ChartJson()
                card2ChartJson.set_data(d_l)
                card2ChartJson.set_chart_type("combination")
                card2ChartJson.set_types({
                    "total": "bar",
                    "percentage": "line"
                })
                card2ChartJson.set_legend({
                    "total": "# of " + targetLevel,
                    "percentage": "% of " + targetLevel
                })
                card2ChartJson.set_axes({
                    "x": "key",
                    "y": "total",
                    "y2": "percentage"
                })
                card2ChartJson.set_label_text({
                    "x": " ",
                    "y": "Count",
                    "y2": "Percentage"
                })
                print("self._binTargetCol & self._binAnalyzedCol : ",
                      self._binTargetCol, self._binAnalyzedCol)
                if (self._binTargetCol == True & self._binAnalyzedCol ==
                        False):
                    print("Only Target Column is Binned")
                    output2 = NarrativesUtils.block_splitter(
                        NarrativesUtils.get_template_output(
                            self._base_dir, 'card2_binned_target.html',
                            targetCardDataDict), self._blockSplitter)
                elif (self._binTargetCol == True & self._binAnalyzedCol ==
                      True):
                    print("Target Column and IV is Binned")
                    output2 = NarrativesUtils.block_splitter(
                        NarrativesUtils.get_template_output(
                            self._base_dir, 'card2_binned_target_and_IV.html',
                            targetCardDataDict), self._blockSplitter)
                else:
                    print("In Else, self._binTargetCol should be False : ",
                          self._binTargetCol)
                    output2 = NarrativesUtils.block_splitter(
                        NarrativesUtils.get_template_output(
                            self._base_dir, 'card2.html', targetCardDataDict),
                        self._blockSplitter)

                card2Data.append(HtmlData(data=card2Heading))
                statistical_info_array = [
                    ("Test Type", "Chi-Square"),
                    ("Chi-Square statistic",
                     str(round(self._chisquare_result.get_stat(), 3))),
                    ("P-Value",
                     str(round(self._chisquare_result.get_pvalue(), 3))),
                    ("Inference",
                     "Chi-squared analysis shows a significant association between {} (target) and {}."
                     .format(self._target_dimension, self._analysed_dimension))
                ]
                statistical_info_array = NarrativesUtils.statistical_info_array_formatter(
                    statistical_info_array)

                card2Data.append(
                    C3ChartData(data=card2ChartJson,
                                info=statistical_info_array))
                card2Data += output2
                card2BubbleData = "<div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}</span><br /><small>{}</small></h2></div><div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}</span><br /><small>{}</small></h2></div>".format(
                    bubble[0]["value"], bubble[0]["text"], bubble[1]["value"],
                    bubble[1]["text"])
                card2Data.append(HtmlData(data=card2BubbleData))
                targetCard = NormalCard()
                targetCard.set_card_data(card2Data)
                targetCard.set_card_name("{} : Distribution of {}".format(
                    self._analysed_dimension, targetLevel))
                self._targetCards.append(targetCard)
                dict_for_test[targetLevel] = targetCardDataDict
        out = {'data_dict': data_dict, 'target_dict': dict_for_test}

        return out

    # def generate_card2_narratives(self):

    def generate_distribution_card_chart(self, __target,
                                         __target_contributions, levels,
                                         levels_count, total, thershold):
        chart = {}
        label = {'total': '# of ' + __target, 'percentage': '% of ' + __target}
        label_text = {
            'x': self._analysed_dimension,
            'y': '# of ' + __target,
            'y2': '% of ' + __target,
        }
        data = {}
        data['total'] = dict(list(zip(levels, __target_contributions)))
        __target_percentages = [
            old_div(x * 100.0, y)
            for x, y in zip(__target_contributions, levels_count)
        ]
        data['percentage'] = dict(list(zip(levels, __target_percentages)))
        chartData = []
        for val in zip(levels, __target_contributions, __target_percentages):
            chartData.append({
                "key": val[0],
                "total": val[1],
                "percentage": val[2]
            })
        # c3_data = [levels,__target_contributions,__target_percentages]
        chart_data = {'label': label, 'data': chartData}
        bubble_data1 = {}
        bubble_data2 = {}
        bubble_data1['value'] = str(
            round(
                old_div(
                    max(__target_contributions) * 100.0,
                    sum(__target_contributions)), 1)) + '%'
        m_index = __target_contributions.index(max(__target_contributions))
        bubble_data1[
            'text'] = 'Overall ' + __target + ' comes from ' + levels[m_index]
        intial = -1
        for k, v, i in zip(__target_contributions, __target_percentages,
                           list(range(len(__target_contributions)))):
            if k > thershold:
                if intial < v:
                    intial = v
                    bubble_data2['value'] = str(round(intial)) + '%'
                    #m_index = __target_percentages.index(i)
                    bubble_data2['text'] = levels[
                        i] + ' has the highest rate of ' + __target
        bubble_data = [bubble_data1, bubble_data2]
        return chart_data, bubble_data

    def generate_card1_table1(self):
        table_percent_by_column = self._chiSquareTable.table_percent_by_column
        column_two_values = self._chiSquareTable.column_two_values
        header_row = [self._analysed_dimension
                      ] + self._chiSquareTable.get_column_one_levels()
        all_columns = [column_two_values] + table_percent_by_column
        other_rows = list(zip(*all_columns))
        other_rows = [list(tup) for tup in other_rows]
        table_data = [header_row] + other_rows
        return table_data

    def generate_card1_table2(self):
        table = self._chiSquareTable.table
        table_percent = self._chiSquareTable.table_percent
        table_percent_by_row = self._chiSquareTable.table_percent_by_row
        table_percent_by_column = self._chiSquareTable.table_percent_by_column
        target_levels = self._chiSquareTable.get_column_one_levels()
        dim_levels = self._chiSquareTable.get_column_two_levels()

        header1 = [self._analysed_dimension] + target_levels + ['Total']
        header = ['State', 'Active', 'Churn', 'Total']  #TODO remove
        data = []
        data1 = [['Tag'] + header1]

        for idx, lvl in enumerate(dim_levels):
            first_row = ['Tag'] + header
            col_2_vals = list(zip(*table))[idx]
            data2 = ['bold'] + [lvl] + list(col_2_vals) + [sum(col_2_vals)]

            dict_ = dict(list(zip(first_row, data2)))
            data.append(dict_)
            data1.append(data2)

            col_2_vals = list(zip(*table_percent_by_column))[idx]
            data2 = [''] + ['As % within ' + self._analysed_dimension
                            ] + list(col_2_vals) + [100.0]
            dict_ = dict(list(zip(first_row, data2)))
            data.append(dict_)
            data1.append(data2)

            col_2_vals = list(zip(*table_percent_by_row))[idx]
            col_2_vals1 = list(zip(*table_percent))[idx]
            data2 = [''] + [
                'As % within ' + self._target_dimension
            ] + list(col_2_vals) + [round(sum(col_2_vals1), 2)]
            dict_ = dict(list(zip(first_row, data2)))
            data.append(dict_)
            data1.append(data2)
            # col_2_vals = zip(*table_percent)[idx]
            data2 = [''] + ['As % of Total'] + list(col_2_vals1) + [
                round(sum(col_2_vals1), 2)
            ]
            dict_ = dict(list(zip(first_row, data2)))
            data.append(dict_)
            data1.append(data2)

        out = {
            'header': header,
            'header1': header1,
            'data': data,
            'label': self._analysed_dimension,
            'data1': data1
        }
        return out
コード例 #11
0
    def Train(self):
        st_global = time.time()

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._slug,
            "initialization",
            "info",
            display=True,
            emptyBin=False,
            customMsg=None,
            weightKey="total")
        appType = self._dataframe_context.get_app_type()
        algosToRun = self._dataframe_context.get_algorithms_to_run()
        algoSetting = [
            x for x in algosToRun if x.get_algorithm_slug() == self._slug
        ][0]
        categorical_columns = self._dataframe_helper.get_string_columns()
        uid_col = self._dataframe_context.get_uid_column()
        if self._metaParser.check_column_isin_ignored_suggestion(uid_col):
            categorical_columns = list(set(categorical_columns) - {uid_col})
        allDateCols = self._dataframe_context.get_date_columns()
        categorical_columns = list(set(categorical_columns) - set(allDateCols))
        print("CATEGORICAL COLS - ", categorical_columns)
        result_column = self._dataframe_context.get_result_column()
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        numerical_columns = [
            x for x in numerical_columns if x != result_column
        ]

        model_path = self._dataframe_context.get_model_path()
        if model_path.startswith("file"):
            model_path = model_path[7:]
        validationDict = self._dataframe_context.get_validation_dict()
        print("model_path", model_path)
        pipeline_filepath = "file://" + str(model_path) + "/" + str(
            self._slug) + "/pipeline/"
        model_filepath = "file://" + str(model_path) + "/" + str(
            self._slug) + "/model"
        pmml_filepath = "file://" + str(model_path) + "/" + str(
            self._slug) + "/modelPmml"

        df = self._data_frame
        if self._mlEnv == "spark":
            pass
        elif self._mlEnv == "sklearn":
            model_filepath = model_path + "/" + self._slug + "/model.pkl"

            x_train, x_test, y_train, y_test = self._dataframe_helper.get_train_test_data(
            )
            x_train = MLUtils.create_dummy_columns(
                x_train,
                [x for x in categorical_columns if x != result_column])
            x_test = MLUtils.create_dummy_columns(
                x_test, [x for x in categorical_columns if x != result_column])
            x_test = MLUtils.fill_missing_columns(x_test, x_train.columns,
                                                  result_column)

            print("=" * 150)
            print("X-Train Shape - ", x_train.shape)
            print("Y-Train Shape - ", y_train.shape)
            print("X-Test Shape - ", x_test.shape)
            print("Y-Test Shape - ", y_test.shape)
            print("~" * 50)
            print("X-Train dtype - ", type(x_train))
            print("Y-Train dtype - ", type(y_train))
            print("X-Test dtype - ", type(x_test))
            print("Y-Test dtype - ", type(y_test))
            print("~" * 50)

            CommonUtils.create_update_and_save_progress_message(
                self._dataframe_context,
                self._scriptWeightDict,
                self._scriptStages,
                self._slug,
                "training",
                "info",
                display=True,
                emptyBin=False,
                customMsg=None,
                weightKey="total")

            st = time.time()

            self._result_setter.set_hyper_parameter_results(self._slug, None)
            evaluationMetricDict = algoSetting.get_evaluvation_metric(
                Type="REGRESSION")
            evaluationMetricDict = {
                "name": GLOBALSETTINGS.REGRESSION_MODEL_EVALUATION_METRIC
            }
            evaluationMetricDict[
                "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[
                    evaluationMetricDict["name"]]

            x_train_tensored, y_train_tensored, x_test_tensored, y_test_tensored = PYTORCHUTILS.get_tensored_data(
                x_train, y_train, x_test, y_test)
            trainset = torch_data_utils.TensorDataset(x_train_tensored,
                                                      y_train_tensored)
            testset = torch_data_utils.TensorDataset(x_test_tensored,
                                                     y_test_tensored)

            nnptr_params = algoSetting.get_nnptr_params_dict()[0]
            layers_for_network = PYTORCHUTILS.get_layers_for_network_module(
                nnptr_params,
                task_type="REGRESSION",
                first_layer_units=x_train.shape[1])

            # Use GPU if available
            device = torch.device(
                "cuda:0" if torch.cuda.is_available() else "cpu")
            network = PyTorchNetwork(layers_for_network).to(device)
            network.eval()

            other_params_dict = PYTORCHUTILS.get_other_pytorch_params(
                nnptr_params,
                task_type="REGRESSION",
                network_params=network.parameters())

            print("~" * 50)
            print("NNPTR-PARAMS - ", nnptr_params)
            print("~" * 50)
            print("OTHER-PARAMS-DICT - ", other_params_dict)
            print("~" * 50)
            print("NEURAL-NETWORK - ", network)
            print("~" * 50)

            criterion = other_params_dict["loss_criterion"]
            n_epochs = other_params_dict["number_of_epochs"]
            batch_size = other_params_dict["batch_size"]
            optimizer = other_params_dict["optimizer"]

            dataloader_params = {
                "batch_size": batch_size,
                "shuffle": True
                # "num_workers":
            }

            train_loader = torch_data_utils.DataLoader(trainset,
                                                       **dataloader_params)
            test_loader = torch_data_utils.DataLoader(testset,
                                                      **dataloader_params)
            '''
            Training the network;
            Batchnormalization(num_features) should be equal to units_op for that layer in training config;
            else --> RuntimeError('running_mean should contain 100 elements not 200',)
            '''

            for epoch in range(n_epochs):
                batchwise_losses = []
                average_loss = 0.0

                for i, (inputs, labels) in enumerate(train_loader):
                    inputs = inputs.to(device)
                    labels = labels.to(device)

                    # Zero the parameter gradients
                    optimizer.zero_grad()

                    # Forward + backward + optimize
                    outputs = network(inputs.float())
                    loss = criterion(outputs, labels.float())
                    loss.backward()
                    optimizer.step()

                    average_loss += loss.item()
                    batchwise_losses.append(loss.item())

                average_loss_per_epoch = old_div(average_loss, (i + 1))
                print("+" * 80)
                print("EPOCH - ", epoch)
                print("BATCHWISE_LOSSES shape - ", len(batchwise_losses))
                print("AVERAGE LOSS PER EPOCH - ", average_loss_per_epoch)
                print("+" * 80)

            trainingTime = time.time() - st
            bestEstimator = network

            outputs_x_test_tensored = network(x_test_tensored.float())
            y_score_mid = outputs_x_test_tensored.tolist()
            y_score = [x[0] for x in y_score_mid]
            print("Y-SCORE - ", y_score)
            print("Y-SCORE length - ", len(y_score))
            y_prob = None

            featureImportance = {}
            objs = {
                "trained_model": bestEstimator,
                "actual": y_test,
                "predicted": y_score,
                "probability": y_prob,
                "feature_importance": featureImportance,
                "featureList": list(x_train.columns),
                "labelMapping": {}
            }
            #featureImportance = objs["trained_model"].feature_importances_
            #featuresArray = [(col_name, featureImportance[idx]) for idx, col_name in enumerate(x_train.columns)]
            featuresArray = []
            if not algoSetting.is_hyperparameter_tuning_enabled():
                modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH -
                                         1) + "1"
                modelFilepathArr = model_filepath.split("/")[:-1]
                modelFilepathArr.append(modelName + ".pt")
                torch.save(objs["trained_model"], "/".join(modelFilepathArr))
                #joblib.dump(objs["trained_model"],"/".join(modelFilepathArr))
                runtime = round((time.time() - st), 2)
            else:
                runtime = round((time.time() - hyper_st), 2)

            try:
                modelPmmlPipeline = PMMLPipeline([("pretrained-estimator",
                                                   objs["trained_model"])])
                modelPmmlPipeline.target_field = result_column
                modelPmmlPipeline.active_fields = np.array(
                    [col for col in x_train.columns if col != result_column])
                sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr=True)
                pmmlfile = open(pmml_filepath, "r")
                pmmlText = pmmlfile.read()
                pmmlfile.close()
                self._result_setter.update_pmml_object({self._slug: pmmlText})
            except:
                pass

            metrics = {}
            metrics["r2"] = r2_score(y_test, y_score)
            metrics["neg_mean_squared_error"] = mean_squared_error(
                y_test, y_score)
            metrics["neg_mean_absolute_error"] = mean_absolute_error(
                y_test, y_score)
            metrics["RMSE"] = sqrt(metrics["neg_mean_squared_error"])
            metrics["explained_variance_score"] = explained_variance_score(
                y_test, y_score)
            transformed = pd.DataFrame({
                "prediction": y_score,
                result_column: y_test
            })
            print("TRANSFORMED PREDICTION TYPE - ",
                  type(transformed["prediction"]))
            print(transformed["prediction"])
            print("TRANSFORMED RESULT COL TYPE - ",
                  type(transformed[result_column]))
            print(transformed[result_column])
            transformed["difference"] = transformed[
                result_column] - transformed["prediction"]
            transformed["mape"] = old_div(
                np.abs(transformed["difference"]) * 100,
                transformed[result_column])

            sampleData = None
            nrows = transformed.shape[0]
            if nrows > 100:
                sampleData = transformed.sample(n=100, random_state=420)
            else:
                sampleData = transformed
            print(sampleData.head())
            if transformed["mape"].max() > 100:
                GLOBALSETTINGS.MAPEBINS.append(transformed["mape"].max())
                mapeCountArr = list(
                    pd.cut(transformed["mape"], GLOBALSETTINGS.MAPEBINS).
                    value_counts().to_dict().items())
                GLOBALSETTINGS.MAPEBINS.pop(5)
            else:
                mapeCountArr = list(
                    pd.cut(transformed["mape"], GLOBALSETTINGS.MAPEBINS).
                    value_counts().to_dict().items())
            mapeStatsArr = [(str(idx), dictObj) for idx, dictObj in enumerate(
                sorted([{
                    "count": x[1],
                    "splitRange": (x[0].left, x[0].right)
                } for x in mapeCountArr],
                       key=lambda x: x["splitRange"][0]))]
            print(mapeStatsArr)
            print(mapeCountArr)
            predictionColSummary = transformed["prediction"].describe(
            ).to_dict()
            quantileBins = [
                predictionColSummary["min"], predictionColSummary["25%"],
                predictionColSummary["50%"], predictionColSummary["75%"],
                predictionColSummary["max"]
            ]
            print(quantileBins)
            quantileBins = sorted(list(set(quantileBins)))
            transformed["quantileBinId"] = pd.cut(transformed["prediction"],
                                                  quantileBins)
            quantileDf = transformed.groupby("quantileBinId").agg({
                "prediction": [np.sum, np.mean, np.size]
            }).reset_index()
            quantileDf.columns = ["prediction", "sum", "mean", "count"]
            print(quantileDf)
            quantileArr = list(quantileDf.T.to_dict().items())
            quantileSummaryArr = [(obj[0], {
                "splitRange":
                (obj[1]["prediction"].left, obj[1]["prediction"].right),
                "count":
                obj[1]["count"],
                "mean":
                obj[1]["mean"],
                "sum":
                obj[1]["sum"]
            }) for obj in quantileArr]
            print(quantileSummaryArr)
            runtime = round((time.time() - st_global), 2)

            self._model_summary.set_model_type("regression")
            self._model_summary.set_algorithm_name("Neural Network (PyTorch)")
            self._model_summary.set_algorithm_display_name(
                "Neural Network (PyTorch)")
            self._model_summary.set_slug(self._slug)
            self._model_summary.set_training_time(runtime)
            self._model_summary.set_training_time(trainingTime)
            self._model_summary.set_target_variable(result_column)
            self._model_summary.set_validation_method(
                validationDict["displayName"])
            self._model_summary.set_model_evaluation_metrics(metrics)
            self._model_summary.set_model_params(nnptr_params)
            self._model_summary.set_quantile_summary(quantileSummaryArr)
            self._model_summary.set_mape_stats(mapeStatsArr)
            self._model_summary.set_sample_data(sampleData.to_dict())
            self._model_summary.set_feature_importance(featuresArray)
            self._model_summary.set_feature_list(list(x_train.columns))
            self._model_summary.set_model_mse(
                metrics["neg_mean_squared_error"])
            self._model_summary.set_model_mae(
                metrics["neg_mean_absolute_error"])
            self._model_summary.set_rmse(metrics["RMSE"])
            self._model_summary.set_model_rsquared(metrics["r2"])
            self._model_summary.set_model_exp_variance_score(
                metrics["explained_variance_score"])

            try:
                pmml_filepath = str(model_path) + "/" + str(
                    self._slug) + "/traindeModel.pmml"
                modelPmmlPipeline = PMMLPipeline([("pretrained-estimator",
                                                   objs["trained_model"])])
                modelPmmlPipeline.target_field = result_column
                modelPmmlPipeline.active_fields = np.array(
                    [col for col in x_train.columns if col != result_column])
                sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr=True)
                pmmlfile = open(pmml_filepath, "r")
                pmmlText = pmmlfile.read()
                pmmlfile.close()
                self._result_setter.update_pmml_object({self._slug: pmmlText})
            except:
                pass

        if algoSetting.is_hyperparameter_tuning_enabled():
            modelDropDownObj = {
                "name": self._model_summary.get_algorithm_name(),
                "evaluationMetricValue": metrics[evaluationMetricDict["name"]],
                "evaluationMetricName": evaluationMetricDict["name"],
                "slug": self._model_summary.get_slug(),
                "Model Id": modelName
            }

            modelSummaryJson = {
                "dropdown": modelDropDownObj,
                "levelcount": self._model_summary.get_level_counts(),
                "modelFeatureList": self._model_summary.get_feature_list(),
                "levelMapping": self._model_summary.get_level_map_dict(),
                "slug": self._model_summary.get_slug(),
                "name": self._model_summary.get_algorithm_name()
            }
        else:
            modelDropDownObj = {
                "name": self._model_summary.get_algorithm_name(),
                "evaluationMetricValue": metrics[evaluationMetricDict["name"]],
                "evaluationMetricName": evaluationMetricDict["name"],
                "slug": self._model_summary.get_slug(),
                "Model Id": modelName
            }
            modelSummaryJson = {
                "dropdown": modelDropDownObj,
                "levelcount": self._model_summary.get_level_counts(),
                "modelFeatureList": self._model_summary.get_feature_list(),
                "levelMapping": self._model_summary.get_level_map_dict(),
                "slug": self._model_summary.get_slug(),
                "name": self._model_summary.get_algorithm_name()
            }
        modelmanagement_ = nnptr_params

        self._model_management = MLModelSummary()
        if algoSetting.is_hyperparameter_tuning_enabled():
            pass
        else:
            self._model_management.set_layer_info(
                data=modelmanagement_['hidden_layer_info'])
            self._model_management.set_loss_function(
                data=modelmanagement_['loss'])
            self._model_management.set_optimizer(
                data=modelmanagement_['optimizer'])
            self._model_management.set_batch_size(
                data=modelmanagement_['batch_size'])
            self._model_management.set_no_epochs(
                data=modelmanagement_['number_of_epochs'])
            # self._model_management.set_model_evaluation_metrics(data=modelmanagement_['metrics'])
            self._model_management.set_job_type(
                self._dataframe_context.get_job_name())  #Project name
            self._model_management.set_training_status(
                data="completed")  # training status
            self._model_management.set_no_of_independent_variables(
                data=x_train)  #no of independent varables
            self._model_management.set_training_time(runtime)  # run time
            self._model_management.set_rmse(metrics["RMSE"])
            self._model_management.set_algorithm_name(
                "Neural Network (TensorFlow)")  #algorithm name
            self._model_management.set_validation_method(
                str(validationDict["displayName"]) + "(" +
                str(validationDict["value"]) + ")")  #validation method
            self._model_management.set_target_variable(
                result_column)  #target column name
            self._model_management.set_creation_date(data=str(
                datetime.now().strftime('%b %d ,%Y  %H:%M ')))  #creation date
            self._model_management.set_datasetName(self._datasetName)
        modelManagementSummaryJson = [
            ["Project Name",
             self._model_management.get_job_type()],
            ["Algorithm",
             self._model_management.get_algorithm_name()],
            ["Training Status",
             self._model_management.get_training_status()],
            ["RMSE", self._model_management.get_rmse()],
            ["RunTime", self._model_management.get_training_time()],
            #["Owner",None],
            ["Created On",
             self._model_management.get_creation_date()]
        ]
        if algoSetting.is_hyperparameter_tuning_enabled():
            modelManagementModelSettingsJson = []
        else:
            modelManagementModelSettingsJson = [
                ["Training Dataset",
                 self._model_management.get_datasetName()],
                [
                    "Target Column",
                    self._model_management.get_target_variable()
                ],
                [
                    "Number Of Independent Variables",
                    self._model_management.get_no_of_independent_variables()
                ], ["Algorithm",
                    self._model_management.get_algorithm_name()],
                [
                    "Model Validation",
                    self._model_management.get_validation_method()
                ],
                ["batch_size",
                 str(self._model_management.get_batch_size())],
                ["Loss", self._model_management.get_loss_function()],
                ["Optimizer",
                 self._model_management.get_optimizer()],
                ["Epochs", self._model_management.get_no_epochs()],
                [
                    "Metrics",
                    self._model_management.get_model_evaluation_metrics()
                ]
            ]
            for i in modelmanagement_["hidden_layer_info"]:
                string = ""
                key = str(modelmanagement_["hidden_layer_info"][i]
                          ["layer"]) + " " + str(i) + ":"
                for j in modelmanagement_["hidden_layer_info"][i]:
                    string = string + str(j) + ":" + str(
                        modelmanagement_["hidden_layer_info"][i][j]) + ",   "
                modelManagementModelSettingsJson.append([key, string])
        print(modelManagementModelSettingsJson)

        nnptrCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for
            cardObj in MLUtils.create_model_summary_cards(self._model_summary)
        ]
        nnptrPerformanceCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj))
            for cardObj in MLUtils.create_model_management_cards_regression(
                self._model_summary)
        ]
        nnptrOverviewCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj))
            for cardObj in MLUtils.create_model_management_card_overview(
                self._model_management, modelManagementSummaryJson,
                modelManagementModelSettingsJson)
        ]
        nnptrDeploymentCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj))
            for cardObj in MLUtils.create_model_management_deploy_empty_card()
        ]
        nnptr_Overview_Node = NarrativesTree()
        nnptr_Overview_Node.set_name("Overview")
        nnptr_Performance_Node = NarrativesTree()
        nnptr_Performance_Node.set_name("Performance")
        nnptr_Deployment_Node = NarrativesTree()
        nnptr_Deployment_Node.set_name("Deployment")
        for card in nnptrOverviewCards:
            nnptr_Overview_Node.add_a_card(card)
        for card in nnptrPerformanceCards:
            nnptr_Performance_Node.add_a_card(card)
        for card in nnptrDeploymentCards:
            nnptr_Deployment_Node.add_a_card(card)
        for card in nnptrCards:
            self._prediction_narrative.add_a_card(card)
        self._result_setter.set_model_summary({
            "Neural Network (PyTorch)":
            json.loads(
                CommonUtils.convert_python_object_to_json(self._model_summary))
        })
        self._result_setter.set_nnptr_regression_model_summary(
            modelSummaryJson)
        self._result_setter.set_nnptr_cards(nnptrCards)
        self._result_setter.set_nnptr_nodes([
            nnptr_Overview_Node, nnptr_Performance_Node, nnptr_Deployment_Node
        ])
        self._result_setter.set_nnptr_fail_card({
            "Algorithm_Name": "Neural Network (PyTorch)",
            "Success": "True"
        })
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._slug,
            "completion",
            "info",
            display=True,
            emptyBin=False,
            customMsg=None,
            weightKey="total")
コード例 #12
0
class DecisionTreeNarrative:
    MAX_FRACTION_DIGITS = 2

    def _get_new_table(self):
        self._decisionTreeCard1Table = [["PREDICTION", "RULES", "PERCENTAGE"]]
        for keys in self._table.keys():
            self._new_table[keys] = {}
            self._new_table[keys]['rules'] = self._table[keys]
            self._new_table[keys]['probability'] = [
                round(i, 2) for i in self.success_percent[keys]
            ]
            keyTable = [
                keys, self._new_table[keys]['rules'],
                self._new_table[keys]['probability']
            ]
            self._decisionTreeCard1Table.append(keyTable)

    # @accepts(object, (str, basestring), DecisionTreeResult,DataFrameHelper,ContextSetter,ResultSetter,NarrativesTree,basestring,dict)
    def __init__(self,
                 column_name,
                 decision_tree_rules,
                 df_helper,
                 df_context,
                 meta_parser,
                 result_setter,
                 story_narrative=None,
                 analysisName=None,
                 scriptWeight=None):
        self._story_narrative = story_narrative
        self._metaParser = meta_parser
        self._dataframe_context = df_context
        self._ignoreMsg = self._dataframe_context.get_message_ignore()
        self._result_setter = result_setter
        self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER
        self._column_name = column_name.lower()
        self._colname = column_name

        self._capitalized_column_name = "%s%s" % (column_name[0].upper(),
                                                  column_name[1:])
        self._decision_rules_dict = decision_tree_rules.get_decision_rules()
        self._decision_tree_json = CommonUtils.as_dict(decision_tree_rules)
        self._decision_tree_raw = self._decision_rules_dict
        # self._decision_tree_raw = {"tree":{"children":None}}
        # self._decision_tree_raw['tree']["children"] = self._decision_tree_json['tree']["children"]
        self._table = decision_tree_rules.get_table()
        self._new_table = {}
        self.successful_predictions = decision_tree_rules.get_success()
        self.total_predictions = decision_tree_rules.get_total()
        self.success_percent = decision_tree_rules.get_success_percent()
        self._important_vars = decision_tree_rules.get_significant_vars()
        self._target_distribution = decision_tree_rules.get_target_contributions(
        )
        self._get_new_table()
        self._df_helper = df_helper
        self.subheader = None
        #self.table = {}
        self.dropdownComment = None
        self.dropdownValues = None
        self._base_dir = "/decisiontree/"

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        if analysisName == None:
            self._analysisName = self._dataframe_context.get_analysis_name()
        else:
            self._analysisName = analysisName
        self._messageURL = self._dataframe_context.get_message_url()
        if scriptWeight == None:
            self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight(
            )
        else:
            self._scriptWeightDict = scriptWeight
        self._scriptStages = {
            "dtreeNarrativeStart": {
                "summary": "Started the Decision Tree Narratives",
                "weight": 0
            },
            "dtreeNarrativeEnd": {
                "summary": "Narratives for Decision Tree Finished",
                "weight": 10
            },
        }
        self._completionStatus += self._scriptWeightDict[
            self._analysisName]["narratives"] * self._scriptStages[
                "dtreeNarrativeStart"]["weight"] / 10
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "dtreeNarrativeStart",\
                                    "info",\
                                    self._scriptStages["dtreeNarrativeStart"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsg)
        self._dataframe_context.update_completion_status(
            self._completionStatus)

        self._decisionTreeNode = NarrativesTree()
        self._decisionTreeNode.set_name("Prediction")
        self._generate_narratives()
        # self._story_narrative.add_a_node(self._decisionTreeNode)
        self._result_setter.set_decision_tree_node(self._decisionTreeNode)
        self._result_setter.set_score_dtree_cards(
            json.loads(
                CommonUtils.convert_python_object_to_json(
                    self._decisionTreeNode.get_all_cards())))

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        self._completionStatus += self._scriptWeightDict[
            self._analysisName]["narratives"] * self._scriptStages[
                "dtreeNarrativeEnd"]["weight"] / 10
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "dtreeNarrativeEnd",\
                                    "info",\
                                    self._scriptStages["dtreeNarrativeEnd"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsg)
        self._dataframe_context.update_completion_status(
            self._completionStatus)

    def _generate_narratives(self):
        self._generate_summary()

    # def _generate_summary(self):
    #     rules = self._decision_rules_dict
    #     colname = self._colname
    #     data_dict = {"dimension_name":self._colname}
    #     data_dict["plural_colname"] = NarrativesUtils.pluralize(data_dict["dimension_name"])
    #     data_dict["significant_vars"] = []
    #     rules_dict = self._table
    #     self.condensedTable={}
    #     for target in rules_dict.keys():
    #         self.condensedTable[target]=[]
    #         total = self.total_predictions[target]
    #         success = self.successful_predictions[target]
    #         success_percent = self.success_percent[target]
    #         for idx,rule in enumerate(rules_dict[target]):
    #             rules1 = NarrativeUtils.generate_rules(target,rule, total[idx], success[idx], success_percent[idx])
    #             self.condensedTable[target].append(rules1)
    #     self.dropdownValues = rules_dict.keys()
    #     data_dict["blockSplitter"] = self._blockSplitter
    #     data_dict['rules'] = self.condensedTable
    #     data_dict['success'] = self.success_percent
    #     data_dict['significant_vars'] = list(set(itertools.chain.from_iterable(self._important_vars.values())))
    #     data_dict['significant_vars'] = self._important_vars
    #     # print '*'*16
    #     # print data_dict['rules']
    #     # print self._new_table
    #     self.card2_data = NarrativesUtils.paragraph_splitter(NarrativesUtils.get_template_output(self._base_dir,\
    #                                                 'decision_tree_card2.html',data_dict))
    #     self.card2_chart = self._target_distribution
    #
    #     self.dropdownComment = NarrativesUtils.get_template_output(self._base_dir,\
    #                                                 'decision_rule_summary.html',data_dict)
    #     main_card = NormalCard()
    #     main_card_data = []
    #     main_card_narrative = NarrativesUtils.block_splitter(self.dropdownComment,self._blockSplitter)
    #     main_card_data += main_card_narrative
    #     main_card_data.append(TreeData(data=self._decision_tree_raw))
    #     main_card_table = TableData()
    #     main_card_table.set_table_data(self._decisionTreeCard1Table)
    #     main_card_table.set_table_type("decisionTreeTable")
    #     main_card_data.append(main_card_table)
    #     main_card.set_card_data(main_card_data)
    #     main_card.set_card_name("Predicting Key Drivers of {}".format(self._colname))
    #     card2 = NormalCard()
    #     card2Data = NarrativesUtils.block_splitter(NarrativesUtils.get_template_output(self._base_dir,\
    #                                                 'decision_tree_card2.html',data_dict),self._blockSplitter)
    #     card2ChartData = []
    #     for k,v in self._target_distribution.items():
    #         card2ChartData.append({"key":k,"value":v})
    #     card2ChartData = NormalChartData(data=card2ChartData)
    #     card2ChartJson = ChartJson()
    #     card2ChartJson.set_data(card2ChartData.get_data())
    #     card2ChartJson.set_chart_type("bar")
    #     card2ChartJson.set_axes({"x":"key","y":"value"})
    #     card2Data.insert(1,C3ChartData(data=card2ChartJson))
    #     card2.set_card_data(card2Data)
    #     card2.set_card_name("Decision Rules for {}".format(self._colname))
    #     self._decisionTreeNode.add_a_card(main_card)
    #     self._decisionTreeNode.add_a_card(card2)
    #     self.subheader = NarrativesUtils.get_template_output(self._base_dir,\
    #                                     'decision_tree_summary.html',data_dict)

    def _generate_summary(self):
        data_dict = {}
        rules_dict = self._table
        data_dict["blockSplitter"] = self._blockSplitter
        data_dict["targetcol"] = self._colname
        groups = rules_dict.keys()
        probabilityCutoff = 75
        probabilityGroups = [{
            "probability": probabilityCutoff,
            "count": 0,
            "range": [probabilityCutoff, 100]
        }, {
            "probability": probabilityCutoff - 1,
            "count": 0,
            "range": [0, probabilityCutoff - 1]
        }]
        tableArray = [[
            "Prediction Rule", "Probability", "Prediction", "Freq", "group",
            "richRules"
        ]]
        dropdownData = []
        chartDict = {}
        targetLevel = self._dataframe_context.get_target_level_for_model()
        probabilityArrayAll = []

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        progressMessage = CommonUtils.create_progress_message_object(
            self._analysisName,
            "custom",
            "info",
            "Generating Prediction rules",
            self._completionStatus,
            self._completionStatus,
            display=True)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=False)
        self._dataframe_context.update_completion_status(
            self._completionStatus)
        targetValues = [x for x in rules_dict.keys() if x == targetLevel
                        ] + [x for x in rules_dict.keys() if x != targetLevel]
        for idx, target in enumerate(targetValues):
            if idx == 0:
                if self._dataframe_context.get_story_on_scored_data() != True:
                    dropdownData.append({
                        "displayName": target,
                        "name": target,
                        "selected": True,
                        "id": idx + 1
                    })
                else:
                    dropdownData.append({
                        "displayName":
                        "{} : {}".format(self._colname, target),
                        "name":
                        target,
                        "selected":
                        True,
                        "id":
                        idx + 1
                    })
            else:
                if self._dataframe_context.get_story_on_scored_data() != True:
                    dropdownData.append({
                        "displayName": target,
                        "name": target,
                        "selected": False,
                        "id": idx + 1
                    })
                else:
                    dropdownData.append({
                        "displayName":
                        "{} : {}".format(self._colname, target),
                        "name":
                        target,
                        "selected":
                        False,
                        "id":
                        idx + 1
                    })
            rulesArray = rules_dict[target]
            probabilityArray = [
                round(x, 2) for x in self.success_percent[target]
            ]
            probabilityArrayAll += probabilityArray
            groupArray = [
                "strong" if x >= probabilityCutoff else "mixed"
                for x in probabilityArray
            ]
            for idx2, obj in enumerate(probabilityGroups):
                grpCount = len([
                    x for x in probabilityArray
                    if x >= obj["range"][0] and x <= obj["range"][1]
                ])
                obj["count"] += grpCount
                probabilityGroups[idx2] = obj
            predictionArray = [target] * len(rulesArray)
            freqArray = self.total_predictions[target]
            chartDict[target] = sum(freqArray)
            success = self.successful_predictions[target]
            success_percent = self.success_percent[target]
            richRulesArray = []
            crudeRuleArray = []
            analysisType = self._dataframe_context.get_analysis_type()
            targetCol = self._dataframe_context.get_result_column()
            binFlag = False
            if self._dataframe_context.get_custom_analysis_details() != None:
                binnedColObj = [
                    x["colName"] for x in
                    self._dataframe_context.get_custom_analysis_details()
                ]
                if binnedColObj != None and targetCol in binnedColObj:
                    binFlag = True
            for idx2, crudeRule in enumerate(rulesArray):
                richRule, crudeRule = NarrativesUtils.generate_rules(
                    self._colname,
                    target,
                    crudeRule,
                    freqArray[idx2],
                    success[idx2],
                    success_percent[idx2],
                    analysisType,
                    binFlag=binFlag)
                richRulesArray.append(richRule)
                crudeRuleArray.append(crudeRule)
            probabilityArray = map(
                lambda x: humanize.apnumber(x) + "%"
                if x >= 10 else str(int(x)) + "%", probabilityArray)
            # targetArray = zip(richRulesArray,probabilityArray,predictionArray,freqArray,groupArray)
            targetArray = zip(crudeRuleArray, probabilityArray,
                              predictionArray, freqArray, groupArray,
                              richRulesArray)
            targetArray = [list(x) for x in targetArray]
            tableArray += targetArray

        donutChartMaxLevel = 10
        if self._dataframe_context.get_story_on_scored_data() == True:
            chartDict = {}
            probabilityRangeForChart = GLOBALSETTINGS.PROBABILITY_RANGE_FOR_DONUT_CHART
            chartDict = dict(
                zip(probabilityRangeForChart.keys(),
                    [0] * len(probabilityRangeForChart)))
            for val in probabilityArrayAll:
                for grps, grpRange in probabilityRangeForChart.items():
                    if val > grpRange[0] and val <= grpRange[1]:
                        chartDict[grps] = chartDict[grps] + 1
            chartDict = {k: v for k, v in chartDict.items() if v != 0}
        else:
            chartDict = dict([(k, sum(v))
                              for k, v in self.total_predictions.items()])
            chartDict = {k: v for k, v in chartDict.items() if v != 0}
        if len(chartDict) > donutChartMaxLevel:
            chartDict = NarrativesUtils.restructure_donut_chart_data(
                chartDict, nLevels=donutChartMaxLevel)
        chartData = NormalChartData([chartDict]).get_data()
        chartJson = ChartJson(data=chartData)
        chartJson.set_title(self._colname)
        chartJson.set_chart_type("donut")
        mainCardChart = C3ChartData(data=chartJson)
        mainCardChart.set_width_percent(45)
        # mainCardChart = {"dataType": "c3Chart","widthPercent":33 ,"data": {"data": [chartDict],"title":self._colname,"axes":{},"label_text":{},"legend":{},"yAxisNumberFormat": ".2s","types":None,"axisRotation":False, "chart_type": "donut"}}

        dropdownDict = {
            "dataType": "dropdown",
            "label": "Showing prediction rules for",
            "data": dropdownData
        }

        data_dict["probabilityGroups"] = probabilityGroups
        if self._dataframe_context.get_story_on_scored_data() != True:
            maincardSummary = NarrativesUtils.get_template_output(self._base_dir,\
                                                        'decisiontreesummary.html',data_dict)
        else:
            predictedLevelcountArray = [(x[2], x[3]) for x in tableArray[1:]]
            predictedLevelCountDict = {}
            # predictedLevelcountDict = defaultdict(predictedLevelcountArray)
            for val in predictedLevelcountArray:
                predictedLevelCountDict.setdefault(val[0], []).append(val[1])

            levelCountDict = {}
            for k, v in predictedLevelCountDict.items():
                levelCountDict[k] = sum(v)
            # levelCountDict = self._metaParser.get_unique_level_dict(self._colname)
            total = float(
                sum([x for x in levelCountDict.values() if x != None]))
            levelCountTuple = [{
                "name": k,
                "count": v,
                "percentage": round(v * 100 / total, 2)
            } for k, v in levelCountDict.items() if v != None]
            percentageArray = [x["percentage"] for x in levelCountTuple]
            percentageArray = NarrativesUtils.ret_smart_round(percentageArray)
            levelCountTuple = [{
                "name": obj["name"],
                "count": obj["count"],
                "percentage": str(percentageArray[idx]) + "%"
            } for idx, obj in enumerate(levelCountTuple)]
            data_dict["nlevel"] = len(levelCountDict)
            print "levelCountTuple", levelCountTuple
            print "levelCountDict", levelCountDict
            if targetLevel in levelCountDict:
                data_dict["topLevel"] = [
                    x for x in levelCountTuple if x["name"] == targetLevel
                ][0]
                if len(levelCountTuple) > 1:
                    data_dict["secondLevel"] = max([
                        x for x in levelCountTuple if x["name"] != targetLevel
                    ],
                                                   key=lambda x: x["count"])
                else:
                    data_dict["secondLevel"] = None
            else:
                data_dict["topLevel"] = levelCountTuple[0]
                if len(levelCountTuple) > 1:
                    data_dict["secondLevel"] = levelCountTuple[1]
                else:
                    data_dict["secondLevel"] = None
            print data_dict
            maincardSummary = NarrativesUtils.get_template_output(
                self._base_dir, 'decisiontreescore.html', data_dict)
        main_card = NormalCard()
        main_card_data = []
        main_card_narrative = NarrativesUtils.block_splitter(
            maincardSummary, self._blockSplitter)
        main_card_data += main_card_narrative

        main_card_data.append(mainCardChart)
        main_card_data.append(dropdownDict)

        main_card_table = TableData()
        if self._dataframe_context.get_story_on_scored_data() == True:
            main_card_table.set_table_width(75)
        main_card_table.set_table_data(tableArray)
        main_card_table.set_table_type("popupDecisionTreeTable")
        main_card_data.append(main_card_table)
        uidTable = self._result_setter.get_unique_identifier_table()
        if uidTable != None:
            main_card_data.append(uidTable)
        else:
            main_card_table.set_table_width(100)
        main_card.set_card_data(main_card_data)
        main_card.set_card_name("Predicting Key Drivers of {}".format(
            self._colname))
        self._decisionTreeNode.add_a_card(main_card)
コード例 #13
0
    def Train(self):
        st_global = time.time()

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._slug,
            "initialization",
            "info",
            display=True,
            emptyBin=False,
            customMsg=None,
            weightKey="total")

        appType = self._dataframe_context.get_app_type()
        algosToRun = self._dataframe_context.get_algorithms_to_run()
        algoSetting = [
            x for x in algosToRun if x.get_algorithm_slug() == self._slug
        ][0]
        categorical_columns = self._dataframe_helper.get_string_columns()
        uid_col = self._dataframe_context.get_uid_column()
        if self._metaParser.check_column_isin_ignored_suggestion(uid_col):
            categorical_columns = list(set(categorical_columns) - {uid_col})
        allDateCols = self._dataframe_context.get_date_columns()
        categorical_columns = list(set(categorical_columns) - set(allDateCols))
        print(categorical_columns)
        result_column = self._dataframe_context.get_result_column()
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        numerical_columns = [
            x for x in numerical_columns if x != result_column
        ]

        model_path = self._dataframe_context.get_model_path()
        if model_path.startswith("file"):
            model_path = model_path[7:]
        validationDict = self._dataframe_context.get_validation_dict()
        print("model_path", model_path)
        pipeline_filepath = "file://" + str(model_path) + "/" + str(
            self._slug) + "/pipeline/"
        model_filepath = "file://" + str(model_path) + "/" + str(
            self._slug) + "/model"
        pmml_filepath = "file://" + str(model_path) + "/" + str(
            self._slug) + "/modelPmml"

        df = self._data_frame
        if self._mlEnv == "spark":
            pass
        elif self._mlEnv == "sklearn":
            model_filepath = model_path + "/" + self._slug + "/model.pkl"
            x_train, x_test, y_train, y_test = self._dataframe_helper.get_train_test_data(
            )
            x_train = MLUtils.create_dummy_columns(
                x_train,
                [x for x in categorical_columns if x != result_column])
            x_test = MLUtils.create_dummy_columns(
                x_test, [x for x in categorical_columns if x != result_column])
            x_test = MLUtils.fill_missing_columns(x_test, x_train.columns,
                                                  result_column)

            st = time.time()

            CommonUtils.create_update_and_save_progress_message(
                self._dataframe_context,
                self._scriptWeightDict,
                self._scriptStages,
                self._slug,
                "training",
                "info",
                display=True,
                emptyBin=False,
                customMsg=None,
                weightKey="total")

            if algoSetting.is_hyperparameter_tuning_enabled():
                pass
            else:
                self._result_setter.set_hyper_parameter_results(
                    self._slug, None)
                evaluationMetricDict = algoSetting.get_evaluvation_metric(
                    Type="Regression")
                evaluationMetricDict[
                    "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[
                        evaluationMetricDict["name"]]
                params_tf = algoSetting.get_tf_params_dict()
                algoParams = algoSetting.get_params_dict()
                algoParams = {k: v for k, v in list(algoParams.items())}

                model = tf.keras.models.Sequential()
                first_layer_flag = True

                for i in range(len(list(
                        params_tf['hidden_layer_info'].keys()))):
                    if params_tf['hidden_layer_info'][str(
                            i)]["layer"] == "Dense":

                        if first_layer_flag:
                            model.add(
                                tf.keras.layers.Dense(
                                    params_tf['hidden_layer_info'][str(
                                        i)]["units"],
                                    activation=params_tf['hidden_layer_info'][
                                        str(i)]["activation"],
                                    input_shape=(len(x_train.columns), ),
                                    use_bias=params_tf['hidden_layer_info'][
                                        str(i)]["use_bias"],
                                    kernel_initializer=params_tf[
                                        'hidden_layer_info'][str(
                                            i)]["kernel_initializer"],
                                    bias_initializer=params_tf[
                                        'hidden_layer_info'][str(
                                            i)]["bias_initializer"],
                                    kernel_regularizer=params_tf[
                                        'hidden_layer_info'][str(
                                            i)]["kernel_regularizer"],
                                    bias_regularizer=params_tf[
                                        'hidden_layer_info'][str(
                                            i)]["bias_regularizer"],
                                    activity_regularizer=params_tf[
                                        'hidden_layer_info'][str(
                                            i)]["activity_regularizer"],
                                    kernel_constraint=params_tf[
                                        'hidden_layer_info'][str(
                                            i)]["kernel_constraint"],
                                    bias_constraint=params_tf[
                                        'hidden_layer_info'][str(
                                            i)]["bias_constraint"]))
                            try:
                                if params_tf['hidden_layer_info'][str(
                                        i)]["batch_normalization"] == "True":
                                    model.add(
                                        tf.keras.layers.BatchNormalization())
                            except:
                                print(
                                    "BATCH_NORM_FAILED ##########################"
                                )
                                pass
                            first_layer_flag = False
                        else:
                            model.add(
                                tf.keras.layers.Dense(
                                    params_tf['hidden_layer_info'][str(
                                        i)]["units"],
                                    activation=params_tf['hidden_layer_info'][
                                        str(i)]["activation"],
                                    use_bias=params_tf['hidden_layer_info'][
                                        str(i)]["use_bias"],
                                    kernel_initializer=params_tf[
                                        'hidden_layer_info'][str(
                                            i)]["kernel_initializer"],
                                    bias_initializer=params_tf[
                                        'hidden_layer_info'][str(
                                            i)]["bias_initializer"],
                                    kernel_regularizer=params_tf[
                                        'hidden_layer_info'][str(
                                            i)]["kernel_regularizer"],
                                    bias_regularizer=params_tf[
                                        'hidden_layer_info'][str(
                                            i)]["bias_regularizer"],
                                    activity_regularizer=params_tf[
                                        'hidden_layer_info'][str(
                                            i)]["activity_regularizer"],
                                    kernel_constraint=params_tf[
                                        'hidden_layer_info'][str(
                                            i)]["kernel_constraint"],
                                    bias_constraint=params_tf[
                                        'hidden_layer_info'][str(
                                            i)]["bias_constraint"]))
                            try:
                                if params_tf['hidden_layer_info'][str(
                                        i)]["batch_normalization"] == "True":
                                    model.add(
                                        tf.keras.layers.BatchNormalization())
                            except:
                                print(
                                    "BATCH_NORM_FAILED ##########################"
                                )
                                pass

                    elif params_tf['hidden_layer_info'][str(
                            i)]["layer"] == "Dropout":
                        model.add(
                            tf.keras.layers.Dropout(
                                float(params_tf['hidden_layer_info'][str(i)]
                                      ["rate"])))

                    elif params_tf['hidden_layer_info'][str(
                            i)]["layer"] == "Lambda":
                        if params_tf['hidden_layer_info'][str(
                                i)]["lambda"] == "Addition":
                            model.add(
                                tf.keras.layers.Lambda(lambda x: x + int(
                                    params_tf['hidden_layer_info'][str(i)][
                                        "units"])))
                        if params_tf['hidden_layer_info'][str(
                                i)]["lambda"] == "Multiplication":
                            model.add(
                                tf.keras.layers.Lambda(lambda x: x * int(
                                    params_tf['hidden_layer_info'][str(i)][
                                        "units"])))
                        if params_tf['hidden_layer_info'][str(
                                i)]["lambda"] == "Subtraction":
                            model.add(
                                tf.keras.layers.Lambda(lambda x: x - int(
                                    params_tf['hidden_layer_info'][str(i)][
                                        "units"])))
                        if params_tf['hidden_layer_info'][str(
                                i)]["lambda"] == "Division":
                            model.add(
                                tf.keras.layers.Lambda(lambda x: old_div(
                                    x,
                                    int(params_tf['hidden_layer_info'][str(i)][
                                        "units"]))))

                model.compile(optimizer=algoParams["optimizer"],
                              loss=algoParams["loss"],
                              metrics=[algoParams['metrics']])

                model.fit(x_train,
                          y_train,
                          epochs=algoParams["number_of_epochs"],
                          verbose=1,
                          batch_size=algoParams["batch_size"])

                bestEstimator = model
            print(model.summary())
            trainingTime = time.time() - st
            y_score = bestEstimator.predict(x_test)
            y_score = list(y_score.flatten())
            try:
                y_prob = bestEstimator.predict_proba(x_test)
            except:
                y_prob = [0] * len(y_score)
            featureImportance = {}

            objs = {
                "trained_model": bestEstimator,
                "actual": y_test,
                "predicted": y_score,
                "probability": y_prob,
                "feature_importance": featureImportance,
                "featureList": list(x_train.columns),
                "labelMapping": {}
            }
            #featureImportance = objs["trained_model"].feature_importances_
            #featuresArray = [(col_name, featureImportance[idx]) for idx, col_name in enumerate(x_train.columns)]
            featuresArray = []
            if not algoSetting.is_hyperparameter_tuning_enabled():
                modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH -
                                         1) + "1"
                modelFilepathArr = model_filepath.split("/")[:-1]
                modelFilepathArr.append(modelName + ".h5")
                objs["trained_model"].save("/".join(modelFilepathArr))
                #joblib.dump(objs["trained_model"],"/".join(modelFilepathArr))
            metrics = {}
            metrics["r2"] = r2_score(y_test, y_score)
            metrics["neg_mean_squared_error"] = mean_squared_error(
                y_test, y_score)
            metrics["neg_mean_absolute_error"] = mean_absolute_error(
                y_test, y_score)
            metrics["RMSE"] = sqrt(metrics["neg_mean_squared_error"])
            metrics["explained_variance_score"] = explained_variance_score(
                y_test, y_score)
            transformed = pd.DataFrame({
                "prediction": y_score,
                result_column: y_test
            })
            transformed["difference"] = transformed[
                result_column] - transformed["prediction"]
            transformed["mape"] = old_div(
                np.abs(transformed["difference"]) * 100,
                transformed[result_column])

            sampleData = None
            nrows = transformed.shape[0]
            if nrows > 100:
                sampleData = transformed.sample(n=100, random_state=420)
            else:
                sampleData = transformed
            print(sampleData.head())
            if transformed["mape"].max() > 100:
                GLOBALSETTINGS.MAPEBINS.append(transformed["mape"].max())
                mapeCountArr = list(
                    pd.cut(transformed["mape"], GLOBALSETTINGS.MAPEBINS).
                    value_counts().to_dict().items())
                GLOBALSETTINGS.MAPEBINS.pop(5)
            else:
                mapeCountArr = list(
                    pd.cut(transformed["mape"], GLOBALSETTINGS.MAPEBINS).
                    value_counts().to_dict().items())
            mapeStatsArr = [(str(idx), dictObj) for idx, dictObj in enumerate(
                sorted([{
                    "count": x[1],
                    "splitRange": (x[0].left, x[0].right)
                } for x in mapeCountArr],
                       key=lambda x: x["splitRange"][0]))]
            print(mapeStatsArr)
            print(mapeCountArr)
            predictionColSummary = transformed["prediction"].describe(
            ).to_dict()
            quantileBins = [
                predictionColSummary["min"], predictionColSummary["25%"],
                predictionColSummary["50%"], predictionColSummary["75%"],
                predictionColSummary["max"]
            ]
            print(quantileBins)
            quantileBins = sorted(list(set(quantileBins)))
            transformed["quantileBinId"] = pd.cut(transformed["prediction"],
                                                  quantileBins)
            quantileDf = transformed.groupby("quantileBinId").agg({
                "prediction": [np.sum, np.mean, np.size]
            }).reset_index()
            quantileDf.columns = ["prediction", "sum", "mean", "count"]
            print(quantileDf)
            quantileArr = list(quantileDf.T.to_dict().items())
            quantileSummaryArr = [(obj[0], {
                "splitRange":
                (obj[1]["prediction"].left, obj[1]["prediction"].right),
                "count":
                obj[1]["count"],
                "mean":
                obj[1]["mean"],
                "sum":
                obj[1]["sum"]
            }) for obj in quantileArr]
            print(quantileSummaryArr)
            runtime = round((time.time() - st_global), 2)

            self._model_summary.set_model_type("regression")
            self._model_summary.set_algorithm_name(
                "Neural Network (TensorFlow)")
            self._model_summary.set_algorithm_display_name(
                "Neural Network (TensorFlow)")
            self._model_summary.set_slug(self._slug)
            self._model_summary.set_training_time(runtime)
            self._model_summary.set_training_time(trainingTime)
            self._model_summary.set_target_variable(result_column)
            self._model_summary.set_validation_method(
                validationDict["displayName"])
            self._model_summary.set_model_evaluation_metrics(metrics)
            self._model_summary.set_model_params(params_tf)
            self._model_summary.set_quantile_summary(quantileSummaryArr)
            self._model_summary.set_mape_stats(mapeStatsArr)
            self._model_summary.set_sample_data(sampleData.to_dict())
            self._model_summary.set_feature_importance(featuresArray)
            self._model_summary.set_feature_list(list(x_train.columns))
            self._model_summary.set_model_mse(
                metrics["neg_mean_squared_error"])
            self._model_summary.set_model_mae(
                metrics["neg_mean_absolute_error"])
            self._model_summary.set_rmse(metrics["RMSE"])
            self._model_summary.set_model_rsquared(metrics["r2"])
            self._model_summary.set_model_exp_variance_score(
                metrics["explained_variance_score"])

            try:
                pmml_filepath = str(model_path) + "/" + str(
                    self._slug) + "/traindeModel.pmml"
                modelPmmlPipeline = PMMLPipeline([("pretrained-estimator",
                                                   objs["trained_model"])])
                modelPmmlPipeline.target_field = result_column
                modelPmmlPipeline.active_fields = np.array(
                    [col for col in x_train.columns if col != result_column])
                sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr=True)
                pmmlfile = open(pmml_filepath, "r")
                pmmlText = pmmlfile.read()
                pmmlfile.close()
                self._result_setter.update_pmml_object({self._slug: pmmlText})
            except:
                pass

        if algoSetting.is_hyperparameter_tuning_enabled():
            modelDropDownObj = {
                "name": self._model_summary.get_algorithm_name(),
                "evaluationMetricValue": metrics[evaluationMetricDict["name"]],
                "evaluationMetricName": evaluationMetricDict["name"],
                "slug": self._model_summary.get_slug(),
                "Model Id": modelName
            }

            modelSummaryJson = {
                "dropdown": modelDropDownObj,
                "levelcount": self._model_summary.get_level_counts(),
                "modelFeatureList": self._model_summary.get_feature_list(),
                "levelMapping": self._model_summary.get_level_map_dict(),
                "slug": self._model_summary.get_slug(),
                "name": self._model_summary.get_algorithm_name()
            }
        else:
            modelDropDownObj = {
                "name": self._model_summary.get_algorithm_name(),
                "evaluationMetricValue": metrics[evaluationMetricDict["name"]],
                "evaluationMetricName": evaluationMetricDict["name"],
                "slug": self._model_summary.get_slug(),
                "Model Id": modelName
            }
            modelSummaryJson = {
                "dropdown": modelDropDownObj,
                "levelcount": self._model_summary.get_level_counts(),
                "modelFeatureList": self._model_summary.get_feature_list(),
                "levelMapping": self._model_summary.get_level_map_dict(),
                "slug": self._model_summary.get_slug(),
                "name": self._model_summary.get_algorithm_name()
            }
        modelmanagement_ = params_tf
        modelmanagement_.update(algoParams)

        self._model_management = MLModelSummary()
        if algoSetting.is_hyperparameter_tuning_enabled():
            pass
        else:
            self._model_management.set_layer_info(
                data=modelmanagement_['hidden_layer_info'])
            self._model_management.set_loss_function(
                data=modelmanagement_['loss'])
            self._model_management.set_optimizer(
                data=modelmanagement_['optimizer'])
            self._model_management.set_batch_size(
                data=modelmanagement_['batch_size'])
            self._model_management.set_no_epochs(
                data=modelmanagement_['number_of_epochs'])
            self._model_management.set_model_evaluation_metrics(
                data=modelmanagement_['metrics'])
            self._model_management.set_job_type(
                self._dataframe_context.get_job_name())  #Project name
            self._model_management.set_training_status(
                data="completed")  # training status
            self._model_management.set_no_of_independent_variables(
                data=x_train)  #no of independent varables
            self._model_management.set_training_time(runtime)  # run time
            self._model_management.set_rmse(metrics["RMSE"])
            self._model_management.set_algorithm_name(
                "Neural Network (TensorFlow)")  #algorithm name
            self._model_management.set_validation_method(
                str(validationDict["displayName"]) + "(" +
                str(validationDict["value"]) + ")")  #validation method
            self._model_management.set_target_variable(
                result_column)  #target column name
            self._model_management.set_creation_date(data=str(
                datetime.now().strftime('%b %d ,%Y  %H:%M ')))  #creation date
            self._model_management.set_datasetName(self._datasetName)
        modelManagementSummaryJson = [
            ["Project Name",
             self._model_management.get_job_type()],
            ["Algorithm",
             self._model_management.get_algorithm_name()],
            ["Training Status",
             self._model_management.get_training_status()],
            ["RMSE", self._model_management.get_rmse()],
            ["RunTime", self._model_management.get_training_time()],
            #["Owner",None],
            ["Created On",
             self._model_management.get_creation_date()]
        ]
        if algoSetting.is_hyperparameter_tuning_enabled():
            modelManagementModelSettingsJson = []
        else:
            modelManagementModelSettingsJson = [
                ["Training Dataset",
                 self._model_management.get_datasetName()],
                [
                    "Target Column",
                    self._model_management.get_target_variable()
                ],
                [
                    "Number Of Independent Variables",
                    self._model_management.get_no_of_independent_variables()
                ], ["Algorithm",
                    self._model_management.get_algorithm_name()],
                [
                    "Model Validation",
                    self._model_management.get_validation_method()
                ],
                ["batch_size",
                 str(self._model_management.get_batch_size())],
                ["Loss", self._model_management.get_loss_function()],
                ["Optimizer",
                 self._model_management.get_optimizer()],
                ["Epochs", self._model_management.get_no_epochs()],
                [
                    "Metrics",
                    self._model_management.get_model_evaluation_metrics()
                ]
            ]
            for i in range(
                    len(list(modelmanagement_['hidden_layer_info'].keys()))):
                string = ""
                key = "layer No-" + str(i) + "-" + str(
                    modelmanagement_["hidden_layer_info"][str(i)]["layer"] +
                    "-")
                for j in modelmanagement_["hidden_layer_info"][str(i)]:
                    modelManagementModelSettingsJson.append([
                        key + j + ":",
                        modelmanagement_["hidden_layer_info"][str(i)][j]
                    ])
        print(modelManagementModelSettingsJson)

        tfregCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for
            cardObj in MLUtils.create_model_summary_cards(self._model_summary)
        ]

        tfregPerformanceCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj))
            for cardObj in MLUtils.create_model_management_cards_regression(
                self._model_summary)
        ]
        tfregOverviewCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj))
            for cardObj in MLUtils.create_model_management_card_overview(
                self._model_management, modelManagementSummaryJson,
                modelManagementModelSettingsJson)
        ]
        tfregDeploymentCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj))
            for cardObj in MLUtils.create_model_management_deploy_empty_card()
        ]
        TFReg_Overview_Node = NarrativesTree()
        TFReg_Overview_Node.set_name("Overview")
        TFReg_Performance_Node = NarrativesTree()
        TFReg_Performance_Node.set_name("Performance")
        TFReg_Deployment_Node = NarrativesTree()
        TFReg_Deployment_Node.set_name("Deployment")
        for card in tfregOverviewCards:
            TFReg_Overview_Node.add_a_card(card)
        for card in tfregPerformanceCards:
            TFReg_Performance_Node.add_a_card(card)
        for card in tfregDeploymentCards:
            TFReg_Deployment_Node.add_a_card(card)
        for card in tfregCards:
            self._prediction_narrative.add_a_card(card)
        self._result_setter.set_model_summary({
            "Neural Network (TensorFlow)":
            json.loads(
                CommonUtils.convert_python_object_to_json(self._model_summary))
        })
        self._result_setter.set_tfreg_regression_model_summart(
            modelSummaryJson)
        self._result_setter.set_tfreg_cards(tfregCards)
        self._result_setter.set_tfreg_nodes([
            TFReg_Overview_Node, TFReg_Performance_Node, TFReg_Deployment_Node
        ])
        self._result_setter.set_tfreg_fail_card({
            "Algorithm_Name": "Neural Network (TensorFlow)",
            "Success": "True"
        })
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._slug,
            "completion",
            "info",
            display=True,
            emptyBin=False,
            customMsg=None,
            weightKey="total")
コード例 #14
0
class AnovaNarratives(object):
    ALPHA = 0.05

    KEY_SUMMARY = 'summary'
    KEY_NARRATIVES = 'narratives'
    KEY_TAKEAWAY = 'key_takeaway'
    DRILL_DOWN = 'drill_down_narrative'
    KEY_CARD = 'card'
    KEY_HEADING = 'heading'
    KEY_SUBHEADING = 'header'
    KEY_CHART = 'charts'
    KEY_PARAGRAPH = 'paragraphs'
    KEY_PARA_HEADER = 'header'
    KEY_PARA_CONTENT = 'content'
    KEY_BUBBLE = 'bubble_data'

    # @accepts(object, DFAnovaResult, DataFrameHelper)
    def __init__(self,
                 df_anova_result,
                 df_helper,
                 df_context,
                 result_setter,
                 story_narrative,
                 scriptWeight=None,
                 analysisName=None):
        self._story_narrative = story_narrative
        self._result_setter = result_setter
        self._dataframe_context = df_context
        self._df_anova_result = df_anova_result
        self._df_helper = df_helper
        self.narratives = {}
        self.narratives['variables'] = ''
        self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER
        self._base_dir = "/anova/"

        self._analysisName = self._dataframe_context.get_analysis_name()
        self._analysisDict = self._dataframe_context.get_analysis_dict()

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        self._messageURL = self._dataframe_context.get_message_url()
        if analysisName == None:
            self._analysisName = self._dataframe_context.get_analysis_name()
        else:
            self._analysisName = analysisName
        if scriptWeight == None:
            self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight(
            )
        else:
            self._scriptWeightDict = scriptWeight
        self._scriptStages = {
            "anovaNarrativeStart": {
                "summary": "Started The Anova Narratives",
                "weight": 0
            },
            "anovaNarrativeEnd": {
                "summary": "Narratives For Anova Finished",
                "weight": 10
            },
        }
        # self._completionStatus += self._scriptWeightDict[self._analysisName]["narratives"]*self._scriptStages["anovaNarrativeStart"]["weight"]/10
        # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
        #                             "anovaNarrativeStart",\
        #                             "info",\
        #                             self._scriptStages["anovaNarrativeStart"]["summary"],\
        #                             self._completionStatus,\
        #                             self._completionStatus)
        # CommonUtils.save_progress_message(self._messageURL,progressMessage)
        # self._dataframe_context.update_completion_status(self._completionStatus)
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "anovaNarrativeStart",
            "info",
            display=False,
            emptyBin=False,
            customMsg=None,
            weightKey="narratives")

        self._generate_narratives()

        # self._completionStatus += self._scriptWeightDict[self._analysisName]["narratives"]*self._scriptStages["anovaNarrativeEnd"]["weight"]/10
        # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
        #                             "anovaNarrativeEnd",\
        #                             "info",\
        #                             self._scriptStages["anovaNarrativeEnd"]["summary"],\
        #                             self._completionStatus,\
        #                             self._completionStatus)
        # CommonUtils.save_progress_message(self._messageURL,progressMessage)
        # self._dataframe_context.update_completion_status(self._completionStatus)
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "anovaNarrativeEnd",
            "info",
            display=False,
            emptyBin=False,
            customMsg=None,
            weightKey="narratives")

        if self._anovaNodes.get_card_count() > 0:
            self._story_narrative.add_a_node(self._anovaNodes)
            #self._generate_take_away()
            self._result_setter.set_anova_node(self._anovaNodes)

    def _generate_narratives(self):
        try:
            nColsToUse = self._analysisDict[
                self._analysisName]["noOfColumnsToUse"]
        except:
            nColsToUse = None
        self._anovaNodes = NarrativesTree()
        self._anovaNodes.set_name("Performance")
        for measure_column in self._df_anova_result.get_measure_columns():
            measure_anova_result = self._df_anova_result.get_measure_result(
                measure_column)
            significant_dimensions_dict, insignificant_dimensions = measure_anova_result.get_OneWayAnovaSignificantDimensions(
            )
            num_dimensions = len(list(significant_dimensions_dict.items())
                                 ) + len(insignificant_dimensions)
            significant_dimensions = [
                k for k, v in sorted(list(significant_dimensions_dict.items()),
                                     key=lambda x: -x[1])
            ]
            if nColsToUse != None:
                significant_dimensions = significant_dimensions[:nColsToUse]
            num_significant_dimensions = len(significant_dimensions)
            num_insignificant_dimensions = len(insignificant_dimensions)
            print("num_significant_dimensions", num_significant_dimensions)
            if num_significant_dimensions > 0:
                mainCard = NormalCard(name="Overview of Key Factors")
                data_c3 = []
                for sig_dim in significant_dimensions:
                    data_c3.append({
                        'dimension':
                        sig_dim,
                        'effect_size':
                        float(significant_dimensions_dict[sig_dim])
                    })
                self.narratives = {}
                self.narratives[AnovaNarratives.
                                KEY_HEADING] = "%s Performance Analysis" % (
                                    measure_column, )
                self.narratives['main_card'] = {}
                self.narratives['cards'] = []
                self.narratives['main_card'][
                    AnovaNarratives.
                    KEY_SUBHEADING] = "Relationship between %s and other Dimensions" % (
                        measure_column)
                self.narratives['main_card'][
                    AnovaNarratives.KEY_PARAGRAPH] = []
                data_dict = { \
                                'significant_dimensions' : significant_dimensions,
                                'insignificant_dimensions' : insignificant_dimensions,
                                'num_significant_dimensions' : num_significant_dimensions,
                                'num_insignificant_dimensions' : num_insignificant_dimensions,
                                'num_dimensions' : num_significant_dimensions+num_insignificant_dimensions,
                                'target' : measure_column \
                            }
                output = {'header': ''}
                output['content'] = NarrativesUtils.get_template_output(
                    self._base_dir, 'anova_template_1.html', data_dict)
                self.narratives['main_card'][
                    AnovaNarratives.KEY_PARAGRAPH].append(output)
                output1 = {'header': ''}
                output1['content'] = NarrativesUtils.get_template_output(
                    self._base_dir, 'anova_template_2.html', data_dict)
                lines = []
                lines += NarrativesUtils.block_splitter(
                    output['content'], self._blockSplitter)
                data_c3 = NormalChartData(data_c3)
                chart_data = data_c3.get_data()
                chartDataValues = []
                effect_size_values = []
                for obj in chart_data:
                    effect_size_values.append(obj["effect_size"])
                chart_data_min = min(effect_size_values)
                if chart_data_min < 0.00001:
                    for obj in chart_data:
                        chartDataValues.append(str(obj["effect_size"]))
                else:
                    for obj in chart_data:
                        chartDataValues.append(obj["effect_size"])
                chart_json = ChartJson(data=chart_data,
                                       axes={
                                           'x': 'dimension',
                                           'y': 'effect_size'
                                       },
                                       label_text={
                                           'x': '',
                                           'y':
                                           'Effect Size (scaled exp values)'
                                       },
                                       chart_type='bar')
                chart_json.set_axis_rotation(True)
                # chart_json.set_yaxis_number_format(".4f")
                chart_json.set_yaxis_number_format(
                    NarrativesUtils.select_y_axis_format(chartDataValues))
                # st_info = ["Test : ANOVA", "Threshold for p-value : 0.05", "Effect Size : Tukey's HSD"]
                statistical_info_array = [
                    ("Test Type", "ANOVA"),
                    ("Effect Size", "ETA squared"),
                    ("Max Effect Size", chart_data[0]["dimension"]),
                    ("Min Effect Size", chart_data[-1]["dimension"]),
                ]
                statistical_inferenc = ""
                if len(chart_data) == 1:
                    statistical_inference = "{} is the only variable that have significant association with the {} (Target) having an \
                     Effect size of {}".format(
                        chart_data[0]["dimension"],
                        self._dataframe_context.get_result_column(),
                        round(chart_data[0]["effect_size"], 4))
                elif len(chart_data) == 2:
                    statistical_inference = "There are two variables ({} and {}) that have significant association with the {} (Target) and the \
                     Effect size ranges are {} and {} respectively".format(
                        chart_data[0]["dimension"], chart_data[1]["dimension"],
                        self._dataframe_context.get_result_column(),
                        round(chart_data[0]["effect_size"], 4),
                        round(chart_data[1]["effect_size"], 4))
                else:
                    statistical_inference = "There are {} variables that have significant association with the {} (Target) and the \
                     Effect size ranges from {} to {}".format(
                        len(chart_data),
                        self._dataframe_context.get_result_column(),
                        round(chart_data[0]["effect_size"], 4),
                        round(chart_data[-1]["effect_size"], 4))
                if statistical_inference != "":
                    statistical_info_array.append(
                        ("Inference", statistical_inference))
                statistical_info_array = NarrativesUtils.statistical_info_array_formatter(
                    statistical_info_array)
                lines += [
                    C3ChartData(data=chart_json, info=statistical_info_array)
                ]
                lines += NarrativesUtils.block_splitter(
                    output1['content'], self._blockSplitter)
                mainCard.set_card_data(lines)
                self._anovaNodes.add_a_card(mainCard)
                self.narratives['main_card'][
                    AnovaNarratives.KEY_PARAGRAPH].append(output1)
                self.narratives['main_card'][AnovaNarratives.KEY_CHART] = {}
                effect_size_chart = {
                    'heading': '',
                    'labels': {
                        'Dimension': 'Effect Size'
                    },
                    'data': significant_dimensions_dict
                }
                print(significant_dimensions_dict)
                self.narratives['main_card'][AnovaNarratives.KEY_CHART][
                    'effect_size'] = effect_size_chart
                progressMessage = CommonUtils.create_progress_message_object(
                    self._analysisName,
                    "custom",
                    "info",
                    "Analyzing Key Drivers",
                    self._completionStatus,
                    self._completionStatus,
                    display=True)
                CommonUtils.save_progress_message(self._messageURL,
                                                  progressMessage,
                                                  ignore=False)
                self._generate_dimension_narratives(significant_dimensions,
                                                    measure_anova_result,
                                                    measure_column)
            else:
                mainCard = NormalCard(name="Overview of Key Factors")
                cardText = HtmlData(
                    "There are no dimensions in the dataset that have significant influence on {}"
                    .format(measure_column))
                mainCard.set_card_data([cardText])
                self._anovaNodes.add_a_card(mainCard)

    def _generate_dimension_narratives(self, significant_dimensions,
                                       measure_anova_result, measure):
        self.narratives['cards'] = []
        anova_trend_result = measure_anova_result.get_trend_data()
        if len(significant_dimensions) == 0:
            self.narratives['cards'].append({
                'card1': '',
                'card2': '',
                'card3': ''
            })
        self.narratives['variables'] = significant_dimensions
        for dimension in significant_dimensions:
            dimensionNode = NarrativesTree(name=dimension)
            narratives = OneWayAnovaNarratives(self._dataframe_context,
                                               measure, dimension,
                                               measure_anova_result,
                                               anova_trend_result,
                                               self._result_setter,
                                               dimensionNode, self._base_dir)
            self._anovaNodes.add_a_node(dimensionNode)
            self.narratives['cards'].append(narratives)
コード例 #15
0
class TestChiSquare(unittest.TestCase):

    # def __init__(self):
    # 	pass

    def setUp(self):
        APP_NAME = "test"
        spark = CommonUtils.get_spark_session(app_name=APP_NAME,
                                              hive_environment=False)
        spark.sparkContext.setLogLevel("ERROR")
        # spark.conf.set("spark.sql.execution.arrow.enabled", "true")

        configJson = get_test_configs("testCase", testFor="chisquare")

        config = configJson["config"]
        jobConfig = configJson["job_config"]
        jobType = jobConfig["job_type"]
        jobName = jobConfig["job_name"]
        jobURL = jobConfig["job_url"]
        messageURL = jobConfig["message_url"]
        try:
            errorURL = jobConfig["error_reporting_url"]
        except:
            errorURL = None
        if "app_id" in jobConfig:
            appid = jobConfig["app_id"]
        else:
            appid = None
        debugMode = True
        LOGGER = {}

        configJsonObj = configparser.ParserConfig(config)
        configJsonObj.set_json_params()
        configJsonObj = configparser.ParserConfig(config)
        configJsonObj.set_json_params()

        dataframe_context = ContextSetter(configJsonObj)
        dataframe_context.set_job_type(
            jobType
        )  #jobType should be set before set_params call of dataframe_context
        dataframe_context.set_params()
        dataframe_context.set_message_url(messageURL)
        dataframe_context.set_app_id(appid)
        dataframe_context.set_debug_mode(debugMode)
        dataframe_context.set_job_url(jobURL)
        dataframe_context.set_app_name(APP_NAME)
        dataframe_context.set_error_url(errorURL)
        dataframe_context.set_logger(LOGGER)
        dataframe_context.set_xml_url(jobConfig["xml_url"])
        dataframe_context.set_job_name(jobName)
        dataframe_context.set_environment("debugMode")
        dataframe_context.set_message_ignore(True)
        dataframe_context.set_analysis_name("Descriptive analysis")

        df = MasterHelper.load_dataset(spark, dataframe_context)
        metaParserInstance = MasterHelper.get_metadata(df, spark,
                                                       dataframe_context, None)
        df, df_helper = MasterHelper.set_dataframe_helper(
            df, dataframe_context, metaParserInstance)
        targetVal = dataframe_context.get_result_column()

        self.result_setter = ResultSetter(dataframe_context)
        self.story_narrative = NarrativesTree()
        self.story_narrative.set_name(
            "{} Performance Report".format(targetVal))
        self.data_frame = df
        self.df_helper = df_helper
        self.df_context = dataframe_context
        self.meta_parser = metaParserInstance
        self.measure_columns = df_helper.get_numeric_columns()
        self.base_dir = "/chisquare/"
        self.significant_variables = [
            'Buyer_Gender', 'Sales', 'Discount_Range', 'Shipping_Cost',
            'Last_Transaction', 'Marketing_Cost'
        ]
        self.measure_columns = [
            'Tenure_in_Days', 'Sales', 'Marketing_Cost', 'Shipping_Cost',
            'Last_Transaction'
        ]
        self.df_chisquare_obj = ChiSquare(
            self.data_frame, self.df_helper, self.df_context,
            self.meta_parser).test_all(
                dimension_columns=(self.df_context.get_result_column(), ))
        self.df_chisquare_result = self.df_chisquare_obj.get_result()
        self.num_analysed_variables = 11

    def test_chisquare_dimension(self):
        test_dimension = ChiSquare(self.data_frame, self.df_helper,
                                   self.df_context,
                                   self.meta_parser).test_dimension(
                                       'Price_Range', 'Source')
        self.assertAlmostEqual(test_dimension.get_pvalue(),
                               exp_values['pval']['Price_Range-Source'],
                               places=5)
        self.assertAlmostEqual(test_dimension.get_effect_size(),
                               exp_values['effect_size']['Price_Range-Source'],
                               places=5)
        self.assertAlmostEqual(test_dimension.get_stat(),
                               exp_values['stats']['Price_Range-Source'],
                               places=5)
        self.assertAlmostEqual(test_dimension.get_v_value(),
                               exp_values['v_value']['Price_Range-Source'],
                               places=5)

    def test_chisquare_measure(self):
        test_measures = ChiSquare(self.data_frame, self.df_helper,
                                  self.df_context,
                                  self.meta_parser).test_measures(
                                      'Price_Range', 'Marketing_Cost')
        self.assertAlmostEqual(
            test_measures.get_pvalue(),
            exp_values['pval']['Price_Range-Marketing_Cost'],
            places=5)
        self.assertAlmostEqual(
            test_measures.get_effect_size(),
            exp_values['effect_size']['Price_Range-Marketing_Cost'],
            places=5)
        self.assertAlmostEqual(
            test_measures.get_stat(),
            exp_values['stats']['Price_Range-Marketing_Cost'],
            places=5)
        self.assertAlmostEqual(
            test_measures.get_v_value(),
            exp_values['v_value']['Price_Range-Marketing_Cost'],
            places=5)

    def test_chisquare_all(self):

        #PVal-Test
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Deal_Type').get_pvalue(),
            exp_values['pval']['Price_Range-Deal_Type'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Discount_Range').get_pvalue(),
            exp_values['pval']['Price_Range-Discount_Range'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result('Price_Range',
                                                       'Source').get_pvalue(),
            exp_values['pval']['Price_Range-Source'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Platform').get_pvalue(),
            exp_values['pval']['Price_Range-Platform'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Buyer_Age').get_pvalue(),
            exp_values['pval']['Price_Range-Buyer_Age'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Buyer_Gender').get_pvalue(),
            exp_values['pval']['Price_Range-Buyer-Gender'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Tenure_in_Days').get_pvalue(),
            exp_values['pval']['Price_Range-Tenure_in_Days'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result('Price_Range',
                                                       'Sales').get_pvalue(),
            exp_values['pval']['Price_Range-Sales'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Marketing_Cost').get_pvalue(),
            exp_values['pval']['Price_Range-Marketing_Cost'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Shipping_Cost').get_pvalue(),
            exp_values['pval']['Price_Range-Shipping_Cost'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Last_Transaction').get_pvalue(),
            exp_values['pval']['Price_Range-Last_Transaction'])

        #EffectSize_Test
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Deal_Type').get_effect_size(),
            exp_values['effect_size']['Price_Range-Deal_Type'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Discount_Range').get_effect_size(),
            exp_values['effect_size']['Price_Range-Discount_Range'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Source').get_effect_size(),
            exp_values['effect_size']['Price_Range-Source'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Platform').get_effect_size(),
            exp_values['effect_size']['Price_Range-Platform'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Buyer_Age').get_effect_size(),
            exp_values['effect_size']['Price_Range-Buyer_Age'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Buyer_Gender').get_effect_size(),
            exp_values['effect_size']['Price_Range-Buyer-Gender'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Tenure_in_Days').get_effect_size(),
            exp_values['effect_size']['Price_Range-Tenure_in_Days'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Sales').get_effect_size(),
            exp_values['effect_size']['Price_Range-Sales'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Marketing_Cost').get_effect_size(),
            exp_values['effect_size']['Price_Range-Marketing_Cost'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Shipping_Cost').get_effect_size(),
            exp_values['effect_size']['Price_Range-Shipping_Cost'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Last_Transaction').get_effect_size(),
            exp_values['effect_size']['Price_Range-Last_Transaction'])

        #Stats_Test
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result('Price_Range',
                                                       'Deal_Type').get_stat(),
            exp_values['stats']['Price_Range-Deal_Type'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Discount_Range').get_stat(),
            exp_values['stats']['Price_Range-Discount_Range'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result('Price_Range',
                                                       'Source').get_stat(),
            exp_values['stats']['Price_Range-Source'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result('Price_Range',
                                                       'Platform').get_stat(),
            exp_values['stats']['Price_Range-Platform'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result('Price_Range',
                                                       'Buyer_Age').get_stat(),
            exp_values['stats']['Price_Range-Buyer_Age'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Buyer_Gender').get_stat(),
            exp_values['stats']['Price_Range-Buyer-Gender'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Tenure_in_Days').get_stat(),
            exp_values['stats']['Price_Range-Tenure_in_Days'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result('Price_Range',
                                                       'Sales').get_stat(),
            exp_values['stats']['Price_Range-Sales'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Marketing_Cost').get_stat(),
            exp_values['stats']['Price_Range-Marketing_Cost'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Shipping_Cost').get_stat(),
            exp_values['stats']['Price_Range-Shipping_Cost'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Last_Transaction').get_stat(),
            exp_values['stats']['Price_Range-Last_Transaction'])

        # #VVal-Test
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Deal_Type').get_v_value(),
            exp_values['v_value']['Price_Range-Deal_Type'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Discount_Range').get_v_value(),
            exp_values['v_value']['Price_Range-Discount_Range'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result('Price_Range',
                                                       'Source').get_v_value(),
            exp_values['v_value']['Price_Range-Source'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Platform').get_v_value(),
            exp_values['v_value']['Price_Range-Platform'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Buyer_Age').get_v_value(),
            exp_values['v_value']['Price_Range-Buyer_Age'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Buyer_Gender').get_v_value(),
            exp_values['v_value']['Price_Range-Buyer-Gender'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Tenure_in_Days').get_v_value(),
            exp_values['v_value']['Price_Range-Tenure_in_Days'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result('Price_Range',
                                                       'Sales').get_v_value(),
            exp_values['v_value']['Price_Range-Sales'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Marketing_Cost').get_v_value(),
            exp_values['v_value']['Price_Range-Marketing_Cost'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Shipping_Cost').get_v_value(),
            exp_values['v_value']['Price_Range-Shipping_Cost'])
        self.assertAlmostEqual(
            self.df_chisquare_obj.get_chisquare_result(
                'Price_Range', 'Last_Transaction').get_v_value(),
            exp_values['v_value']['Price_Range-Last_Transaction'])

    def test_chisquare_analysis(self):
        target_chisquare_result = self.df_chisquare_result['Price_Range']
        chisquare_result = self.df_chisquare_obj.get_chisquare_result(
            'Price_Range', 'Buyer_Gender')
        out = ChiSquareAnalysis(
            self.df_context, self.df_helper, chisquare_result, 'Price_Range',
            'Buyer_Gender', self.significant_variables,
            self.num_analysed_variables, self.data_frame, self.measure_columns,
            self.base_dir, None,
            target_chisquare_result)._generate_narratives()

        self.assertEqual(out['data_dict'], exp_data_dict)
        self.assertEqual(out['target_dict']['11 to 50'],
                         out['target_dict']['11 to 50'])
        self.assertEqual(out['target_dict']['101 to 500'],
                         out['target_dict']['101 to 500'])
        self.assertEqual(out['target_dict']['0 to 10'],
                         out['target_dict']['0 to 10'])
コード例 #16
0
class BusinessCard(object):
    """
    Functionalities
    """
    def __init__(self, story_result, meta_parser, result_setter,
                 dataframe_context, dataframe_helper, start_time,
                 analysis_type):
        self._story_result = story_result
        self._meta_parser = meta_parser
        self._result_setter = result_setter
        self._dataframe_context = dataframe_context
        self._dataframe_helper = dataframe_helper
        self.subheader = "Impact"
        self.business_card1 = NormalCard()
        self.business_card1.set_card_name("Overview")
        self.businessCardData = []
        self.start_time = start_time
        self.analysis_type = analysis_type

    def set_params(self):
        self.target_levels = self._dataframe_helper.get_num_unique_values(
            self._dataframe_context.get_result_column())
        self.number_variables = self.get_number_variables()
        self.number_measures = self.get_number_measures()
        self.number_dimensions = self.get_number_dimensions()
        if self.analysis_type == 'dimension':
            self.analysis_list = [
                "overview_rules", "association_summary", "association_rules",
                "prediction_rules"
            ]
        elif self.analysis_type == 'measure':
            self.analysis_list = [
                "overview_rules", "performance_summary", "performance_rules",
                "influencers_summary", "influencers_rules", "prediction_rules"
            ]
        self.data_points = self.get_number_data_points()
        self.number_charts = self.get_number_charts()
        self.number_prediction_rules = self.get_number_prediction_rules()
        self.number_pages = self.get_number_pages()
        self.number_analysis = self.get_number_analysis()
        self.number_queries = self.get_number_queries()
        self.time_mAdvisor = time.time() - self.start_time
        self.time_analyst = self.get_time_analyst()
        self.time_saved = self.get_time_saved()
        self.impact_on_productivity = self.get_impact_on_productivity()

    def get_number_charts(self):
        return json.dumps(self._story_result, indent=2).count("c3Chart")

    def get_number_analysis(self):
        if self.analysis_type == 'dimension':
            significant_variables_levels = {"None": 0}
            for each in self._story_result['listOfNodes']:
                try:
                    if each['name'] == 'Key Drivers':
                        for node in each['listOfNodes']:
                            significant_variables_levels[node['name']] = [
                                self._meta_parser.get_num_unique_values(
                                    node['name']) if node['name']
                                in self._dataframe_helper.get_string_columns()
                                else 5
                            ][0]
                except:
                    for key in each.keys():
                        if not key.startswith('maxdepth'):
                            if each['name'] == 'Key Drivers':
                                for node in each['listOfNodes']:
                                    significant_variables_levels[
                                        node['name']] = [
                                            self._meta_parser.
                                            get_num_unique_values(node['name'])
                                            if node['name']
                                            in self._dataframe_helper.
                                            get_string_columns() else 5
                                        ][0]
            self.number_analysis_dict = {}
            self.number_analysis_dict[
                "overview_rules"] = self.target_levels * 2
            self.number_analysis_dict['association_summary'] = (
                self.number_dimensions + self.number_measures) * 2
            self.number_analysis_dict["association_rules"] = sum(
                significant_variables_levels.values()) * 6
            self.number_analysis_dict[
                "prediction_rules"] = self.number_prediction_rules * 5
            return sum(self.number_analysis_dict.values())
        elif self.analysis_type == 'measure':
            significant_variables_levels = {"None": 0}
            for each in self._story_result['listOfNodes']:
                if each['name'] == 'Performance':
                    for node in each['listOfNodes']:
                        significant_variables_levels[node['name']] = [
                            self._dataframe_helper.get_num_unique_values(
                                node['name']) if node['name']
                            in self._dataframe_helper.get_string_columns() else
                            5
                        ][0]
            self.number_analysis_dict = {}
            self.number_analysis_dict[
                "overview_rules"] = self.target_levels * 2
            self.number_analysis_dict["performance_summary"] = (
                self.number_dimensions + self.number_measures) * 2
            self.number_analysis_dict["performance_rules"] = sum(
                significant_variables_levels.values()) * 6
            self.number_analysis_dict[
                "prediction_rules"] = self.number_prediction_rules * 5
            self.number_analysis_dict[
                "influencers_summary"] = self.number_measures * 2
            self.number_analysis_dict["influencers_rules"] = 8
            return sum(self.number_analysis_dict.values())

    def get_number_queries(self):
        if self.analysis_type == 'dimension':
            queries_per_analysis_dict = {
                "overview_rules": 15,
                "association_summary": 120,
                "association_rules": 600,
                "prediction_rules": 200
            }
        elif self.analysis_type == 'measure':
            queries_per_analysis_dict = {
                "overview_rules": 15,
                "performance_summary": 120,
                "performance_rules": 600,
                "influencers_summary": 100,
                "influencers_rules": 80,
                "prediction_rules": 200
            }
        sum = 0
        for analysis in self.analysis_list:
            sum += self.number_analysis_dict[
                analysis] * queries_per_analysis_dict[analysis]
        return sum

    def get_number_prediction_rules(self):
        num_prediction_rules = 0
        for each_node in self._story_result['listOfNodes']:
            try:
                if each_node['name'] == 'Prediction':
                    for card in each_node['listOfCards'][0]['cardData']:
                        if card['dataType'] == 'table':
                            num_prediction_rules = len(
                                card['data']['tableData'])
            except:
                for key in each_node.keys():
                    if key.startswith('maxdepth'):
                        if each_node['maxdepth3'][
                                'name'] == 'Prediction' or each_node[
                                    'maxdepth4'][
                                        'name'] == 'Prediction' or each_node[
                                            'maxdepth5'][
                                                'name'] == 'Prediction':
                            for Depth in range(3, 6):
                                for card in each_node['maxdepth' + str(
                                        Depth)]['listOfCards'][0]['cardData']:
                                    if card['dataType'] == 'table':
                                        num_prediction_rules += len(
                                            card['data']['tableData'])
        return num_prediction_rules

    def get_number_pages(self):
        sum = 0
        for each in self._story_result['listOfNodes']:
            try:
                if each['listOfNodes']:
                    for items in each['listOfNodes']:
                        sum += len(items['listOfCards'])
                    sum += len(each['listOfCards'])
                else:
                    sum += len(each['listOfCards'])
            except:
                for key in each.keys():
                    if key.startswith('maxdepth'):
                        if each['maxdepth3']['listOfNodes'] or each[
                                'maxdepth4']['listOfNodes'] or each[
                                    'maxdepth5']['listOfNodes']:
                            for Depth in range(3, 6):
                                for items in each['maxdepth' +
                                                  str(Depth)]['listOfNodes']:
                                    sum += len(
                                        items['maxdepth' +
                                              str(Depth)]['listOfCards'])
                                sum += len(each['maxdepth' +
                                                str(Depth)]['listOfCards'])
                        else:
                            for Depth in range(3, 6):
                                sum += len(each['maxdepth' +
                                                str(Depth)]['listOfCards'])
        return sum

    def get_number_data_points(self):
        return self._meta_parser.get_num_rows(
        ) * self._meta_parser.get_num_columns()

    def get_number_variables(self):
        return self._meta_parser.get_num_columns()

    def get_number_dimensions(self):
        self.number_dimensions = len(
            self._dataframe_helper.get_string_columns())
        return self.number_dimensions

    def get_number_measures(self):
        self.number_measures = len(
            self._dataframe_helper.get_numeric_columns())
        return self.number_measures

    def get_time_analyst(self):
        if self.analysis_type == 'dimension':
            time_per_analysis_dict = {
                "overview_rules": 10,
                "association_summary": 120,
                "association_rules": 180,
                "prediction_rules": 300
            }
        elif self.analysis_type == 'measure':
            time_per_analysis_dict = {
                "overview_rules": 10,
                "performance_summary": 120,
                "performance_rules": 180,
                "influencers_summary": 120,
                "influencers_rules": 180,
                "prediction_rules": 300
            }
        sum = 0
        for analysis in self.analysis_list:
            sum += self.number_analysis_dict[
                analysis] * time_per_analysis_dict[analysis]
        return sum

    def get_time_saved(self):
        '''
        Total Time Saved - 21 Hrs ( Productitvity Gain = Time taken by data scientist - time taken by mAdvisor)
        '''
        return self.time_analyst - self.time_mAdvisor

    def get_impact_on_productivity(self):
        '''
        Impact on Productivity - 3.5 X  ( Impact on Productivity = Time taken by data scientist / time taken by mAdvisor)
        '''
        productivity = str(
            round(old_div(self.time_analyst, self.time_mAdvisor), 1)) + "X"
        return productivity

    def get_summary_data(self):
        summaryData = [{
            "name": "Total Data Points",
            "value": str(self.data_points)
        }, {
            "name": "Number of Queries",
            "value": str(self.number_queries)
        }, {
            "name": "Number of Analysis",
            "value": str(self.number_analysis)
        }, {
            "name": "Total Pages",
            "value": str(self.number_pages)
        }, {
            "name": "Total Time Saved",
            "value": CommonUtils.humanize_time(self.time_saved)
        }, {
            "name": "Impact on Productivity",
            "value": str(self.impact_on_productivity)
        }]
        # summaryData = HtmlData(data="<p> Hello World!!! </p>")
        summaryDataClass = DataBox(data=summaryData)
        self.businessCardData.append(summaryDataClass)
        # businessCardData.append(summaryData)
        # self.business_card1.set_card_data(self.businessCardData)
        # self._businessImpactNode.add_a_card(self.business_card1)

    def get_summary_para(self):
        para_normal = """<blockquote><p>
        <b>Great Job !!!</b> You have analysed the dataset that contains {} variables after executing about <b>{}</b> analytics queries and <b>{}</b> Statistical and ML analysis in parallel. Using mAdvisor, you have completed the analysis within <b>{}</b> which would have required around <b>{}</b>.
        </p></blockquote>
        """.format(self.number_variables, self.number_queries,
                   self.number_analysis,
                   CommonUtils.humanize_time(self.time_mAdvisor),
                   CommonUtils.humanize_time(self.time_analyst))

        para_images = """<div class="col-md-6">
            <div class="d_analyst_block">
                <span class="d_analyst_img"></span>
                <h1 class="pull-left xs-mt-40 xs-ml-10">
                    <small>Data Analyst <span class="bImpact_time_icon xs-ml-10"></span></small>
                    <br>
                    <small>{}</small>
                </h1>
            </div>
        </div>
        <div class="col-md-6">
            <div class="d_m_block">
                <span class="d_m_img"></span>
                <h1 class="pull-left xs-mt-40 xs-ml-10"><span class="bImpact_time_icon"></span><br>
                    <small>{}</small>
                </h1>
            </div>
        </div>
        <div class="clearfix xs-m-50"></div>

           """.format(CommonUtils.humanize_time(self.time_analyst),
                      CommonUtils.humanize_time(self.time_mAdvisor))

        para_concatinated = """
        <div class="row">
            <div class="col-md-8 col-md-offset-2 xs-mt-20">
                {}{}
            </div>
        </div>
        """.format(para_images, para_normal)

        paraDataClass = HtmlData(data=para_concatinated)
        self.businessCardData.append(paraDataClass)

    def Run(self):
        print("In Run of BusinessCard")
        self._businessImpactNode = NarrativesTree()
        self._businessImpactNode.set_name("Impact")

        self.set_params()

        summary = self.get_summary_data()
        summary_para = self.get_summary_para()

        self.business_card1.set_card_data(self.businessCardData)
        self._businessImpactNode.add_a_card(self.business_card1)
        self._result_setter.set_business_impact_node(self._businessImpactNode)