Ejemplo n.º 1
0
    def Predict(self):
        self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight(
        )
        self._scriptStages = {
            "initialization": {
                "summary": "Initialized the Random Forest Scripts",
                "weight": 2
            },
            "prediction": {
                "summary": "Random Forest Model Prediction Finished",
                "weight": 2
            },
            "frequency": {
                "summary": "descriptive analysis finished",
                "weight": 2
            },
            "chisquare": {
                "summary": "chi Square analysis finished",
                "weight": 4
            },
            "completion": {
                "summary": "all analysis finished",
                "weight": 4
            },
        }

        self._completionStatus += old_div(
            self._scriptWeightDict[self._analysisName]["total"] *
            self._scriptStages["initialization"]["weight"], 10)
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "initialization",\
                                    "info",\
                                    self._scriptStages["initialization"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsg)
        self._dataframe_context.update_completion_status(
            self._completionStatus)
        # Match with the level_counts and then clean the data
        dataSanity = True
        level_counts_train = self._dataframe_context.get_level_count_dict()
        cat_cols = self._dataframe_helper.get_string_columns()
        # level_counts_score = CommonUtils.get_level_count_dict(self._data_frame,cat_cols,self._dataframe_context.get_column_separator(),output_type="dict")
        # if level_counts_train != {}:
        #     for key in level_counts_train:
        #         if key in level_counts_score:
        #             if level_counts_train[key] != level_counts_score[key]:
        #                 dataSanity = False
        #         else:
        #             dataSanity = False
        categorical_columns = self._dataframe_helper.get_string_columns()
        uid_col = self._dataframe_context.get_uid_column()
        if self._metaParser.check_column_isin_ignored_suggestion(uid_col):
            categorical_columns = list(set(categorical_columns) - {uid_col})
        allDateCols = self._dataframe_context.get_date_columns()
        categorical_columns = list(set(categorical_columns) - set(allDateCols))
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        result_column = self._dataframe_context.get_result_column()
        test_data_path = self._dataframe_context.get_input_file()

        if self._mlEnv == "spark":
            pass
        elif self._mlEnv == "sklearn":

            score_data_path = self._dataframe_context.get_score_path(
            ) + "/data.csv"
            if score_data_path.startswith("file"):
                score_data_path = score_data_path[7:]
            trained_model_path = self._dataframe_context.get_model_path()
            trained_model_path += "/" + self._dataframe_context.get_model_for_scoring(
            ) + ".pkl"
            if trained_model_path.startswith("file"):
                trained_model_path = trained_model_path[7:]
            score_summary_path = self._dataframe_context.get_score_path(
            ) + "/Summary/summary.json"
            if score_summary_path.startswith("file"):
                score_summary_path = score_summary_path[7:]
            trained_model = joblib.load(trained_model_path)
            # pandas_df = self._data_frame.toPandas()
            df = self._data_frame.toPandas()
            model_columns = self._dataframe_context.get_model_features()
            pandas_df = MLUtils.create_dummy_columns(
                df, [x for x in categorical_columns if x != result_column])
            pandas_df = MLUtils.fill_missing_columns(pandas_df, model_columns,
                                                     result_column)
            if uid_col:
                pandas_df = pandas_df[[
                    x for x in pandas_df.columns if x != uid_col
                ]]
            y_score = trained_model.predict(pandas_df)
            y_prob = trained_model.predict_proba(pandas_df)
            y_prob = MLUtils.calculate_predicted_probability(y_prob)
            y_prob = list([round(x, 2) for x in y_prob])
            score = {
                "predicted_class": y_score,
                "predicted_probability": y_prob
            }

        df["predicted_class"] = score["predicted_class"]
        labelMappingDict = self._dataframe_context.get_label_map()
        df["predicted_class"] = df["predicted_class"].apply(
            lambda x: labelMappingDict[x] if x != None else "NA")
        df["predicted_probability"] = score["predicted_probability"]
        self._score_summary[
            "prediction_split"] = MLUtils.calculate_scored_probability_stats(
                df)
        self._score_summary["result_column"] = result_column
        if result_column in df.columns:
            df.drop(result_column, axis=1, inplace=True)
        df = df.rename(index=str, columns={"predicted_class": result_column})
        df.to_csv(score_data_path, header=True, index=False)
        uidCol = self._dataframe_context.get_uid_column()
        if uidCol == None:
            uidCols = self._metaParser.get_suggested_uid_columns()
            if len(uidCols) > 0:
                uidCol = uidCols[0]
        uidTableData = []
        predictedClasses = list(df[result_column].unique())
        if uidCol:
            if uidCol in df.columns:
                for level in predictedClasses:
                    levelDf = df[df[result_column] == level]
                    levelDf = levelDf[[
                        uidCol, "predicted_probability", result_column
                    ]]
                    levelDf.sort_values(by="predicted_probability",
                                        ascending=False,
                                        inplace=True)
                    levelDf["predicted_probability"] = levelDf[
                        "predicted_probability"].apply(
                            lambda x: humanize.apnumber(x * 100) + "%"
                            if x * 100 >= 10 else str(int(x * 100)) + "%")
                    uidTableData.append(levelDf[:5])
                uidTableData = pd.concat(uidTableData)
                uidTableData = [list(arr) for arr in list(uidTableData.values)]
                uidTableData = [[uidCol, "Probability", result_column]
                                ] + uidTableData
                uidTable = TableData()
                uidTable.set_table_width(25)
                uidTable.set_table_data(uidTableData)
                uidTable.set_table_type("normalHideColumn")
                self._result_setter.set_unique_identifier_table(
                    json.loads(
                        CommonUtils.convert_python_object_to_json(uidTable)))

        self._completionStatus += old_div(
            self._scriptWeightDict[self._analysisName]["total"] *
            self._scriptStages["prediction"]["weight"], 10)
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "prediction",\
                                    "info",\
                                    self._scriptStages["prediction"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsg)
        self._dataframe_context.update_completion_status(
            self._completionStatus)
        # CommonUtils.write_to_file(score_summary_path,json.dumps({"scoreSummary":self._score_summary}))

        print("STARTING DIMENSION ANALYSIS ...")
        columns_to_keep = []
        columns_to_drop = []

        # considercolumnstype = self._dataframe_context.get_score_consider_columns_type()
        # considercolumns = self._dataframe_context.get_score_consider_columns()
        # if considercolumnstype != None:
        #     if considercolumns != None:
        #         if considercolumnstype == ["excluding"]:
        #             columns_to_drop = considercolumns
        #         elif considercolumnstype == ["including"]:
        #             columns_to_keep = considercolumns

        columns_to_keep = self._dataframe_context.get_score_consider_columns()
        if len(columns_to_keep) > 0:
            columns_to_drop = list(set(df.columns) - set(columns_to_keep))
        else:
            columns_to_drop += ["predicted_probability"]
        columns_to_drop = [
            x for x in columns_to_drop
            if x in df.columns and x != result_column
        ]
        print("columns_to_drop", columns_to_drop)
        df.drop(columns_to_drop, axis=1, inplace=True)

        resultColLevelCount = dict(df[result_column].value_counts())
        # self._metaParser.update_level_counts(result_column,resultColLevelCount)
        self._metaParser.update_column_dict(
            result_column, {
                "LevelCount": resultColLevelCount,
                "numberOfUniqueValues": len(list(resultColLevelCount.keys()))
            })
        self._dataframe_context.set_story_on_scored_data(True)
        SQLctx = SQLContext(sparkContext=self._spark.sparkContext,
                            sparkSession=self._spark)
        spark_scored_df = SQLctx.createDataFrame(df)
        # spark_scored_df.write.csv(score_data_path+"/data",mode="overwrite",header=True)
        # TODO update metadata for the newly created dataframe
        self._dataframe_context.update_consider_columns(columns_to_keep)
        df_helper = DataFrameHelper(spark_scored_df, self._dataframe_context,
                                    self._metaParser)
        df_helper.set_params()
        spark_scored_df = df_helper.get_data_frame()
        # try:
        #     fs = time.time()
        #     narratives_file = self._dataframe_context.get_score_path()+"/narratives/FreqDimension/data.json"
        #     if narratives_file.startswith("file"):
        #         narratives_file = narratives_file[7:]
        #     result_file = self._dataframe_context.get_score_path()+"/results/FreqDimension/data.json"
        #     if result_file.startswith("file"):
        #         result_file = result_file[7:]
        #     init_freq_dim = FreqDimensions(df, df_helper, self._dataframe_context,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName)
        #     df_freq_dimension_obj = init_freq_dim.test_all(dimension_columns=[result_column])
        #     df_freq_dimension_result = CommonUtils.as_dict(df_freq_dimension_obj)
        #     narratives_obj = DimensionColumnNarrative(result_column, df_helper, self._dataframe_context, df_freq_dimension_obj,self._result_setter,self._prediction_narrative,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName)
        #     narratives = CommonUtils.as_dict(narratives_obj)
        #
        #     print "Frequency Analysis Done in ", time.time() - fs,  " seconds."
        #     self._completionStatus += self._scriptWeightDict[self._analysisName]["total"]*self._scriptStages["frequency"]["weight"]/10
        #     progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
        #                                 "frequency",\
        #                                 "info",\
        #                                 self._scriptStages["frequency"]["summary"],\
        #                                 self._completionStatus,\
        #                                 self._completionStatus)
        #     CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsg)
        #     self._dataframe_context.update_completion_status(self._completionStatus)
        #     print "Frequency ",self._completionStatus
        # except:
        #     print "Frequency Analysis Failed "
        #
        # try:
        #     fs = time.time()
        #     narratives_file = self._dataframe_context.get_score_path()+"/narratives/ChiSquare/data.json"
        #     if narratives_file.startswith("file"):
        #         narratives_file = narratives_file[7:]
        #     result_file = self._dataframe_context.get_score_path()+"/results/ChiSquare/data.json"
        #     if result_file.startswith("file"):
        #         result_file = result_file[7:]
        #     init_chisquare_obj = ChiSquare(df, df_helper, self._dataframe_context,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName)
        #     df_chisquare_obj = init_chisquare_obj.test_all(dimension_columns= [result_column])
        #     df_chisquare_result = CommonUtils.as_dict(df_chisquare_obj)
        #     chisquare_narratives = CommonUtils.as_dict(ChiSquareNarratives(df_helper, df_chisquare_obj, self._dataframe_context,df,self._prediction_narrative,self._result_setter,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName))
        # except:
        #     print "ChiSquare Analysis Failed "
        if len(predictedClasses) >= 2:
            try:
                fs = time.time()
                df_decision_tree_obj = DecisionTrees(
                    spark_scored_df,
                    df_helper,
                    self._dataframe_context,
                    self._spark,
                    self._metaParser,
                    scriptWeight=self._scriptWeightDict,
                    analysisName=self._analysisName).test_all(
                        dimension_columns=[result_column])
                narratives_obj = CommonUtils.as_dict(
                    DecisionTreeNarrative(result_column,
                                          df_decision_tree_obj,
                                          self._dataframe_helper,
                                          self._dataframe_context,
                                          self._metaParser,
                                          self._result_setter,
                                          story_narrative=None,
                                          analysisName=self._analysisName,
                                          scriptWeight=self._scriptWeightDict))
                print(narratives_obj)
            except:
                print("DecisionTree Analysis Failed ")
        else:
            data_dict = {
                "npred": len(predictedClasses),
                "nactual": len(list(labelMappingDict.values()))
            }
            if data_dict["nactual"] > 2:
                levelCountDict[predictedClasses[0]] = resultColLevelCount[
                    predictedClasses[0]]
                levelCountDict["Others"] = sum([
                    v for k, v in list(resultColLevelCount.items())
                    if k != predictedClasses[0]
                ])
            else:
                levelCountDict = resultColLevelCount
                otherClass = list(
                    set(labelMappingDict.values()) - set(predictedClasses))[0]
                levelCountDict[otherClass] = 0

                print(levelCountDict)

            total = float(
                sum([x for x in list(levelCountDict.values()) if x != None]))
            levelCountTuple = [({
                "name":
                k,
                "count":
                v,
                "percentage":
                humanize.apnumber(old_div(v * 100, total)) +
                "%" if old_div(v * 100, total) >= 10 else
                str(int(old_div(v * 100, total))) + "%"
            }) for k, v in list(levelCountDict.items()) if v != None]
            levelCountTuple = sorted(levelCountTuple,
                                     key=lambda x: x["count"],
                                     reverse=True)
            data_dict["blockSplitter"] = "|~NEWBLOCK~|"
            data_dict["targetcol"] = result_column
            data_dict["nlevel"] = len(list(levelCountDict.keys()))
            data_dict["topLevel"] = levelCountTuple[0]
            data_dict["secondLevel"] = levelCountTuple[1]
            maincardSummary = NarrativesUtils.get_template_output(
                "/apps/", 'scorewithoutdtree.html', data_dict)

            main_card = NormalCard()
            main_card_data = []
            main_card_narrative = NarrativesUtils.block_splitter(
                maincardSummary, "|~NEWBLOCK~|")
            main_card_data += main_card_narrative

            chartData = NormalChartData([levelCountDict]).get_data()
            chartJson = ChartJson(data=chartData)
            chartJson.set_title(result_column)
            chartJson.set_chart_type("donut")
            mainCardChart = C3ChartData(data=chartJson)
            mainCardChart.set_width_percent(33)
            main_card_data.append(mainCardChart)

            uidTable = self._result_setter.get_unique_identifier_table()
            if uidTable != None:
                main_card_data.append(uidTable)
            main_card.set_card_data(main_card_data)
            main_card.set_card_name(
                "Predicting Key Drivers of {}".format(result_column))
            self._result_setter.set_score_dtree_cards([main_card], {})
Ejemplo n.º 2
0
    def generate_narratives(self):
        regression_narrative_obj = LinearRegressionNarrative(
                                    self._df_regression_result,
                                    self._correlations,
                                    self._dataframe_helper,
                                    self._dataframe_context,
                                    self._metaParser,
                                    self._spark
                                    )
        main_card_data = regression_narrative_obj.generate_main_card_data()
        main_card_narrative = NarrativesUtils.get_template_output(self._base_dir,\
                                                        'regression_main_card.html',main_card_data)
        self.narratives['main_card'] = {}
        self.narratives["main_card"]['paragraphs'] = NarrativesUtils.paragraph_splitter(main_card_narrative)
        self.narratives["main_card"]['header'] = 'Key Measures that affect ' + self.result_column
        self.narratives["main_card"]['chart'] = {}
        self.narratives["main_card"]['chart']['heading'] = ''
        self.narratives["main_card"]['chart']['data'] = [[i for i,j in self._all_coeffs],
                                                         [j['coefficient'] for i,j in self._all_coeffs]]
        self.narratives["main_card"]['chart']['label'] = {'x':'Measure Name',
                                                            'y': 'Change in ' + self.result_column + ' per unit increase'}

        main_card = NormalCard()
        main_card_header = HtmlData(data = '<h3>Key Measures that affect ' + self.result_column+"</h3>")
        main_card_paragraphs = NarrativesUtils.block_splitter(main_card_narrative,self._blockSplitter)
        main_card_chart_data = [{"key":val[0],"value":val[1]} for val in zip([i for i,j in self._all_coeffs],[j['coefficient'] for i,j in self._all_coeffs])]
        main_card_chart = NormalChartData(data=main_card_chart_data)
        mainCardChartJson = ChartJson()
        mainCardChartJson.set_data(main_card_chart.get_data())
        mainCardChartJson.set_label_text({'x':'Influencing Factors','y': 'Change in ' + self.result_column + ' per unit increase'})
        mainCardChartJson.set_chart_type("bar")
        mainCardChartJson.set_axes({"x":"key","y":"value"})
        mainCardChartJson.set_yaxis_number_format(".2f")
        # st_info = ["Test : Regression","Threshold for p-value: 0.05", "Effect Size: Regression Coefficient"]
        chart_data = sorted(main_card_chart_data,key=lambda x:x["value"],reverse=True)
        statistical_info_array=[
            ("Test Type","Regression"),
            ("Effect Size","Coefficients"),
            ("Max Effect Size",chart_data[0]["key"]),
            ("Min Effect Size",chart_data[-1]["key"]),
            ]
        statistical_inferenc = ""
        if len(chart_data) == 1:
            statistical_inference = "{} is the only variable that have significant influence over {} (Target) having an \
             Effect size of {}".format(chart_data[0]["key"],self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4))
        elif len(chart_data) == 2:
            statistical_inference = "There are two variables ({} and {}) that have significant influence over {} (Target) and the \
             Effect size ranges are {} and {} respectively".format(chart_data[0]["key"],chart_data[1]["key"],self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4),round(chart_data[1]["value"],4))
        else:
            statistical_inference = "There are {} variables that have significant influence over {} (Target) and the \
             Effect size ranges from {} to {}".format(len(chart_data),self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4),round(chart_data[-1]["value"],4))
        if statistical_inference != "":
            statistical_info_array.append(("Inference",statistical_inference))
        statistical_info_array = NarrativesUtils.statistical_info_array_formatter(statistical_info_array)
        main_card.set_card_data(data = [main_card_header]+main_card_paragraphs+[C3ChartData(data=mainCardChartJson,info=statistical_info_array)])
        main_card.set_card_name("Key Influencers")
        self._regressionNode.add_a_card(main_card)


        count = 0
        for measure_column in self.significant_measures:
            sigMeasureNode = NarrativesTree()
            sigMeasureNode.set_name(measure_column)
            measureCard1 = NormalCard()
            measureCard1.set_card_name("{}: Impact on {}".format(measure_column,self.result_column))
            measureCard1Data = []
            if self._run_dimension_level_regression:
                measureCard2 = NormalCard()
                measureCard2.set_card_name("Key Areas where it Matters")
                measureCard2Data = []

            measure_column_cards = {}
            card0 = {}
            card1data = regression_narrative_obj.generate_card1_data(measure_column)
            card1heading = "<h3>Impact of "+measure_column+" on "+self.result_column+"</h3>"
            measureCard1Header = HtmlData(data=card1heading)
            card1data.update({"blockSplitter":self._blockSplitter})
            card1narrative = NarrativesUtils.get_template_output(self._base_dir,\
                                                            'regression_card1.html',card1data)

            card1paragraphs = NarrativesUtils.block_splitter(card1narrative,self._blockSplitter)
            card0 = {"paragraphs":card1paragraphs}
            card0["charts"] = {}
            card0['charts']['chart2']={}
            # card0['charts']['chart2']['data']=card1data["chart_data"]
            # card0['charts']['chart2']['heading'] = ''
            # card0['charts']['chart2']['labels'] = {}
            card0['charts']['chart1']={}
            card0["heading"] = card1heading
            measure_column_cards['card0'] = card0

            measureCard1Header = HtmlData(data=card1heading)
            measureCard1Data += [measureCard1Header]
            measureCard1para = card1paragraphs
            measureCard1Data += measureCard1para

            if self._run_dimension_level_regression:
                print("running narratives for key area dict")
                self._dim_regression = self.run_regression_for_dimension_levels()
                card2table, card2data=regression_narrative_obj.generate_card2_data(measure_column,self._dim_regression)
                card2data.update({"blockSplitter":self._blockSplitter})
                card2narrative = NarrativesUtils.get_template_output(self._base_dir,\
                                                            'regression_card2.html',card2data)
                card2paragraphs = NarrativesUtils.block_splitter(card2narrative,self._blockSplitter)

                card1 = {'tables': card2table, 'paragraphs' : card2paragraphs,
                        'heading' : 'Key Areas where ' + measure_column + ' matters'}
                measure_column_cards['card1'] = card1

                measureCard2Data += card2paragraphs
                if "table1" in card2table:
                    table1data = regression_narrative_obj.convert_table_data(card2table["table1"])
                    card2Table1 = TableData()
                    card2Table1.set_table_data(table1data)
                    card2Table1.set_table_type("heatMap")
                    card2Table1.set_table_top_header(card2table["table1"]["heading"])
                    card2Table1Json = json.loads(CommonUtils.convert_python_object_to_json(card2Table1))
                    # measureCard2Data.insert(3,card2Table1)
                    measureCard2Data.insert(3,card2Table1Json)

                if "table2" in card2table:
                    table2data = regression_narrative_obj.convert_table_data(card2table["table2"])
                    card2Table2 = TableData()
                    card2Table2.set_table_data(table2data)
                    card2Table2.set_table_type("heatMap")
                    card2Table2.set_table_top_header(card2table["table2"]["heading"])
                    # measureCard2Data.insert(5,card2Table2)
                    card2Table2Json = json.loads(CommonUtils.convert_python_object_to_json(card2Table2))
                    # measureCard2Data.append(card2Table2)
                    measureCard2Data.append(card2Table2Json)


            # self._result_setter.set_trend_section_data({"result_column":self.result_column,
            #                                             "measure_column":measure_column,
            #                                             "base_dir":self._base_dir
            #                                             })
            # trend_narratives_obj = TimeSeriesNarrative(self._dataframe_helper, self._dataframe_context, self._result_setter, self._spark, self._story_narrative)
            # card2 =  trend_narratives_obj.get_regression_trend_card_data()
            # if card2:
            #     measure_column_cards['card2'] = card2
            #
            #
            # card3 = {}
            progressMessage = CommonUtils.create_progress_message_object(self._analysisName,"custom","info","Analyzing Key Influencers",self._completionStatus,self._completionStatus,display=True)
            CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=False)
            card4data = regression_narrative_obj.generate_card4_data(self.result_column,measure_column)
            card4data.update({"blockSplitter":self._blockSplitter})
            # card4heading = "Sensitivity Analysis: Effect of "+self.result_column+" on Segments of "+measure_column
            card4narrative = NarrativesUtils.get_template_output(self._base_dir,\
                                                                'regression_card4.html',card4data)
            card4paragraphs = NarrativesUtils.block_splitter(card4narrative,self._blockSplitter)
            # card3 = {"paragraphs":card4paragraphs}
            card0['paragraphs'] = card1paragraphs+card4paragraphs
            card4Chart = card4data["charts"]
            # st_info = ["Test : Regression", "Variables : "+ self.result_column +", "+measure_column,"Intercept : "+str(round(self._df_regression_result.get_intercept(),2)), "Regression Coefficient : "+ str(round(self._df_regression_result.get_coeff(measure_column),2))]
            statistical_info_array=[
                ("Test Type","Regression"),
                ("Coefficient",str(round(self._df_regression_result.get_coeff(measure_column),2))),
                ("P-Value","<= 0.05"),
                ("Intercept",str(round(self._df_regression_result.get_intercept(),2))),
                ("R Square ",str(round(self._df_regression_result.get_rsquare(),2))),
                ]
            inferenceTuple = ()
            coeff = self._df_regression_result.get_coeff(measure_column)
            if coeff > 0:
                inferenceTuple = ("Inference","For every additional unit of increase in {} there will be an increase of {} units in {} (target).".format(measure_column,str(round(coeff,2)),self._dataframe_context.get_result_column()))
            else:
                inferenceTuple = ("Inference","For every additional unit of decrease in {} there will be an decrease of {} units in {} (target).".format(measure_column,str(round(coeff,2)),self._dataframe_context.get_result_column()))
            if len(inferenceTuple) > 0:
                statistical_info_array.append(inferenceTuple)
            statistical_info_array = NarrativesUtils.statistical_info_array_formatter(statistical_info_array)

            card4paragraphs.insert(2,C3ChartData(data=card4Chart,info=statistical_info_array))
            measureCard1Data += card4paragraphs

            self.narratives['cards'].append(measure_column_cards)

            if count == 0:
                card4data.pop("charts")
                self._result_setter.update_executive_summary_data(card4data)
            count += 1
            measureCard1.set_card_data(measureCard1Data)
            if self._run_dimension_level_regression:
                measureCard2.set_card_data(measureCard2Data)
                sigMeasureNode.add_cards([measureCard1,measureCard2])
            sigMeasureNode.add_cards([measureCard1])
            self._regressionNode.add_a_node(sigMeasureNode)
        # self._result_setter.set_trend_section_completion_status(True)
        self._story_narrative.add_a_node(self._regressionNode)
Ejemplo n.º 3
0
    def __init__(self, df_helper, df_context, result_setter, spark,
                 story_narrative, meta_parser):
        self._story_narrative = story_narrative
        self._result_setter = result_setter
        self._spark = spark
        self._dataframe_helper = df_helper
        self._dataframe_context = df_context
        self._pandas_flag = df_context._pandas_flag
        self._data_frame = df_helper.get_data_frame()
        self._num_significant_digits = NarrativesUtils.get_significant_digit_settings(
            "trend")
        self._metaParser = meta_parser

        self._result_column = self._dataframe_context.get_result_column()
        self._string_columns = self._dataframe_helper.get_string_columns()
        self._timestamp_columns = self._dataframe_helper.get_timestamp_columns(
        )

        # self._selected_date_columns = None
        self._selected_date_columns = self._dataframe_context.get_selected_date_columns(
        )
        self._all_date_columns = self._dataframe_context.get_date_columns()
        self._string_columns = list(
            set(self._string_columns) - set(self._all_date_columns))

        self._dateFormatDetected = False
        self._existingDateFormat = None
        self._dateFormatConversionDict = NarrativesUtils.date_formats_mapping_dict(
        )
        self._dateColumnFormatDict = df_context.get_date_format_dict()
        if self._dataframe_context.get_requested_date_format() != None:
            self._requestedDateFormat = df_context.get_requested_date_format()
        else:
            self._requestedDateFormat = None

        self._analysistype = self._dataframe_context.get_analysis_type()
        self._trendSettings = self._dataframe_context.get_trend_settings()
        self._trendSpecificMeasure = False
        if self._trendSettings != None:
            if self._analysistype == "dimension" and self._trendSettings[
                    "name"] != "Count":
                self._trendSpecificMeasure = True
                self._analysistype = "measure"
                self._result_column = self._trendSettings["selectedMeasure"]
            elif self._analysistype == "measure" and self._trendSettings[
                    "name"] != "Count":
                self._result_column = self._trendSettings["selectedMeasure"]

        self._trend_subsection = self._result_setter.get_trend_section_name()
        self._regression_trend_card = None
        self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER
        self._highlightFlag = "|~HIGHLIGHT~|"
        self._trend_on_td_column = False
        self._number_of_dimensions_to_consider = 10

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        self._analysisName = self._dataframe_context.get_analysis_name()
        self._messageURL = self._dataframe_context.get_message_url()
        if self._analysistype == "dimension":
            self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight(
            )
            self._scriptStages = {
                "initialization": {
                    "summary": "Initialized The Frequency Narratives",
                    "weight": 0
                },
                "summarygeneration": {
                    "summary": "Summary Generation Finished",
                    "weight": 4
                },
                "completion": {
                    "summary": "Frequency Stats Narratives Done",
                    "weight": 0
                },
            }
        elif self._analysistype == "measure":
            if self._trendSpecificMeasure:
                self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight(
                )
            else:
                self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight(
                )
            self._scriptStages = {
                "trendNarrativeStart": {
                    "summary": "Started The Descriptive Stats Narratives",
                    "weight": 1
                },
                "trendNarrativeEnd": {
                    "summary": "Narratives For Descriptive Stats Finished",
                    "weight": 0
                },
            }

        self._base_dir = "/trend/"
        if self._pandas_flag and self._selected_date_columns and not self._dateColumnFormatDict and not self._timestamp_columns:
            for column in self._selected_date_columns:
                uniqueVals = self._data_frame[column].astype(
                    str).unique().tolist()
                metaHelperInstance = MetaDataHelper(self._data_frame,
                                                    self._data_frame.shape[0])
                if len(uniqueVals
                       ) > 0 and metaHelperInstance.get_datetime_format_pandas(
                           [
                               self._data_frame.sort_values(
                                   by=column, ascending=False)[column][0]
                           ]) != None:
                    dateColumnFormat = metaHelperInstance.get_datetime_format_pandas(
                        uniqueVals)
                    self._dateColumnFormatDict.update(
                        {column: dateColumnFormat})
        dateColCheck = NarrativesUtils.check_date_column_formats(self._selected_date_columns,\
                                                    self._timestamp_columns,\
                                                    self._dateColumnFormatDict,\
                                                    self._dateFormatConversionDict,
                                                    self._requestedDateFormat)
        print(dateColCheck)

        self._dateFormatDetected = dateColCheck["dateFormatDetected"]
        self._trend_on_td_column = dateColCheck["trendOnTdCol"]
        if self._dateFormatDetected:
            self._requestedDateFormat = dateColCheck["requestedDateFormat"]
            self._existingDateFormat = dateColCheck["existingDateFormat"]
            # self._date_column_suggested is the column used for trend
            self._date_column_suggested = dateColCheck["suggestedDateColumn"]
        if self._existingDateFormat:
            self._data_frame, dataRangeStats = NarrativesUtils.calculate_data_range_stats(
                self._data_frame, self._existingDateFormat,
                self._date_column_suggested, self._trend_on_td_column,
                self._pandas_flag)
            print(dataRangeStats)
            self._durationString = dataRangeStats["durationString"]
            self._duration = dataRangeStats["duration"]
            self._dataLevel = dataRangeStats["dataLevel"]
            first_date = dataRangeStats["firstDate"]
            last_date = dataRangeStats["lastDate"]

            if self._timestamp_columns != None:
                if self._selected_date_columns == None:
                    self._selected_date_columns = self._timestamp_columns
                else:
                    self._selected_date_columns += self._timestamp_columns
        if self._pandas_flag:
            pass
        else:
            if self._trend_subsection == "regression":
                if self._selected_date_columns != None:
                    if self._dateFormatDetected:
                        trend_subsection_data = self._result_setter.get_trend_section_data(
                        )
                        measure_column = trend_subsection_data[
                            "measure_column"]
                        result_column = trend_subsection_data["result_column"]
                        base_dir = trend_subsection_data["base_dir"]

                        card3heading = 'How ' + result_column + ' and ' + measure_column + ' changed over time'
                        if self._dataLevel == "day":
                            grouped_data = self._data_frame.groupBy(
                                "suggestedDate").agg({
                                    measure_column: 'sum',
                                    result_column: 'sum'
                                })
                            grouped_data = grouped_data.withColumnRenamed(
                                grouped_data.columns[-1], result_column)
                            grouped_data = grouped_data.withColumnRenamed(
                                grouped_data.columns[-2], measure_column)
                            grouped_data = grouped_data.withColumn(
                                "year_month",
                                udf(lambda x: x.strftime("%b-%y"))(
                                    "suggestedDate"))
                            grouped_data = grouped_data.orderBy(
                                "suggestedDate", ascending=True)
                            grouped_data = grouped_data.withColumnRenamed(
                                grouped_data.columns[0], "key")
                            grouped_data = grouped_data.toPandas()
                        elif self._dataLevel == "month":
                            grouped_data = self._data_frame.groupBy(
                                "year_month").agg({
                                    measure_column: 'sum',
                                    result_column: 'sum'
                                })
                            grouped_data = grouped_data.withColumnRenamed(
                                grouped_data.columns[-1], result_column)
                            grouped_data = grouped_data.withColumnRenamed(
                                grouped_data.columns[-2], measure_column)
                            grouped_data = grouped_data.withColumn(
                                "suggestedDate",
                                udf(lambda x: datetime.strptime(x, "%b-%y"))(
                                    "year_month"))
                            grouped_data = grouped_data.orderBy(
                                "suggestedDate", ascending=True)
                            grouped_data = grouped_data.withColumnRenamed(
                                "suggestedDate", "key")
                            grouped_data = grouped_data.select([
                                "key", measure_column, result_column,
                                "year_month"
                            ]).toPandas()
                            grouped_data["key"] = grouped_data[
                                "year_month"].apply(
                                    lambda x: datetime.strptime(x, "%b-%y"
                                                                ).date())

                        trend_narrative_obj = TrendNarrative(
                            self._result_column, self._date_column_suggested,
                            grouped_data, self._existingDateFormat,
                            self._requestedDateFormat, self._base_dir,
                            self._metaParser)

                        card3data = trend_narrative_obj.generate_regression_trend_data(
                            grouped_data, measure_column, result_column,
                            self._dataLevel, self._durationString)

                        card3narrative = NarrativesUtils.get_template_output(base_dir,\
                                                                        'regression_card3.html',card3data)

                        card3chart = trend_narrative_obj.generate_regression_trend_chart(
                            grouped_data, self._dataLevel)
                        card3paragraphs = NarrativesUtils.paragraph_splitter(
                            card3narrative)
                        card2 = {
                            'charts': card3chart,
                            'paragraphs': card3paragraphs,
                            'heading': card3heading
                        }
                        self.set_regression_trend_card_data(card2)
                    else:
                        print("NO DATE FORMAT DETECTED")
                else:
                    print("NO DATE COLUMNS PRESENT")

        if self._analysistype == "measure":
            self._completionStatus += old_div(
                self._scriptWeightDict[self._analysisName]["total"] *
                self._scriptStages["trendNarrativeStart"]["weight"], 10)
            progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                        "trendNarrativeStart",\
                                        "info",\
                                        self._scriptStages["trendNarrativeStart"]["summary"],\
                                        self._completionStatus,\
                                        self._completionStatus)
            CommonUtils.save_progress_message(self._messageURL,
                                              progressMessage)
            self._dataframe_context.update_completion_status(
                self._completionStatus)
            # self._startMeasureTrend = self._result_setter.get_trend_section_completion_status()
            self._startMeasureTrend = True

            if self._startMeasureTrend == True:
                self.narratives = {
                    "SectionHeading": "",
                    "card1": {},
                    "card2": {},
                    "card3": {}
                }
                if self._selected_date_columns != None:
                    if self._dateFormatDetected:
                        grouped_data = NarrativesUtils.get_grouped_data_for_trend(
                            self._data_frame, self._dataLevel,
                            self._result_column, self._analysistype,
                            self._pandas_flag)
                        if self._pandas_flag:
                            self._data_frame = self._data_frame.drop(
                                self._date_column_suggested, axis=1)
                        else:
                            self._data_frame = self._data_frame.drop(
                                self._date_column_suggested)
                        # self._data_frame = self._data_frame.withColumnRenamed("year_month", self._date_column_suggested)

                        significant_dimensions = []
                        significant_dimension_dict = df_helper.get_significant_dimension(
                        )
                        if significant_dimension_dict != {} and significant_dimension_dict != None:
                            significant_dimension_tuple = tuple(
                                significant_dimension_dict.items())
                            significant_dimension_tuple = sorted(
                                significant_dimension_tuple,
                                key=lambda x: x[1],
                                reverse=True)
                            significant_dimensions = [
                                x[0] for x in
                                significant_dimension_tuple[:self.
                                                            _number_of_dimensions_to_consider]
                            ]
                        else:
                            significant_dimensions = self._string_columns[:self
                                                                          .
                                                                          _number_of_dimensions_to_consider]
                        print("significant_dimensions", significant_dimensions)
                        trend_narrative_obj = TrendNarrative(
                            self._result_column, self._date_column_suggested,
                            grouped_data, self._existingDateFormat,
                            self._requestedDateFormat, self._base_dir,
                            self._metaParser)
                        # grouped_data.to_csv("/home/gulshan/marlabs/datasets/trend_grouped_pandas.csv",index=False)
                        dataDict = trend_narrative_obj.generateDataDict(
                            grouped_data, self._dataLevel,
                            self._durationString)
                        # # update reference time with max value
                        reference_time = dataDict["reference_time"]
                        dataDict["duration"] = self._duration
                        dataDict["dataLevel"] = self._dataLevel
                        dataDict["durationString"] = self._durationString
                        dataDict[
                            "significant_dimensions"] = significant_dimensions
                        if len(significant_dimensions) > 0:
                            if self._dataLevel == "day":
                                datetimeformat = self._existingDateFormat
                            elif self._dataLevel == "month":
                                datetimeformat = "%b-%y"
                            # xtraData = trend_narrative_obj.get_xtra_calculations(self._data_frame,grouped_data,significant_dimensions,self._date_column_suggested,self._result_column,self._existingDateFormat,reference_time,self._dataLevel, self._pandas_flag)
                            xtraData = trend_narrative_obj.get_xtra_calculations(
                                self._data_frame, grouped_data,
                                significant_dimensions,
                                self._date_column_suggested,
                                self._result_column, datetimeformat,
                                reference_time, self._dataLevel,
                                self._pandas_flag)
                            if xtraData != None:
                                dataDict.update(xtraData)
                        # print 'Trend dataDict:  %s' %(json.dumps(dataDict, indent=2))
                        self._result_setter.update_executive_summary_data(
                            dataDict)
                        dataDict.update({
                            "blockSplitter": self._blockSplitter,
                            "highlightFlag": self._highlightFlag
                        })

                        summary1 = NarrativesUtils.get_template_output(self._base_dir,\
                                                                        'measure_trend_card1.html',dataDict)
                        summary2 = NarrativesUtils.get_template_output(self._base_dir,\
                                                                        'measure_trend_card2.html',dataDict)
                        measureTrendCard = NormalCard()
                        measureTrendcard1Data = NarrativesUtils.block_splitter(
                            summary1,
                            self._blockSplitter,
                            highlightFlag=self._highlightFlag)
                        measureTrendcard2Data = NarrativesUtils.block_splitter(
                            summary2, self._blockSplitter)
                        # print measureTrendcard1Data

                        bubbledata = dataDict["bubbleData"]
                        # print bubbledata
                        card1BubbleData = "<div class='col-md-6 col-xs-12 xs-p-20'><h2 class='text-center'><span>{}</span><br/><small>{}</small></h2></div><div class='col-md-6 col-xs-12 xs-p-20'><h2 class='text-center'><span>{}</span><br/><small>{}</small></h2></div>".format(
                            bubbledata[0]["value"], bubbledata[0]["text"],
                            bubbledata[1]["value"], bubbledata[1]["text"])
                        # print card1BubbleData

                        trend_chart_data = list(
                            grouped_data[["key",
                                          "value"]].T.to_dict().values())
                        trend_chart_data = sorted(trend_chart_data,
                                                  key=lambda x: x["key"])
                        card1chartdata = {"actual": [], "predicted": []}

                        if self._dataLevel == "day":
                            card1chartdata["actual"] = [{
                                "key": str(val["key"]),
                                "value": val["value"]
                            } for val in trend_chart_data]
                        elif self._dataLevel == "month":
                            card1chartdata["actual"] = [{
                                "key":
                                val["key"].strftime("%b-%y"),
                                "value":
                                val["value"]
                            } for val in trend_chart_data]

                        if self._duration < 365:
                            prediction_window = 3
                        else:
                            prediction_window = 6
                        predicted_values = trend_narrative_obj.get_forecast_values(
                            grouped_data["value"],
                            prediction_window)[len(grouped_data["value"]):]
                        predicted_values = [
                            round(x, self._num_significant_digits)
                            for x in predicted_values
                        ]

                        forecasted_data = []
                        forecasted_data.append(card1chartdata["actual"][-1])
                        forecasted_dates = []
                        # forecast_start_time = datetime.strptime(card1chartdata["actual"][-1]["key"],"%b-%y")
                        if self._dataLevel == "month":
                            forecast_start_time = datetime.strptime(
                                card1chartdata["actual"][-1]["key"], "%b-%y")
                        elif self._dataLevel == "day":
                            try:
                                forecast_start_time = datetime.strptime(
                                    card1chartdata["actual"][-1]["key"],
                                    "%Y-%m-%d")
                            except:
                                forecast_start_time = datetime.strptime(
                                    card1chartdata["actual"][-1]["key"],
                                    '%Y-%m-%d %H:%M:%S')
                        for val in range(prediction_window):
                            if self._dataLevel == "month":
                                key = forecast_start_time + relativedelta(
                                    months=1 + val)
                                forecasted_dates.append(key)
                            elif self._dataLevel == "day":
                                key = forecast_start_time + relativedelta(
                                    days=1 + val)
                                forecasted_dates.append(key)
                        forecasted_list = list(
                            zip(forecasted_dates, predicted_values))
                        if self._dataLevel == "month":
                            forecasted_list = [{
                                "key": val[0].strftime("%b-%y"),
                                "value": val[1]
                            } for val in forecasted_list]
                        elif self._dataLevel == "day":
                            forecasted_list = [{
                                "key":
                                val[0].strftime("%Y-%m-%d"),
                                "value":
                                val[1]
                            } for val in forecasted_list]
                        forecasted_data += forecasted_list
                        card1chartdata["predicted"] = forecasted_data
                        # print json.dumps(card1chartdata,indent=2)
                        card1chartdata = ScatterChartData(data=card1chartdata)
                        chartJson = ChartJson()
                        chartJson.set_data(card1chartdata.get_data())
                        chartJson.set_label_text({
                            'x': ' ',
                            'y': 'No. of Observations'
                        })
                        chartJson.set_legend({
                            "actual": "Observed",
                            "predicted": "Forecast"
                        })
                        chartJson.set_chart_type("scatter_line")
                        chartJson.set_axes({"x": "key", "y": "value"})
                        chartJson.set_yaxis_number_format(".2f")
                        st_info = [
                            "Trend Analysis",
                            "Forecast Method : Holt Winters Method"
                        ]
                        measureTrendcard1Data.insert(
                            1, C3ChartData(data=chartJson, info=st_info))
                        measureTrendcard1Data.append(
                            HtmlData(data=card1BubbleData))
                        cardData = measureTrendcard1Data + measureTrendcard2Data
                        measureTrendCard.set_card_data(cardData)
                        measureTrendCard.set_card_name("Trend Analysis")
                        trendStoryNode = NarrativesTree(
                            "Trend", None, [], [measureTrendCard])
                        self._story_narrative.add_a_node(trendStoryNode)
                        self._result_setter.set_trend_node(trendStoryNode)

                        # prediction_data = [{"key":x["key"],"value":x["value"]} for x in trend_chart_data]
                        # last_val = prediction_data[-1]
                        # last_val.update({"predicted_value":last_val["value"]})
                        # prediction_data[-1] = last_val
                        #
                        # for val in range(prediction_window):
                        #     dataLevel = dataDict["dataLevel"]
                        #     if self._dataLevel == "month":
                        #         last_key = prediction_data[-1]["key"]
                        #         key = last_key+relativedelta(months=1)
                        #         prediction_data.append({"key":key,"predicted_value":predicted_values[val]})
                        #         forecasted_data.append({"key":key,"value":predicted_values[val]})
                        #     elif self._dataLevel == "day":
                        #         last_key = prediction_data[-1]["key"]
                        #         key = last_key+relativedelta(days=1)
                        #         prediction_data.append({"key":key,"predicted_value":predicted_values[val]})
                        # prediction_data_copy = prediction_data
                        # prediction_data = []
                        # for val in prediction_data_copy:
                        #     val["key"] = val["key"].strftime("%b-%y")
                        #     prediction_data.append(val)

                        # forecastDataDict = {"startForecast":predicted_values[0],
                        #                     "endForecast":predicted_values[prediction_window-1],
                        #                     "measure":dataDict["measure"],
                        #                     "forecast":True,
                        #                     "forecast_percentage": round((predicted_values[prediction_window-1]-predicted_values[0])/predicted_values[0],self._num_significant_digits),
                        #                     "prediction_window_text": str(prediction_window) + " months"
                        #                     }
                        #
                        # self._result_setter.update_executive_summary_data(forecastDataDict)
                        # summary3 = NarrativesUtils.get_template_output(self._base_dir,\
                        # 'trend_narrative_card3.html',forecastDataDict)
                        self._completionStatus += old_div(
                            self._scriptWeightDict[self._analysisName]["total"]
                            *
                            self._scriptStages["trendNarrativeEnd"]["weight"],
                            10)
                        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                                    "trendNarrativeEnd",\
                                                    "info",\
                                                    self._scriptStages["trendNarrativeEnd"]["summary"],\
                                                    self._completionStatus,\
                                                    self._completionStatus)
                        CommonUtils.save_progress_message(
                            self._messageURL, progressMessage)
                        self._dataframe_context.update_completion_status(
                            self._completionStatus)
                    else:
                        # self._result_setter.update_executive_summary_data({"trend_present":False})
                        print("Trend Analysis for Measure Failed")
                        print("#" * 20 + "Trend Analysis Error" + "#" * 20)
                        print(
                            "No date format for the date column %s was detected."
                            % (self._date_column_suggested))
                        print("#" * 60)
                        self._completionStatus += self._scriptWeightDict[
                            self._analysisName]["total"]
                        self._dataframe_context.update_completion_status(
                            completionStatus)
                        progressMessage = CommonUtils.create_progress_message_object("Trend","failedState","error",\
                                        "Trend Failed As "+"No Date Format For The Date Column %s Was Detected !!!" %(self._date_column_suggested),\
                                        completionStatus,completionStatus)
                        CommonUtils.save_progress_message(
                            messageURL, progressMessage)
                        self._dataframe_context.update_completion_status(
                            self._completionStatus)
                else:
                    # self._result_setter.update_executive_summary_data({"trend_present":False})
                    print("Trend Analysis for Measure Failed")
                    print("#" * 20 + "Trend Analysis Error" + "#" * 20)
                    print("No date column present for Trend Analysis.")
                    print("#" * 60)
                    self._completionStatus += self._scriptWeightDict[
                        self._analysisName]["total"]
                    self._dataframe_context.update_completion_status(
                        completionStatus)
                    progressMessage = CommonUtils.create_progress_message_object("Trend","failedState","error",\
                                    "No Date Column Present",\
                                    completionStatus,completionStatus)
                    CommonUtils.save_progress_message(messageURL,
                                                      progressMessage)
                    self._dataframe_context.update_completion_status(
                        self._completionStatus)
            else:
                print("overall Trend not Started YET")

        elif self._analysistype == "dimension":
            print("Dimension Trend Started")
            self._completionStatus += old_div(
                self._scriptWeightDict[self._analysisName]["total"] *
                self._scriptStages["initialization"]["weight"], 10)
            progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                        "initialization",\
                                        "info",\
                                        self._scriptStages["initialization"]["summary"],\
                                        self._completionStatus,\
                                        self._completionStatus)
            CommonUtils.save_progress_message(self._messageURL,
                                              progressMessage)
            self._dataframe_context.update_completion_status(
                self._completionStatus)

            self.narratives = {"card0": {}}
            if self._selected_date_columns != None:
                if self._dateFormatDetected:
                    # result_column_levels = [x[0] for x in self._data_frame.select(self._result_column).distinct().collect()]
                    try:
                        result_column_levels = self._metaParser.get_unique_level_names(
                            self._result_column)
                    except:
                        if self._pandas_flag:
                            result_column_levels = list(
                                self._data_frame[self._result_column].unique())
                        else:
                            result_column_levels = [
                                x[0] for x in self._data_frame.select(
                                    self._result_column).distinct().collect()
                            ]
                            # result_column_levels = self._data_frame.agg((F.collect_set(self._result_column).alias(self._result_column))).first().asDict()[self._result_column]

                    print("-" * 100)
                    # TODO Implement meta parser getter here
                    print(result_column_levels)
                    if self._pandas_flag:
                        level_count_df = self._data_frame[
                            self._result_column].value_counts()[0:2]
                        top2levels = list(level_count_df.index)
                    else:
                        level_count_df = self._data_frame.groupBy(
                            self._result_column).count().orderBy(
                                "count", ascending=False)
                        level_count_df_rows = level_count_df.collect()
                        top2levels = [
                            level_count_df_rows[0][0],
                            level_count_df_rows[1][0]
                        ]
                    cardData = []
                    chart_data = {}
                    cardData1 = []
                    c3_chart = {"dataType": "c3Chart", "data": {}}
                    print("#" * 40)
                    overall_count = NarrativesUtils.get_grouped_count_data_for_dimension_trend(
                        self._data_frame, self._dataLevel, self._result_column,
                        self._pandas_flag)
                    print("#" * 40)
                    for idx, level in enumerate(top2levels):
                        print("calculations in progress for the level :- ",
                              level)
                        if self._pandas_flag:
                            leveldf = self._data_frame[self._data_frame[
                                self._result_column] == level]
                        else:
                            leveldf = self._data_frame.filter(
                                col(self._result_column) == level)
                        grouped_data = NarrativesUtils.get_grouped_data_for_trend(
                            leveldf, self._dataLevel, self._result_column,
                            self._analysistype, self._pandas_flag)
                        grouped_data.rename(columns={"value": "value_count"},
                                            inplace=True)
                        grouped_data = pd.merge(grouped_data,
                                                overall_count,
                                                on='key',
                                                how='left')
                        # grouped_data["value"] = grouped_data["value_count"].apply(lambda x:round(x*100/float(self._data_frame.count()),self._num_significant_digits))
                        grouped_data["value"] = old_div(
                            grouped_data["value_count"],
                            grouped_data["totalCount"])
                        grouped_data["value"] = grouped_data["value"].apply(
                            lambda x: round(x * 100, self.
                                            _num_significant_digits))
                        if self._pandas_flag:
                            leveldf = leveldf.drop(self._date_column_suggested,
                                                   axis=1)
                            leveldf = leveldf.rename(
                                columns={
                                    "year_month": self._date_column_suggested
                                })
                            if "year_month" not in leveldf.columns:
                                leveldf["year_month"] = leveldf[
                                    self._date_column_suggested]
                            leveldf["value_col"] = 1
                        else:
                            leveldf = leveldf.drop(self._date_column_suggested)
                            leveldf = leveldf.withColumnRenamed(
                                "year_month", self._date_column_suggested)
                            if "year_month" not in leveldf.columns:
                                leveldf = leveldf.withColumn(
                                    "year_month",
                                    col(self._date_column_suggested))
                            leveldf = leveldf.withColumn('value_col', lit(1))

                        trend_narrative_obj = TrendNarrative(
                            self._result_column, self._date_column_suggested,
                            grouped_data, self._existingDateFormat,
                            self._requestedDateFormat, self._base_dir,
                            self._metaParser)
                        dataDict = trend_narrative_obj.generateDataDict(
                            grouped_data, self._dataLevel,
                            self._durationString)
                        dataDict["target_column"] = dataDict["measure"]
                        dataDict["measure"] = level
                        dataDict["duration"] = self._duration
                        dataDict["dataLevel"] = self._dataLevel
                        dataDict["durationString"] = self._durationString
                        # grouped_data.to_csv("/home/gulshan/marlabs/datasets/grouped_data"+str(idx))
                        # print json.dumps(dataDict,indent=2)
                        significant_dimensions = []
                        significant_dimension_dict = df_helper.get_chisquare_significant_dimension(
                        )
                        if significant_dimension_dict != {} and significant_dimension_dict != None:
                            significant_dimension_tuple = tuple(
                                significant_dimension_dict.items())
                            significant_dimension_tuple = sorted(
                                significant_dimension_tuple,
                                key=lambda x: x[1],
                                reverse=True)
                            significant_dimensions = [
                                x[0] for x in
                                significant_dimension_tuple[:self.
                                                            _number_of_dimensions_to_consider]
                            ]
                        else:
                            significant_dimensions = self._string_columns[:self
                                                                          .
                                                                          _number_of_dimensions_to_consider]
                        print("significant_dimensions", significant_dimensions)
                        reference_time = dataDict["reference_time"]
                        dataDict[
                            "significant_dimensions"] = significant_dimensions
                        if len(significant_dimensions) > 0:
                            st = time.time()
                            xtraData = trend_narrative_obj.get_xtra_calculations(
                                leveldf, grouped_data, significant_dimensions,
                                self._date_column_suggested, "value_col",
                                self._existingDateFormat, reference_time,
                                self._dataLevel, self._pandas_flag)
                            print("time for get_xtra_calculations",
                                  time.time() - st)
                            if xtraData != None:
                                dataDict.update(xtraData)
                        dimensionCount = trend_narrative_obj.generate_dimension_extra_narrative(
                            grouped_data, dataDict, self._dataLevel)
                        if dimensionCount != None:
                            dataDict.update(dimensionCount)

                        dataDict.update({
                            "level_index": idx,
                            "blockSplitter": self._blockSplitter,
                            "highlightFlag": self._highlightFlag
                        })

                        self._result_setter.update_executive_summary_data(
                            dataDict)
                        trendStory = NarrativesUtils.get_template_output(self._base_dir,\
                                                                        'dimension_trend.html',dataDict)
                        blocks = NarrativesUtils.block_splitter(
                            trendStory, self._blockSplitter)

                        if idx != 0:
                            cardData1 += blocks[2:]
                        else:
                            cardData1 += blocks

                        trend_chart_data = [
                            x for x in list(grouped_data[
                                ["key", "value"]].T.to_dict().values())
                            if x['key'] != None
                        ]
                        trend_chart_data = sorted(trend_chart_data,
                                                  key=lambda x: x["key"])
                        card1chartdata = trend_chart_data
                        if self._dataLevel == "day":
                            card1chartdata = [{
                                "key": str(val["key"]),
                                "value": val["value"]
                            } for val in card1chartdata]
                        elif self._dataLevel == "month":
                            card1chartdata = [{
                                "key":
                                val["key"].strftime("%b-%y"),
                                "value":
                                val["value"]
                            } for val in card1chartdata]
                        chart_data[level] = card1chartdata

                    labels = {
                        "x": "key",
                        "y": list(chart_data.keys())[0],
                        "y2": list(chart_data.keys())[1]
                    }
                    c3Chart = {
                        "data": chart_data,
                        "format": "%b-%y",
                        "label": labels,
                        "label_text": {
                            "x": "Time",
                            "y": "Percentage of " + labels["y"],
                            "y2": "Percentage of " + labels["y2"]
                        }
                    }

                    c3_chart["data"] = c3Chart
                    multiLineData = []
                    for idx in range(len(chart_data[top2levels[0]])):
                        key = chart_data[top2levels[0]][idx]["key"]
                        value = chart_data[top2levels[0]][idx]["value"]
                        try:
                            value1 = chart_data[top2levels[1]][idx]["value"]
                        except:
                            value1 = 0
                        multiLineData.append({
                            "key": key,
                            top2levels[0]: value,
                            top2levels[1]: value1
                        })
                    chartData = NormalChartData(multiLineData)
                    chartJson = ChartJson()
                    chartJson.set_data(chartData.get_data())
                    chartJson.set_label_text(c3Chart["label_text"])
                    chartJson.set_legend(c3Chart["label"])
                    chartJson.set_chart_type("line")
                    chartJson.set_yaxis_number_format(".2f")
                    chartJson.set_axes(labels)
                    st_info = [
                        "Trend Analysis",
                        "Forecast Method : Holt Winters Method"
                    ]
                    cardData1.insert(1,
                                     C3ChartData(data=chartJson, info=st_info))
                    trendCard = NormalCard(name="Trend Analysis",
                                           slug=None,
                                           cardData=cardData1)
                    trendStoryNode = NarrativesTree("Trend", None, [],
                                                    [trendCard])
                    self._story_narrative.add_a_node(trendStoryNode)
                    self._result_setter.set_trend_node(trendStoryNode)
                    self._completionStatus += old_div(
                        self._scriptWeightDict[self._analysisName]["total"] *
                        self._scriptStages["summarygeneration"]["weight"], 10)
                    progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                                "summarygeneration",\
                                                "info",\
                                                self._scriptStages["summarygeneration"]["summary"],\
                                                self._completionStatus,\
                                                self._completionStatus)
                    CommonUtils.save_progress_message(self._messageURL,
                                                      progressMessage)
                    self._dataframe_context.update_completion_status(
                        self._completionStatus)

                    self._completionStatus += old_div(
                        self._scriptWeightDict[self._analysisName]["total"] *
                        self._scriptStages["completion"]["weight"], 10)
                    progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                                "completion",\
                                                "info",\
                                                self._scriptStages["completion"]["summary"],\
                                                self._completionStatus,\
                                                self._completionStatus)
                    CommonUtils.save_progress_message(self._messageURL,
                                                      progressMessage)
                    self._dataframe_context.update_completion_status(
                        self._completionStatus)
                else:
                    self._result_setter.update_executive_summary_data(
                        {"trend_present": False})
                    print("Trend Analysis for Dimension Failed")
                    print("#" * 20 + "Trend Analysis Error" + "#" * 20)
                    if self._date_column_suggested:
                        print(
                            "No date format for the date column %s was detected."
                            % (self._date_column_suggested))
                    print("#" * 60)
                    self._completionStatus += self._scriptWeightDict[
                        self._analysisName]["total"]
                    self._dataframe_context.update_completion_status(
                        self._completionStatus)
                    progressMessage = CommonUtils.create_progress_message_object("Trend","failedState","error",\
                                    "Trend Failed As "+"No Date Format For The Date Column %s Was Detected !!!" %(self._date_column_suggested),\
                                    self._completionStatus,self._completionStatus)
                    CommonUtils.save_progress_message(messageURL,
                                                      progressMessage)
                    self._dataframe_context.update_completion_status(
                        self._completionStatus)

            else:
                self._result_setter.update_executive_summary_data(
                    {"trend_present": False})
                print("Trend Analysis for Dimension Failed")
                print("#" * 20 + "Trend Analysis Error" + "#" * 20)
                print("No date column present for Trend Analysis.")
                print("#" * 60)
                self._completionStatus += self._scriptWeightDict[
                    self._analysisName]["total"]
                self._dataframe_context.update_completion_status(
                    self._completionStatus)
                progressMessage = CommonUtils.create_progress_message_object("Trend","failedState","error",\
                                "No Date Column Present",\
                                self._completionStatus,self._completionStatus)
                CommonUtils.save_progress_message(messageURL, progressMessage)
                self._dataframe_context.update_completion_status(
                    self._completionStatus)
Ejemplo n.º 4
0
    def _generate_narratives(self):
        """
        generate main card narrative and remaining cards are generated by calling ChiSquareAnalysis class for each of analyzed dimensions
        """
        for target_dimension in self._df_chisquare_result.keys():
            target_chisquare_result = self._df_chisquare_result[
                target_dimension]
            analysed_variables = target_chisquare_result.keys(
            )  ## List of all analyzed var.
            # List of significant var out of analyzed var.
            significant_variables = [
                dim for dim in target_chisquare_result.keys()
                if target_chisquare_result[dim].get_pvalue() <= 0.05
            ]
            effect_sizes = [
                target_chisquare_result[dim].get_effect_size()
                for dim in significant_variables
            ]

            effect_size_dict = dict(zip(significant_variables, effect_sizes))
            significant_variables = [
                y
                for (x, y) in sorted(zip(effect_sizes, significant_variables),
                                     reverse=True)
            ]
            #insignificant_variables = [i for i in self._df_chisquare_result[target_dimension] if i['pv']>0.05]

            num_analysed_variables = len(analysed_variables)
            num_significant_variables = len(significant_variables)
            self.narratives['main_card'] = {}
            self.narratives['main_card'][
                'heading'] = 'Relationship between ' + target_dimension + ' and other factors'
            self.narratives['main_card']['paragraphs'] = {}
            data_dict = {
                'num_variables': num_analysed_variables,
                'num_significant_variables': num_significant_variables,
                'significant_variables': significant_variables,
                'target': target_dimension,
                'analysed_dimensions': analysed_variables,
                'blockSplitter': self._blockSplitter
            }  # for both para 1 and para 2
            paragraph = {}
            paragraph['header'] = ''

            paragraph['content'] = NarrativesUtils.get_template_output(
                self._base_dir, 'main_card.html', data_dict)
            self.narratives['main_card']['paragraphs'] = [paragraph]
            self.narratives['cards'] = []
            chart = {
                'header':
                'Strength of association between ' + target_dimension +
                ' and other dimensions'
            }
            chart['data'] = effect_size_dict
            chart['label_text'] = {
                'x': 'Dimensions',
                'y': 'Effect Size (Cramers-V)'
            }

            chart_data = []
            chartDataValues = []
            for k, v in effect_size_dict.items():
                chart_data.append({"key": k, "value": float(v)})
                chartDataValues.append(float(v))
            chart_data = sorted(chart_data,
                                key=lambda x: x["value"],
                                reverse=True)
            chart_json = ChartJson()
            chart_json.set_data(chart_data)
            chart_json.set_chart_type("bar")
            # chart_json.set_label_text({'x':'Dimensions','y':'Effect Size (Cramers-V)'})
            chart_json.set_label_text({
                'x': '  ',
                'y': 'Effect Size (Cramers-V)'
            })
            chart_json.set_axis_rotation(True)
            chart_json.set_axes({"x": "key", "y": "value"})
            # chart_json.set_yaxis_number_format(".4f")
            chart_json.set_yaxis_number_format(
                NarrativesUtils.select_y_axis_format(chartDataValues))
            self.narratives['main_card']['chart'] = chart

            main_card = NormalCard()
            header = "<h3>Strength of association between " + target_dimension + " and other dimensions</h3>"
            main_card_data = [HtmlData(data=header)]
            main_card_narrative = NarrativesUtils.get_template_output(
                self._base_dir, 'main_card.html', data_dict)
            main_card_narrative = NarrativesUtils.block_splitter(
                main_card_narrative, self._blockSplitter)
            main_card_data += main_card_narrative
            # st_info = ["Test : Chi Square", "Threshold for p-value : 0.05", "Effect Size : Cramer's V"]
            # print "chartdata",chart_data
            if len(chart_data) > 0:
                statistical_info_array = [
                    ("Test Type", "Chi-Square"),
                    ("Effect Size", "Cramer's V"),
                    ("Max Effect Size", chart_data[0]["key"]),
                    ("Min Effect Size", chart_data[-1]["key"]),
                ]
                statistical_inferenc = ""
                if len(chart_data) == 1:
                    statistical_inference = "{} is the only variable that have significant association with the {} (Target) having an \
                     Effect size of {}".format(
                        chart_data[0]["key"],
                        self._dataframe_context.get_result_column(),
                        round(chart_data[0]["value"], 4))
                elif len(chart_data) == 2:
                    statistical_inference = "There are two variables ({} and {}) that have significant association with the {} (Target) and the \
                     Effect size ranges are {} and {} respectively".format(
                        chart_data[0]["key"], chart_data[1]["key"],
                        self._dataframe_context.get_result_column(),
                        round(chart_data[0]["value"], 4),
                        round(chart_data[1]["value"], 4))
                else:
                    statistical_inference = "There are {} variables that have significant association with the {} (Target) and the \
                     Effect size ranges from {} to {}".format(
                        len(chart_data),
                        self._dataframe_context.get_result_column(),
                        round(chart_data[0]["value"], 4),
                        round(chart_data[-1]["value"], 4))
                if statistical_inference != "":
                    statistical_info_array.append(
                        ("Inference", statistical_inference))
                statistical_info_array = NarrativesUtils.statistical_info_array_formatter(
                    statistical_info_array)
            else:
                statistical_info_array = []
            main_card_data.append(
                C3ChartData(data=chart_json, info=statistical_info_array))
            main_card.set_card_data(main_card_data)
            main_card.set_card_name("Key Influencers")

            if self._storyOnScoredData != True:
                self._chiSquareNode.add_a_card(main_card)
                self._result_setter.add_a_score_chi_card(main_card)

            print "target_dimension", target_dimension
            if self._appid == '2' and num_significant_variables > 5:
                significant_variables = significant_variables[:5]
            else:
                if self._nColsToUse != None:
                    significant_variables = significant_variables[:self.
                                                                  _nColsToUse]

            CommonUtils.create_update_and_save_progress_message(
                self._dataframe_context,
                self._scriptWeightDict,
                self._scriptStages,
                self._analysisName,
                "custom",
                "info",
                display=True,
                customMsg="Analyzing key drivers",
                weightKey="narratives")
            for analysed_dimension in significant_variables[:self.
                                                            _noOfSigDimsToShow]:
                chisquare_result = self._df_chisquare.get_chisquare_result(
                    target_dimension, analysed_dimension)
                if self._appid == '2':
                    print "APPID 2 is used"
                    card = ChiSquareAnalysis(
                        self._dataframe_context, self._dataframe_helper,
                        chisquare_result, target_dimension, analysed_dimension,
                        significant_variables, num_analysed_variables,
                        self._data_frame, self._measure_columns,
                        self._base_dir, None, target_chisquare_result)
                    # self.narratives['cards'].append(card)
                    self._result_setter.add_a_score_chi_card(
                        json.loads(
                            CommonUtils.convert_python_object_to_json(
                                card.get_dimension_card1())))

                elif self._appid == '1':
                    print "APPID 1 is used"
                    card = ChiSquareAnalysis(
                        self._dataframe_context, self._dataframe_helper,
                        chisquare_result, target_dimension, analysed_dimension,
                        significant_variables, num_analysed_variables,
                        self._data_frame, self._measure_columns,
                        self._base_dir, None, target_chisquare_result)
                    # self.narratives['cards'].append(card)
                    self._result_setter.add_a_score_chi_card(
                        json.loads(
                            CommonUtils.convert_python_object_to_json(
                                card.get_dimension_card1())))
                else:
                    target_dimension_card = ChiSquareAnalysis(
                        self._dataframe_context, self._dataframe_helper,
                        chisquare_result, target_dimension, analysed_dimension,
                        significant_variables, num_analysed_variables,
                        self._data_frame, self._measure_columns,
                        self._base_dir, None, target_chisquare_result)
                    self.narratives['cards'].append(target_dimension_card)
                    self._chiSquareNode.add_a_node(
                        target_dimension_card.get_dimension_node())
        self._story_narrative.add_a_node(self._chiSquareNode)
        self._result_setter.set_chisquare_node(self._chiSquareNode)
    def Predict(self):
        self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight(
        )
        self._scriptStages = {
            "initialization": {
                "summary": "Initialized the Naive Bayes Scripts",
                "weight": 2
            },
            "prediction": {
                "summary": "Spark ML Naive Bayes Model Prediction Finished",
                "weight": 2
            },
            "frequency": {
                "summary": "descriptive analysis finished",
                "weight": 2
            },
            "chisquare": {
                "summary": "chi Square analysis finished",
                "weight": 4
            },
            "completion": {
                "summary": "all analysis finished",
                "weight": 4
            },
        }

        self._completionStatus += self._scriptWeightDict[self._analysisName][
            "total"] * self._scriptStages["initialization"]["weight"] / 10
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "initialization",\
                                    "info",\
                                    self._scriptStages["initialization"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL, progressMessage)
        self._dataframe_context.update_completion_status(
            self._completionStatus)

        SQLctx = SQLContext(sparkContext=self._spark.sparkContext,
                            sparkSession=self._spark)
        dataSanity = True
        level_counts_train = self._dataframe_context.get_level_count_dict()
        categorical_columns = self._dataframe_helper.get_string_columns()
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        time_dimension_columns = self._dataframe_helper.get_timestamp_columns()
        result_column = self._dataframe_context.get_result_column()
        categorical_columns = [
            x for x in categorical_columns if x != result_column
        ]

        level_counts_score = CommonUtils.get_level_count_dict(
            self._data_frame,
            categorical_columns,
            self._dataframe_context.get_column_separator(),
            output_type="dict",
            dataType="spark")
        for key in level_counts_train:
            if key in level_counts_score:
                if level_counts_train[key] != level_counts_score[key]:
                    dataSanity = False
            else:
                dataSanity = False

        test_data_path = self._dataframe_context.get_input_file()
        score_data_path = self._dataframe_context.get_score_path(
        ) + "/data.csv"
        trained_model_path = self._dataframe_context.get_model_path()
        trained_model_path = "/".join(
            trained_model_path.split("/")[:-1]
        ) + "/" + self._slug + "/" + self._dataframe_context.get_model_for_scoring(
        )
        # score_summary_path = self._dataframe_context.get_score_path()+"/Summary/summary.json"

        pipelineModel = MLUtils.load_pipeline(trained_model_path)

        df = self._data_frame
        transformed = pipelineModel.transform(df)
        label_indexer_dict = MLUtils.read_string_indexer_mapping(
            trained_model_path, SQLctx)
        prediction_to_levels = udf(lambda x: label_indexer_dict[x],
                                   StringType())
        transformed = transformed.withColumn(
            result_column, prediction_to_levels(transformed.prediction))

        if "probability" in transformed.columns:
            probability_dataframe = transformed.select(
                [result_column, "probability"]).toPandas()
            probability_dataframe = probability_dataframe.rename(
                index=str, columns={result_column: "predicted_class"})
            probability_dataframe[
                "predicted_probability"] = probability_dataframe[
                    "probability"].apply(lambda x: max(x))
            self._score_summary[
                "prediction_split"] = MLUtils.calculate_scored_probability_stats(
                    probability_dataframe)
            self._score_summary["result_column"] = result_column
            scored_dataframe = transformed.select(
                categorical_columns + time_dimension_columns +
                numerical_columns + [result_column, "probability"]).toPandas()
            scored_dataframe['predicted_probability'] = probability_dataframe[
                "predicted_probability"].values
            # scored_dataframe = scored_dataframe.rename(index=str, columns={"predicted_probability": "probability"})
        else:
            self._score_summary["prediction_split"] = []
            self._score_summary["result_column"] = result_column
            scored_dataframe = transformed.select(categorical_columns +
                                                  time_dimension_columns +
                                                  numerical_columns +
                                                  [result_column]).toPandas()

        labelMappingDict = self._dataframe_context.get_label_map()
        if score_data_path.startswith("file"):
            score_data_path = score_data_path[7:]
        scored_dataframe.to_csv(score_data_path, header=True, index=False)

        uidCol = self._dataframe_context.get_uid_column()
        if uidCol == None:
            uidCols = self._metaParser.get_suggested_uid_columns()
            if len(uidCols) > 0:
                uidCol = uidCols[0]
        uidTableData = []
        predictedClasses = list(scored_dataframe[result_column].unique())
        if uidCol:
            if uidCol in df.columns:
                for level in predictedClasses:
                    levelDf = scored_dataframe[scored_dataframe[result_column]
                                               == level]
                    levelDf = levelDf[[
                        uidCol, "predicted_probability", result_column
                    ]]
                    levelDf.sort_values(by="predicted_probability",
                                        ascending=False,
                                        inplace=True)
                    levelDf["predicted_probability"] = levelDf[
                        "predicted_probability"].apply(
                            lambda x: humanize.apnumber(x * 100) + "%"
                            if x * 100 >= 10 else str(int(x * 100)) + "%")
                    uidTableData.append(levelDf[:5])
                uidTableData = pd.concat(uidTableData)
                uidTableData = [list(arr) for arr in list(uidTableData.values)]
                uidTableData = [[uidCol, "Probability", result_column]
                                ] + uidTableData
                uidTable = TableData()
                uidTable.set_table_width(25)
                uidTable.set_table_data(uidTableData)
                uidTable.set_table_type("normalHideColumn")
                self._result_setter.set_unique_identifier_table(
                    json.loads(
                        CommonUtils.convert_python_object_to_json(uidTable)))

        self._completionStatus += self._scriptWeightDict[self._analysisName][
            "total"] * self._scriptStages["prediction"]["weight"] / 10
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "prediction",\
                                    "info",\
                                    self._scriptStages["prediction"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL, progressMessage)
        self._dataframe_context.update_completion_status(
            self._completionStatus)

        print("STARTING DIMENSION ANALYSIS ...")
        columns_to_keep = []
        columns_to_drop = []

        columns_to_keep = self._dataframe_context.get_score_consider_columns()

        if len(columns_to_keep) > 0:
            columns_to_drop = list(set(df.columns) - set(columns_to_keep))
        else:
            columns_to_drop += ["predicted_probability"]

        scored_df = transformed.select(categorical_columns +
                                       time_dimension_columns +
                                       numerical_columns + [result_column])
        columns_to_drop = [
            x for x in columns_to_drop if x in scored_df.columns
        ]
        modified_df = scored_df.select(
            [x for x in scored_df.columns if x not in columns_to_drop])
        resultColLevelCount = dict(
            modified_df.groupby(result_column).count().collect())
        self._metaParser.update_column_dict(
            result_column, {
                "LevelCount": resultColLevelCount,
                "numberOfUniqueValues": len(resultColLevelCount.keys())
            })
        self._dataframe_context.set_story_on_scored_data(True)

        self._dataframe_context.update_consider_columns(columns_to_keep)
        df_helper = DataFrameHelper(modified_df, self._dataframe_context,
                                    self._metaParser)
        df_helper.set_params()
        spark_scored_df = df_helper.get_data_frame()

        if len(predictedClasses) >= 2:
            try:
                fs = time.time()
                df_decision_tree_obj = DecisionTrees(
                    spark_scored_df,
                    df_helper,
                    self._dataframe_context,
                    self._spark,
                    self._metaParser,
                    scriptWeight=self._scriptWeightDict,
                    analysisName=self._analysisName).test_all(
                        dimension_columns=[result_column])
                narratives_obj = CommonUtils.as_dict(
                    DecisionTreeNarrative(result_column,
                                          df_decision_tree_obj,
                                          self._dataframe_helper,
                                          self._dataframe_context,
                                          self._metaParser,
                                          self._result_setter,
                                          story_narrative=None,
                                          analysisName=self._analysisName,
                                          scriptWeight=self._scriptWeightDict))
                print(narratives_obj)
            except Exception as e:
                print("DecisionTree Analysis Failed ", str(e))
        else:
            data_dict = {
                "npred": len(predictedClasses),
                "nactual": len(labelMappingDict.values())
            }

            if data_dict["nactual"] > 2:
                levelCountDict[predictedClasses[0]] = resultColLevelCount[
                    predictedClasses[0]]
                levelCountDict["Others"] = sum([
                    v for k, v in resultColLevelCount.items()
                    if k != predictedClasses[0]
                ])
            else:
                levelCountDict = resultColLevelCount
                otherClass = list(
                    set(labelMappingDict.values()) - set(predictedClasses))[0]
                levelCountDict[otherClass] = 0

                print(levelCountDict)

            total = float(
                sum([x for x in levelCountDict.values() if x != None]))
            levelCountTuple = [({
                "name":
                k,
                "count":
                v,
                "percentage":
                humanize.apnumber(v * 100 / total) + "%"
            }) for k, v in levelCountDict.items() if v != None]
            levelCountTuple = sorted(levelCountTuple,
                                     key=lambda x: x["count"],
                                     reverse=True)
            data_dict["blockSplitter"] = "|~NEWBLOCK~|"
            data_dict["targetcol"] = result_column
            data_dict["nlevel"] = len(levelCountDict.keys())
            data_dict["topLevel"] = levelCountTuple[0]
            data_dict["secondLevel"] = levelCountTuple[1]
            maincardSummary = NarrativesUtils.get_template_output(
                "/apps/", 'scorewithoutdtree.html', data_dict)

            main_card = NormalCard()
            main_card_data = []
            main_card_narrative = NarrativesUtils.block_splitter(
                maincardSummary, "|~NEWBLOCK~|")
            main_card_data += main_card_narrative

            chartData = NormalChartData([levelCountDict]).get_data()
            chartJson = ChartJson(data=chartData)
            chartJson.set_title(result_column)
            chartJson.set_chart_type("donut")
            mainCardChart = C3ChartData(data=chartJson)
            mainCardChart.set_width_percent(33)
            main_card_data.append(mainCardChart)

            uidTable = self._result_setter.get_unique_identifier_table()
            if uidTable != None:
                main_card_data.append(uidTable)
            main_card.set_card_data(main_card_data)
            main_card.set_card_name(
                "Predicting Key Drivers of {}".format(result_column))
            self._result_setter.set_score_dtree_cards([main_card], {})
Ejemplo n.º 6
0
    def _generate_narratives(self):
        chisquare_result = self._chisquare_result
        target_dimension = self._target_dimension
        analysed_dimension = self._analysed_dimension
        significant_variables = self._significant_variables
        num_analysed_variables = self._num_analysed_variables
        table = self._chiSquareTable
        total = self._chiSquareTable.get_total()

        levels = self._chiSquareTable.get_column_two_levels()
        level_counts = self._chiSquareTable.get_column_total()
        levels_count_sum = sum(level_counts)
        levels_percentages = [
            i * 100.0 / levels_count_sum for i in level_counts
        ]
        sorted_levels = sorted(zip(level_counts, levels), reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]

        top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        bottom_dim = sorted_levels[-1][1]
        bottom_dim_contribution = sorted_levels[-1][0]
        bottom_dims = [
            y for x, y in sorted_levels if x == bottom_dim_contribution
        ]

        target_levels = self._chiSquareTable.get_column_one_levels()

        target_counts = self._chiSquareTable.get_row_total()
        sorted_target_levels = sorted(zip(target_counts, target_levels),
                                      reverse=True)

        top_target_count, top_target = sorted_target_levels[0]
        second_target_count, second_target = sorted_target_levels[1]

        top_target_contributions = [
            table.get_value(top_target, i) for i in levels
        ]
        sum_top_target = sum(top_target_contributions)

        sorted_levels = sorted(zip(top_target_contributions, levels),
                               reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]
        top_target_top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        top_target_top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        top_target_bottom_dim = sorted_levels[-1][1]
        top_target_bottom_dim_contribution = sorted_levels[-1][0]

        top_target_percentages = [
            i * 100.0 / sum_top_target for i in top_target_contributions
        ]
        best_top_target_index = top_target_contributions.index(
            max(top_target_contributions))
        worst_top_target_index = top_target_contributions.index(
            min(top_target_contributions))
        top_target_differences = [
            x - y for x, y in zip(levels_percentages, top_target_percentages)
        ]
        if len(top_target_differences) > 6:
            tops = 2
            bottoms = -2
        elif len(top_target_differences) > 4:
            tops = 2
            bottoms = -1
        else:
            tops = 1
            bottoms = -1
        sorted_ = sorted(enumerate(top_target_differences),
                         key=lambda x: x[1],
                         reverse=True)
        best_top_difference_indices = [x for x, y in sorted_[:tops]]
        worst_top_difference_indices = [x for x, y in sorted_[bottoms:]]

        top_target_shares = [
            x * 100.0 / y
            for x, y in zip(top_target_contributions, level_counts)
        ]
        max_top_target_shares = max(top_target_shares)
        best_top_target_share_index = [
            idx for idx, val in enumerate(top_target_shares)
            if val == max_top_target_shares
        ]
        level_counts_threshold = sum(level_counts) * 0.05 / len(level_counts)
        min_top_target_shares = min([
            x for x, y in zip(top_target_shares, level_counts)
            if y >= level_counts_threshold
        ])
        worst_top_target_share_index = [
            idx for idx, val in enumerate(top_target_shares)
            if val == min_top_target_shares
        ]
        overall_top_percentage = sum_top_target * 100.0 / total

        second_target_contributions = [
            table.get_value(second_target, i) for i in levels
        ]
        sum_second_target = sum(second_target_contributions)

        sorted_levels = sorted(zip(second_target_contributions, levels),
                               reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]
        second_target_top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        second_target_top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        second_target_bottom_dim = sorted_levels[-1][1]
        second_target_bottom_dim_contribution = sorted_levels[-1][0]

        second_target_percentages = [
            i * 100.0 / sum_second_target for i in second_target_contributions
        ]
        best_second_target_index = second_target_contributions.index(
            max(second_target_contributions))
        worst_second_target_index = second_target_contributions.index(
            min(second_target_contributions))
        second_target_differences = [
            x - y
            for x, y in zip(levels_percentages, second_target_percentages)
        ]
        if len(second_target_differences) > 6:
            tops = 2
            bottoms = -2
        elif len(second_target_differences) > 4:
            tops = 2
            bottoms = -1
        else:
            tops = 1
            bottoms = -1
        sorted_ = sorted(enumerate(second_target_differences),
                         key=lambda x: x[1],
                         reverse=True)
        best_second_difference_indices = [x for x, y in sorted_[:tops]]
        worst_second_difference_indices = [x for x, y in sorted_[bottoms:]]

        second_target_shares = [
            x * 100.0 / y
            for x, y in zip(second_target_contributions, level_counts)
        ]
        max_second_target_shares = max(second_target_shares)
        best_second_target_share_index = [
            idx for idx, val in enumerate(second_target_shares)
            if val == max_second_target_shares
        ]
        level_counts_threshold = sum(level_counts) * 0.05 / len(level_counts)
        min_second_target_shares = min([
            x for x, y in zip(second_target_shares, level_counts)
            if y >= level_counts_threshold
        ])
        # worst_second_target_share_index = second_target_shares.index(min_second_target_shares)
        worst_second_target_share_index = [
            idx for idx, val in enumerate(second_target_shares)
            if val == min_second_target_shares
        ]
        overall_second_percentage = sum_second_target * 100.0 / total

        targetCardDataDict = {}
        targetCardDataDict['target'] = target_dimension
        targetCardDataDict['colname'] = analysed_dimension
        targetCardDataDict['num_significant'] = len(significant_variables)
        targetCardDataDict['plural_colname'] = NarrativesUtils.pluralize(
            analysed_dimension)

        targetCardDataDict["blockSplitter"] = self._blockSplitter
        targetCardDataDict["binTargetCol"] = self._binTargetCol
        targetCardDataDict["binAnalyzedCol"] = self._binAnalyzedCol
        targetCardDataDict['highlightFlag'] = self._highlightFlag
        targetCardDataDict['levels'] = levels

        data_dict = {}
        data_dict[
            'best_second_difference'] = best_second_difference_indices  ##these changed
        data_dict['worst_second_difference'] = worst_second_difference_indices
        data_dict['best_top_difference'] = best_top_difference_indices
        data_dict['worst_top_difference'] = worst_top_difference_indices
        data_dict['levels_percentages'] = levels_percentages
        data_dict['top_target_percentages'] = top_target_percentages
        data_dict['second_target_percentages'] = second_target_percentages
        data_dict['levels'] = levels
        data_dict['best_top_share'] = best_top_target_share_index
        data_dict['worst_top_share'] = worst_top_target_share_index
        data_dict['best_second_share'] = best_second_target_share_index
        data_dict['worst_second_share'] = worst_second_target_share_index
        data_dict['top_target_shares'] = top_target_shares
        data_dict['second_target_shares'] = second_target_shares
        data_dict['overall_second'] = overall_second_percentage
        data_dict['overall_top'] = overall_top_percentage

        data_dict['num_significant'] = len(significant_variables)
        data_dict['colname'] = analysed_dimension
        data_dict['plural_colname'] = NarrativesUtils.pluralize(
            analysed_dimension)
        data_dict['target'] = target_dimension
        data_dict['top_levels'] = top_dims
        data_dict['top_levels_percent'] = round(
            top_dims_contribution * 100.0 / total, 1)
        data_dict['bottom_level'] = bottom_dim
        data_dict['bottom_levels'] = bottom_dims
        data_dict['bottom_level_percent'] = round(
            bottom_dim_contribution * 100 / sum(level_counts), 2)
        data_dict['second_target'] = second_target
        data_dict['second_target_top_dims'] = second_target_top_dims
        data_dict[
            'second_target_top_dims_contribution'] = second_target_top_dims_contribution * 100.0 / sum(
                second_target_contributions)
        data_dict['second_target_bottom_dim'] = second_target_bottom_dim
        data_dict[
            'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution
        data_dict['best_second_target'] = levels[best_second_target_index]
        data_dict['best_second_target_count'] = second_target_contributions[
            best_second_target_index]
        data_dict['best_second_target_percent'] = round(
            second_target_contributions[best_second_target_index] * 100.0 /
            sum(second_target_contributions), 2)
        data_dict['worst_second_target'] = levels[worst_second_target_index]
        data_dict['worst_second_target_percent'] = round(
            second_target_contributions[worst_second_target_index] * 100.0 /
            sum(second_target_contributions), 2)

        data_dict['top_target'] = top_target
        data_dict['top_target_top_dims'] = top_target_top_dims
        data_dict[
            'top_target_top_dims_contribution'] = top_target_top_dims_contribution * 100.0 / sum(
                top_target_contributions)
        data_dict['top_target_bottom_dim'] = top_target_bottom_dim
        data_dict[
            'top_target_bottom_dim_contribution'] = top_target_bottom_dim_contribution
        data_dict['best_top_target'] = levels[best_top_target_index]
        data_dict['best_top_target_count'] = top_target_contributions[
            best_top_target_index]
        data_dict['best_top_target_percent'] = round(
            top_target_contributions[best_top_target_index] * 100.0 /
            sum(top_target_contributions), 2)
        data_dict['worst_top_target'] = levels[worst_top_target_index]
        data_dict['worst_top_target_percent'] = round(
            top_target_contributions[worst_top_target_index] * 100.0 /
            sum(top_target_contributions), 2)

        data_dict["blockSplitter"] = self._blockSplitter
        data_dict["binTargetCol"] = self._binTargetCol
        data_dict["binAnalyzedCol"] = self._binAnalyzedCol
        data_dict['highlightFlag'] = self._highlightFlag

        ###############
        #     CARD1   #
        ###############

        print "self._binTargetCol & self._binAnalyzedCol : ", self._binTargetCol, self._binAnalyzedCol
        if (self._binTargetCol == True & self._binAnalyzedCol == False):
            print "Only Target Column is Binned, : ", self._binTargetCol
            output = NarrativesUtils.block_splitter(
                NarrativesUtils.get_template_output(
                    self._base_dir, 'card1_binned_target.html', data_dict),
                self._blockSplitter,
                highlightFlag=self._highlightFlag)
        elif (self._binTargetCol == True & self._binAnalyzedCol == True):
            print "Target Column and IV is Binned : ", self._binTargetCol, self._binAnalyzedCol
            output = NarrativesUtils.block_splitter(
                NarrativesUtils.get_template_output(
                    self._base_dir, 'card1_binned_target_and_IV.html',
                    data_dict),
                self._blockSplitter,
                highlightFlag=self._highlightFlag)
        else:
            output = NarrativesUtils.block_splitter(
                NarrativesUtils.get_template_output(self._base_dir,
                                                    'card1.html', data_dict),
                self._blockSplitter,
                highlightFlag=self._highlightFlag)

        targetDimCard1Data = []
        targetDimcard1Heading = '<h3>Relationship between ' + self._target_dimension + '  and ' + self._analysed_dimension + "</h3>"

        toggledata = ToggleData()

        targetDimTable1Data = self.generate_card1_table1()
        targetDimCard1Table1 = TableData()
        targetDimCard1Table1.set_table_type("heatMap")
        targetDimCard1Table1.set_table_data(targetDimTable1Data)
        toggledata.set_toggleon_data({
            "data": {
                "tableData": targetDimTable1Data,
                "tableType": "heatMap"
            },
            "dataType": "table"
        })

        targetDimTable2Data = self.generate_card1_table2()
        targetDimCard1Table2 = TableData()
        targetDimCard1Table2.set_table_type("normal")
        table2Data = targetDimTable2Data["data1"]
        table2Data = [
            innerList[1:] for innerList in table2Data
            if innerList[0].strip() != ""
        ]
        targetDimCard1Table2.set_table_data(table2Data)

        toggledata.set_toggleoff_data({
            "data": {
                "tableData": table2Data,
                "tableType": "heatMap"
            },
            "dataType": "table"
        })

        targetDimCard1Data.append(HtmlData(data=targetDimcard1Heading))
        targetDimCard1Data.append(toggledata)
        targetDimCard1Data += output

        self._card1.set_card_data(targetDimCard1Data)
        self._card1.set_card_name("{}: Relationship with {}".format(
            self._analysed_dimension, self._target_dimension))

        ###############
        #     CARD2   #
        ###############

        if self._appid == None:

            key_factors = ''
            num_key_factors = len(self._second_level_dimensions)

            if len(self._second_level_dimensions) == 5:
                key_factors = ', '.join(
                    self._second_level_dimensions[:4]
                ) + ' and ' + self._second_level_dimensions[4]
            elif len(self._second_level_dimensions) == 4:
                key_factors = ', '.join(
                    self._second_level_dimensions[:3]
                ) + ' and ' + self._second_level_dimensions[3]
            elif len(self._second_level_dimensions) == 3:
                key_factors = ', '.join(
                    self._second_level_dimensions[:2]
                ) + ' and ' + self._second_level_dimensions[2]
            elif len(self._second_level_dimensions) == 2:
                key_factors = ' and '.join(self._second_level_dimensions)
            elif len(self._second_level_dimensions) == 1:
                key_factors = self._second_level_dimensions[0]

            targetCardDataDict['num_key_factors'] = num_key_factors
            targetCardDataDict['key_factors'] = key_factors
            dict_for_test = {}
            for tupleObj in sorted_target_levels[:self._chiSquareLevelLimit]:
                targetLevel = tupleObj[1]

                targetCardDataDict['random_card2'] = random.randint(1, 100)
                targetCardDataDict['random_card4'] = random.randint(1, 100)

                second_target_contributions = [
                    table.get_value(targetLevel, i) for i in levels
                ]
                sum_second_target = sum(second_target_contributions)

                sorted_levels = sorted(zip(second_target_contributions,
                                           levels),
                                       reverse=True)

                level_differences = [0.0] + [
                    sorted_levels[i][0] - sorted_levels[i + 1][0]
                    for i in range(len(sorted_levels) - 1)
                ]
                second_target_top_dims = [
                    j for i, j in sorted_levels[:level_differences.
                                                index(max(level_differences))]
                ]
                second_target_top_dims_contribution = sum([
                    i for i, j in sorted_levels[:level_differences.
                                                index(max(level_differences))]
                ])
                second_target_bottom_dim = sorted_levels[-1][1]
                second_target_bottom_dim_contribution = sorted_levels[-1][0]

                second_target_percentages = [
                    i * 100.0 / sum_second_target
                    for i in second_target_contributions
                ]
                best_second_target_index = second_target_contributions.index(
                    max(second_target_contributions))
                worst_second_target_index = second_target_contributions.index(
                    min(second_target_contributions))
                second_target_differences = [
                    x - y for x, y in zip(levels_percentages,
                                          second_target_percentages)
                ]
                if len(second_target_differences) > 6:
                    tops = 2
                    bottoms = -2
                elif len(second_target_differences) > 4:
                    tops = 2
                    bottoms = -1
                else:
                    tops = 1
                    bottoms = -1
                sorted_ = sorted(enumerate(second_target_differences),
                                 key=lambda x: x[1],
                                 reverse=True)
                best_second_difference_indices = [x for x, y in sorted_[:tops]]
                worst_second_difference_indices = [
                    x for x, y in sorted_[bottoms:]
                ]

                second_target_shares = [
                    x * 100.0 / y
                    for x, y in zip(second_target_contributions, level_counts)
                ]
                max_second_target_shares = max(second_target_shares)
                best_second_target_share_index = [
                    idx for idx, val in enumerate(second_target_shares)
                    if val == max_second_target_shares
                ]
                level_counts_threshold = sum(level_counts) * 0.05 / len(
                    level_counts)
                min_second_target_shares = min([
                    x for x, y in zip(second_target_shares, level_counts)
                    if y >= level_counts_threshold
                ])
                worst_second_target_share_index = [
                    idx for idx, val in enumerate(second_target_shares)
                    if val == min_second_target_shares
                ]
                overall_second_percentage = sum_second_target * 100.0 / total

                # DataFrame for contribution calculation

                df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\
                                        filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                                        select(self._second_level_dimensions).toPandas()
                df_second_dim = self._data_frame.filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                                    select(self._second_level_dimensions).toPandas()

                # if self._chisquare_result.get_splits():
                #     splits = self._chisquare_result.get_splits()
                #     idx = self._chiSquareTable.get_bin_names(splits).index(second_target_top_dims[0])
                #     idx1 = self._chiSquareTable.get_bin_names(splits).index(top_target_top_dims[0])
                #     splits[len(splits)-1] = splits[len(splits)-1]+1
                #     df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\
                #                         filter(col(self._analysed_dimension)>=splits[idx]).filter(col(self._analysed_dimension)<splits[idx+1]).\
                #                         select(self._second_level_dimensions).toPandas()
                #     df_second_dim = self._data_frame.filter(col(self._analysed_dimension)>=splits[idx]).\
                #                     filter(col(self._analysed_dimension)<splits[idx+1]).\
                #                     select(self._second_level_dimensions).toPandas()
                # else:
                #     df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\
                #                         filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                #                         select(self._second_level_dimensions).toPandas()
                #     df_second_dim = self._data_frame.filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                #                     select(self._second_level_dimensions).toPandas()

                # print self._data_frame.select('Sales').show()

                distribution_second = []
                for d in self._second_level_dimensions:

                    grouped = df_second_target.groupby(d).agg({
                        d: 'count'
                    }).sort_values(d, ascending=False)
                    contributions = df_second_dim.groupby(d).agg({d: 'count'})
                    contribution_index = list(contributions.index)
                    contributions_val = contributions[d].tolist()
                    contributions_list = dict(
                        zip(contribution_index, contributions_val))
                    index_list = list(grouped.index)
                    grouped_list = grouped[d].tolist()
                    contributions_percent_list = [
                        round(y * 100.0 / contributions_list[x], 2)
                        for x, y in zip(index_list, grouped_list)
                    ]
                    sum_ = grouped[d].sum()
                    diffs = [0] + [
                        grouped_list[i] - grouped_list[i + 1]
                        for i in range(len(grouped_list) - 1)
                    ]
                    max_diff = diffs.index(max(diffs))

                    index_txt = ''
                    if max_diff == 1:
                        index_txt = index_list[0]
                    elif max_diff == 2:
                        index_txt = index_list[0] + '(' + str(
                            round(grouped_list[0] * 100.0 / sum_, 1)
                        ) + '%)' + ' and ' + index_list[1] + '(' + str(
                            round(grouped_list[1] * 100.0 / sum_, 1)) + '%)'
                    elif max_diff > 2:
                        index_txt = 'including ' + index_list[0] + '(' + str(
                            round(grouped_list[0] * 100.0 / sum_, 1)
                        ) + '%)' + ' and ' + index_list[1] + '(' + str(
                            round(grouped_list[1] * 100.0 / sum_, 1)) + '%)'
                    distribution_second.append({'contributions':[round(i*100.0/sum_,2) for i in grouped_list[:max_diff]],\
                                            'levels': index_list[:max_diff],'variation':random.randint(1,100),\
                                            'index_txt': index_txt, 'd':d,'contributions_percent':contributions_percent_list})

                targetCardDataDict['distribution_second'] = distribution_second
                targetCardDataDict['second_target'] = targetLevel
                targetCardDataDict[
                    'second_target_top_dims'] = second_target_top_dims
                targetCardDataDict[
                    'second_target_top_dims_contribution'] = second_target_top_dims_contribution * 100.0 / sum(
                        second_target_contributions)
                targetCardDataDict[
                    'second_target_bottom_dim'] = second_target_bottom_dim
                targetCardDataDict[
                    'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution
                targetCardDataDict['best_second_target'] = levels[
                    best_second_target_index]
                targetCardDataDict[
                    'best_second_target_count'] = second_target_contributions[
                        best_second_target_index]
                targetCardDataDict['best_second_target_percent'] = round(
                    second_target_contributions[best_second_target_index] *
                    100.0 / sum(second_target_contributions), 2)
                targetCardDataDict['worst_second_target'] = levels[
                    worst_second_target_index]
                targetCardDataDict['worst_second_target_percent'] = round(
                    second_target_contributions[worst_second_target_index] *
                    100.0 / sum(second_target_contributions), 2)

                card2Data = []
                targetLevelContributions = [
                    table.get_value(targetLevel, i) for i in levels
                ]
                card2Heading = '<h3>Distribution of ' + self._target_dimension + ' (' + targetLevel + ') across ' + self._analysed_dimension + "</h3>"
                chart, bubble = self.generate_distribution_card_chart(
                    targetLevel, targetLevelContributions, levels,
                    level_counts, total)
                card2ChartData = NormalChartData(data=chart["data"])
                card2ChartJson = ChartJson()
                card2ChartJson.set_data(card2ChartData.get_data())
                card2ChartJson.set_chart_type("combination")
                card2ChartJson.set_types({
                    "total": "bar",
                    "percentage": "line"
                })
                card2ChartJson.set_legend({
                    "total": "# of " + targetLevel,
                    "percentage": "% of " + targetLevel
                })
                card2ChartJson.set_axes({
                    "x": "key",
                    "y": "total",
                    "y2": "percentage"
                })
                card2ChartJson.set_label_text({
                    "x": " ",
                    "y": "Count",
                    "y2": "Percentage"
                })
                print "self._binTargetCol & self._binAnalyzedCol : ", self._binTargetCol, self._binAnalyzedCol
                if (self._binTargetCol == True & self._binAnalyzedCol ==
                        False):
                    print "Only Target Column is Binned"
                    output2 = NarrativesUtils.block_splitter(
                        NarrativesUtils.get_template_output(
                            self._base_dir, 'card2_binned_target.html',
                            targetCardDataDict), self._blockSplitter)
                elif (self._binTargetCol == True & self._binAnalyzedCol ==
                      True):
                    print "Target Column and IV is Binned"
                    output2 = NarrativesUtils.block_splitter(
                        NarrativesUtils.get_template_output(
                            self._base_dir, 'card2_binned_target_and_IV.html',
                            targetCardDataDict), self._blockSplitter)
                else:
                    print "In Else, self._binTargetCol should be False : ", self._binTargetCol
                    output2 = NarrativesUtils.block_splitter(
                        NarrativesUtils.get_template_output(
                            self._base_dir, 'card2.html', targetCardDataDict),
                        self._blockSplitter)

                card2Data.append(HtmlData(data=card2Heading))
                statistical_info_array = [
                    ("Test Type", "Chi-Square"),
                    ("Chi-Square statistic",
                     str(round(self._chisquare_result.get_stat(), 3))),
                    ("P-Value",
                     str(round(self._chisquare_result.get_pvalue(), 3))),
                    ("Inference",
                     "Chi-squared analysis shows a significant association between {} (target) and {}."
                     .format(self._target_dimension, self._analysed_dimension))
                ]
                statistical_info_array = NarrativesUtils.statistical_info_array_formatter(
                    statistical_info_array)

                card2Data.append(
                    C3ChartData(data=card2ChartJson,
                                info=statistical_info_array))
                card2Data += output2
                card2BubbleData = "<div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}</span><br /><small>{}</small></h2></div><div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}</span><br /><small>{}</small></h2></div>".format(
                    bubble[0]["value"], bubble[0]["text"], bubble[1]["value"],
                    bubble[1]["text"])
                card2Data.append(HtmlData(data=card2BubbleData))
                targetCard = NormalCard()
                targetCard.set_card_data(card2Data)
                targetCard.set_card_name("{} : Distribution of {}".format(
                    self._analysed_dimension, targetLevel))
                self._targetCards.append(targetCard)
                dict_for_test[targetLevel] = targetCardDataDict
        out = {'data_dict': data_dict, 'target_dict': dict_for_test}

        return out
Ejemplo n.º 7
0
class ChiSquareAnalysis:
    def __init__(self,
                 df_context,
                 df_helper,
                 chisquare_result,
                 target_dimension,
                 analysed_dimension,
                 significant_variables,
                 num_analysed_variables,
                 data_frame,
                 measure_columns,
                 base_dir,
                 appid=None,
                 target_chisquare_result=None):
        self._blockSplitter = "|~NEWBLOCK~|"
        self._highlightFlag = "|~HIGHLIGHT~|"
        self._dimensionNode = NarrativesTree()
        self._dimensionNode.set_name(target_dimension)
        self._data_frame = data_frame
        self._dataframe_context = df_context
        self._dataframe_helper = df_helper
        self._chisquare_result = chisquare_result
        self._target_dimension = target_dimension
        self._analysed_dimension = analysed_dimension
        self._significant_variables = significant_variables
        self._target_chisquare_result = target_chisquare_result
        self._measure_columns = self._dataframe_helper.get_numeric_columns()
        self._chiSquareLevelLimit = GLOBALSETTINGS.CHISQUARELEVELLIMIT

        self._num_analysed_variables = num_analysed_variables
        self._chiSquareTable = chisquare_result.get_contingency_table()

        significant_variables = list(
            set(significant_variables) - {analysed_dimension})
        if len(significant_variables) <= 20:
            if len(significant_variables) <= 3:
                self._second_level_dimensions = list(significant_variables)
            else:
                self._second_level_dimensions = list(significant_variables)[:3]
        else:
            self._second_level_dimensions = list(significant_variables)[:5]

        print self._second_level_dimensions

        self._appid = appid
        self._card1 = NormalCard()
        self._targetCards = []
        self._base_dir = base_dir

        self._binTargetCol = False
        self._binAnalyzedCol = False
        print "--------Chi-Square Narratives for ", analysed_dimension, "---------"
        if self._dataframe_context.get_custom_analysis_details() != None:
            binnedColObj = [
                x["colName"]
                for x in self._dataframe_context.get_custom_analysis_details()
            ]
            print "analysed_dimension : ", self._analysed_dimension
            if binnedColObj != None and self._target_dimension in binnedColObj:
                self._binTargetCol = True
            if binnedColObj != None and (
                    self._analysed_dimension in binnedColObj
                    or self._analysed_dimension in self._measure_columns):
                self._binAnalyzedCol = True

        if self._appid == None:
            self._generate_narratives()
            self._dimensionNode.add_cards([self._card1] + self._targetCards)
            self._dimensionNode.set_name("{}".format(analysed_dimension))
        elif self._appid == "2":
            self._generate_narratives()
            self._dimensionNode.add_cards([self._card1])
            self._dimensionNode.set_name("{}".format(analysed_dimension))
        elif self._appid == "1":
            self._generate_narratives()
            self._dimensionNode.add_cards([self._card1])
            self._dimensionNode.set_name("{}".format(analysed_dimension))

    def get_dimension_node(self):
        return json.loads(
            CommonUtils.convert_python_object_to_json(self._dimensionNode))

    def get_dimension_card1(self):
        return self._card1

    def _generate_narratives(self):
        chisquare_result = self._chisquare_result
        target_dimension = self._target_dimension
        analysed_dimension = self._analysed_dimension
        significant_variables = self._significant_variables
        num_analysed_variables = self._num_analysed_variables
        table = self._chiSquareTable
        total = self._chiSquareTable.get_total()

        levels = self._chiSquareTable.get_column_two_levels()
        level_counts = self._chiSquareTable.get_column_total()
        levels_count_sum = sum(level_counts)
        levels_percentages = [
            i * 100.0 / levels_count_sum for i in level_counts
        ]
        sorted_levels = sorted(zip(level_counts, levels), reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]

        top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        bottom_dim = sorted_levels[-1][1]
        bottom_dim_contribution = sorted_levels[-1][0]
        bottom_dims = [
            y for x, y in sorted_levels if x == bottom_dim_contribution
        ]

        target_levels = self._chiSquareTable.get_column_one_levels()

        target_counts = self._chiSquareTable.get_row_total()
        sorted_target_levels = sorted(zip(target_counts, target_levels),
                                      reverse=True)

        top_target_count, top_target = sorted_target_levels[0]
        second_target_count, second_target = sorted_target_levels[1]

        top_target_contributions = [
            table.get_value(top_target, i) for i in levels
        ]
        sum_top_target = sum(top_target_contributions)

        sorted_levels = sorted(zip(top_target_contributions, levels),
                               reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]
        top_target_top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        top_target_top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        top_target_bottom_dim = sorted_levels[-1][1]
        top_target_bottom_dim_contribution = sorted_levels[-1][0]

        top_target_percentages = [
            i * 100.0 / sum_top_target for i in top_target_contributions
        ]
        best_top_target_index = top_target_contributions.index(
            max(top_target_contributions))
        worst_top_target_index = top_target_contributions.index(
            min(top_target_contributions))
        top_target_differences = [
            x - y for x, y in zip(levels_percentages, top_target_percentages)
        ]
        if len(top_target_differences) > 6:
            tops = 2
            bottoms = -2
        elif len(top_target_differences) > 4:
            tops = 2
            bottoms = -1
        else:
            tops = 1
            bottoms = -1
        sorted_ = sorted(enumerate(top_target_differences),
                         key=lambda x: x[1],
                         reverse=True)
        best_top_difference_indices = [x for x, y in sorted_[:tops]]
        worst_top_difference_indices = [x for x, y in sorted_[bottoms:]]

        top_target_shares = [
            x * 100.0 / y
            for x, y in zip(top_target_contributions, level_counts)
        ]
        max_top_target_shares = max(top_target_shares)
        best_top_target_share_index = [
            idx for idx, val in enumerate(top_target_shares)
            if val == max_top_target_shares
        ]
        level_counts_threshold = sum(level_counts) * 0.05 / len(level_counts)
        min_top_target_shares = min([
            x for x, y in zip(top_target_shares, level_counts)
            if y >= level_counts_threshold
        ])
        worst_top_target_share_index = [
            idx for idx, val in enumerate(top_target_shares)
            if val == min_top_target_shares
        ]
        overall_top_percentage = sum_top_target * 100.0 / total

        second_target_contributions = [
            table.get_value(second_target, i) for i in levels
        ]
        sum_second_target = sum(second_target_contributions)

        sorted_levels = sorted(zip(second_target_contributions, levels),
                               reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]
        second_target_top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        second_target_top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        second_target_bottom_dim = sorted_levels[-1][1]
        second_target_bottom_dim_contribution = sorted_levels[-1][0]

        second_target_percentages = [
            i * 100.0 / sum_second_target for i in second_target_contributions
        ]
        best_second_target_index = second_target_contributions.index(
            max(second_target_contributions))
        worst_second_target_index = second_target_contributions.index(
            min(second_target_contributions))
        second_target_differences = [
            x - y
            for x, y in zip(levels_percentages, second_target_percentages)
        ]
        if len(second_target_differences) > 6:
            tops = 2
            bottoms = -2
        elif len(second_target_differences) > 4:
            tops = 2
            bottoms = -1
        else:
            tops = 1
            bottoms = -1
        sorted_ = sorted(enumerate(second_target_differences),
                         key=lambda x: x[1],
                         reverse=True)
        best_second_difference_indices = [x for x, y in sorted_[:tops]]
        worst_second_difference_indices = [x for x, y in sorted_[bottoms:]]

        second_target_shares = [
            x * 100.0 / y
            for x, y in zip(second_target_contributions, level_counts)
        ]
        max_second_target_shares = max(second_target_shares)
        best_second_target_share_index = [
            idx for idx, val in enumerate(second_target_shares)
            if val == max_second_target_shares
        ]
        level_counts_threshold = sum(level_counts) * 0.05 / len(level_counts)
        min_second_target_shares = min([
            x for x, y in zip(second_target_shares, level_counts)
            if y >= level_counts_threshold
        ])
        # worst_second_target_share_index = second_target_shares.index(min_second_target_shares)
        worst_second_target_share_index = [
            idx for idx, val in enumerate(second_target_shares)
            if val == min_second_target_shares
        ]
        overall_second_percentage = sum_second_target * 100.0 / total

        targetCardDataDict = {}
        targetCardDataDict['target'] = target_dimension
        targetCardDataDict['colname'] = analysed_dimension
        targetCardDataDict['num_significant'] = len(significant_variables)
        targetCardDataDict['plural_colname'] = NarrativesUtils.pluralize(
            analysed_dimension)

        targetCardDataDict["blockSplitter"] = self._blockSplitter
        targetCardDataDict["binTargetCol"] = self._binTargetCol
        targetCardDataDict["binAnalyzedCol"] = self._binAnalyzedCol
        targetCardDataDict['highlightFlag'] = self._highlightFlag
        targetCardDataDict['levels'] = levels

        data_dict = {}
        data_dict[
            'best_second_difference'] = best_second_difference_indices  ##these changed
        data_dict['worst_second_difference'] = worst_second_difference_indices
        data_dict['best_top_difference'] = best_top_difference_indices
        data_dict['worst_top_difference'] = worst_top_difference_indices
        data_dict['levels_percentages'] = levels_percentages
        data_dict['top_target_percentages'] = top_target_percentages
        data_dict['second_target_percentages'] = second_target_percentages
        data_dict['levels'] = levels
        data_dict['best_top_share'] = best_top_target_share_index
        data_dict['worst_top_share'] = worst_top_target_share_index
        data_dict['best_second_share'] = best_second_target_share_index
        data_dict['worst_second_share'] = worst_second_target_share_index
        data_dict['top_target_shares'] = top_target_shares
        data_dict['second_target_shares'] = second_target_shares
        data_dict['overall_second'] = overall_second_percentage
        data_dict['overall_top'] = overall_top_percentage

        data_dict['num_significant'] = len(significant_variables)
        data_dict['colname'] = analysed_dimension
        data_dict['plural_colname'] = NarrativesUtils.pluralize(
            analysed_dimension)
        data_dict['target'] = target_dimension
        data_dict['top_levels'] = top_dims
        data_dict['top_levels_percent'] = round(
            top_dims_contribution * 100.0 / total, 1)
        data_dict['bottom_level'] = bottom_dim
        data_dict['bottom_levels'] = bottom_dims
        data_dict['bottom_level_percent'] = round(
            bottom_dim_contribution * 100 / sum(level_counts), 2)
        data_dict['second_target'] = second_target
        data_dict['second_target_top_dims'] = second_target_top_dims
        data_dict[
            'second_target_top_dims_contribution'] = second_target_top_dims_contribution * 100.0 / sum(
                second_target_contributions)
        data_dict['second_target_bottom_dim'] = second_target_bottom_dim
        data_dict[
            'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution
        data_dict['best_second_target'] = levels[best_second_target_index]
        data_dict['best_second_target_count'] = second_target_contributions[
            best_second_target_index]
        data_dict['best_second_target_percent'] = round(
            second_target_contributions[best_second_target_index] * 100.0 /
            sum(second_target_contributions), 2)
        data_dict['worst_second_target'] = levels[worst_second_target_index]
        data_dict['worst_second_target_percent'] = round(
            second_target_contributions[worst_second_target_index] * 100.0 /
            sum(second_target_contributions), 2)

        data_dict['top_target'] = top_target
        data_dict['top_target_top_dims'] = top_target_top_dims
        data_dict[
            'top_target_top_dims_contribution'] = top_target_top_dims_contribution * 100.0 / sum(
                top_target_contributions)
        data_dict['top_target_bottom_dim'] = top_target_bottom_dim
        data_dict[
            'top_target_bottom_dim_contribution'] = top_target_bottom_dim_contribution
        data_dict['best_top_target'] = levels[best_top_target_index]
        data_dict['best_top_target_count'] = top_target_contributions[
            best_top_target_index]
        data_dict['best_top_target_percent'] = round(
            top_target_contributions[best_top_target_index] * 100.0 /
            sum(top_target_contributions), 2)
        data_dict['worst_top_target'] = levels[worst_top_target_index]
        data_dict['worst_top_target_percent'] = round(
            top_target_contributions[worst_top_target_index] * 100.0 /
            sum(top_target_contributions), 2)

        data_dict["blockSplitter"] = self._blockSplitter
        data_dict["binTargetCol"] = self._binTargetCol
        data_dict["binAnalyzedCol"] = self._binAnalyzedCol
        data_dict['highlightFlag'] = self._highlightFlag

        ###############
        #     CARD1   #
        ###############

        print "self._binTargetCol & self._binAnalyzedCol : ", self._binTargetCol, self._binAnalyzedCol
        if (self._binTargetCol == True & self._binAnalyzedCol == False):
            print "Only Target Column is Binned, : ", self._binTargetCol
            output = NarrativesUtils.block_splitter(
                NarrativesUtils.get_template_output(
                    self._base_dir, 'card1_binned_target.html', data_dict),
                self._blockSplitter,
                highlightFlag=self._highlightFlag)
        elif (self._binTargetCol == True & self._binAnalyzedCol == True):
            print "Target Column and IV is Binned : ", self._binTargetCol, self._binAnalyzedCol
            output = NarrativesUtils.block_splitter(
                NarrativesUtils.get_template_output(
                    self._base_dir, 'card1_binned_target_and_IV.html',
                    data_dict),
                self._blockSplitter,
                highlightFlag=self._highlightFlag)
        else:
            output = NarrativesUtils.block_splitter(
                NarrativesUtils.get_template_output(self._base_dir,
                                                    'card1.html', data_dict),
                self._blockSplitter,
                highlightFlag=self._highlightFlag)

        targetDimCard1Data = []
        targetDimcard1Heading = '<h3>Relationship between ' + self._target_dimension + '  and ' + self._analysed_dimension + "</h3>"

        toggledata = ToggleData()

        targetDimTable1Data = self.generate_card1_table1()
        targetDimCard1Table1 = TableData()
        targetDimCard1Table1.set_table_type("heatMap")
        targetDimCard1Table1.set_table_data(targetDimTable1Data)
        toggledata.set_toggleon_data({
            "data": {
                "tableData": targetDimTable1Data,
                "tableType": "heatMap"
            },
            "dataType": "table"
        })

        targetDimTable2Data = self.generate_card1_table2()
        targetDimCard1Table2 = TableData()
        targetDimCard1Table2.set_table_type("normal")
        table2Data = targetDimTable2Data["data1"]
        table2Data = [
            innerList[1:] for innerList in table2Data
            if innerList[0].strip() != ""
        ]
        targetDimCard1Table2.set_table_data(table2Data)

        toggledata.set_toggleoff_data({
            "data": {
                "tableData": table2Data,
                "tableType": "heatMap"
            },
            "dataType": "table"
        })

        targetDimCard1Data.append(HtmlData(data=targetDimcard1Heading))
        targetDimCard1Data.append(toggledata)
        targetDimCard1Data += output

        self._card1.set_card_data(targetDimCard1Data)
        self._card1.set_card_name("{}: Relationship with {}".format(
            self._analysed_dimension, self._target_dimension))

        ###############
        #     CARD2   #
        ###############

        if self._appid == None:

            key_factors = ''
            num_key_factors = len(self._second_level_dimensions)

            if len(self._second_level_dimensions) == 5:
                key_factors = ', '.join(
                    self._second_level_dimensions[:4]
                ) + ' and ' + self._second_level_dimensions[4]
            elif len(self._second_level_dimensions) == 4:
                key_factors = ', '.join(
                    self._second_level_dimensions[:3]
                ) + ' and ' + self._second_level_dimensions[3]
            elif len(self._second_level_dimensions) == 3:
                key_factors = ', '.join(
                    self._second_level_dimensions[:2]
                ) + ' and ' + self._second_level_dimensions[2]
            elif len(self._second_level_dimensions) == 2:
                key_factors = ' and '.join(self._second_level_dimensions)
            elif len(self._second_level_dimensions) == 1:
                key_factors = self._second_level_dimensions[0]

            targetCardDataDict['num_key_factors'] = num_key_factors
            targetCardDataDict['key_factors'] = key_factors
            dict_for_test = {}
            for tupleObj in sorted_target_levels[:self._chiSquareLevelLimit]:
                targetLevel = tupleObj[1]

                targetCardDataDict['random_card2'] = random.randint(1, 100)
                targetCardDataDict['random_card4'] = random.randint(1, 100)

                second_target_contributions = [
                    table.get_value(targetLevel, i) for i in levels
                ]
                sum_second_target = sum(second_target_contributions)

                sorted_levels = sorted(zip(second_target_contributions,
                                           levels),
                                       reverse=True)

                level_differences = [0.0] + [
                    sorted_levels[i][0] - sorted_levels[i + 1][0]
                    for i in range(len(sorted_levels) - 1)
                ]
                second_target_top_dims = [
                    j for i, j in sorted_levels[:level_differences.
                                                index(max(level_differences))]
                ]
                second_target_top_dims_contribution = sum([
                    i for i, j in sorted_levels[:level_differences.
                                                index(max(level_differences))]
                ])
                second_target_bottom_dim = sorted_levels[-1][1]
                second_target_bottom_dim_contribution = sorted_levels[-1][0]

                second_target_percentages = [
                    i * 100.0 / sum_second_target
                    for i in second_target_contributions
                ]
                best_second_target_index = second_target_contributions.index(
                    max(second_target_contributions))
                worst_second_target_index = second_target_contributions.index(
                    min(second_target_contributions))
                second_target_differences = [
                    x - y for x, y in zip(levels_percentages,
                                          second_target_percentages)
                ]
                if len(second_target_differences) > 6:
                    tops = 2
                    bottoms = -2
                elif len(second_target_differences) > 4:
                    tops = 2
                    bottoms = -1
                else:
                    tops = 1
                    bottoms = -1
                sorted_ = sorted(enumerate(second_target_differences),
                                 key=lambda x: x[1],
                                 reverse=True)
                best_second_difference_indices = [x for x, y in sorted_[:tops]]
                worst_second_difference_indices = [
                    x for x, y in sorted_[bottoms:]
                ]

                second_target_shares = [
                    x * 100.0 / y
                    for x, y in zip(second_target_contributions, level_counts)
                ]
                max_second_target_shares = max(second_target_shares)
                best_second_target_share_index = [
                    idx for idx, val in enumerate(second_target_shares)
                    if val == max_second_target_shares
                ]
                level_counts_threshold = sum(level_counts) * 0.05 / len(
                    level_counts)
                min_second_target_shares = min([
                    x for x, y in zip(second_target_shares, level_counts)
                    if y >= level_counts_threshold
                ])
                worst_second_target_share_index = [
                    idx for idx, val in enumerate(second_target_shares)
                    if val == min_second_target_shares
                ]
                overall_second_percentage = sum_second_target * 100.0 / total

                # DataFrame for contribution calculation

                df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\
                                        filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                                        select(self._second_level_dimensions).toPandas()
                df_second_dim = self._data_frame.filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                                    select(self._second_level_dimensions).toPandas()

                # if self._chisquare_result.get_splits():
                #     splits = self._chisquare_result.get_splits()
                #     idx = self._chiSquareTable.get_bin_names(splits).index(second_target_top_dims[0])
                #     idx1 = self._chiSquareTable.get_bin_names(splits).index(top_target_top_dims[0])
                #     splits[len(splits)-1] = splits[len(splits)-1]+1
                #     df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\
                #                         filter(col(self._analysed_dimension)>=splits[idx]).filter(col(self._analysed_dimension)<splits[idx+1]).\
                #                         select(self._second_level_dimensions).toPandas()
                #     df_second_dim = self._data_frame.filter(col(self._analysed_dimension)>=splits[idx]).\
                #                     filter(col(self._analysed_dimension)<splits[idx+1]).\
                #                     select(self._second_level_dimensions).toPandas()
                # else:
                #     df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\
                #                         filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                #                         select(self._second_level_dimensions).toPandas()
                #     df_second_dim = self._data_frame.filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                #                     select(self._second_level_dimensions).toPandas()

                # print self._data_frame.select('Sales').show()

                distribution_second = []
                for d in self._second_level_dimensions:

                    grouped = df_second_target.groupby(d).agg({
                        d: 'count'
                    }).sort_values(d, ascending=False)
                    contributions = df_second_dim.groupby(d).agg({d: 'count'})
                    contribution_index = list(contributions.index)
                    contributions_val = contributions[d].tolist()
                    contributions_list = dict(
                        zip(contribution_index, contributions_val))
                    index_list = list(grouped.index)
                    grouped_list = grouped[d].tolist()
                    contributions_percent_list = [
                        round(y * 100.0 / contributions_list[x], 2)
                        for x, y in zip(index_list, grouped_list)
                    ]
                    sum_ = grouped[d].sum()
                    diffs = [0] + [
                        grouped_list[i] - grouped_list[i + 1]
                        for i in range(len(grouped_list) - 1)
                    ]
                    max_diff = diffs.index(max(diffs))

                    index_txt = ''
                    if max_diff == 1:
                        index_txt = index_list[0]
                    elif max_diff == 2:
                        index_txt = index_list[0] + '(' + str(
                            round(grouped_list[0] * 100.0 / sum_, 1)
                        ) + '%)' + ' and ' + index_list[1] + '(' + str(
                            round(grouped_list[1] * 100.0 / sum_, 1)) + '%)'
                    elif max_diff > 2:
                        index_txt = 'including ' + index_list[0] + '(' + str(
                            round(grouped_list[0] * 100.0 / sum_, 1)
                        ) + '%)' + ' and ' + index_list[1] + '(' + str(
                            round(grouped_list[1] * 100.0 / sum_, 1)) + '%)'
                    distribution_second.append({'contributions':[round(i*100.0/sum_,2) for i in grouped_list[:max_diff]],\
                                            'levels': index_list[:max_diff],'variation':random.randint(1,100),\
                                            'index_txt': index_txt, 'd':d,'contributions_percent':contributions_percent_list})

                targetCardDataDict['distribution_second'] = distribution_second
                targetCardDataDict['second_target'] = targetLevel
                targetCardDataDict[
                    'second_target_top_dims'] = second_target_top_dims
                targetCardDataDict[
                    'second_target_top_dims_contribution'] = second_target_top_dims_contribution * 100.0 / sum(
                        second_target_contributions)
                targetCardDataDict[
                    'second_target_bottom_dim'] = second_target_bottom_dim
                targetCardDataDict[
                    'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution
                targetCardDataDict['best_second_target'] = levels[
                    best_second_target_index]
                targetCardDataDict[
                    'best_second_target_count'] = second_target_contributions[
                        best_second_target_index]
                targetCardDataDict['best_second_target_percent'] = round(
                    second_target_contributions[best_second_target_index] *
                    100.0 / sum(second_target_contributions), 2)
                targetCardDataDict['worst_second_target'] = levels[
                    worst_second_target_index]
                targetCardDataDict['worst_second_target_percent'] = round(
                    second_target_contributions[worst_second_target_index] *
                    100.0 / sum(second_target_contributions), 2)

                card2Data = []
                targetLevelContributions = [
                    table.get_value(targetLevel, i) for i in levels
                ]
                card2Heading = '<h3>Distribution of ' + self._target_dimension + ' (' + targetLevel + ') across ' + self._analysed_dimension + "</h3>"
                chart, bubble = self.generate_distribution_card_chart(
                    targetLevel, targetLevelContributions, levels,
                    level_counts, total)
                card2ChartData = NormalChartData(data=chart["data"])
                card2ChartJson = ChartJson()
                card2ChartJson.set_data(card2ChartData.get_data())
                card2ChartJson.set_chart_type("combination")
                card2ChartJson.set_types({
                    "total": "bar",
                    "percentage": "line"
                })
                card2ChartJson.set_legend({
                    "total": "# of " + targetLevel,
                    "percentage": "% of " + targetLevel
                })
                card2ChartJson.set_axes({
                    "x": "key",
                    "y": "total",
                    "y2": "percentage"
                })
                card2ChartJson.set_label_text({
                    "x": " ",
                    "y": "Count",
                    "y2": "Percentage"
                })
                print "self._binTargetCol & self._binAnalyzedCol : ", self._binTargetCol, self._binAnalyzedCol
                if (self._binTargetCol == True & self._binAnalyzedCol ==
                        False):
                    print "Only Target Column is Binned"
                    output2 = NarrativesUtils.block_splitter(
                        NarrativesUtils.get_template_output(
                            self._base_dir, 'card2_binned_target.html',
                            targetCardDataDict), self._blockSplitter)
                elif (self._binTargetCol == True & self._binAnalyzedCol ==
                      True):
                    print "Target Column and IV is Binned"
                    output2 = NarrativesUtils.block_splitter(
                        NarrativesUtils.get_template_output(
                            self._base_dir, 'card2_binned_target_and_IV.html',
                            targetCardDataDict), self._blockSplitter)
                else:
                    print "In Else, self._binTargetCol should be False : ", self._binTargetCol
                    output2 = NarrativesUtils.block_splitter(
                        NarrativesUtils.get_template_output(
                            self._base_dir, 'card2.html', targetCardDataDict),
                        self._blockSplitter)

                card2Data.append(HtmlData(data=card2Heading))
                statistical_info_array = [
                    ("Test Type", "Chi-Square"),
                    ("Chi-Square statistic",
                     str(round(self._chisquare_result.get_stat(), 3))),
                    ("P-Value",
                     str(round(self._chisquare_result.get_pvalue(), 3))),
                    ("Inference",
                     "Chi-squared analysis shows a significant association between {} (target) and {}."
                     .format(self._target_dimension, self._analysed_dimension))
                ]
                statistical_info_array = NarrativesUtils.statistical_info_array_formatter(
                    statistical_info_array)

                card2Data.append(
                    C3ChartData(data=card2ChartJson,
                                info=statistical_info_array))
                card2Data += output2
                card2BubbleData = "<div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}</span><br /><small>{}</small></h2></div><div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}</span><br /><small>{}</small></h2></div>".format(
                    bubble[0]["value"], bubble[0]["text"], bubble[1]["value"],
                    bubble[1]["text"])
                card2Data.append(HtmlData(data=card2BubbleData))
                targetCard = NormalCard()
                targetCard.set_card_data(card2Data)
                targetCard.set_card_name("{} : Distribution of {}".format(
                    self._analysed_dimension, targetLevel))
                self._targetCards.append(targetCard)
                dict_for_test[targetLevel] = targetCardDataDict
        out = {'data_dict': data_dict, 'target_dict': dict_for_test}

        return out

    # def generate_card2_narratives(self):

    def generate_distribution_card_chart(self, __target,
                                         __target_contributions, levels,
                                         levels_count, total):
        chart = {}
        label = {'total': '# of ' + __target, 'percentage': '% of ' + __target}
        label_text = {
            'x': self._analysed_dimension,
            'y': '# of ' + __target,
            'y2': '% of ' + __target,
        }
        data = {}
        data['total'] = dict(zip(levels, __target_contributions))
        __target_percentages = [
            x * 100.0 / y for x, y in zip(__target_contributions, levels_count)
        ]
        data['percentage'] = dict(zip(levels, __target_percentages))
        chartData = []
        for val in zip(levels, __target_contributions, __target_percentages):
            chartData.append({
                "key": val[0],
                "total": val[1],
                "percentage": val[2]
            })
        # c3_data = [levels,__target_contributions,__target_percentages]
        chart_data = {'label': label, 'data': chartData}
        bubble_data1 = {}
        bubble_data2 = {}
        bubble_data1['value'] = str(
            round(
                max(__target_contributions) * 100.0 /
                sum(__target_contributions), 1)) + '%'
        m_index = __target_contributions.index(max(__target_contributions))
        bubble_data1[
            'text'] = 'Overall ' + __target + ' comes from ' + levels[m_index]

        bubble_data2['value'] = str(round(max(__target_percentages), 1)) + '%'
        m_index = __target_percentages.index(max(__target_percentages))
        bubble_data2[
            'text'] = levels[m_index] + ' has the highest rate of ' + __target

        bubble_data = [bubble_data1, bubble_data2]
        return chart_data, bubble_data

    def generate_card1_table1(self):
        table_percent_by_column = self._chiSquareTable.table_percent_by_column
        column_two_values = self._chiSquareTable.column_two_values
        header_row = [self._analysed_dimension
                      ] + self._chiSquareTable.get_column_one_levels()
        all_columns = [column_two_values] + table_percent_by_column
        other_rows = zip(*all_columns)
        other_rows = [list(tup) for tup in other_rows]
        table_data = [header_row] + other_rows
        return table_data

    def generate_card1_table2(self):
        table = self._chiSquareTable.table
        table_percent = self._chiSquareTable.table_percent
        table_percent_by_row = self._chiSquareTable.table_percent_by_row
        table_percent_by_column = self._chiSquareTable.table_percent_by_column
        target_levels = self._chiSquareTable.get_column_one_levels()
        dim_levels = self._chiSquareTable.get_column_two_levels()

        header1 = [self._analysed_dimension] + target_levels + ['Total']
        header = ['State', 'Active', 'Churn', 'Total']  #TODO remove
        data = []
        data1 = [['Tag'] + header1]

        for idx, lvl in enumerate(dim_levels):
            first_row = ['Tag'] + header
            col_2_vals = zip(*table)[idx]
            data2 = ['bold'] + [lvl] + list(col_2_vals) + [sum(col_2_vals)]

            dict_ = dict(zip(first_row, data2))
            data.append(dict_)
            data1.append(data2)

            col_2_vals = zip(*table_percent_by_column)[idx]
            data2 = [''] + ['As % within ' + self._analysed_dimension
                            ] + list(col_2_vals) + [100.0]
            dict_ = dict(zip(first_row, data2))
            data.append(dict_)
            data1.append(data2)

            col_2_vals = zip(*table_percent_by_row)[idx]
            col_2_vals1 = zip(*table_percent)[idx]
            data2 = [''] + [
                'As % within ' + self._target_dimension
            ] + list(col_2_vals) + [round(sum(col_2_vals1), 2)]
            dict_ = dict(zip(first_row, data2))
            data.append(dict_)
            data1.append(data2)
            # col_2_vals = zip(*table_percent)[idx]
            data2 = [''] + ['As % of Total'] + list(col_2_vals1) + [
                round(sum(col_2_vals1), 2)
            ]
            dict_ = dict(zip(first_row, data2))
            data.append(dict_)
            data1.append(data2)

        out = {
            'header': header,
            'header1': header1,
            'data': data,
            'label': self._analysed_dimension,
            'data1': data1
        }
        return out
Ejemplo n.º 8
0
class ChiSquareAnalysis(object):
    def __init__(self,
                 df_context,
                 df_helper,
                 chisquare_result,
                 target_dimension,
                 analysed_dimension,
                 significant_variables,
                 num_analysed_variables,
                 data_frame,
                 measure_columns,
                 base_dir,
                 appid=None,
                 target_chisquare_result=None):
        self._blockSplitter = "|~NEWBLOCK~|"
        self._highlightFlag = "|~HIGHLIGHT~|"
        self._dimensionNode = NarrativesTree()
        self._dimensionNode.set_name(target_dimension)
        self._data_frame = data_frame
        self._dataframe_context = df_context
        self._pandas_flag = df_context._pandas_flag
        self._dataframe_helper = df_helper
        self._chisquare_result = chisquare_result
        self._target_dimension = target_dimension
        self._analysed_dimension = analysed_dimension
        self._significant_variables = significant_variables
        self._target_chisquare_result = target_chisquare_result
        self._measure_columns = self._dataframe_helper.get_numeric_columns()
        self._chiSquareLevelLimit = GLOBALSETTINGS.CHISQUARELEVELLIMIT

        self._num_analysed_variables = num_analysed_variables
        self._chiSquareTable = chisquare_result.get_contingency_table()

        significant_variables = list(
            set(significant_variables) - {analysed_dimension})
        if len(significant_variables) <= 20:
            if len(significant_variables) <= 3:
                self._second_level_dimensions = list(significant_variables)
            else:
                self._second_level_dimensions = list(significant_variables)[:3]
        else:
            self._second_level_dimensions = list(significant_variables)[:5]

        print(self._second_level_dimensions)

        self._appid = appid
        self._card1 = NormalCard()
        self._targetCards = []
        self._base_dir = base_dir

        self._binTargetCol = False
        self._binAnalyzedCol = False
        print("--------Chi-Square Narratives for ", analysed_dimension,
              "---------")
        if self._dataframe_context.get_custom_analysis_details() != None:
            binnedColObj = [
                x["colName"]
                for x in self._dataframe_context.get_custom_analysis_details()
            ]
            print("analysed_dimension : ", self._analysed_dimension)
            if binnedColObj != None and self._target_dimension in binnedColObj:
                self._binTargetCol = True
            if binnedColObj != None and (
                    self._analysed_dimension in binnedColObj
                    or self._analysed_dimension in self._measure_columns):
                self._binAnalyzedCol = True

        if self._appid == None:
            self._generate_narratives()
            self._dimensionNode.add_cards([self._card1] + self._targetCards)
            self._dimensionNode.set_name("{}".format(analysed_dimension))
        elif self._appid == "2":
            self._generate_narratives()
            self._dimensionNode.add_cards([self._card1])
            self._dimensionNode.set_name("{}".format(analysed_dimension))
        elif self._appid == "1":
            self._generate_narratives()
            self._dimensionNode.add_cards([self._card1])
            self._dimensionNode.set_name("{}".format(analysed_dimension))

    def get_dimension_node(self):
        return json.loads(
            CommonUtils.convert_python_object_to_json(self._dimensionNode))

    def get_dimension_card1(self):
        return self._card1

    def _generate_narratives(self):
        chisquare_result = self._chisquare_result
        target_dimension = self._target_dimension
        analysed_dimension = self._analysed_dimension
        significant_variables = self._significant_variables
        num_analysed_variables = self._num_analysed_variables
        table = self._chiSquareTable
        total = self._chiSquareTable.get_total()

        levels = self._chiSquareTable.get_column_two_levels()
        level_counts = self._chiSquareTable.get_column_total()
        levels_count_sum = sum(level_counts)
        levels_percentages = [
            old_div(i * 100.0, levels_count_sum) for i in level_counts
        ]
        sorted_levels = sorted(zip(level_counts, levels), reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]

        top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        bottom_dim = sorted_levels[-1][1]
        bottom_dim_contribution = sorted_levels[-1][0]
        bottom_dims = [
            y for x, y in sorted_levels if x == bottom_dim_contribution
        ]

        target_levels = self._chiSquareTable.get_column_one_levels()

        target_counts = self._chiSquareTable.get_row_total()
        sorted_target_levels = sorted(zip(target_counts, target_levels),
                                      reverse=True)

        top_target_count, top_target = sorted_target_levels[0]
        second_target_count, second_target = sorted_target_levels[1]

        top_target_contributions = [
            table.get_value(top_target, i) for i in levels
        ]
        sum_top_target = sum(top_target_contributions)

        sorted_levels = sorted(zip(top_target_contributions, levels),
                               reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]
        top_target_top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        top_target_top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        top_target_bottom_dim = sorted_levels[-1][1]
        top_target_bottom_dim_contribution = sorted_levels[-1][0]

        top_target_percentages = [
            old_div(i * 100.0, sum_top_target)
            for i in top_target_contributions
        ]
        best_top_target_index = top_target_contributions.index(
            max(top_target_contributions))
        worst_top_target_index = top_target_contributions.index(
            min(top_target_contributions))
        top_target_differences = [
            x - y for x, y in zip(levels_percentages, top_target_percentages)
        ]
        if len(top_target_differences) > 6:
            tops = 2
            bottoms = -2
        elif len(top_target_differences) > 4:
            tops = 2
            bottoms = -1
        else:
            tops = 1
            bottoms = -1
        sorted_ = sorted(enumerate(top_target_differences),
                         key=lambda x: x[1],
                         reverse=True)
        best_top_difference_indices = [x for x, y in sorted_[:tops]]
        worst_top_difference_indices = [x for x, y in sorted_[bottoms:]]

        top_target_shares = [
            old_div(x * 100.0, y)
            for x, y in zip(top_target_contributions, level_counts)
        ]
        max_top_target_shares = max(top_target_shares)
        best_top_target_share_index = [
            idx for idx, val in enumerate(top_target_shares)
            if val == max_top_target_shares
        ]
        level_counts_threshold = old_div(
            sum(level_counts) * 0.05, len(level_counts))
        min_top_target_shares = min([
            x for x, y in zip(top_target_shares, level_counts)
            if y >= level_counts_threshold
        ])
        if max_top_target_shares == min_top_target_shares:
            worst_top_target_share_index = []
        else:
            worst_top_target_share_index = [
                idx for idx, val in enumerate(top_target_shares)
                if val == min_top_target_shares
            ]
        overall_top_percentage = old_div(sum_top_target * 100.0, total)

        second_target_contributions = [
            table.get_value(second_target, i) for i in levels
        ]
        sum_second_target = sum(second_target_contributions)

        sorted_levels = sorted(zip(second_target_contributions, levels),
                               reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]
        second_target_top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        second_target_top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        second_target_bottom_dim = sorted_levels[-1][1]
        second_target_bottom_dim_contribution = sorted_levels[-1][0]

        second_target_percentages = [
            old_div(i * 100.0, sum_second_target)
            for i in second_target_contributions
        ]
        best_second_target_index = second_target_contributions.index(
            max(second_target_contributions))
        worst_second_target_index = second_target_contributions.index(
            min(second_target_contributions))
        second_target_differences = [
            x - y
            for x, y in zip(levels_percentages, second_target_percentages)
        ]
        if len(second_target_differences) > 6:
            tops = 2
            bottoms = -2
        elif len(second_target_differences) > 4:
            tops = 2
            bottoms = -1
        else:
            tops = 1
            bottoms = -1
        sorted_ = sorted(enumerate(second_target_differences),
                         key=lambda x: x[1],
                         reverse=True)
        best_second_difference_indices = [x for x, y in sorted_[:tops]]
        worst_second_difference_indices = [x for x, y in sorted_[bottoms:]]

        second_target_shares = [
            old_div(x * 100.0, y)
            for x, y in zip(second_target_contributions, level_counts)
        ]
        max_second_target_shares = max(second_target_shares)
        best_second_target_share_index = [
            idx for idx, val in enumerate(second_target_shares)
            if val == max_second_target_shares
        ]
        level_counts_threshold = old_div(
            sum(level_counts) * 0.05, len(level_counts))
        if min(second_target_shares) == 0:
            min_second_target_shares = min([
                x for x, y in zip(second_target_shares, level_counts) if x != 0
            ])
        else:
            min_second_target_shares = min([
                x for x, y in zip(second_target_shares, level_counts)
                if y >= level_counts_threshold
            ])
        # worst_second_target_share_index = second_target_shares.index(min_second_target_shares)
        if max_second_target_shares == min_second_target_shares:
            worst_second_target_share_index = []
        else:
            worst_second_target_share_index = [
                idx for idx, val in enumerate(second_target_shares)
                if val == min_second_target_shares
            ]
        overall_second_percentage = old_div(sum_second_target * 100.0, total)

        targetCardDataDict = {}
        targetCardDataDict['target'] = target_dimension
        targetCardDataDict['colname'] = analysed_dimension
        targetCardDataDict['num_significant'] = len(significant_variables)
        targetCardDataDict['plural_colname'] = NarrativesUtils.pluralize(
            analysed_dimension)

        targetCardDataDict["blockSplitter"] = self._blockSplitter
        targetCardDataDict["binTargetCol"] = self._binTargetCol
        targetCardDataDict["binAnalyzedCol"] = self._binAnalyzedCol
        targetCardDataDict['highlightFlag'] = self._highlightFlag
        targetCardDataDict['levels'] = levels

        data_dict = {}
        data_dict[
            'best_second_difference'] = best_second_difference_indices  ##these changed
        data_dict['worst_second_difference'] = worst_second_difference_indices
        data_dict['best_top_difference'] = best_top_difference_indices
        data_dict['worst_top_difference'] = worst_top_difference_indices
        data_dict['levels_percentages'] = levels_percentages
        data_dict['top_target_percentages'] = top_target_percentages
        data_dict['second_target_percentages'] = second_target_percentages
        data_dict['levels'] = levels
        data_dict['best_top_share'] = best_top_target_share_index
        data_dict['worst_top_share'] = worst_top_target_share_index
        data_dict['best_second_share'] = best_second_target_share_index
        data_dict['worst_second_share'] = worst_second_target_share_index
        data_dict['top_target_shares'] = top_target_shares
        data_dict['second_target_shares'] = second_target_shares
        data_dict['overall_second'] = overall_second_percentage
        data_dict['overall_top'] = overall_top_percentage

        data_dict['num_significant'] = len(significant_variables)
        data_dict['colname'] = analysed_dimension
        data_dict['plural_colname'] = NarrativesUtils.pluralize(
            analysed_dimension)
        data_dict['target'] = target_dimension
        data_dict['top_levels'] = top_dims
        data_dict['top_levels_percent'] = round(
            old_div(top_dims_contribution * 100.0, total), 1)
        data_dict['bottom_level'] = bottom_dim
        data_dict['bottom_levels'] = bottom_dims
        data_dict['bottom_level_percent'] = round(
            old_div(bottom_dim_contribution * 100, sum(level_counts)), 2)
        data_dict['second_target'] = second_target
        data_dict['second_target_top_dims'] = second_target_top_dims
        data_dict['second_target_top_dims_contribution'] = old_div(
            second_target_top_dims_contribution * 100.0,
            sum(second_target_contributions))
        data_dict['second_target_bottom_dim'] = second_target_bottom_dim
        data_dict[
            'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution
        data_dict['best_second_target'] = levels[best_second_target_index]
        data_dict['best_second_target_count'] = second_target_contributions[
            best_second_target_index]
        data_dict['best_second_target_percent'] = round(
            old_div(
                second_target_contributions[best_second_target_index] * 100.0,
                sum(second_target_contributions)), 2)
        data_dict['worst_second_target'] = levels[worst_second_target_index]
        data_dict['worst_second_target_percent'] = round(
            old_div(
                second_target_contributions[worst_second_target_index] * 100.0,
                sum(second_target_contributions)), 2)

        data_dict['top_target'] = top_target
        data_dict['top_target_top_dims'] = top_target_top_dims
        data_dict['top_target_top_dims_contribution'] = old_div(
            top_target_top_dims_contribution * 100.0,
            sum(top_target_contributions))
        data_dict['top_target_bottom_dim'] = top_target_bottom_dim
        data_dict[
            'top_target_bottom_dim_contribution'] = top_target_bottom_dim_contribution
        data_dict['best_top_target'] = levels[best_top_target_index]
        data_dict['best_top_target_count'] = top_target_contributions[
            best_top_target_index]
        data_dict['best_top_target_percent'] = round(
            old_div(top_target_contributions[best_top_target_index] * 100.0,
                    sum(top_target_contributions)), 2)
        data_dict['worst_top_target'] = levels[worst_top_target_index]
        data_dict['worst_top_target_percent'] = round(
            old_div(top_target_contributions[worst_top_target_index] * 100.0,
                    sum(top_target_contributions)), 2)

        data_dict["blockSplitter"] = self._blockSplitter
        data_dict["binTargetCol"] = self._binTargetCol
        data_dict["binAnalyzedCol"] = self._binAnalyzedCol
        data_dict['highlightFlag'] = self._highlightFlag

        # print "_"*60
        # print "DATA DICT - ", data_dict
        # print "_"*60

        ###############
        #     CARD1   #
        ###############

        print("self._binTargetCol & self._binAnalyzedCol : ",
              self._binTargetCol, self._binAnalyzedCol)
        if len(data_dict['worst_second_share']) == 0:
            output = NarrativesUtils.block_splitter(
                NarrativesUtils.get_template_output(
                    self._base_dir, 'card1_binned_target_worst_second.html',
                    data_dict),
                self._blockSplitter,
                highlightFlag=self._highlightFlag)
        else:
            if (self._binTargetCol == True & self._binAnalyzedCol == False):
                print("Only Target Column is Binned, : ", self._binTargetCol)
                output = NarrativesUtils.block_splitter(
                    NarrativesUtils.get_template_output(
                        self._base_dir, 'card1_binned_target.html', data_dict),
                    self._blockSplitter,
                    highlightFlag=self._highlightFlag)
            elif (self._binTargetCol == True & self._binAnalyzedCol == True):
                print("Target Column and IV is Binned : ", self._binTargetCol,
                      self._binAnalyzedCol)
                output = NarrativesUtils.block_splitter(
                    NarrativesUtils.get_template_output(
                        self._base_dir, 'card1_binned_target_and_IV.html',
                        data_dict),
                    self._blockSplitter,
                    highlightFlag=self._highlightFlag)
            else:
                output = NarrativesUtils.block_splitter(
                    NarrativesUtils.get_template_output(
                        self._base_dir, 'card1.html', data_dict),
                    self._blockSplitter,
                    highlightFlag=self._highlightFlag)

        targetDimCard1Data = []
        targetDimcard1Heading = '<h3>Impact of ' + self._analysed_dimension + '  on ' + self._target_dimension + "</h3>"

        toggledata = ToggleData()

        targetDimTable1Data = self.generate_card1_table1()
        targetDimCard1Table1 = TableData()
        targetDimCard1Table1.set_table_type("heatMap")
        targetDimCard1Table1.set_table_data(targetDimTable1Data)
        toggledata.set_toggleon_data({
            "data": {
                "tableData": targetDimTable1Data,
                "tableType": "heatMap"
            },
            "dataType": "table"
        })

        targetDimTable2Data = self.generate_card1_table2()
        targetDimCard1Table2 = TableData()
        targetDimCard1Table2.set_table_type("normal")
        table2Data = targetDimTable2Data["data1"]
        table2Data = [
            innerList[1:] for innerList in table2Data
            if innerList[0].strip() != ""
        ]
        targetDimCard1Table2.set_table_data(table2Data)

        toggledata.set_toggleoff_data({
            "data": {
                "tableData": table2Data,
                "tableType": "heatMap"
            },
            "dataType": "table"
        })

        targetDimCard1Data.append(HtmlData(data=targetDimcard1Heading))
        targetDimCard1Data.append(toggledata)
        targetDimCard1Data += output

        self._card1.set_card_data(targetDimCard1Data)
        self._card1.set_card_name("{}: Relationship with {}".format(
            self._analysed_dimension, self._target_dimension))

        ###############
        #     CARD2   #
        ###############

        if self._appid == None:

            key_factors = ''
            num_key_factors = len(self._second_level_dimensions)

            if len(self._second_level_dimensions) == 5:
                key_factors = ', '.join(
                    self._second_level_dimensions[:4]
                ) + ' and ' + self._second_level_dimensions[4]
            elif len(self._second_level_dimensions) == 4:
                key_factors = ', '.join(
                    self._second_level_dimensions[:3]
                ) + ' and ' + self._second_level_dimensions[3]
            elif len(self._second_level_dimensions) == 3:
                key_factors = ', '.join(
                    self._second_level_dimensions[:2]
                ) + ' and ' + self._second_level_dimensions[2]
            elif len(self._second_level_dimensions) == 2:
                key_factors = ' and '.join(self._second_level_dimensions)
            elif len(self._second_level_dimensions) == 1:
                key_factors = self._second_level_dimensions[0]

            targetCardDataDict['num_key_factors'] = num_key_factors
            targetCardDataDict['key_factors'] = key_factors
            dict_for_test = {}
            for tupleObj in sorted_target_levels[:self._chiSquareLevelLimit]:
                targetLevel = tupleObj[1]

                targetCardDataDict['random_card2'] = random.randint(1, 100)
                targetCardDataDict['random_card4'] = random.randint(1, 100)

                second_target_contributions = [
                    table.get_value(targetLevel, i) for i in levels
                ]
                sum_second_target = sum(second_target_contributions)

                sorted_levels = sorted(zip(second_target_contributions,
                                           levels),
                                       reverse=True)

                level_differences = [0.0] + [
                    sorted_levels[i][0] - sorted_levels[i + 1][0]
                    for i in range(len(sorted_levels) - 1)
                ]
                level_diff_index = level_differences.index(
                    max(level_differences)) if level_differences.index(
                        max(level_differences)) > 0 else len(
                            level_differences
                        )  ##added for pipeline keyerror issue
                second_target_top_dims = [
                    j for i, j in sorted_levels[:level_diff_index]
                ]
                second_target_top_dims_contribution = sum([
                    i for i, j in sorted_levels[:level_differences.
                                                index(max(level_differences))]
                ])
                second_target_bottom_dim = sorted_levels[-1][1]
                second_target_bottom_dim_contribution = sorted_levels[-1][0]

                second_target_percentages = [
                    old_div(i * 100.0, sum_second_target)
                    for i in second_target_contributions
                ]
                best_second_target_index = second_target_contributions.index(
                    max(second_target_contributions))
                worst_second_target_index = second_target_contributions.index(
                    min(second_target_contributions))
                second_target_differences = [
                    x - y for x, y in zip(levels_percentages,
                                          second_target_percentages)
                ]
                if len(second_target_differences) > 6:
                    tops = 2
                    bottoms = -2
                elif len(second_target_differences) > 4:
                    tops = 2
                    bottoms = -1
                else:
                    tops = 1
                    bottoms = -1
                sorted_ = sorted(enumerate(second_target_differences),
                                 key=lambda x: x[1],
                                 reverse=True)
                best_second_difference_indices = [x for x, y in sorted_[:tops]]
                worst_second_difference_indices = [
                    x for x, y in sorted_[bottoms:]
                ]

                second_target_shares = [
                    old_div(x * 100.0, y)
                    for x, y in zip(second_target_contributions, level_counts)
                ]
                max_second_target_shares = max(second_target_shares)
                best_second_target_share_index = [
                    idx for idx, val in enumerate(second_target_shares)
                    if val == max_second_target_shares
                ]
                level_counts_threshold = old_div(
                    sum(level_counts) * 0.05, len(level_counts))
                min_second_target_shares = min([
                    x for x, y in zip(second_target_shares, level_counts)
                    if y >= level_counts_threshold
                ])
                worst_second_target_share_index = [
                    idx for idx, val in enumerate(second_target_shares)
                    if val == min_second_target_shares
                ]
                overall_second_percentage = old_div(sum_second_target * 100.0,
                                                    total)

                # DataFrame for contribution calculation
                if self._pandas_flag:
                    df_second_target = self._data_frame[(
                        self._data_frame[self._target_dimension] == targetLevel
                    ) & (self._data_frame[self._analysed_dimension] ==
                         second_target_top_dims[0])][
                             self._second_level_dimensions]
                    df_second_dim = self._data_frame[(
                        self._data_frame[self._analysed_dimension] ==
                        second_target_top_dims[0]
                    )][self._second_level_dimensions]
                else:
                    df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\
                                            filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                                            select(self._second_level_dimensions).toPandas()
                    df_second_dim = self._data_frame.filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                                        select(self._second_level_dimensions).toPandas()

                # if self._chisquare_result.get_splits():
                #     splits = self._chisquare_result.get_splits()
                #     idx = self._chiSquareTable.get_bin_names(splits).index(second_target_top_dims[0])
                #     idx1 = self._chiSquareTable.get_bin_names(splits).index(top_target_top_dims[0])
                #     splits[len(splits)-1] = splits[len(splits)-1]+1
                #     df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\
                #                         filter(col(self._analysed_dimension)>=splits[idx]).filter(col(self._analysed_dimension)<splits[idx+1]).\
                #                         select(self._second_level_dimensions).toPandas()
                #     df_second_dim = self._data_frame.filter(col(self._analysed_dimension)>=splits[idx]).\
                #                     filter(col(self._analysed_dimension)<splits[idx+1]).\
                #                     select(self._second_level_dimensions).toPandas()
                # else:
                #     df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\
                #                         filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                #                         select(self._second_level_dimensions).toPandas()
                #     df_second_dim = self._data_frame.filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                #                     select(self._second_level_dimensions).toPandas()

                # print self._data_frame.select('Sales').show()

                distribution_second = []
                d_l = []
                for d in self._second_level_dimensions:
                    grouped = df_second_target.groupby(d).agg({d: 'count'})
                    contributions = df_second_dim.groupby(d).agg({d: 'count'})
                    contribution_index = list(contributions.index)
                    contributions_val = contributions[d].tolist()
                    contributions_list = dict(
                        list(zip(contribution_index, contributions_val)))
                    index_list = list(grouped.index)
                    grouped_list = grouped[d].tolist()
                    contributions_percent_list = [
                        round(old_div(y * 100.0, contributions_list[x]), 2)
                        for x, y in zip(index_list, grouped_list)
                    ]
                    sum_ = grouped[d].sum()
                    diffs = [0] + [
                        grouped_list[i] - grouped_list[i + 1]
                        for i in range(len(grouped_list) - 1)
                    ]
                    max_diff = diffs.index(max(diffs))
                    grouped_dict = dict(list(zip(index_list, grouped_list)))

                    for val in contribution_index:
                        if val not in list(grouped_dict.keys()):
                            grouped_dict[val] = 0
                        else:
                            pass

                    index_list = []
                    grouped_list = []
                    contributions_val = []

                    for key in list(grouped_dict.keys()):
                        index_list.append(str(key))
                        grouped_list.append(grouped_dict[key])
                        contributions_val.append(contributions_list[key])
                    '''
                    print "="*70
                    print "GROUPED - ", grouped
                    print "INDEX LIST - ", index_list
                    print "GROUPED LIST - ", grouped_list
                    print "GROUPED DICT - ", grouped_dict
                    print "CONTRIBUTIONS - ", contributions
                    print "CONTRIBUTION INDEX - ", contribution_index
                    print "CONTRIBUTIONS VAL - ", contributions_val
                    print "CONTRIBUTIONS LIST - ", contributions_list
                    print "CONTRIBUTIONS PERCENT LIST - ", contributions_percent_list
                    print "SUM - ", sum_
                    print "DIFFS - ", diffs
                    print "MAX DIFF - ", max_diff
                    print "="*70
                    '''

                    informative_dict = {
                        "levels": index_list,
                        "positive_class_contribution": grouped_list,
                        "positive_plus_others": contributions_val
                    }

                    informative_df = pd.DataFrame(informative_dict)
                    informative_df["percentage_horizontal"] = old_div(
                        informative_df["positive_class_contribution"] * 100,
                        informative_df["positive_plus_others"])
                    informative_df["percentage_vertical"] = old_div(
                        informative_df["positive_class_contribution"] * 100,
                        sum_)
                    informative_df.sort_values(["percentage_vertical"],
                                               inplace=True,
                                               ascending=False)
                    informative_df = informative_df.reset_index(drop=True)

                    percentage_vertical_sorted = list(
                        informative_df["percentage_vertical"])
                    percentage_horizontal_sorted = list(
                        informative_df["percentage_horizontal"])
                    levels_sorted = list(informative_df["levels"])

                    differences_list = []
                    for i in range(1, len(percentage_vertical_sorted)):
                        difference = percentage_vertical_sorted[
                            i - 1] - percentage_vertical_sorted[i]
                        differences_list.append(round(difference, 2))
                    '''
                    print "-"*70
                    print "DIFFERENCES LIST - ", differences_list
                    print "-"*70
                    '''

                    index_txt = ''
                    if differences_list:
                        if differences_list[0] >= 30:
                            print("showing 1st case")
                            index_txt = levels_sorted[0]
                            max_diff_equivalent = 1
                        else:
                            if len(differences_list) >= 2:
                                if differences_list[1] >= 10:
                                    print("showing 1st and 2nd case")
                                    index_txt = levels_sorted[0] + '(' + str(
                                        round(percentage_vertical_sorted[0], 1)
                                    ) + '%)' + ' and ' + levels_sorted[
                                        1] + '(' + str(
                                            round(
                                                percentage_vertical_sorted[1],
                                                1)) + '%)'
                                    max_diff_equivalent = 2
                                else:
                                    print("showing 3rd case")
                                    index_txt = 'including ' + levels_sorted[
                                        0] + '(' + str(
                                            round(
                                                percentage_vertical_sorted[0],
                                                1)
                                        ) + '%)' + ' and ' + levels_sorted[
                                            1] + '(' + str(
                                                round(
                                                    percentage_vertical_sorted[
                                                        1], 1)) + '%)'
                                    max_diff_equivalent = 3
                            else:
                                print("showing 3rd case")
                                index_txt = 'including ' + levels_sorted[
                                    0] + '(' + str(
                                        round(percentage_vertical_sorted[0], 1)
                                    ) + '%)' + ' and ' + levels_sorted[
                                        1] + '(' + str(
                                            round(
                                                percentage_vertical_sorted[1],
                                                1)) + '%)'
                                max_diff_equivalent = 3

                    else:
                        max_diff_equivalent = 0
                    '''
                    print "-"*70
                    print informative_df.head(25)
                    print "-"*70
                    '''

                    distribution_second.append({
                        'contributions': [
                            round(i, 2) for i in
                            percentage_vertical_sorted[:max_diff_equivalent]
                        ],
                        'levels':
                        levels_sorted[:max_diff_equivalent],
                        'variation':
                        random.randint(1, 100),
                        'index_txt':
                        index_txt,
                        'd':
                        d,
                        'contributions_percent':
                        percentage_horizontal_sorted
                    })
                '''
                  print "DISTRIBUTION SECOND - ", distribution_second
                  print "<>"*50
                  '''
                targetCardDataDict['distribution_second'] = distribution_second
                targetCardDataDict['second_target'] = targetLevel
                targetCardDataDict[
                    'second_target_top_dims'] = second_target_top_dims
                targetCardDataDict[
                    'second_target_top_dims_contribution'] = old_div(
                        second_target_top_dims_contribution * 100.0,
                        sum(second_target_contributions))
                targetCardDataDict[
                    'second_target_bottom_dim'] = second_target_bottom_dim
                targetCardDataDict[
                    'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution
                targetCardDataDict['best_second_target'] = levels[
                    best_second_target_index]
                targetCardDataDict[
                    'best_second_target_count'] = second_target_contributions[
                        best_second_target_index]
                targetCardDataDict['best_second_target_percent'] = round(
                    old_div(
                        second_target_contributions[best_second_target_index] *
                        100.0, sum(second_target_contributions)), 2)
                targetCardDataDict['worst_second_target'] = levels[
                    worst_second_target_index]
                targetCardDataDict['worst_second_target_percent'] = round(
                    old_div(
                        second_target_contributions[worst_second_target_index]
                        * 100.0, sum(second_target_contributions)), 2)

                card2Data = []
                targetLevelContributions = [
                    table.get_value(targetLevel, i) for i in levels
                ]
                impact_target_thershold = old_div(
                    sum(targetLevelContributions) * 0.02,
                    len(targetLevelContributions))
                card2Heading = '<h3>Key Drivers of ' + self._target_dimension + ' (' + targetLevel + ')' + "</h3>"
                chart, bubble = self.generate_distribution_card_chart(
                    targetLevel, targetLevelContributions, levels,
                    level_counts, total, impact_target_thershold)
                card2ChartData = NormalChartData(data=chart["data"])
                "rounding the chartdata values for key drivers tab inside table percentage(table data)"
                for d in card2ChartData.get_data():
                    d['percentage'] = round(d['percentage'], 2)
                    d_l.append(d)
                card2ChartJson = ChartJson()
                card2ChartJson.set_data(d_l)
                card2ChartJson.set_chart_type("combination")
                card2ChartJson.set_types({
                    "total": "bar",
                    "percentage": "line"
                })
                card2ChartJson.set_legend({
                    "total": "# of " + targetLevel,
                    "percentage": "% of " + targetLevel
                })
                card2ChartJson.set_axes({
                    "x": "key",
                    "y": "total",
                    "y2": "percentage"
                })
                card2ChartJson.set_label_text({
                    "x": " ",
                    "y": "Count",
                    "y2": "Percentage"
                })
                print("self._binTargetCol & self._binAnalyzedCol : ",
                      self._binTargetCol, self._binAnalyzedCol)
                if (self._binTargetCol == True & self._binAnalyzedCol ==
                        False):
                    print("Only Target Column is Binned")
                    output2 = NarrativesUtils.block_splitter(
                        NarrativesUtils.get_template_output(
                            self._base_dir, 'card2_binned_target.html',
                            targetCardDataDict), self._blockSplitter)
                elif (self._binTargetCol == True & self._binAnalyzedCol ==
                      True):
                    print("Target Column and IV is Binned")
                    output2 = NarrativesUtils.block_splitter(
                        NarrativesUtils.get_template_output(
                            self._base_dir, 'card2_binned_target_and_IV.html',
                            targetCardDataDict), self._blockSplitter)
                else:
                    print("In Else, self._binTargetCol should be False : ",
                          self._binTargetCol)
                    output2 = NarrativesUtils.block_splitter(
                        NarrativesUtils.get_template_output(
                            self._base_dir, 'card2.html', targetCardDataDict),
                        self._blockSplitter)

                card2Data.append(HtmlData(data=card2Heading))
                statistical_info_array = [
                    ("Test Type", "Chi-Square"),
                    ("Chi-Square statistic",
                     str(round(self._chisquare_result.get_stat(), 3))),
                    ("P-Value",
                     str(round(self._chisquare_result.get_pvalue(), 3))),
                    ("Inference",
                     "Chi-squared analysis shows a significant association between {} (target) and {}."
                     .format(self._target_dimension, self._analysed_dimension))
                ]
                statistical_info_array = NarrativesUtils.statistical_info_array_formatter(
                    statistical_info_array)

                card2Data.append(
                    C3ChartData(data=card2ChartJson,
                                info=statistical_info_array))
                card2Data += output2
                card2BubbleData = "<div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}</span><br /><small>{}</small></h2></div><div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}</span><br /><small>{}</small></h2></div>".format(
                    bubble[0]["value"], bubble[0]["text"], bubble[1]["value"],
                    bubble[1]["text"])
                card2Data.append(HtmlData(data=card2BubbleData))
                targetCard = NormalCard()
                targetCard.set_card_data(card2Data)
                targetCard.set_card_name("{} : Distribution of {}".format(
                    self._analysed_dimension, targetLevel))
                self._targetCards.append(targetCard)
                dict_for_test[targetLevel] = targetCardDataDict
        out = {'data_dict': data_dict, 'target_dict': dict_for_test}

        return out

    # def generate_card2_narratives(self):

    def generate_distribution_card_chart(self, __target,
                                         __target_contributions, levels,
                                         levels_count, total, thershold):
        chart = {}
        label = {'total': '# of ' + __target, 'percentage': '% of ' + __target}
        label_text = {
            'x': self._analysed_dimension,
            'y': '# of ' + __target,
            'y2': '% of ' + __target,
        }
        data = {}
        data['total'] = dict(list(zip(levels, __target_contributions)))
        __target_percentages = [
            old_div(x * 100.0, y)
            for x, y in zip(__target_contributions, levels_count)
        ]
        data['percentage'] = dict(list(zip(levels, __target_percentages)))
        chartData = []
        for val in zip(levels, __target_contributions, __target_percentages):
            chartData.append({
                "key": val[0],
                "total": val[1],
                "percentage": val[2]
            })
        # c3_data = [levels,__target_contributions,__target_percentages]
        chart_data = {'label': label, 'data': chartData}
        bubble_data1 = {}
        bubble_data2 = {}
        bubble_data1['value'] = str(
            round(
                old_div(
                    max(__target_contributions) * 100.0,
                    sum(__target_contributions)), 1)) + '%'
        m_index = __target_contributions.index(max(__target_contributions))
        bubble_data1[
            'text'] = 'Overall ' + __target + ' comes from ' + levels[m_index]
        intial = -1
        for k, v, i in zip(__target_contributions, __target_percentages,
                           list(range(len(__target_contributions)))):
            if k > thershold:
                if intial < v:
                    intial = v
                    bubble_data2['value'] = str(round(intial)) + '%'
                    #m_index = __target_percentages.index(i)
                    bubble_data2['text'] = levels[
                        i] + ' has the highest rate of ' + __target
        bubble_data = [bubble_data1, bubble_data2]
        return chart_data, bubble_data

    def generate_card1_table1(self):
        table_percent_by_column = self._chiSquareTable.table_percent_by_column
        column_two_values = self._chiSquareTable.column_two_values
        header_row = [self._analysed_dimension
                      ] + self._chiSquareTable.get_column_one_levels()
        all_columns = [column_two_values] + table_percent_by_column
        other_rows = list(zip(*all_columns))
        other_rows = [list(tup) for tup in other_rows]
        table_data = [header_row] + other_rows
        return table_data

    def generate_card1_table2(self):
        table = self._chiSquareTable.table
        table_percent = self._chiSquareTable.table_percent
        table_percent_by_row = self._chiSquareTable.table_percent_by_row
        table_percent_by_column = self._chiSquareTable.table_percent_by_column
        target_levels = self._chiSquareTable.get_column_one_levels()
        dim_levels = self._chiSquareTable.get_column_two_levels()

        header1 = [self._analysed_dimension] + target_levels + ['Total']
        header = ['State', 'Active', 'Churn', 'Total']  #TODO remove
        data = []
        data1 = [['Tag'] + header1]

        for idx, lvl in enumerate(dim_levels):
            first_row = ['Tag'] + header
            col_2_vals = list(zip(*table))[idx]
            data2 = ['bold'] + [lvl] + list(col_2_vals) + [sum(col_2_vals)]

            dict_ = dict(list(zip(first_row, data2)))
            data.append(dict_)
            data1.append(data2)

            col_2_vals = list(zip(*table_percent_by_column))[idx]
            data2 = [''] + ['As % within ' + self._analysed_dimension
                            ] + list(col_2_vals) + [100.0]
            dict_ = dict(list(zip(first_row, data2)))
            data.append(dict_)
            data1.append(data2)

            col_2_vals = list(zip(*table_percent_by_row))[idx]
            col_2_vals1 = list(zip(*table_percent))[idx]
            data2 = [''] + [
                'As % within ' + self._target_dimension
            ] + list(col_2_vals) + [round(sum(col_2_vals1), 2)]
            dict_ = dict(list(zip(first_row, data2)))
            data.append(dict_)
            data1.append(data2)
            # col_2_vals = zip(*table_percent)[idx]
            data2 = [''] + ['As % of Total'] + list(col_2_vals1) + [
                round(sum(col_2_vals1), 2)
            ]
            dict_ = dict(list(zip(first_row, data2)))
            data.append(dict_)
            data1.append(data2)

        out = {
            'header': header,
            'header1': header1,
            'data': data,
            'label': self._analysed_dimension,
            'data1': data1
        }
        return out
Ejemplo n.º 9
0
    def _generate_narratives(self):
        chisquare_result = self._chisquare_result
        target_dimension = self._target_dimension
        analysed_dimension = self._analysed_dimension
        significant_variables = self._significant_variables
        num_analysed_variables = self._num_analysed_variables
        table = self._chiSquareTable
        total = self._chiSquareTable.get_total()

        levels = self._chiSquareTable.get_column_two_levels()
        level_counts = self._chiSquareTable.get_column_total()
        levels_count_sum = sum(level_counts)
        levels_percentages = [
            old_div(i * 100.0, levels_count_sum) for i in level_counts
        ]
        sorted_levels = sorted(zip(level_counts, levels), reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]

        top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        bottom_dim = sorted_levels[-1][1]
        bottom_dim_contribution = sorted_levels[-1][0]
        bottom_dims = [
            y for x, y in sorted_levels if x == bottom_dim_contribution
        ]

        target_levels = self._chiSquareTable.get_column_one_levels()

        target_counts = self._chiSquareTable.get_row_total()
        sorted_target_levels = sorted(zip(target_counts, target_levels),
                                      reverse=True)

        top_target_count, top_target = sorted_target_levels[0]
        second_target_count, second_target = sorted_target_levels[1]

        top_target_contributions = [
            table.get_value(top_target, i) for i in levels
        ]
        sum_top_target = sum(top_target_contributions)

        sorted_levels = sorted(zip(top_target_contributions, levels),
                               reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]
        top_target_top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        top_target_top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        top_target_bottom_dim = sorted_levels[-1][1]
        top_target_bottom_dim_contribution = sorted_levels[-1][0]

        top_target_percentages = [
            old_div(i * 100.0, sum_top_target)
            for i in top_target_contributions
        ]
        best_top_target_index = top_target_contributions.index(
            max(top_target_contributions))
        worst_top_target_index = top_target_contributions.index(
            min(top_target_contributions))
        top_target_differences = [
            x - y for x, y in zip(levels_percentages, top_target_percentages)
        ]
        if len(top_target_differences) > 6:
            tops = 2
            bottoms = -2
        elif len(top_target_differences) > 4:
            tops = 2
            bottoms = -1
        else:
            tops = 1
            bottoms = -1
        sorted_ = sorted(enumerate(top_target_differences),
                         key=lambda x: x[1],
                         reverse=True)
        best_top_difference_indices = [x for x, y in sorted_[:tops]]
        worst_top_difference_indices = [x for x, y in sorted_[bottoms:]]

        top_target_shares = [
            old_div(x * 100.0, y)
            for x, y in zip(top_target_contributions, level_counts)
        ]
        max_top_target_shares = max(top_target_shares)
        best_top_target_share_index = [
            idx for idx, val in enumerate(top_target_shares)
            if val == max_top_target_shares
        ]
        level_counts_threshold = old_div(
            sum(level_counts) * 0.05, len(level_counts))
        min_top_target_shares = min([
            x for x, y in zip(top_target_shares, level_counts)
            if y >= level_counts_threshold
        ])
        if max_top_target_shares == min_top_target_shares:
            worst_top_target_share_index = []
        else:
            worst_top_target_share_index = [
                idx for idx, val in enumerate(top_target_shares)
                if val == min_top_target_shares
            ]
        overall_top_percentage = old_div(sum_top_target * 100.0, total)

        second_target_contributions = [
            table.get_value(second_target, i) for i in levels
        ]
        sum_second_target = sum(second_target_contributions)

        sorted_levels = sorted(zip(second_target_contributions, levels),
                               reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]
        second_target_top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        second_target_top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        second_target_bottom_dim = sorted_levels[-1][1]
        second_target_bottom_dim_contribution = sorted_levels[-1][0]

        second_target_percentages = [
            old_div(i * 100.0, sum_second_target)
            for i in second_target_contributions
        ]
        best_second_target_index = second_target_contributions.index(
            max(second_target_contributions))
        worst_second_target_index = second_target_contributions.index(
            min(second_target_contributions))
        second_target_differences = [
            x - y
            for x, y in zip(levels_percentages, second_target_percentages)
        ]
        if len(second_target_differences) > 6:
            tops = 2
            bottoms = -2
        elif len(second_target_differences) > 4:
            tops = 2
            bottoms = -1
        else:
            tops = 1
            bottoms = -1
        sorted_ = sorted(enumerate(second_target_differences),
                         key=lambda x: x[1],
                         reverse=True)
        best_second_difference_indices = [x for x, y in sorted_[:tops]]
        worst_second_difference_indices = [x for x, y in sorted_[bottoms:]]

        second_target_shares = [
            old_div(x * 100.0, y)
            for x, y in zip(second_target_contributions, level_counts)
        ]
        max_second_target_shares = max(second_target_shares)
        best_second_target_share_index = [
            idx for idx, val in enumerate(second_target_shares)
            if val == max_second_target_shares
        ]
        level_counts_threshold = old_div(
            sum(level_counts) * 0.05, len(level_counts))
        if min(second_target_shares) == 0:
            min_second_target_shares = min([
                x for x, y in zip(second_target_shares, level_counts) if x != 0
            ])
        else:
            min_second_target_shares = min([
                x for x, y in zip(second_target_shares, level_counts)
                if y >= level_counts_threshold
            ])
        # worst_second_target_share_index = second_target_shares.index(min_second_target_shares)
        if max_second_target_shares == min_second_target_shares:
            worst_second_target_share_index = []
        else:
            worst_second_target_share_index = [
                idx for idx, val in enumerate(second_target_shares)
                if val == min_second_target_shares
            ]
        overall_second_percentage = old_div(sum_second_target * 100.0, total)

        targetCardDataDict = {}
        targetCardDataDict['target'] = target_dimension
        targetCardDataDict['colname'] = analysed_dimension
        targetCardDataDict['num_significant'] = len(significant_variables)
        targetCardDataDict['plural_colname'] = NarrativesUtils.pluralize(
            analysed_dimension)

        targetCardDataDict["blockSplitter"] = self._blockSplitter
        targetCardDataDict["binTargetCol"] = self._binTargetCol
        targetCardDataDict["binAnalyzedCol"] = self._binAnalyzedCol
        targetCardDataDict['highlightFlag'] = self._highlightFlag
        targetCardDataDict['levels'] = levels

        data_dict = {}
        data_dict[
            'best_second_difference'] = best_second_difference_indices  ##these changed
        data_dict['worst_second_difference'] = worst_second_difference_indices
        data_dict['best_top_difference'] = best_top_difference_indices
        data_dict['worst_top_difference'] = worst_top_difference_indices
        data_dict['levels_percentages'] = levels_percentages
        data_dict['top_target_percentages'] = top_target_percentages
        data_dict['second_target_percentages'] = second_target_percentages
        data_dict['levels'] = levels
        data_dict['best_top_share'] = best_top_target_share_index
        data_dict['worst_top_share'] = worst_top_target_share_index
        data_dict['best_second_share'] = best_second_target_share_index
        data_dict['worst_second_share'] = worst_second_target_share_index
        data_dict['top_target_shares'] = top_target_shares
        data_dict['second_target_shares'] = second_target_shares
        data_dict['overall_second'] = overall_second_percentage
        data_dict['overall_top'] = overall_top_percentage

        data_dict['num_significant'] = len(significant_variables)
        data_dict['colname'] = analysed_dimension
        data_dict['plural_colname'] = NarrativesUtils.pluralize(
            analysed_dimension)
        data_dict['target'] = target_dimension
        data_dict['top_levels'] = top_dims
        data_dict['top_levels_percent'] = round(
            old_div(top_dims_contribution * 100.0, total), 1)
        data_dict['bottom_level'] = bottom_dim
        data_dict['bottom_levels'] = bottom_dims
        data_dict['bottom_level_percent'] = round(
            old_div(bottom_dim_contribution * 100, sum(level_counts)), 2)
        data_dict['second_target'] = second_target
        data_dict['second_target_top_dims'] = second_target_top_dims
        data_dict['second_target_top_dims_contribution'] = old_div(
            second_target_top_dims_contribution * 100.0,
            sum(second_target_contributions))
        data_dict['second_target_bottom_dim'] = second_target_bottom_dim
        data_dict[
            'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution
        data_dict['best_second_target'] = levels[best_second_target_index]
        data_dict['best_second_target_count'] = second_target_contributions[
            best_second_target_index]
        data_dict['best_second_target_percent'] = round(
            old_div(
                second_target_contributions[best_second_target_index] * 100.0,
                sum(second_target_contributions)), 2)
        data_dict['worst_second_target'] = levels[worst_second_target_index]
        data_dict['worst_second_target_percent'] = round(
            old_div(
                second_target_contributions[worst_second_target_index] * 100.0,
                sum(second_target_contributions)), 2)

        data_dict['top_target'] = top_target
        data_dict['top_target_top_dims'] = top_target_top_dims
        data_dict['top_target_top_dims_contribution'] = old_div(
            top_target_top_dims_contribution * 100.0,
            sum(top_target_contributions))
        data_dict['top_target_bottom_dim'] = top_target_bottom_dim
        data_dict[
            'top_target_bottom_dim_contribution'] = top_target_bottom_dim_contribution
        data_dict['best_top_target'] = levels[best_top_target_index]
        data_dict['best_top_target_count'] = top_target_contributions[
            best_top_target_index]
        data_dict['best_top_target_percent'] = round(
            old_div(top_target_contributions[best_top_target_index] * 100.0,
                    sum(top_target_contributions)), 2)
        data_dict['worst_top_target'] = levels[worst_top_target_index]
        data_dict['worst_top_target_percent'] = round(
            old_div(top_target_contributions[worst_top_target_index] * 100.0,
                    sum(top_target_contributions)), 2)

        data_dict["blockSplitter"] = self._blockSplitter
        data_dict["binTargetCol"] = self._binTargetCol
        data_dict["binAnalyzedCol"] = self._binAnalyzedCol
        data_dict['highlightFlag'] = self._highlightFlag

        # print "_"*60
        # print "DATA DICT - ", data_dict
        # print "_"*60

        ###############
        #     CARD1   #
        ###############

        print("self._binTargetCol & self._binAnalyzedCol : ",
              self._binTargetCol, self._binAnalyzedCol)
        if len(data_dict['worst_second_share']) == 0:
            output = NarrativesUtils.block_splitter(
                NarrativesUtils.get_template_output(
                    self._base_dir, 'card1_binned_target_worst_second.html',
                    data_dict),
                self._blockSplitter,
                highlightFlag=self._highlightFlag)
        else:
            if (self._binTargetCol == True & self._binAnalyzedCol == False):
                print("Only Target Column is Binned, : ", self._binTargetCol)
                output = NarrativesUtils.block_splitter(
                    NarrativesUtils.get_template_output(
                        self._base_dir, 'card1_binned_target.html', data_dict),
                    self._blockSplitter,
                    highlightFlag=self._highlightFlag)
            elif (self._binTargetCol == True & self._binAnalyzedCol == True):
                print("Target Column and IV is Binned : ", self._binTargetCol,
                      self._binAnalyzedCol)
                output = NarrativesUtils.block_splitter(
                    NarrativesUtils.get_template_output(
                        self._base_dir, 'card1_binned_target_and_IV.html',
                        data_dict),
                    self._blockSplitter,
                    highlightFlag=self._highlightFlag)
            else:
                output = NarrativesUtils.block_splitter(
                    NarrativesUtils.get_template_output(
                        self._base_dir, 'card1.html', data_dict),
                    self._blockSplitter,
                    highlightFlag=self._highlightFlag)

        targetDimCard1Data = []
        targetDimcard1Heading = '<h3>Impact of ' + self._analysed_dimension + '  on ' + self._target_dimension + "</h3>"

        toggledata = ToggleData()

        targetDimTable1Data = self.generate_card1_table1()
        targetDimCard1Table1 = TableData()
        targetDimCard1Table1.set_table_type("heatMap")
        targetDimCard1Table1.set_table_data(targetDimTable1Data)
        toggledata.set_toggleon_data({
            "data": {
                "tableData": targetDimTable1Data,
                "tableType": "heatMap"
            },
            "dataType": "table"
        })

        targetDimTable2Data = self.generate_card1_table2()
        targetDimCard1Table2 = TableData()
        targetDimCard1Table2.set_table_type("normal")
        table2Data = targetDimTable2Data["data1"]
        table2Data = [
            innerList[1:] for innerList in table2Data
            if innerList[0].strip() != ""
        ]
        targetDimCard1Table2.set_table_data(table2Data)

        toggledata.set_toggleoff_data({
            "data": {
                "tableData": table2Data,
                "tableType": "heatMap"
            },
            "dataType": "table"
        })

        targetDimCard1Data.append(HtmlData(data=targetDimcard1Heading))
        targetDimCard1Data.append(toggledata)
        targetDimCard1Data += output

        self._card1.set_card_data(targetDimCard1Data)
        self._card1.set_card_name("{}: Relationship with {}".format(
            self._analysed_dimension, self._target_dimension))

        ###############
        #     CARD2   #
        ###############

        if self._appid == None:

            key_factors = ''
            num_key_factors = len(self._second_level_dimensions)

            if len(self._second_level_dimensions) == 5:
                key_factors = ', '.join(
                    self._second_level_dimensions[:4]
                ) + ' and ' + self._second_level_dimensions[4]
            elif len(self._second_level_dimensions) == 4:
                key_factors = ', '.join(
                    self._second_level_dimensions[:3]
                ) + ' and ' + self._second_level_dimensions[3]
            elif len(self._second_level_dimensions) == 3:
                key_factors = ', '.join(
                    self._second_level_dimensions[:2]
                ) + ' and ' + self._second_level_dimensions[2]
            elif len(self._second_level_dimensions) == 2:
                key_factors = ' and '.join(self._second_level_dimensions)
            elif len(self._second_level_dimensions) == 1:
                key_factors = self._second_level_dimensions[0]

            targetCardDataDict['num_key_factors'] = num_key_factors
            targetCardDataDict['key_factors'] = key_factors
            dict_for_test = {}
            for tupleObj in sorted_target_levels[:self._chiSquareLevelLimit]:
                targetLevel = tupleObj[1]

                targetCardDataDict['random_card2'] = random.randint(1, 100)
                targetCardDataDict['random_card4'] = random.randint(1, 100)

                second_target_contributions = [
                    table.get_value(targetLevel, i) for i in levels
                ]
                sum_second_target = sum(second_target_contributions)

                sorted_levels = sorted(zip(second_target_contributions,
                                           levels),
                                       reverse=True)

                level_differences = [0.0] + [
                    sorted_levels[i][0] - sorted_levels[i + 1][0]
                    for i in range(len(sorted_levels) - 1)
                ]
                level_diff_index = level_differences.index(
                    max(level_differences)) if level_differences.index(
                        max(level_differences)) > 0 else len(
                            level_differences
                        )  ##added for pipeline keyerror issue
                second_target_top_dims = [
                    j for i, j in sorted_levels[:level_diff_index]
                ]
                second_target_top_dims_contribution = sum([
                    i for i, j in sorted_levels[:level_differences.
                                                index(max(level_differences))]
                ])
                second_target_bottom_dim = sorted_levels[-1][1]
                second_target_bottom_dim_contribution = sorted_levels[-1][0]

                second_target_percentages = [
                    old_div(i * 100.0, sum_second_target)
                    for i in second_target_contributions
                ]
                best_second_target_index = second_target_contributions.index(
                    max(second_target_contributions))
                worst_second_target_index = second_target_contributions.index(
                    min(second_target_contributions))
                second_target_differences = [
                    x - y for x, y in zip(levels_percentages,
                                          second_target_percentages)
                ]
                if len(second_target_differences) > 6:
                    tops = 2
                    bottoms = -2
                elif len(second_target_differences) > 4:
                    tops = 2
                    bottoms = -1
                else:
                    tops = 1
                    bottoms = -1
                sorted_ = sorted(enumerate(second_target_differences),
                                 key=lambda x: x[1],
                                 reverse=True)
                best_second_difference_indices = [x for x, y in sorted_[:tops]]
                worst_second_difference_indices = [
                    x for x, y in sorted_[bottoms:]
                ]

                second_target_shares = [
                    old_div(x * 100.0, y)
                    for x, y in zip(second_target_contributions, level_counts)
                ]
                max_second_target_shares = max(second_target_shares)
                best_second_target_share_index = [
                    idx for idx, val in enumerate(second_target_shares)
                    if val == max_second_target_shares
                ]
                level_counts_threshold = old_div(
                    sum(level_counts) * 0.05, len(level_counts))
                min_second_target_shares = min([
                    x for x, y in zip(second_target_shares, level_counts)
                    if y >= level_counts_threshold
                ])
                worst_second_target_share_index = [
                    idx for idx, val in enumerate(second_target_shares)
                    if val == min_second_target_shares
                ]
                overall_second_percentage = old_div(sum_second_target * 100.0,
                                                    total)

                # DataFrame for contribution calculation
                if self._pandas_flag:
                    df_second_target = self._data_frame[(
                        self._data_frame[self._target_dimension] == targetLevel
                    ) & (self._data_frame[self._analysed_dimension] ==
                         second_target_top_dims[0])][
                             self._second_level_dimensions]
                    df_second_dim = self._data_frame[(
                        self._data_frame[self._analysed_dimension] ==
                        second_target_top_dims[0]
                    )][self._second_level_dimensions]
                else:
                    df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\
                                            filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                                            select(self._second_level_dimensions).toPandas()
                    df_second_dim = self._data_frame.filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                                        select(self._second_level_dimensions).toPandas()

                # if self._chisquare_result.get_splits():
                #     splits = self._chisquare_result.get_splits()
                #     idx = self._chiSquareTable.get_bin_names(splits).index(second_target_top_dims[0])
                #     idx1 = self._chiSquareTable.get_bin_names(splits).index(top_target_top_dims[0])
                #     splits[len(splits)-1] = splits[len(splits)-1]+1
                #     df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\
                #                         filter(col(self._analysed_dimension)>=splits[idx]).filter(col(self._analysed_dimension)<splits[idx+1]).\
                #                         select(self._second_level_dimensions).toPandas()
                #     df_second_dim = self._data_frame.filter(col(self._analysed_dimension)>=splits[idx]).\
                #                     filter(col(self._analysed_dimension)<splits[idx+1]).\
                #                     select(self._second_level_dimensions).toPandas()
                # else:
                #     df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\
                #                         filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                #                         select(self._second_level_dimensions).toPandas()
                #     df_second_dim = self._data_frame.filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                #                     select(self._second_level_dimensions).toPandas()

                # print self._data_frame.select('Sales').show()

                distribution_second = []
                d_l = []
                for d in self._second_level_dimensions:
                    grouped = df_second_target.groupby(d).agg({d: 'count'})
                    contributions = df_second_dim.groupby(d).agg({d: 'count'})
                    contribution_index = list(contributions.index)
                    contributions_val = contributions[d].tolist()
                    contributions_list = dict(
                        list(zip(contribution_index, contributions_val)))
                    index_list = list(grouped.index)
                    grouped_list = grouped[d].tolist()
                    contributions_percent_list = [
                        round(old_div(y * 100.0, contributions_list[x]), 2)
                        for x, y in zip(index_list, grouped_list)
                    ]
                    sum_ = grouped[d].sum()
                    diffs = [0] + [
                        grouped_list[i] - grouped_list[i + 1]
                        for i in range(len(grouped_list) - 1)
                    ]
                    max_diff = diffs.index(max(diffs))
                    grouped_dict = dict(list(zip(index_list, grouped_list)))

                    for val in contribution_index:
                        if val not in list(grouped_dict.keys()):
                            grouped_dict[val] = 0
                        else:
                            pass

                    index_list = []
                    grouped_list = []
                    contributions_val = []

                    for key in list(grouped_dict.keys()):
                        index_list.append(str(key))
                        grouped_list.append(grouped_dict[key])
                        contributions_val.append(contributions_list[key])
                    '''
                    print "="*70
                    print "GROUPED - ", grouped
                    print "INDEX LIST - ", index_list
                    print "GROUPED LIST - ", grouped_list
                    print "GROUPED DICT - ", grouped_dict
                    print "CONTRIBUTIONS - ", contributions
                    print "CONTRIBUTION INDEX - ", contribution_index
                    print "CONTRIBUTIONS VAL - ", contributions_val
                    print "CONTRIBUTIONS LIST - ", contributions_list
                    print "CONTRIBUTIONS PERCENT LIST - ", contributions_percent_list
                    print "SUM - ", sum_
                    print "DIFFS - ", diffs
                    print "MAX DIFF - ", max_diff
                    print "="*70
                    '''

                    informative_dict = {
                        "levels": index_list,
                        "positive_class_contribution": grouped_list,
                        "positive_plus_others": contributions_val
                    }

                    informative_df = pd.DataFrame(informative_dict)
                    informative_df["percentage_horizontal"] = old_div(
                        informative_df["positive_class_contribution"] * 100,
                        informative_df["positive_plus_others"])
                    informative_df["percentage_vertical"] = old_div(
                        informative_df["positive_class_contribution"] * 100,
                        sum_)
                    informative_df.sort_values(["percentage_vertical"],
                                               inplace=True,
                                               ascending=False)
                    informative_df = informative_df.reset_index(drop=True)

                    percentage_vertical_sorted = list(
                        informative_df["percentage_vertical"])
                    percentage_horizontal_sorted = list(
                        informative_df["percentage_horizontal"])
                    levels_sorted = list(informative_df["levels"])

                    differences_list = []
                    for i in range(1, len(percentage_vertical_sorted)):
                        difference = percentage_vertical_sorted[
                            i - 1] - percentage_vertical_sorted[i]
                        differences_list.append(round(difference, 2))
                    '''
                    print "-"*70
                    print "DIFFERENCES LIST - ", differences_list
                    print "-"*70
                    '''

                    index_txt = ''
                    if differences_list:
                        if differences_list[0] >= 30:
                            print("showing 1st case")
                            index_txt = levels_sorted[0]
                            max_diff_equivalent = 1
                        else:
                            if len(differences_list) >= 2:
                                if differences_list[1] >= 10:
                                    print("showing 1st and 2nd case")
                                    index_txt = levels_sorted[0] + '(' + str(
                                        round(percentage_vertical_sorted[0], 1)
                                    ) + '%)' + ' and ' + levels_sorted[
                                        1] + '(' + str(
                                            round(
                                                percentage_vertical_sorted[1],
                                                1)) + '%)'
                                    max_diff_equivalent = 2
                                else:
                                    print("showing 3rd case")
                                    index_txt = 'including ' + levels_sorted[
                                        0] + '(' + str(
                                            round(
                                                percentage_vertical_sorted[0],
                                                1)
                                        ) + '%)' + ' and ' + levels_sorted[
                                            1] + '(' + str(
                                                round(
                                                    percentage_vertical_sorted[
                                                        1], 1)) + '%)'
                                    max_diff_equivalent = 3
                            else:
                                print("showing 3rd case")
                                index_txt = 'including ' + levels_sorted[
                                    0] + '(' + str(
                                        round(percentage_vertical_sorted[0], 1)
                                    ) + '%)' + ' and ' + levels_sorted[
                                        1] + '(' + str(
                                            round(
                                                percentage_vertical_sorted[1],
                                                1)) + '%)'
                                max_diff_equivalent = 3

                    else:
                        max_diff_equivalent = 0
                    '''
                    print "-"*70
                    print informative_df.head(25)
                    print "-"*70
                    '''

                    distribution_second.append({
                        'contributions': [
                            round(i, 2) for i in
                            percentage_vertical_sorted[:max_diff_equivalent]
                        ],
                        'levels':
                        levels_sorted[:max_diff_equivalent],
                        'variation':
                        random.randint(1, 100),
                        'index_txt':
                        index_txt,
                        'd':
                        d,
                        'contributions_percent':
                        percentage_horizontal_sorted
                    })
                '''
                  print "DISTRIBUTION SECOND - ", distribution_second
                  print "<>"*50
                  '''
                targetCardDataDict['distribution_second'] = distribution_second
                targetCardDataDict['second_target'] = targetLevel
                targetCardDataDict[
                    'second_target_top_dims'] = second_target_top_dims
                targetCardDataDict[
                    'second_target_top_dims_contribution'] = old_div(
                        second_target_top_dims_contribution * 100.0,
                        sum(second_target_contributions))
                targetCardDataDict[
                    'second_target_bottom_dim'] = second_target_bottom_dim
                targetCardDataDict[
                    'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution
                targetCardDataDict['best_second_target'] = levels[
                    best_second_target_index]
                targetCardDataDict[
                    'best_second_target_count'] = second_target_contributions[
                        best_second_target_index]
                targetCardDataDict['best_second_target_percent'] = round(
                    old_div(
                        second_target_contributions[best_second_target_index] *
                        100.0, sum(second_target_contributions)), 2)
                targetCardDataDict['worst_second_target'] = levels[
                    worst_second_target_index]
                targetCardDataDict['worst_second_target_percent'] = round(
                    old_div(
                        second_target_contributions[worst_second_target_index]
                        * 100.0, sum(second_target_contributions)), 2)

                card2Data = []
                targetLevelContributions = [
                    table.get_value(targetLevel, i) for i in levels
                ]
                impact_target_thershold = old_div(
                    sum(targetLevelContributions) * 0.02,
                    len(targetLevelContributions))
                card2Heading = '<h3>Key Drivers of ' + self._target_dimension + ' (' + targetLevel + ')' + "</h3>"
                chart, bubble = self.generate_distribution_card_chart(
                    targetLevel, targetLevelContributions, levels,
                    level_counts, total, impact_target_thershold)
                card2ChartData = NormalChartData(data=chart["data"])
                "rounding the chartdata values for key drivers tab inside table percentage(table data)"
                for d in card2ChartData.get_data():
                    d['percentage'] = round(d['percentage'], 2)
                    d_l.append(d)
                card2ChartJson = ChartJson()
                card2ChartJson.set_data(d_l)
                card2ChartJson.set_chart_type("combination")
                card2ChartJson.set_types({
                    "total": "bar",
                    "percentage": "line"
                })
                card2ChartJson.set_legend({
                    "total": "# of " + targetLevel,
                    "percentage": "% of " + targetLevel
                })
                card2ChartJson.set_axes({
                    "x": "key",
                    "y": "total",
                    "y2": "percentage"
                })
                card2ChartJson.set_label_text({
                    "x": " ",
                    "y": "Count",
                    "y2": "Percentage"
                })
                print("self._binTargetCol & self._binAnalyzedCol : ",
                      self._binTargetCol, self._binAnalyzedCol)
                if (self._binTargetCol == True & self._binAnalyzedCol ==
                        False):
                    print("Only Target Column is Binned")
                    output2 = NarrativesUtils.block_splitter(
                        NarrativesUtils.get_template_output(
                            self._base_dir, 'card2_binned_target.html',
                            targetCardDataDict), self._blockSplitter)
                elif (self._binTargetCol == True & self._binAnalyzedCol ==
                      True):
                    print("Target Column and IV is Binned")
                    output2 = NarrativesUtils.block_splitter(
                        NarrativesUtils.get_template_output(
                            self._base_dir, 'card2_binned_target_and_IV.html',
                            targetCardDataDict), self._blockSplitter)
                else:
                    print("In Else, self._binTargetCol should be False : ",
                          self._binTargetCol)
                    output2 = NarrativesUtils.block_splitter(
                        NarrativesUtils.get_template_output(
                            self._base_dir, 'card2.html', targetCardDataDict),
                        self._blockSplitter)

                card2Data.append(HtmlData(data=card2Heading))
                statistical_info_array = [
                    ("Test Type", "Chi-Square"),
                    ("Chi-Square statistic",
                     str(round(self._chisquare_result.get_stat(), 3))),
                    ("P-Value",
                     str(round(self._chisquare_result.get_pvalue(), 3))),
                    ("Inference",
                     "Chi-squared analysis shows a significant association between {} (target) and {}."
                     .format(self._target_dimension, self._analysed_dimension))
                ]
                statistical_info_array = NarrativesUtils.statistical_info_array_formatter(
                    statistical_info_array)

                card2Data.append(
                    C3ChartData(data=card2ChartJson,
                                info=statistical_info_array))
                card2Data += output2
                card2BubbleData = "<div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}</span><br /><small>{}</small></h2></div><div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}</span><br /><small>{}</small></h2></div>".format(
                    bubble[0]["value"], bubble[0]["text"], bubble[1]["value"],
                    bubble[1]["text"])
                card2Data.append(HtmlData(data=card2BubbleData))
                targetCard = NormalCard()
                targetCard.set_card_data(card2Data)
                targetCard.set_card_name("{} : Distribution of {}".format(
                    self._analysed_dimension, targetLevel))
                self._targetCards.append(targetCard)
                dict_for_test[targetLevel] = targetCardDataDict
        out = {'data_dict': data_dict, 'target_dict': dict_for_test}

        return out
Ejemplo n.º 10
0
    def _generate_summary(self):
        data_dict = {}
        rules_dict = self._table
        data_dict["blockSplitter"] = self._blockSplitter
        data_dict["targetcol"] = self._colname
        groups = rules_dict.keys()
        probabilityCutoff = 75
        probabilityGroups = [{
            "probability": probabilityCutoff,
            "count": 0,
            "range": [probabilityCutoff, 100]
        }, {
            "probability": probabilityCutoff - 1,
            "count": 0,
            "range": [0, probabilityCutoff - 1]
        }]
        tableArray = [[
            "Prediction Rule", "Probability", "Prediction", "Freq", "group",
            "richRules"
        ]]
        dropdownData = []
        chartDict = {}
        targetLevel = self._dataframe_context.get_target_level_for_model()
        probabilityArrayAll = []

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        progressMessage = CommonUtils.create_progress_message_object(
            self._analysisName,
            "custom",
            "info",
            "Generating Prediction rules",
            self._completionStatus,
            self._completionStatus,
            display=True)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=False)
        self._dataframe_context.update_completion_status(
            self._completionStatus)
        targetValues = [x for x in rules_dict.keys() if x == targetLevel
                        ] + [x for x in rules_dict.keys() if x != targetLevel]
        for idx, target in enumerate(targetValues):
            if idx == 0:
                if self._dataframe_context.get_story_on_scored_data() != True:
                    dropdownData.append({
                        "displayName": target,
                        "name": target,
                        "selected": True,
                        "id": idx + 1
                    })
                else:
                    dropdownData.append({
                        "displayName":
                        "{} : {}".format(self._colname, target),
                        "name":
                        target,
                        "selected":
                        True,
                        "id":
                        idx + 1
                    })
            else:
                if self._dataframe_context.get_story_on_scored_data() != True:
                    dropdownData.append({
                        "displayName": target,
                        "name": target,
                        "selected": False,
                        "id": idx + 1
                    })
                else:
                    dropdownData.append({
                        "displayName":
                        "{} : {}".format(self._colname, target),
                        "name":
                        target,
                        "selected":
                        False,
                        "id":
                        idx + 1
                    })
            rulesArray = rules_dict[target]
            probabilityArray = [
                round(x, 2) for x in self.success_percent[target]
            ]
            probabilityArrayAll += probabilityArray
            groupArray = [
                "strong" if x >= probabilityCutoff else "mixed"
                for x in probabilityArray
            ]
            for idx2, obj in enumerate(probabilityGroups):
                grpCount = len([
                    x for x in probabilityArray
                    if x >= obj["range"][0] and x <= obj["range"][1]
                ])
                obj["count"] += grpCount
                probabilityGroups[idx2] = obj
            predictionArray = [target] * len(rulesArray)
            freqArray = self.total_predictions[target]
            chartDict[target] = sum(freqArray)
            success = self.successful_predictions[target]
            success_percent = self.success_percent[target]
            richRulesArray = []
            crudeRuleArray = []
            analysisType = self._dataframe_context.get_analysis_type()
            targetCol = self._dataframe_context.get_result_column()
            binFlag = False
            if self._dataframe_context.get_custom_analysis_details() != None:
                binnedColObj = [
                    x["colName"] for x in
                    self._dataframe_context.get_custom_analysis_details()
                ]
                if binnedColObj != None and targetCol in binnedColObj:
                    binFlag = True
            for idx2, crudeRule in enumerate(rulesArray):
                richRule, crudeRule = NarrativesUtils.generate_rules(
                    self._colname,
                    target,
                    crudeRule,
                    freqArray[idx2],
                    success[idx2],
                    success_percent[idx2],
                    analysisType,
                    binFlag=binFlag)
                richRulesArray.append(richRule)
                crudeRuleArray.append(crudeRule)
            probabilityArray = map(
                lambda x: humanize.apnumber(x) + "%"
                if x >= 10 else str(int(x)) + "%", probabilityArray)
            # targetArray = zip(richRulesArray,probabilityArray,predictionArray,freqArray,groupArray)
            targetArray = zip(crudeRuleArray, probabilityArray,
                              predictionArray, freqArray, groupArray,
                              richRulesArray)
            targetArray = [list(x) for x in targetArray]
            tableArray += targetArray

        donutChartMaxLevel = 10
        if self._dataframe_context.get_story_on_scored_data() == True:
            chartDict = {}
            probabilityRangeForChart = GLOBALSETTINGS.PROBABILITY_RANGE_FOR_DONUT_CHART
            chartDict = dict(
                zip(probabilityRangeForChart.keys(),
                    [0] * len(probabilityRangeForChart)))
            for val in probabilityArrayAll:
                for grps, grpRange in probabilityRangeForChart.items():
                    if val > grpRange[0] and val <= grpRange[1]:
                        chartDict[grps] = chartDict[grps] + 1
            chartDict = {k: v for k, v in chartDict.items() if v != 0}
        else:
            chartDict = dict([(k, sum(v))
                              for k, v in self.total_predictions.items()])
            chartDict = {k: v for k, v in chartDict.items() if v != 0}
        if len(chartDict) > donutChartMaxLevel:
            chartDict = NarrativesUtils.restructure_donut_chart_data(
                chartDict, nLevels=donutChartMaxLevel)
        chartData = NormalChartData([chartDict]).get_data()
        chartJson = ChartJson(data=chartData)
        chartJson.set_title(self._colname)
        chartJson.set_chart_type("donut")
        mainCardChart = C3ChartData(data=chartJson)
        mainCardChart.set_width_percent(45)
        # mainCardChart = {"dataType": "c3Chart","widthPercent":33 ,"data": {"data": [chartDict],"title":self._colname,"axes":{},"label_text":{},"legend":{},"yAxisNumberFormat": ".2s","types":None,"axisRotation":False, "chart_type": "donut"}}

        dropdownDict = {
            "dataType": "dropdown",
            "label": "Showing prediction rules for",
            "data": dropdownData
        }

        data_dict["probabilityGroups"] = probabilityGroups
        if self._dataframe_context.get_story_on_scored_data() != True:
            maincardSummary = NarrativesUtils.get_template_output(self._base_dir,\
                                                        'decisiontreesummary.html',data_dict)
        else:
            predictedLevelcountArray = [(x[2], x[3]) for x in tableArray[1:]]
            predictedLevelCountDict = {}
            # predictedLevelcountDict = defaultdict(predictedLevelcountArray)
            for val in predictedLevelcountArray:
                predictedLevelCountDict.setdefault(val[0], []).append(val[1])

            levelCountDict = {}
            for k, v in predictedLevelCountDict.items():
                levelCountDict[k] = sum(v)
            # levelCountDict = self._metaParser.get_unique_level_dict(self._colname)
            total = float(
                sum([x for x in levelCountDict.values() if x != None]))
            levelCountTuple = [{
                "name": k,
                "count": v,
                "percentage": round(v * 100 / total, 2)
            } for k, v in levelCountDict.items() if v != None]
            percentageArray = [x["percentage"] for x in levelCountTuple]
            percentageArray = NarrativesUtils.ret_smart_round(percentageArray)
            levelCountTuple = [{
                "name": obj["name"],
                "count": obj["count"],
                "percentage": str(percentageArray[idx]) + "%"
            } for idx, obj in enumerate(levelCountTuple)]
            data_dict["nlevel"] = len(levelCountDict)
            print "levelCountTuple", levelCountTuple
            print "levelCountDict", levelCountDict
            if targetLevel in levelCountDict:
                data_dict["topLevel"] = [
                    x for x in levelCountTuple if x["name"] == targetLevel
                ][0]
                if len(levelCountTuple) > 1:
                    data_dict["secondLevel"] = max([
                        x for x in levelCountTuple if x["name"] != targetLevel
                    ],
                                                   key=lambda x: x["count"])
                else:
                    data_dict["secondLevel"] = None
            else:
                data_dict["topLevel"] = levelCountTuple[0]
                if len(levelCountTuple) > 1:
                    data_dict["secondLevel"] = levelCountTuple[1]
                else:
                    data_dict["secondLevel"] = None
            print data_dict
            maincardSummary = NarrativesUtils.get_template_output(
                self._base_dir, 'decisiontreescore.html', data_dict)
        main_card = NormalCard()
        main_card_data = []
        main_card_narrative = NarrativesUtils.block_splitter(
            maincardSummary, self._blockSplitter)
        main_card_data += main_card_narrative

        main_card_data.append(mainCardChart)
        main_card_data.append(dropdownDict)

        main_card_table = TableData()
        if self._dataframe_context.get_story_on_scored_data() == True:
            main_card_table.set_table_width(75)
        main_card_table.set_table_data(tableArray)
        main_card_table.set_table_type("popupDecisionTreeTable")
        main_card_data.append(main_card_table)
        uidTable = self._result_setter.get_unique_identifier_table()
        if uidTable != None:
            main_card_data.append(uidTable)
        else:
            main_card_table.set_table_width(100)
        main_card.set_card_data(main_card_data)
        main_card.set_card_name("Predicting Key Drivers of {}".format(
            self._colname))
        self._decisionTreeNode.add_a_card(main_card)
Ejemplo n.º 11
0
    def _generate_summary(self):
        data_dict = {}
        rules_dict = self._table
        data_dict["blockSplitter"] = self._blockSplitter
        data_dict["targetcol"] = self._colname
        groups = rules_dict.keys()
        probabilityCutoff = 75
        probabilityGroups = [{
            "probability": probabilityCutoff,
            "count": 0,
            "range": [probabilityCutoff, 100]
        }, {
            "probability": probabilityCutoff - 1,
            "count": 0,
            "range": [0, probabilityCutoff - 1]
        }]
        tableArray = [[
            "Prediction Rule", "Probability", "Prediction", "Freq", "group",
            "richRules"
        ]]
        dropdownData = []
        chartDict = {}
        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        progressMessage = CommonUtils.create_progress_message_object(
            self._analysisName,
            "custom",
            "info",
            "Generating Prediction rules",
            self._completionStatus,
            self._completionStatus,
            display=True)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=False)

        for idx, target in enumerate(rules_dict.keys()):
            targetToDisplayInTable = target.split(":")[0].strip()
            if idx == 0:
                dropdownData.append({
                    "displayName": target,
                    "name": targetToDisplayInTable,
                    "searchTerm": targetToDisplayInTable,
                    "selected": True,
                    "id": idx + 1
                })
            else:
                dropdownData.append({
                    "displayName": target,
                    "name": targetToDisplayInTable,
                    "searchTerm": targetToDisplayInTable,
                    "selected": False,
                    "id": idx + 1
                })
            rulesArray = rules_dict[target]
            probabilityArray = [
                round(x, 2) for x in self.success_percent[target]
            ]
            groupArray = [
                "strong" if x >= probabilityCutoff else "mixed"
                for x in probabilityArray
            ]
            for idx2, obj in enumerate(probabilityGroups):
                grpCount = len([
                    x for x in probabilityArray
                    if x >= obj["range"][0] and x <= obj["range"][1]
                ])
                obj["count"] += grpCount
                probabilityGroups[idx2] = obj
            predictionArray = [targetToDisplayInTable] * len(rulesArray)
            freqArray = self.total_predictions[target]
            chartDict[target] = sum(freqArray)
            success = self.successful_predictions[target]
            success_percent = self.success_percent[target]
            richRulesArray = []
            crudeRuleArray = []
            analysisType = self._dataframe_context.get_analysis_type()
            targetCol = self._dataframe_context.get_result_column()
            binFlag = False
            if self._dataframe_context.get_custom_analysis_details() != None:
                binnedColObj = [
                    x["colName"] for x in
                    self._dataframe_context.get_custom_analysis_details()
                ]
                if binnedColObj != None and targetCol in binnedColObj:
                    binFlag = True
            for idx2, crudeRule in enumerate(rulesArray):
                richRule, crudeRule = NarrativesUtils.generate_rules(
                    self._colname,
                    target,
                    crudeRule,
                    freqArray[idx2],
                    success[idx2],
                    success_percent[idx2],
                    analysisType,
                    binFlag=binFlag)
                richRulesArray.append(richRule)
                crudeRuleArray.append(crudeRule)
            probabilityArray = map(
                lambda x: humanize.apnumber(x) + "%"
                if x >= 10 else str(int(x)) + "%", probabilityArray)
            # targetArray = zip(rulesArray,probabilityArray,predictionArray,freqArray,groupArray)
            targetArray = zip(crudeRuleArray, probabilityArray,
                              predictionArray, freqArray, groupArray,
                              richRulesArray)
            targetArray = [list(x) for x in targetArray]
            tableArray += targetArray

        donutChartMaxLevel = 10
        if len(chartDict) > donutChartMaxLevel:
            chartDict = NarrativesUtils.restructure_donut_chart_data(
                chartDict, nLevels=donutChartMaxLevel)
        chartData = NormalChartData([chartDict]).get_data()
        chartJson = ChartJson(data=chartData)
        chartJson.set_title(self._colname)
        chartJson.set_chart_type("donut")
        mainCardChart = C3ChartData(data=chartJson)
        mainCardChart.set_width_percent(45)
        # mainCardChart = {"dataType": "c3Chart","widthPercent":33 ,"data": {"data": [chartDict],"title":self._colname,"axes":{},"label_text":{},"legend":{},"yAxisNumberFormat": ".2s","types":None,"axisRotation":False, "chart_type": "donut"}}

        dropdownDict = {
            "dataType": "dropdown",
            "label": "Showing prediction rules for",
            "data": dropdownData
        }

        data_dict["probabilityGroups"] = probabilityGroups

        maincardSummary = NarrativesUtils.get_template_output(self._base_dir,\
                                                    'decisiontreesummary.html',data_dict)
        main_card = NormalCard()
        main_card_data = []
        main_card_narrative = NarrativesUtils.block_splitter(
            maincardSummary, self._blockSplitter)
        main_card_data += main_card_narrative

        main_card_data.append(mainCardChart)
        main_card_data.append(dropdownDict)

        main_card_table = TableData()
        main_card_table.set_table_data(tableArray)
        main_card_table.set_table_type("popupDecisionTreeTable")
        main_card_data.append(main_card_table)
        main_card.set_card_data(main_card_data)
        main_card.set_card_name("Predicting Key Drivers of {}".format(
            self._colname))
        self._decisionTreeNode.add_a_card(main_card)
Ejemplo n.º 12
0
class BusinessCard(object):
    """
    Functionalities
    """
    def __init__(self, story_result, meta_parser, result_setter,
                 dataframe_context, dataframe_helper, start_time,
                 analysis_type):
        self._story_result = story_result
        self._meta_parser = meta_parser
        self._result_setter = result_setter
        self._dataframe_context = dataframe_context
        self._dataframe_helper = dataframe_helper
        self.subheader = "Impact"
        self.business_card1 = NormalCard()
        self.business_card1.set_card_name("Overview")
        self.businessCardData = []
        self.start_time = start_time
        self.analysis_type = analysis_type

    def set_params(self):
        self.target_levels = self._dataframe_helper.get_num_unique_values(
            self._dataframe_context.get_result_column())
        self.number_variables = self.get_number_variables()
        self.number_measures = self.get_number_measures()
        self.number_dimensions = self.get_number_dimensions()
        if self.analysis_type == 'dimension':
            self.analysis_list = [
                "overview_rules", "association_summary", "association_rules",
                "prediction_rules"
            ]
        elif self.analysis_type == 'measure':
            self.analysis_list = [
                "overview_rules", "performance_summary", "performance_rules",
                "influencers_summary", "influencers_rules", "prediction_rules"
            ]
        self.data_points = self.get_number_data_points()
        self.number_charts = self.get_number_charts()
        self.number_prediction_rules = self.get_number_prediction_rules()
        self.number_pages = self.get_number_pages()
        self.number_analysis = self.get_number_analysis()
        self.number_queries = self.get_number_queries()
        self.time_mAdvisor = time.time() - self.start_time
        self.time_analyst = self.get_time_analyst()
        self.time_saved = self.get_time_saved()
        self.impact_on_productivity = self.get_impact_on_productivity()

    def get_number_charts(self):
        return json.dumps(self._story_result, indent=2).count("c3Chart")

    def get_number_analysis(self):
        if self.analysis_type == 'dimension':
            significant_variables_levels = {"None": 0}
            for each in self._story_result['listOfNodes']:
                try:
                    if each['name'] == 'Key Drivers':
                        for node in each['listOfNodes']:
                            significant_variables_levels[node['name']] = [
                                self._meta_parser.get_num_unique_values(
                                    node['name']) if node['name']
                                in self._dataframe_helper.get_string_columns()
                                else 5
                            ][0]
                except:
                    for key in each.keys():
                        if not key.startswith('maxdepth'):
                            if each['name'] == 'Key Drivers':
                                for node in each['listOfNodes']:
                                    significant_variables_levels[
                                        node['name']] = [
                                            self._meta_parser.
                                            get_num_unique_values(node['name'])
                                            if node['name']
                                            in self._dataframe_helper.
                                            get_string_columns() else 5
                                        ][0]
            self.number_analysis_dict = {}
            self.number_analysis_dict[
                "overview_rules"] = self.target_levels * 2
            self.number_analysis_dict['association_summary'] = (
                self.number_dimensions + self.number_measures) * 2
            self.number_analysis_dict["association_rules"] = sum(
                significant_variables_levels.values()) * 6
            self.number_analysis_dict[
                "prediction_rules"] = self.number_prediction_rules * 5
            return sum(self.number_analysis_dict.values())
        elif self.analysis_type == 'measure':
            significant_variables_levels = {"None": 0}
            for each in self._story_result['listOfNodes']:
                if each['name'] == 'Performance':
                    for node in each['listOfNodes']:
                        significant_variables_levels[node['name']] = [
                            self._dataframe_helper.get_num_unique_values(
                                node['name']) if node['name']
                            in self._dataframe_helper.get_string_columns() else
                            5
                        ][0]
            self.number_analysis_dict = {}
            self.number_analysis_dict[
                "overview_rules"] = self.target_levels * 2
            self.number_analysis_dict["performance_summary"] = (
                self.number_dimensions + self.number_measures) * 2
            self.number_analysis_dict["performance_rules"] = sum(
                significant_variables_levels.values()) * 6
            self.number_analysis_dict[
                "prediction_rules"] = self.number_prediction_rules * 5
            self.number_analysis_dict[
                "influencers_summary"] = self.number_measures * 2
            self.number_analysis_dict["influencers_rules"] = 8
            return sum(self.number_analysis_dict.values())

    def get_number_queries(self):
        if self.analysis_type == 'dimension':
            queries_per_analysis_dict = {
                "overview_rules": 15,
                "association_summary": 120,
                "association_rules": 600,
                "prediction_rules": 200
            }
        elif self.analysis_type == 'measure':
            queries_per_analysis_dict = {
                "overview_rules": 15,
                "performance_summary": 120,
                "performance_rules": 600,
                "influencers_summary": 100,
                "influencers_rules": 80,
                "prediction_rules": 200
            }
        sum = 0
        for analysis in self.analysis_list:
            sum += self.number_analysis_dict[
                analysis] * queries_per_analysis_dict[analysis]
        return sum

    def get_number_prediction_rules(self):
        num_prediction_rules = 0
        for each_node in self._story_result['listOfNodes']:
            try:
                if each_node['name'] == 'Prediction':
                    for card in each_node['listOfCards'][0]['cardData']:
                        if card['dataType'] == 'table':
                            num_prediction_rules = len(
                                card['data']['tableData'])
            except:
                for key in each_node.keys():
                    if key.startswith('maxdepth'):
                        if each_node['maxdepth3'][
                                'name'] == 'Prediction' or each_node[
                                    'maxdepth4'][
                                        'name'] == 'Prediction' or each_node[
                                            'maxdepth5'][
                                                'name'] == 'Prediction':
                            for Depth in range(3, 6):
                                for card in each_node['maxdepth' + str(
                                        Depth)]['listOfCards'][0]['cardData']:
                                    if card['dataType'] == 'table':
                                        num_prediction_rules += len(
                                            card['data']['tableData'])
        return num_prediction_rules

    def get_number_pages(self):
        sum = 0
        for each in self._story_result['listOfNodes']:
            try:
                if each['listOfNodes']:
                    for items in each['listOfNodes']:
                        sum += len(items['listOfCards'])
                    sum += len(each['listOfCards'])
                else:
                    sum += len(each['listOfCards'])
            except:
                for key in each.keys():
                    if key.startswith('maxdepth'):
                        if each['maxdepth3']['listOfNodes'] or each[
                                'maxdepth4']['listOfNodes'] or each[
                                    'maxdepth5']['listOfNodes']:
                            for Depth in range(3, 6):
                                for items in each['maxdepth' +
                                                  str(Depth)]['listOfNodes']:
                                    sum += len(
                                        items['maxdepth' +
                                              str(Depth)]['listOfCards'])
                                sum += len(each['maxdepth' +
                                                str(Depth)]['listOfCards'])
                        else:
                            for Depth in range(3, 6):
                                sum += len(each['maxdepth' +
                                                str(Depth)]['listOfCards'])
        return sum

    def get_number_data_points(self):
        return self._meta_parser.get_num_rows(
        ) * self._meta_parser.get_num_columns()

    def get_number_variables(self):
        return self._meta_parser.get_num_columns()

    def get_number_dimensions(self):
        self.number_dimensions = len(
            self._dataframe_helper.get_string_columns())
        return self.number_dimensions

    def get_number_measures(self):
        self.number_measures = len(
            self._dataframe_helper.get_numeric_columns())
        return self.number_measures

    def get_time_analyst(self):
        if self.analysis_type == 'dimension':
            time_per_analysis_dict = {
                "overview_rules": 10,
                "association_summary": 120,
                "association_rules": 180,
                "prediction_rules": 300
            }
        elif self.analysis_type == 'measure':
            time_per_analysis_dict = {
                "overview_rules": 10,
                "performance_summary": 120,
                "performance_rules": 180,
                "influencers_summary": 120,
                "influencers_rules": 180,
                "prediction_rules": 300
            }
        sum = 0
        for analysis in self.analysis_list:
            sum += self.number_analysis_dict[
                analysis] * time_per_analysis_dict[analysis]
        return sum

    def get_time_saved(self):
        '''
        Total Time Saved - 21 Hrs ( Productitvity Gain = Time taken by data scientist - time taken by mAdvisor)
        '''
        return self.time_analyst - self.time_mAdvisor

    def get_impact_on_productivity(self):
        '''
        Impact on Productivity - 3.5 X  ( Impact on Productivity = Time taken by data scientist / time taken by mAdvisor)
        '''
        productivity = str(
            round(old_div(self.time_analyst, self.time_mAdvisor), 1)) + "X"
        return productivity

    def get_summary_data(self):
        summaryData = [{
            "name": "Total Data Points",
            "value": str(self.data_points)
        }, {
            "name": "Number of Queries",
            "value": str(self.number_queries)
        }, {
            "name": "Number of Analysis",
            "value": str(self.number_analysis)
        }, {
            "name": "Total Pages",
            "value": str(self.number_pages)
        }, {
            "name": "Total Time Saved",
            "value": CommonUtils.humanize_time(self.time_saved)
        }, {
            "name": "Impact on Productivity",
            "value": str(self.impact_on_productivity)
        }]
        # summaryData = HtmlData(data="<p> Hello World!!! </p>")
        summaryDataClass = DataBox(data=summaryData)
        self.businessCardData.append(summaryDataClass)
        # businessCardData.append(summaryData)
        # self.business_card1.set_card_data(self.businessCardData)
        # self._businessImpactNode.add_a_card(self.business_card1)

    def get_summary_para(self):
        para_normal = """<blockquote><p>
        <b>Great Job !!!</b> You have analysed the dataset that contains {} variables after executing about <b>{}</b> analytics queries and <b>{}</b> Statistical and ML analysis in parallel. Using mAdvisor, you have completed the analysis within <b>{}</b> which would have required around <b>{}</b>.
        </p></blockquote>
        """.format(self.number_variables, self.number_queries,
                   self.number_analysis,
                   CommonUtils.humanize_time(self.time_mAdvisor),
                   CommonUtils.humanize_time(self.time_analyst))

        para_images = """<div class="col-md-6">
            <div class="d_analyst_block">
                <span class="d_analyst_img"></span>
                <h1 class="pull-left xs-mt-40 xs-ml-10">
                    <small>Data Analyst <span class="bImpact_time_icon xs-ml-10"></span></small>
                    <br>
                    <small>{}</small>
                </h1>
            </div>
        </div>
        <div class="col-md-6">
            <div class="d_m_block">
                <span class="d_m_img"></span>
                <h1 class="pull-left xs-mt-40 xs-ml-10"><span class="bImpact_time_icon"></span><br>
                    <small>{}</small>
                </h1>
            </div>
        </div>
        <div class="clearfix xs-m-50"></div>

           """.format(CommonUtils.humanize_time(self.time_analyst),
                      CommonUtils.humanize_time(self.time_mAdvisor))

        para_concatinated = """
        <div class="row">
            <div class="col-md-8 col-md-offset-2 xs-mt-20">
                {}{}
            </div>
        </div>
        """.format(para_images, para_normal)

        paraDataClass = HtmlData(data=para_concatinated)
        self.businessCardData.append(paraDataClass)

    def Run(self):
        print("In Run of BusinessCard")
        self._businessImpactNode = NarrativesTree()
        self._businessImpactNode.set_name("Impact")

        self.set_params()

        summary = self.get_summary_data()
        summary_para = self.get_summary_para()

        self.business_card1.set_card_data(self.businessCardData)
        self._businessImpactNode.add_a_card(self.business_card1)
        self._result_setter.set_business_impact_node(self._businessImpactNode)