Python TableData.set_table_data 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: bi.common

클래스/타입: TableData

메소드/함수: set_table_data

hotexamples.com에서의 예제들: 7

Python TableData.set_table_data - 7개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 bi.common.TableData.set_table_data에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

TableData(8)

set_table_data(7)

set_table_type(7)

set_table_width(3)

set_table_top_header(1)

예제 #1

파일 보기

    def generate_narratives(self):
        regression_narrative_obj = LinearRegressionNarrative(
                                    self._df_regression_result,
                                    self._correlations,
                                    self._dataframe_helper,
                                    self._dataframe_context,
                                    self._metaParser,
                                    self._spark
                                    )
        main_card_data = regression_narrative_obj.generate_main_card_data()
        main_card_narrative = NarrativesUtils.get_template_output(self._base_dir,\
                                                        'regression_main_card.html',main_card_data)
        self.narratives['main_card'] = {}
        self.narratives["main_card"]['paragraphs'] = NarrativesUtils.paragraph_splitter(main_card_narrative)
        self.narratives["main_card"]['header'] = 'Key Measures that affect ' + self.result_column
        self.narratives["main_card"]['chart'] = {}
        self.narratives["main_card"]['chart']['heading'] = ''
        self.narratives["main_card"]['chart']['data'] = [[i for i,j in self._all_coeffs],
                                                         [j['coefficient'] for i,j in self._all_coeffs]]
        self.narratives["main_card"]['chart']['label'] = {'x':'Measure Name',
                                                            'y': 'Change in ' + self.result_column + ' per unit increase'}

        main_card = NormalCard()
        main_card_header = HtmlData(data = '<h3>Key Measures that affect ' + self.result_column+"</h3>")
        main_card_paragraphs = NarrativesUtils.block_splitter(main_card_narrative,self._blockSplitter)
        main_card_chart_data = [{"key":val[0],"value":val[1]} for val in zip([i for i,j in self._all_coeffs],[j['coefficient'] for i,j in self._all_coeffs])]
        main_card_chart = NormalChartData(data=main_card_chart_data)
        mainCardChartJson = ChartJson()
        mainCardChartJson.set_data(main_card_chart.get_data())
        mainCardChartJson.set_label_text({'x':'Influencing Factors','y': 'Change in ' + self.result_column + ' per unit increase'})
        mainCardChartJson.set_chart_type("bar")
        mainCardChartJson.set_axes({"x":"key","y":"value"})
        mainCardChartJson.set_yaxis_number_format(".2f")
        # st_info = ["Test : Regression","Threshold for p-value: 0.05", "Effect Size: Regression Coefficient"]
        chart_data = sorted(main_card_chart_data,key=lambda x:x["value"],reverse=True)
        statistical_info_array=[
            ("Test Type","Regression"),
            ("Effect Size","Coefficients"),
            ("Max Effect Size",chart_data[0]["key"]),
            ("Min Effect Size",chart_data[-1]["key"]),
            ]
        statistical_inferenc = ""
        if len(chart_data) == 1:
            statistical_inference = "{} is the only variable that have significant influence over {} (Target) having an \
             Effect size of {}".format(chart_data[0]["key"],self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4))
        elif len(chart_data) == 2:
            statistical_inference = "There are two variables ({} and {}) that have significant influence over {} (Target) and the \
             Effect size ranges are {} and {} respectively".format(chart_data[0]["key"],chart_data[1]["key"],self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4),round(chart_data[1]["value"],4))
        else:
            statistical_inference = "There are {} variables that have significant influence over {} (Target) and the \
             Effect size ranges from {} to {}".format(len(chart_data),self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4),round(chart_data[-1]["value"],4))
        if statistical_inference != "":
            statistical_info_array.append(("Inference",statistical_inference))
        statistical_info_array = NarrativesUtils.statistical_info_array_formatter(statistical_info_array)
        main_card.set_card_data(data = [main_card_header]+main_card_paragraphs+[C3ChartData(data=mainCardChartJson,info=statistical_info_array)])
        main_card.set_card_name("Key Influencers")
        self._regressionNode.add_a_card(main_card)


        count = 0
        for measure_column in self.significant_measures:
            sigMeasureNode = NarrativesTree()
            sigMeasureNode.set_name(measure_column)
            measureCard1 = NormalCard()
            measureCard1.set_card_name("{}: Impact on {}".format(measure_column,self.result_column))
            measureCard1Data = []
            if self._run_dimension_level_regression:
                measureCard2 = NormalCard()
                measureCard2.set_card_name("Key Areas where it Matters")
                measureCard2Data = []

            measure_column_cards = {}
            card0 = {}
            card1data = regression_narrative_obj.generate_card1_data(measure_column)
            card1heading = "<h3>Impact of "+measure_column+" on "+self.result_column+"</h3>"
            measureCard1Header = HtmlData(data=card1heading)
            card1data.update({"blockSplitter":self._blockSplitter})
            card1narrative = NarrativesUtils.get_template_output(self._base_dir,\
                                                            'regression_card1.html',card1data)

            card1paragraphs = NarrativesUtils.block_splitter(card1narrative,self._blockSplitter)
            card0 = {"paragraphs":card1paragraphs}
            card0["charts"] = {}
            card0['charts']['chart2']={}
            # card0['charts']['chart2']['data']=card1data["chart_data"]
            # card0['charts']['chart2']['heading'] = ''
            # card0['charts']['chart2']['labels'] = {}
            card0['charts']['chart1']={}
            card0["heading"] = card1heading
            measure_column_cards['card0'] = card0

            measureCard1Header = HtmlData(data=card1heading)
            measureCard1Data += [measureCard1Header]
            measureCard1para = card1paragraphs
            measureCard1Data += measureCard1para

            if self._run_dimension_level_regression:
                print("running narratives for key area dict")
                self._dim_regression = self.run_regression_for_dimension_levels()
                card2table, card2data=regression_narrative_obj.generate_card2_data(measure_column,self._dim_regression)
                card2data.update({"blockSplitter":self._blockSplitter})
                card2narrative = NarrativesUtils.get_template_output(self._base_dir,\
                                                            'regression_card2.html',card2data)
                card2paragraphs = NarrativesUtils.block_splitter(card2narrative,self._blockSplitter)

                card1 = {'tables': card2table, 'paragraphs' : card2paragraphs,
                        'heading' : 'Key Areas where ' + measure_column + ' matters'}
                measure_column_cards['card1'] = card1

                measureCard2Data += card2paragraphs
                if "table1" in card2table:
                    table1data = regression_narrative_obj.convert_table_data(card2table["table1"])
                    card2Table1 = TableData()
                    card2Table1.set_table_data(table1data)
                    card2Table1.set_table_type("heatMap")
                    card2Table1.set_table_top_header(card2table["table1"]["heading"])
                    card2Table1Json = json.loads(CommonUtils.convert_python_object_to_json(card2Table1))
                    # measureCard2Data.insert(3,card2Table1)
                    measureCard2Data.insert(3,card2Table1Json)

                if "table2" in card2table:
                    table2data = regression_narrative_obj.convert_table_data(card2table["table2"])
                    card2Table2 = TableData()
                    card2Table2.set_table_data(table2data)
                    card2Table2.set_table_type("heatMap")
                    card2Table2.set_table_top_header(card2table["table2"]["heading"])
                    # measureCard2Data.insert(5,card2Table2)
                    card2Table2Json = json.loads(CommonUtils.convert_python_object_to_json(card2Table2))
                    # measureCard2Data.append(card2Table2)
                    measureCard2Data.append(card2Table2Json)


            # self._result_setter.set_trend_section_data({"result_column":self.result_column,
            #                                             "measure_column":measure_column,
            #                                             "base_dir":self._base_dir
            #                                             })
            # trend_narratives_obj = TimeSeriesNarrative(self._dataframe_helper, self._dataframe_context, self._result_setter, self._spark, self._story_narrative)
            # card2 =  trend_narratives_obj.get_regression_trend_card_data()
            # if card2:
            #     measure_column_cards['card2'] = card2
            #
            #
            # card3 = {}
            progressMessage = CommonUtils.create_progress_message_object(self._analysisName,"custom","info","Analyzing Key Influencers",self._completionStatus,self._completionStatus,display=True)
            CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=False)
            card4data = regression_narrative_obj.generate_card4_data(self.result_column,measure_column)
            card4data.update({"blockSplitter":self._blockSplitter})
            # card4heading = "Sensitivity Analysis: Effect of "+self.result_column+" on Segments of "+measure_column
            card4narrative = NarrativesUtils.get_template_output(self._base_dir,\
                                                                'regression_card4.html',card4data)
            card4paragraphs = NarrativesUtils.block_splitter(card4narrative,self._blockSplitter)
            # card3 = {"paragraphs":card4paragraphs}
            card0['paragraphs'] = card1paragraphs+card4paragraphs
            card4Chart = card4data["charts"]
            # st_info = ["Test : Regression", "Variables : "+ self.result_column +", "+measure_column,"Intercept : "+str(round(self._df_regression_result.get_intercept(),2)), "Regression Coefficient : "+ str(round(self._df_regression_result.get_coeff(measure_column),2))]
            statistical_info_array=[
                ("Test Type","Regression"),
                ("Coefficient",str(round(self._df_regression_result.get_coeff(measure_column),2))),
                ("P-Value","<= 0.05"),
                ("Intercept",str(round(self._df_regression_result.get_intercept(),2))),
                ("R Square ",str(round(self._df_regression_result.get_rsquare(),2))),
                ]
            inferenceTuple = ()
            coeff = self._df_regression_result.get_coeff(measure_column)
            if coeff > 0:
                inferenceTuple = ("Inference","For every additional unit of increase in {} there will be an increase of {} units in {} (target).".format(measure_column,str(round(coeff,2)),self._dataframe_context.get_result_column()))
            else:
                inferenceTuple = ("Inference","For every additional unit of decrease in {} there will be an decrease of {} units in {} (target).".format(measure_column,str(round(coeff,2)),self._dataframe_context.get_result_column()))
            if len(inferenceTuple) > 0:
                statistical_info_array.append(inferenceTuple)
            statistical_info_array = NarrativesUtils.statistical_info_array_formatter(statistical_info_array)

            card4paragraphs.insert(2,C3ChartData(data=card4Chart,info=statistical_info_array))
            measureCard1Data += card4paragraphs

            self.narratives['cards'].append(measure_column_cards)

            if count == 0:
                card4data.pop("charts")
                self._result_setter.update_executive_summary_data(card4data)
            count += 1
            measureCard1.set_card_data(measureCard1Data)
            if self._run_dimension_level_regression:
                measureCard2.set_card_data(measureCard2Data)
                sigMeasureNode.add_cards([measureCard1,measureCard2])
            sigMeasureNode.add_cards([measureCard1])
            self._regressionNode.add_a_node(sigMeasureNode)
        # self._result_setter.set_trend_section_completion_status(True)
        self._story_narrative.add_a_node(self._regressionNode)

예제 #2

파일 보기

    def Predict(self):
        self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight(
        )
        self._scriptStages = {
            "initialization": {
                "summary": "Initialized the Random Forest Scripts",
                "weight": 2
            },
            "prediction": {
                "summary": "Random Forest Model Prediction Finished",
                "weight": 2
            },
            "frequency": {
                "summary": "descriptive analysis finished",
                "weight": 2
            },
            "chisquare": {
                "summary": "chi Square analysis finished",
                "weight": 4
            },
            "completion": {
                "summary": "all analysis finished",
                "weight": 4
            },
        }

        self._completionStatus += old_div(
            self._scriptWeightDict[self._analysisName]["total"] *
            self._scriptStages["initialization"]["weight"], 10)
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "initialization",\
                                    "info",\
                                    self._scriptStages["initialization"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsg)
        self._dataframe_context.update_completion_status(
            self._completionStatus)
        # Match with the level_counts and then clean the data
        dataSanity = True
        level_counts_train = self._dataframe_context.get_level_count_dict()
        cat_cols = self._dataframe_helper.get_string_columns()
        # level_counts_score = CommonUtils.get_level_count_dict(self._data_frame,cat_cols,self._dataframe_context.get_column_separator(),output_type="dict")
        # if level_counts_train != {}:
        #     for key in level_counts_train:
        #         if key in level_counts_score:
        #             if level_counts_train[key] != level_counts_score[key]:
        #                 dataSanity = False
        #         else:
        #             dataSanity = False
        categorical_columns = self._dataframe_helper.get_string_columns()
        uid_col = self._dataframe_context.get_uid_column()
        if self._metaParser.check_column_isin_ignored_suggestion(uid_col):
            categorical_columns = list(set(categorical_columns) - {uid_col})
        allDateCols = self._dataframe_context.get_date_columns()
        categorical_columns = list(set(categorical_columns) - set(allDateCols))
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        result_column = self._dataframe_context.get_result_column()
        test_data_path = self._dataframe_context.get_input_file()

        if self._mlEnv == "spark":
            pass
        elif self._mlEnv == "sklearn":

            score_data_path = self._dataframe_context.get_score_path(
            ) + "/data.csv"
            if score_data_path.startswith("file"):
                score_data_path = score_data_path[7:]
            trained_model_path = self._dataframe_context.get_model_path()
            trained_model_path += "/" + self._dataframe_context.get_model_for_scoring(
            ) + ".pkl"
            if trained_model_path.startswith("file"):
                trained_model_path = trained_model_path[7:]
            score_summary_path = self._dataframe_context.get_score_path(
            ) + "/Summary/summary.json"
            if score_summary_path.startswith("file"):
                score_summary_path = score_summary_path[7:]
            trained_model = joblib.load(trained_model_path)
            # pandas_df = self._data_frame.toPandas()
            df = self._data_frame.toPandas()
            model_columns = self._dataframe_context.get_model_features()
            pandas_df = MLUtils.create_dummy_columns(
                df, [x for x in categorical_columns if x != result_column])
            pandas_df = MLUtils.fill_missing_columns(pandas_df, model_columns,
                                                     result_column)
            if uid_col:
                pandas_df = pandas_df[[
                    x for x in pandas_df.columns if x != uid_col
                ]]
            y_score = trained_model.predict(pandas_df)
            y_prob = trained_model.predict_proba(pandas_df)
            y_prob = MLUtils.calculate_predicted_probability(y_prob)
            y_prob = list([round(x, 2) for x in y_prob])
            score = {
                "predicted_class": y_score,
                "predicted_probability": y_prob
            }

        df["predicted_class"] = score["predicted_class"]
        labelMappingDict = self._dataframe_context.get_label_map()
        df["predicted_class"] = df["predicted_class"].apply(
            lambda x: labelMappingDict[x] if x != None else "NA")
        df["predicted_probability"] = score["predicted_probability"]
        self._score_summary[
            "prediction_split"] = MLUtils.calculate_scored_probability_stats(
                df)
        self._score_summary["result_column"] = result_column
        if result_column in df.columns:
            df.drop(result_column, axis=1, inplace=True)
        df = df.rename(index=str, columns={"predicted_class": result_column})
        df.to_csv(score_data_path, header=True, index=False)
        uidCol = self._dataframe_context.get_uid_column()
        if uidCol == None:
            uidCols = self._metaParser.get_suggested_uid_columns()
            if len(uidCols) > 0:
                uidCol = uidCols[0]
        uidTableData = []
        predictedClasses = list(df[result_column].unique())
        if uidCol:
            if uidCol in df.columns:
                for level in predictedClasses:
                    levelDf = df[df[result_column] == level]
                    levelDf = levelDf[[
                        uidCol, "predicted_probability", result_column
                    ]]
                    levelDf.sort_values(by="predicted_probability",
                                        ascending=False,
                                        inplace=True)
                    levelDf["predicted_probability"] = levelDf[
                        "predicted_probability"].apply(
                            lambda x: humanize.apnumber(x * 100) + "%"
                            if x * 100 >= 10 else str(int(x * 100)) + "%")
                    uidTableData.append(levelDf[:5])
                uidTableData = pd.concat(uidTableData)
                uidTableData = [list(arr) for arr in list(uidTableData.values)]
                uidTableData = [[uidCol, "Probability", result_column]
                                ] + uidTableData
                uidTable = TableData()
                uidTable.set_table_width(25)
                uidTable.set_table_data(uidTableData)
                uidTable.set_table_type("normalHideColumn")
                self._result_setter.set_unique_identifier_table(
                    json.loads(
                        CommonUtils.convert_python_object_to_json(uidTable)))

        self._completionStatus += old_div(
            self._scriptWeightDict[self._analysisName]["total"] *
            self._scriptStages["prediction"]["weight"], 10)
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "prediction",\
                                    "info",\
                                    self._scriptStages["prediction"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsg)
        self._dataframe_context.update_completion_status(
            self._completionStatus)
        # CommonUtils.write_to_file(score_summary_path,json.dumps({"scoreSummary":self._score_summary}))

        print("STARTING DIMENSION ANALYSIS ...")
        columns_to_keep = []
        columns_to_drop = []

        # considercolumnstype = self._dataframe_context.get_score_consider_columns_type()
        # considercolumns = self._dataframe_context.get_score_consider_columns()
        # if considercolumnstype != None:
        #     if considercolumns != None:
        #         if considercolumnstype == ["excluding"]:
        #             columns_to_drop = considercolumns
        #         elif considercolumnstype == ["including"]:
        #             columns_to_keep = considercolumns

        columns_to_keep = self._dataframe_context.get_score_consider_columns()
        if len(columns_to_keep) > 0:
            columns_to_drop = list(set(df.columns) - set(columns_to_keep))
        else:
            columns_to_drop += ["predicted_probability"]
        columns_to_drop = [
            x for x in columns_to_drop
            if x in df.columns and x != result_column
        ]
        print("columns_to_drop", columns_to_drop)
        df.drop(columns_to_drop, axis=1, inplace=True)

        resultColLevelCount = dict(df[result_column].value_counts())
        # self._metaParser.update_level_counts(result_column,resultColLevelCount)
        self._metaParser.update_column_dict(
            result_column, {
                "LevelCount": resultColLevelCount,
                "numberOfUniqueValues": len(list(resultColLevelCount.keys()))
            })
        self._dataframe_context.set_story_on_scored_data(True)
        SQLctx = SQLContext(sparkContext=self._spark.sparkContext,
                            sparkSession=self._spark)
        spark_scored_df = SQLctx.createDataFrame(df)
        # spark_scored_df.write.csv(score_data_path+"/data",mode="overwrite",header=True)
        # TODO update metadata for the newly created dataframe
        self._dataframe_context.update_consider_columns(columns_to_keep)
        df_helper = DataFrameHelper(spark_scored_df, self._dataframe_context,
                                    self._metaParser)
        df_helper.set_params()
        spark_scored_df = df_helper.get_data_frame()
        # try:
        #     fs = time.time()
        #     narratives_file = self._dataframe_context.get_score_path()+"/narratives/FreqDimension/data.json"
        #     if narratives_file.startswith("file"):
        #         narratives_file = narratives_file[7:]
        #     result_file = self._dataframe_context.get_score_path()+"/results/FreqDimension/data.json"
        #     if result_file.startswith("file"):
        #         result_file = result_file[7:]
        #     init_freq_dim = FreqDimensions(df, df_helper, self._dataframe_context,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName)
        #     df_freq_dimension_obj = init_freq_dim.test_all(dimension_columns=[result_column])
        #     df_freq_dimension_result = CommonUtils.as_dict(df_freq_dimension_obj)
        #     narratives_obj = DimensionColumnNarrative(result_column, df_helper, self._dataframe_context, df_freq_dimension_obj,self._result_setter,self._prediction_narrative,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName)
        #     narratives = CommonUtils.as_dict(narratives_obj)
        #
        #     print "Frequency Analysis Done in ", time.time() - fs,  " seconds."
        #     self._completionStatus += self._scriptWeightDict[self._analysisName]["total"]*self._scriptStages["frequency"]["weight"]/10
        #     progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
        #                                 "frequency",\
        #                                 "info",\
        #                                 self._scriptStages["frequency"]["summary"],\
        #                                 self._completionStatus,\
        #                                 self._completionStatus)
        #     CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsg)
        #     self._dataframe_context.update_completion_status(self._completionStatus)
        #     print "Frequency ",self._completionStatus
        # except:
        #     print "Frequency Analysis Failed "
        #
        # try:
        #     fs = time.time()
        #     narratives_file = self._dataframe_context.get_score_path()+"/narratives/ChiSquare/data.json"
        #     if narratives_file.startswith("file"):
        #         narratives_file = narratives_file[7:]
        #     result_file = self._dataframe_context.get_score_path()+"/results/ChiSquare/data.json"
        #     if result_file.startswith("file"):
        #         result_file = result_file[7:]
        #     init_chisquare_obj = ChiSquare(df, df_helper, self._dataframe_context,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName)
        #     df_chisquare_obj = init_chisquare_obj.test_all(dimension_columns= [result_column])
        #     df_chisquare_result = CommonUtils.as_dict(df_chisquare_obj)
        #     chisquare_narratives = CommonUtils.as_dict(ChiSquareNarratives(df_helper, df_chisquare_obj, self._dataframe_context,df,self._prediction_narrative,self._result_setter,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName))
        # except:
        #     print "ChiSquare Analysis Failed "
        if len(predictedClasses) >= 2:
            try:
                fs = time.time()
                df_decision_tree_obj = DecisionTrees(
                    spark_scored_df,
                    df_helper,
                    self._dataframe_context,
                    self._spark,
                    self._metaParser,
                    scriptWeight=self._scriptWeightDict,
                    analysisName=self._analysisName).test_all(
                        dimension_columns=[result_column])
                narratives_obj = CommonUtils.as_dict(
                    DecisionTreeNarrative(result_column,
                                          df_decision_tree_obj,
                                          self._dataframe_helper,
                                          self._dataframe_context,
                                          self._metaParser,
                                          self._result_setter,
                                          story_narrative=None,
                                          analysisName=self._analysisName,
                                          scriptWeight=self._scriptWeightDict))
                print(narratives_obj)
            except:
                print("DecisionTree Analysis Failed ")
        else:
            data_dict = {
                "npred": len(predictedClasses),
                "nactual": len(list(labelMappingDict.values()))
            }
            if data_dict["nactual"] > 2:
                levelCountDict[predictedClasses[0]] = resultColLevelCount[
                    predictedClasses[0]]
                levelCountDict["Others"] = sum([
                    v for k, v in list(resultColLevelCount.items())
                    if k != predictedClasses[0]
                ])
            else:
                levelCountDict = resultColLevelCount
                otherClass = list(
                    set(labelMappingDict.values()) - set(predictedClasses))[0]
                levelCountDict[otherClass] = 0

                print(levelCountDict)

            total = float(
                sum([x for x in list(levelCountDict.values()) if x != None]))
            levelCountTuple = [({
                "name":
                k,
                "count":
                v,
                "percentage":
                humanize.apnumber(old_div(v * 100, total)) +
                "%" if old_div(v * 100, total) >= 10 else
                str(int(old_div(v * 100, total))) + "%"
            }) for k, v in list(levelCountDict.items()) if v != None]
            levelCountTuple = sorted(levelCountTuple,
                                     key=lambda x: x["count"],
                                     reverse=True)
            data_dict["blockSplitter"] = "|~NEWBLOCK~|"
            data_dict["targetcol"] = result_column
            data_dict["nlevel"] = len(list(levelCountDict.keys()))
            data_dict["topLevel"] = levelCountTuple[0]
            data_dict["secondLevel"] = levelCountTuple[1]
            maincardSummary = NarrativesUtils.get_template_output(
                "/apps/", 'scorewithoutdtree.html', data_dict)

            main_card = NormalCard()
            main_card_data = []
            main_card_narrative = NarrativesUtils.block_splitter(
                maincardSummary, "|~NEWBLOCK~|")
            main_card_data += main_card_narrative

            chartData = NormalChartData([levelCountDict]).get_data()
            chartJson = ChartJson(data=chartData)
            chartJson.set_title(result_column)
            chartJson.set_chart_type("donut")
            mainCardChart = C3ChartData(data=chartJson)
            mainCardChart.set_width_percent(33)
            main_card_data.append(mainCardChart)

            uidTable = self._result_setter.get_unique_identifier_table()
            if uidTable != None:
                main_card_data.append(uidTable)
            main_card.set_card_data(main_card_data)
            main_card.set_card_name(
                "Predicting Key Drivers of {}".format(result_column))
            self._result_setter.set_score_dtree_cards([main_card], {})

예제 #3

파일 보기

파일: naive_bayes_pyspark.py 프로젝트: Srinidhi-SA/mAdvisordevSpark

    def Predict(self):
        self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight(
        )
        self._scriptStages = {
            "initialization": {
                "summary": "Initialized the Naive Bayes Scripts",
                "weight": 2
            },
            "prediction": {
                "summary": "Spark ML Naive Bayes Model Prediction Finished",
                "weight": 2
            },
            "frequency": {
                "summary": "descriptive analysis finished",
                "weight": 2
            },
            "chisquare": {
                "summary": "chi Square analysis finished",
                "weight": 4
            },
            "completion": {
                "summary": "all analysis finished",
                "weight": 4
            },
        }

        self._completionStatus += self._scriptWeightDict[self._analysisName][
            "total"] * self._scriptStages["initialization"]["weight"] / 10
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "initialization",\
                                    "info",\
                                    self._scriptStages["initialization"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL, progressMessage)
        self._dataframe_context.update_completion_status(
            self._completionStatus)

        SQLctx = SQLContext(sparkContext=self._spark.sparkContext,
                            sparkSession=self._spark)
        dataSanity = True
        level_counts_train = self._dataframe_context.get_level_count_dict()
        categorical_columns = self._dataframe_helper.get_string_columns()
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        time_dimension_columns = self._dataframe_helper.get_timestamp_columns()
        result_column = self._dataframe_context.get_result_column()
        categorical_columns = [
            x for x in categorical_columns if x != result_column
        ]

        level_counts_score = CommonUtils.get_level_count_dict(
            self._data_frame,
            categorical_columns,
            self._dataframe_context.get_column_separator(),
            output_type="dict",
            dataType="spark")
        for key in level_counts_train:
            if key in level_counts_score:
                if level_counts_train[key] != level_counts_score[key]:
                    dataSanity = False
            else:
                dataSanity = False

        test_data_path = self._dataframe_context.get_input_file()
        score_data_path = self._dataframe_context.get_score_path(
        ) + "/data.csv"
        trained_model_path = self._dataframe_context.get_model_path()
        trained_model_path = "/".join(
            trained_model_path.split("/")[:-1]
        ) + "/" + self._slug + "/" + self._dataframe_context.get_model_for_scoring(
        )
        # score_summary_path = self._dataframe_context.get_score_path()+"/Summary/summary.json"

        pipelineModel = MLUtils.load_pipeline(trained_model_path)

        df = self._data_frame
        transformed = pipelineModel.transform(df)
        label_indexer_dict = MLUtils.read_string_indexer_mapping(
            trained_model_path, SQLctx)
        prediction_to_levels = udf(lambda x: label_indexer_dict[x],
                                   StringType())
        transformed = transformed.withColumn(
            result_column, prediction_to_levels(transformed.prediction))

        if "probability" in transformed.columns:
            probability_dataframe = transformed.select(
                [result_column, "probability"]).toPandas()
            probability_dataframe = probability_dataframe.rename(
                index=str, columns={result_column: "predicted_class"})
            probability_dataframe[
                "predicted_probability"] = probability_dataframe[
                    "probability"].apply(lambda x: max(x))
            self._score_summary[
                "prediction_split"] = MLUtils.calculate_scored_probability_stats(
                    probability_dataframe)
            self._score_summary["result_column"] = result_column
            scored_dataframe = transformed.select(
                categorical_columns + time_dimension_columns +
                numerical_columns + [result_column, "probability"]).toPandas()
            scored_dataframe['predicted_probability'] = probability_dataframe[
                "predicted_probability"].values
            # scored_dataframe = scored_dataframe.rename(index=str, columns={"predicted_probability": "probability"})
        else:
            self._score_summary["prediction_split"] = []
            self._score_summary["result_column"] = result_column
            scored_dataframe = transformed.select(categorical_columns +
                                                  time_dimension_columns +
                                                  numerical_columns +
                                                  [result_column]).toPandas()

        labelMappingDict = self._dataframe_context.get_label_map()
        if score_data_path.startswith("file"):
            score_data_path = score_data_path[7:]
        scored_dataframe.to_csv(score_data_path, header=True, index=False)

        uidCol = self._dataframe_context.get_uid_column()
        if uidCol == None:
            uidCols = self._metaParser.get_suggested_uid_columns()
            if len(uidCols) > 0:
                uidCol = uidCols[0]
        uidTableData = []
        predictedClasses = list(scored_dataframe[result_column].unique())
        if uidCol:
            if uidCol in df.columns:
                for level in predictedClasses:
                    levelDf = scored_dataframe[scored_dataframe[result_column]
                                               == level]
                    levelDf = levelDf[[
                        uidCol, "predicted_probability", result_column
                    ]]
                    levelDf.sort_values(by="predicted_probability",
                                        ascending=False,
                                        inplace=True)
                    levelDf["predicted_probability"] = levelDf[
                        "predicted_probability"].apply(
                            lambda x: humanize.apnumber(x * 100) + "%"
                            if x * 100 >= 10 else str(int(x * 100)) + "%")
                    uidTableData.append(levelDf[:5])
                uidTableData = pd.concat(uidTableData)
                uidTableData = [list(arr) for arr in list(uidTableData.values)]
                uidTableData = [[uidCol, "Probability", result_column]
                                ] + uidTableData
                uidTable = TableData()
                uidTable.set_table_width(25)
                uidTable.set_table_data(uidTableData)
                uidTable.set_table_type("normalHideColumn")
                self._result_setter.set_unique_identifier_table(
                    json.loads(
                        CommonUtils.convert_python_object_to_json(uidTable)))

        self._completionStatus += self._scriptWeightDict[self._analysisName][
            "total"] * self._scriptStages["prediction"]["weight"] / 10
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "prediction",\
                                    "info",\
                                    self._scriptStages["prediction"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL, progressMessage)
        self._dataframe_context.update_completion_status(
            self._completionStatus)

        print("STARTING DIMENSION ANALYSIS ...")
        columns_to_keep = []
        columns_to_drop = []

        columns_to_keep = self._dataframe_context.get_score_consider_columns()

        if len(columns_to_keep) > 0:
            columns_to_drop = list(set(df.columns) - set(columns_to_keep))
        else:
            columns_to_drop += ["predicted_probability"]

        scored_df = transformed.select(categorical_columns +
                                       time_dimension_columns +
                                       numerical_columns + [result_column])
        columns_to_drop = [
            x for x in columns_to_drop if x in scored_df.columns
        ]
        modified_df = scored_df.select(
            [x for x in scored_df.columns if x not in columns_to_drop])
        resultColLevelCount = dict(
            modified_df.groupby(result_column).count().collect())
        self._metaParser.update_column_dict(
            result_column, {
                "LevelCount": resultColLevelCount,
                "numberOfUniqueValues": len(resultColLevelCount.keys())
            })
        self._dataframe_context.set_story_on_scored_data(True)

        self._dataframe_context.update_consider_columns(columns_to_keep)
        df_helper = DataFrameHelper(modified_df, self._dataframe_context,
                                    self._metaParser)
        df_helper.set_params()
        spark_scored_df = df_helper.get_data_frame()

        if len(predictedClasses) >= 2:
            try:
                fs = time.time()
                df_decision_tree_obj = DecisionTrees(
                    spark_scored_df,
                    df_helper,
                    self._dataframe_context,
                    self._spark,
                    self._metaParser,
                    scriptWeight=self._scriptWeightDict,
                    analysisName=self._analysisName).test_all(
                        dimension_columns=[result_column])
                narratives_obj = CommonUtils.as_dict(
                    DecisionTreeNarrative(result_column,
                                          df_decision_tree_obj,
                                          self._dataframe_helper,
                                          self._dataframe_context,
                                          self._metaParser,
                                          self._result_setter,
                                          story_narrative=None,
                                          analysisName=self._analysisName,
                                          scriptWeight=self._scriptWeightDict))
                print(narratives_obj)
            except Exception as e:
                print("DecisionTree Analysis Failed ", str(e))
        else:
            data_dict = {
                "npred": len(predictedClasses),
                "nactual": len(labelMappingDict.values())
            }

            if data_dict["nactual"] > 2:
                levelCountDict[predictedClasses[0]] = resultColLevelCount[
                    predictedClasses[0]]
                levelCountDict["Others"] = sum([
                    v for k, v in resultColLevelCount.items()
                    if k != predictedClasses[0]
                ])
            else:
                levelCountDict = resultColLevelCount
                otherClass = list(
                    set(labelMappingDict.values()) - set(predictedClasses))[0]
                levelCountDict[otherClass] = 0

                print(levelCountDict)

            total = float(
                sum([x for x in levelCountDict.values() if x != None]))
            levelCountTuple = [({
                "name":
                k,
                "count":
                v,
                "percentage":
                humanize.apnumber(v * 100 / total) + "%"
            }) for k, v in levelCountDict.items() if v != None]
            levelCountTuple = sorted(levelCountTuple,
                                     key=lambda x: x["count"],
                                     reverse=True)
            data_dict["blockSplitter"] = "|~NEWBLOCK~|"
            data_dict["targetcol"] = result_column
            data_dict["nlevel"] = len(levelCountDict.keys())
            data_dict["topLevel"] = levelCountTuple[0]
            data_dict["secondLevel"] = levelCountTuple[1]
            maincardSummary = NarrativesUtils.get_template_output(
                "/apps/", 'scorewithoutdtree.html', data_dict)

            main_card = NormalCard()
            main_card_data = []
            main_card_narrative = NarrativesUtils.block_splitter(
                maincardSummary, "|~NEWBLOCK~|")
            main_card_data += main_card_narrative

            chartData = NormalChartData([levelCountDict]).get_data()
            chartJson = ChartJson(data=chartData)
            chartJson.set_title(result_column)
            chartJson.set_chart_type("donut")
            mainCardChart = C3ChartData(data=chartJson)
            mainCardChart.set_width_percent(33)
            main_card_data.append(mainCardChart)

            uidTable = self._result_setter.get_unique_identifier_table()
            if uidTable != None:
                main_card_data.append(uidTable)
            main_card.set_card_data(main_card_data)
            main_card.set_card_name(
                "Predicting Key Drivers of {}".format(result_column))
            self._result_setter.set_score_dtree_cards([main_card], {})

예제 #4

파일 보기

파일: chisquare.py 프로젝트: Srinidhi-SA/Madvisorhadoop

    def _generate_narratives(self):
        chisquare_result = self._chisquare_result
        target_dimension = self._target_dimension
        analysed_dimension = self._analysed_dimension
        significant_variables = self._significant_variables
        num_analysed_variables = self._num_analysed_variables
        table = self._chiSquareTable
        total = self._chiSquareTable.get_total()

        levels = self._chiSquareTable.get_column_two_levels()
        level_counts = self._chiSquareTable.get_column_total()
        levels_count_sum = sum(level_counts)
        levels_percentages = [
            i * 100.0 / levels_count_sum for i in level_counts
        ]
        sorted_levels = sorted(zip(level_counts, levels), reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]

        top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        bottom_dim = sorted_levels[-1][1]
        bottom_dim_contribution = sorted_levels[-1][0]
        bottom_dims = [
            y for x, y in sorted_levels if x == bottom_dim_contribution
        ]

        target_levels = self._chiSquareTable.get_column_one_levels()

        target_counts = self._chiSquareTable.get_row_total()
        sorted_target_levels = sorted(zip(target_counts, target_levels),
                                      reverse=True)

        top_target_count, top_target = sorted_target_levels[0]
        second_target_count, second_target = sorted_target_levels[1]

        top_target_contributions = [
            table.get_value(top_target, i) for i in levels
        ]
        sum_top_target = sum(top_target_contributions)

        sorted_levels = sorted(zip(top_target_contributions, levels),
                               reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]
        top_target_top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        top_target_top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        top_target_bottom_dim = sorted_levels[-1][1]
        top_target_bottom_dim_contribution = sorted_levels[-1][0]

        top_target_percentages = [
            i * 100.0 / sum_top_target for i in top_target_contributions
        ]
        best_top_target_index = top_target_contributions.index(
            max(top_target_contributions))
        worst_top_target_index = top_target_contributions.index(
            min(top_target_contributions))
        top_target_differences = [
            x - y for x, y in zip(levels_percentages, top_target_percentages)
        ]
        if len(top_target_differences) > 6:
            tops = 2
            bottoms = -2
        elif len(top_target_differences) > 4:
            tops = 2
            bottoms = -1
        else:
            tops = 1
            bottoms = -1
        sorted_ = sorted(enumerate(top_target_differences),
                         key=lambda x: x[1],
                         reverse=True)
        best_top_difference_indices = [x for x, y in sorted_[:tops]]
        worst_top_difference_indices = [x for x, y in sorted_[bottoms:]]

        top_target_shares = [
            x * 100.0 / y
            for x, y in zip(top_target_contributions, level_counts)
        ]
        max_top_target_shares = max(top_target_shares)
        best_top_target_share_index = [
            idx for idx, val in enumerate(top_target_shares)
            if val == max_top_target_shares
        ]
        level_counts_threshold = sum(level_counts) * 0.05 / len(level_counts)
        min_top_target_shares = min([
            x for x, y in zip(top_target_shares, level_counts)
            if y >= level_counts_threshold
        ])
        worst_top_target_share_index = [
            idx for idx, val in enumerate(top_target_shares)
            if val == min_top_target_shares
        ]
        overall_top_percentage = sum_top_target * 100.0 / total

        second_target_contributions = [
            table.get_value(second_target, i) for i in levels
        ]
        sum_second_target = sum(second_target_contributions)

        sorted_levels = sorted(zip(second_target_contributions, levels),
                               reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]
        second_target_top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        second_target_top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        second_target_bottom_dim = sorted_levels[-1][1]
        second_target_bottom_dim_contribution = sorted_levels[-1][0]

        second_target_percentages = [
            i * 100.0 / sum_second_target for i in second_target_contributions
        ]
        best_second_target_index = second_target_contributions.index(
            max(second_target_contributions))
        worst_second_target_index = second_target_contributions.index(
            min(second_target_contributions))
        second_target_differences = [
            x - y
            for x, y in zip(levels_percentages, second_target_percentages)
        ]
        if len(second_target_differences) > 6:
            tops = 2
            bottoms = -2
        elif len(second_target_differences) > 4:
            tops = 2
            bottoms = -1
        else:
            tops = 1
            bottoms = -1
        sorted_ = sorted(enumerate(second_target_differences),
                         key=lambda x: x[1],
                         reverse=True)
        best_second_difference_indices = [x for x, y in sorted_[:tops]]
        worst_second_difference_indices = [x for x, y in sorted_[bottoms:]]

        second_target_shares = [
            x * 100.0 / y
            for x, y in zip(second_target_contributions, level_counts)
        ]
        max_second_target_shares = max(second_target_shares)
        best_second_target_share_index = [
            idx for idx, val in enumerate(second_target_shares)
            if val == max_second_target_shares
        ]
        level_counts_threshold = sum(level_counts) * 0.05 / len(level_counts)
        min_second_target_shares = min([
            x for x, y in zip(second_target_shares, level_counts)
            if y >= level_counts_threshold
        ])
        # worst_second_target_share_index = second_target_shares.index(min_second_target_shares)
        worst_second_target_share_index = [
            idx for idx, val in enumerate(second_target_shares)
            if val == min_second_target_shares
        ]
        overall_second_percentage = sum_second_target * 100.0 / total

        targetCardDataDict = {}
        targetCardDataDict['target'] = target_dimension
        targetCardDataDict['colname'] = analysed_dimension
        targetCardDataDict['num_significant'] = len(significant_variables)
        targetCardDataDict['plural_colname'] = NarrativesUtils.pluralize(
            analysed_dimension)

        targetCardDataDict["blockSplitter"] = self._blockSplitter
        targetCardDataDict["binTargetCol"] = self._binTargetCol
        targetCardDataDict["binAnalyzedCol"] = self._binAnalyzedCol
        targetCardDataDict['highlightFlag'] = self._highlightFlag
        targetCardDataDict['levels'] = levels

        data_dict = {}
        data_dict[
            'best_second_difference'] = best_second_difference_indices  ##these changed
        data_dict['worst_second_difference'] = worst_second_difference_indices
        data_dict['best_top_difference'] = best_top_difference_indices
        data_dict['worst_top_difference'] = worst_top_difference_indices
        data_dict['levels_percentages'] = levels_percentages
        data_dict['top_target_percentages'] = top_target_percentages
        data_dict['second_target_percentages'] = second_target_percentages
        data_dict['levels'] = levels
        data_dict['best_top_share'] = best_top_target_share_index
        data_dict['worst_top_share'] = worst_top_target_share_index
        data_dict['best_second_share'] = best_second_target_share_index
        data_dict['worst_second_share'] = worst_second_target_share_index
        data_dict['top_target_shares'] = top_target_shares
        data_dict['second_target_shares'] = second_target_shares
        data_dict['overall_second'] = overall_second_percentage
        data_dict['overall_top'] = overall_top_percentage

        data_dict['num_significant'] = len(significant_variables)
        data_dict['colname'] = analysed_dimension
        data_dict['plural_colname'] = NarrativesUtils.pluralize(
            analysed_dimension)
        data_dict['target'] = target_dimension
        data_dict['top_levels'] = top_dims
        data_dict['top_levels_percent'] = round(
            top_dims_contribution * 100.0 / total, 1)
        data_dict['bottom_level'] = bottom_dim
        data_dict['bottom_levels'] = bottom_dims
        data_dict['bottom_level_percent'] = round(
            bottom_dim_contribution * 100 / sum(level_counts), 2)
        data_dict['second_target'] = second_target
        data_dict['second_target_top_dims'] = second_target_top_dims
        data_dict[
            'second_target_top_dims_contribution'] = second_target_top_dims_contribution * 100.0 / sum(
                second_target_contributions)
        data_dict['second_target_bottom_dim'] = second_target_bottom_dim
        data_dict[
            'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution
        data_dict['best_second_target'] = levels[best_second_target_index]
        data_dict['best_second_target_count'] = second_target_contributions[
            best_second_target_index]
        data_dict['best_second_target_percent'] = round(
            second_target_contributions[best_second_target_index] * 100.0 /
            sum(second_target_contributions), 2)
        data_dict['worst_second_target'] = levels[worst_second_target_index]
        data_dict['worst_second_target_percent'] = round(
            second_target_contributions[worst_second_target_index] * 100.0 /
            sum(second_target_contributions), 2)

        data_dict['top_target'] = top_target
        data_dict['top_target_top_dims'] = top_target_top_dims
        data_dict[
            'top_target_top_dims_contribution'] = top_target_top_dims_contribution * 100.0 / sum(
                top_target_contributions)
        data_dict['top_target_bottom_dim'] = top_target_bottom_dim
        data_dict[
            'top_target_bottom_dim_contribution'] = top_target_bottom_dim_contribution
        data_dict['best_top_target'] = levels[best_top_target_index]
        data_dict['best_top_target_count'] = top_target_contributions[
            best_top_target_index]
        data_dict['best_top_target_percent'] = round(
            top_target_contributions[best_top_target_index] * 100.0 /
            sum(top_target_contributions), 2)
        data_dict['worst_top_target'] = levels[worst_top_target_index]
        data_dict['worst_top_target_percent'] = round(
            top_target_contributions[worst_top_target_index] * 100.0 /
            sum(top_target_contributions), 2)

        data_dict["blockSplitter"] = self._blockSplitter
        data_dict["binTargetCol"] = self._binTargetCol
        data_dict["binAnalyzedCol"] = self._binAnalyzedCol
        data_dict['highlightFlag'] = self._highlightFlag

        ###############
        #     CARD1   #
        ###############

        print "self._binTargetCol & self._binAnalyzedCol : ", self._binTargetCol, self._binAnalyzedCol
        if (self._binTargetCol == True & self._binAnalyzedCol == False):
            print "Only Target Column is Binned, : ", self._binTargetCol
            output = NarrativesUtils.block_splitter(
                NarrativesUtils.get_template_output(
                    self._base_dir, 'card1_binned_target.html', data_dict),
                self._blockSplitter,
                highlightFlag=self._highlightFlag)
        elif (self._binTargetCol == True & self._binAnalyzedCol == True):
            print "Target Column and IV is Binned : ", self._binTargetCol, self._binAnalyzedCol
            output = NarrativesUtils.block_splitter(
                NarrativesUtils.get_template_output(
                    self._base_dir, 'card1_binned_target_and_IV.html',
                    data_dict),
                self._blockSplitter,
                highlightFlag=self._highlightFlag)
        else:
            output = NarrativesUtils.block_splitter(
                NarrativesUtils.get_template_output(self._base_dir,
                                                    'card1.html', data_dict),
                self._blockSplitter,
                highlightFlag=self._highlightFlag)

        targetDimCard1Data = []
        targetDimcard1Heading = '<h3>Relationship between ' + self._target_dimension + '  and ' + self._analysed_dimension + "</h3>"

        toggledata = ToggleData()

        targetDimTable1Data = self.generate_card1_table1()
        targetDimCard1Table1 = TableData()
        targetDimCard1Table1.set_table_type("heatMap")
        targetDimCard1Table1.set_table_data(targetDimTable1Data)
        toggledata.set_toggleon_data({
            "data": {
                "tableData": targetDimTable1Data,
                "tableType": "heatMap"
            },
            "dataType": "table"
        })

        targetDimTable2Data = self.generate_card1_table2()
        targetDimCard1Table2 = TableData()
        targetDimCard1Table2.set_table_type("normal")
        table2Data = targetDimTable2Data["data1"]
        table2Data = [
            innerList[1:] for innerList in table2Data
            if innerList[0].strip() != ""
        ]
        targetDimCard1Table2.set_table_data(table2Data)

        toggledata.set_toggleoff_data({
            "data": {
                "tableData": table2Data,
                "tableType": "heatMap"
            },
            "dataType": "table"
        })

        targetDimCard1Data.append(HtmlData(data=targetDimcard1Heading))
        targetDimCard1Data.append(toggledata)
        targetDimCard1Data += output

        self._card1.set_card_data(targetDimCard1Data)
        self._card1.set_card_name("{}: Relationship with {}".format(
            self._analysed_dimension, self._target_dimension))

        ###############
        #     CARD2   #
        ###############

        if self._appid == None:

            key_factors = ''
            num_key_factors = len(self._second_level_dimensions)

            if len(self._second_level_dimensions) == 5:
                key_factors = ', '.join(
                    self._second_level_dimensions[:4]
                ) + ' and ' + self._second_level_dimensions[4]
            elif len(self._second_level_dimensions) == 4:
                key_factors = ', '.join(
                    self._second_level_dimensions[:3]
                ) + ' and ' + self._second_level_dimensions[3]
            elif len(self._second_level_dimensions) == 3:
                key_factors = ', '.join(
                    self._second_level_dimensions[:2]
                ) + ' and ' + self._second_level_dimensions[2]
            elif len(self._second_level_dimensions) == 2:
                key_factors = ' and '.join(self._second_level_dimensions)
            elif len(self._second_level_dimensions) == 1:
                key_factors = self._second_level_dimensions[0]

            targetCardDataDict['num_key_factors'] = num_key_factors
            targetCardDataDict['key_factors'] = key_factors
            dict_for_test = {}
            for tupleObj in sorted_target_levels[:self._chiSquareLevelLimit]:
                targetLevel = tupleObj[1]

                targetCardDataDict['random_card2'] = random.randint(1, 100)
                targetCardDataDict['random_card4'] = random.randint(1, 100)

                second_target_contributions = [
                    table.get_value(targetLevel, i) for i in levels
                ]
                sum_second_target = sum(second_target_contributions)

                sorted_levels = sorted(zip(second_target_contributions,
                                           levels),
                                       reverse=True)

                level_differences = [0.0] + [
                    sorted_levels[i][0] - sorted_levels[i + 1][0]
                    for i in range(len(sorted_levels) - 1)
                ]
                second_target_top_dims = [
                    j for i, j in sorted_levels[:level_differences.
                                                index(max(level_differences))]
                ]
                second_target_top_dims_contribution = sum([
                    i for i, j in sorted_levels[:level_differences.
                                                index(max(level_differences))]
                ])
                second_target_bottom_dim = sorted_levels[-1][1]
                second_target_bottom_dim_contribution = sorted_levels[-1][0]

                second_target_percentages = [
                    i * 100.0 / sum_second_target
                    for i in second_target_contributions
                ]
                best_second_target_index = second_target_contributions.index(
                    max(second_target_contributions))
                worst_second_target_index = second_target_contributions.index(
                    min(second_target_contributions))
                second_target_differences = [
                    x - y for x, y in zip(levels_percentages,
                                          second_target_percentages)
                ]
                if len(second_target_differences) > 6:
                    tops = 2
                    bottoms = -2
                elif len(second_target_differences) > 4:
                    tops = 2
                    bottoms = -1
                else:
                    tops = 1
                    bottoms = -1
                sorted_ = sorted(enumerate(second_target_differences),
                                 key=lambda x: x[1],
                                 reverse=True)
                best_second_difference_indices = [x for x, y in sorted_[:tops]]
                worst_second_difference_indices = [
                    x for x, y in sorted_[bottoms:]
                ]

                second_target_shares = [
                    x * 100.0 / y
                    for x, y in zip(second_target_contributions, level_counts)
                ]
                max_second_target_shares = max(second_target_shares)
                best_second_target_share_index = [
                    idx for idx, val in enumerate(second_target_shares)
                    if val == max_second_target_shares
                ]
                level_counts_threshold = sum(level_counts) * 0.05 / len(
                    level_counts)
                min_second_target_shares = min([
                    x for x, y in zip(second_target_shares, level_counts)
                    if y >= level_counts_threshold
                ])
                worst_second_target_share_index = [
                    idx for idx, val in enumerate(second_target_shares)
                    if val == min_second_target_shares
                ]
                overall_second_percentage = sum_second_target * 100.0 / total

                # DataFrame for contribution calculation

                df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\
                                        filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                                        select(self._second_level_dimensions).toPandas()
                df_second_dim = self._data_frame.filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                                    select(self._second_level_dimensions).toPandas()

                # if self._chisquare_result.get_splits():
                #     splits = self._chisquare_result.get_splits()
                #     idx = self._chiSquareTable.get_bin_names(splits).index(second_target_top_dims[0])
                #     idx1 = self._chiSquareTable.get_bin_names(splits).index(top_target_top_dims[0])
                #     splits[len(splits)-1] = splits[len(splits)-1]+1
                #     df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\
                #                         filter(col(self._analysed_dimension)>=splits[idx]).filter(col(self._analysed_dimension)<splits[idx+1]).\
                #                         select(self._second_level_dimensions).toPandas()
                #     df_second_dim = self._data_frame.filter(col(self._analysed_dimension)>=splits[idx]).\
                #                     filter(col(self._analysed_dimension)<splits[idx+1]).\
                #                     select(self._second_level_dimensions).toPandas()
                # else:
                #     df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\
                #                         filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                #                         select(self._second_level_dimensions).toPandas()
                #     df_second_dim = self._data_frame.filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                #                     select(self._second_level_dimensions).toPandas()

                # print self._data_frame.select('Sales').show()

                distribution_second = []
                for d in self._second_level_dimensions:

                    grouped = df_second_target.groupby(d).agg({
                        d: 'count'
                    }).sort_values(d, ascending=False)
                    contributions = df_second_dim.groupby(d).agg({d: 'count'})
                    contribution_index = list(contributions.index)
                    contributions_val = contributions[d].tolist()
                    contributions_list = dict(
                        zip(contribution_index, contributions_val))
                    index_list = list(grouped.index)
                    grouped_list = grouped[d].tolist()
                    contributions_percent_list = [
                        round(y * 100.0 / contributions_list[x], 2)
                        for x, y in zip(index_list, grouped_list)
                    ]
                    sum_ = grouped[d].sum()
                    diffs = [0] + [
                        grouped_list[i] - grouped_list[i + 1]
                        for i in range(len(grouped_list) - 1)
                    ]
                    max_diff = diffs.index(max(diffs))

                    index_txt = ''
                    if max_diff == 1:
                        index_txt = index_list[0]
                    elif max_diff == 2:
                        index_txt = index_list[0] + '(' + str(
                            round(grouped_list[0] * 100.0 / sum_, 1)
                        ) + '%)' + ' and ' + index_list[1] + '(' + str(
                            round(grouped_list[1] * 100.0 / sum_, 1)) + '%)'
                    elif max_diff > 2:
                        index_txt = 'including ' + index_list[0] + '(' + str(
                            round(grouped_list[0] * 100.0 / sum_, 1)
                        ) + '%)' + ' and ' + index_list[1] + '(' + str(
                            round(grouped_list[1] * 100.0 / sum_, 1)) + '%)'
                    distribution_second.append({'contributions':[round(i*100.0/sum_,2) for i in grouped_list[:max_diff]],\
                                            'levels': index_list[:max_diff],'variation':random.randint(1,100),\
                                            'index_txt': index_txt, 'd':d,'contributions_percent':contributions_percent_list})

                targetCardDataDict['distribution_second'] = distribution_second
                targetCardDataDict['second_target'] = targetLevel
                targetCardDataDict[
                    'second_target_top_dims'] = second_target_top_dims
                targetCardDataDict[
                    'second_target_top_dims_contribution'] = second_target_top_dims_contribution * 100.0 / sum(
                        second_target_contributions)
                targetCardDataDict[
                    'second_target_bottom_dim'] = second_target_bottom_dim
                targetCardDataDict[
                    'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution
                targetCardDataDict['best_second_target'] = levels[
                    best_second_target_index]
                targetCardDataDict[
                    'best_second_target_count'] = second_target_contributions[
                        best_second_target_index]
                targetCardDataDict['best_second_target_percent'] = round(
                    second_target_contributions[best_second_target_index] *
                    100.0 / sum(second_target_contributions), 2)
                targetCardDataDict['worst_second_target'] = levels[
                    worst_second_target_index]
                targetCardDataDict['worst_second_target_percent'] = round(
                    second_target_contributions[worst_second_target_index] *
                    100.0 / sum(second_target_contributions), 2)

                card2Data = []
                targetLevelContributions = [
                    table.get_value(targetLevel, i) for i in levels
                ]
                card2Heading = '<h3>Distribution of ' + self._target_dimension + ' (' + targetLevel + ') across ' + self._analysed_dimension + "</h3>"
                chart, bubble = self.generate_distribution_card_chart(
                    targetLevel, targetLevelContributions, levels,
                    level_counts, total)
                card2ChartData = NormalChartData(data=chart["data"])
                card2ChartJson = ChartJson()
                card2ChartJson.set_data(card2ChartData.get_data())
                card2ChartJson.set_chart_type("combination")
                card2ChartJson.set_types({
                    "total": "bar",
                    "percentage": "line"
                })
                card2ChartJson.set_legend({
                    "total": "# of " + targetLevel,
                    "percentage": "% of " + targetLevel
                })
                card2ChartJson.set_axes({
                    "x": "key",
                    "y": "total",
                    "y2": "percentage"
                })
                card2ChartJson.set_label_text({
                    "x": " ",
                    "y": "Count",
                    "y2": "Percentage"
                })
                print "self._binTargetCol & self._binAnalyzedCol : ", self._binTargetCol, self._binAnalyzedCol
                if (self._binTargetCol == True & self._binAnalyzedCol ==
                        False):
                    print "Only Target Column is Binned"
                    output2 = NarrativesUtils.block_splitter(
                        NarrativesUtils.get_template_output(
                            self._base_dir, 'card2_binned_target.html',
                            targetCardDataDict), self._blockSplitter)
                elif (self._binTargetCol == True & self._binAnalyzedCol ==
                      True):
                    print "Target Column and IV is Binned"
                    output2 = NarrativesUtils.block_splitter(
                        NarrativesUtils.get_template_output(
                            self._base_dir, 'card2_binned_target_and_IV.html',
                            targetCardDataDict), self._blockSplitter)
                else:
                    print "In Else, self._binTargetCol should be False : ", self._binTargetCol
                    output2 = NarrativesUtils.block_splitter(
                        NarrativesUtils.get_template_output(
                            self._base_dir, 'card2.html', targetCardDataDict),
                        self._blockSplitter)

                card2Data.append(HtmlData(data=card2Heading))
                statistical_info_array = [
                    ("Test Type", "Chi-Square"),
                    ("Chi-Square statistic",
                     str(round(self._chisquare_result.get_stat(), 3))),
                    ("P-Value",
                     str(round(self._chisquare_result.get_pvalue(), 3))),
                    ("Inference",
                     "Chi-squared analysis shows a significant association between {} (target) and {}."
                     .format(self._target_dimension, self._analysed_dimension))
                ]
                statistical_info_array = NarrativesUtils.statistical_info_array_formatter(
                    statistical_info_array)

                card2Data.append(
                    C3ChartData(data=card2ChartJson,
                                info=statistical_info_array))
                card2Data += output2
                card2BubbleData = "<div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}</span><br /><small>{}</small></h2></div><div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}</span><br /><small>{}</small></h2></div>".format(
                    bubble[0]["value"], bubble[0]["text"], bubble[1]["value"],
                    bubble[1]["text"])
                card2Data.append(HtmlData(data=card2BubbleData))
                targetCard = NormalCard()
                targetCard.set_card_data(card2Data)
                targetCard.set_card_name("{} : Distribution of {}".format(
                    self._analysed_dimension, targetLevel))
                self._targetCards.append(targetCard)
                dict_for_test[targetLevel] = targetCardDataDict
        out = {'data_dict': data_dict, 'target_dict': dict_for_test}

        return out

예제 #5

파일 보기

파일: chisquare.py 프로젝트: Srinidhi-SA/mAdvisordevSpark

    def _generate_narratives(self):
        chisquare_result = self._chisquare_result
        target_dimension = self._target_dimension
        analysed_dimension = self._analysed_dimension
        significant_variables = self._significant_variables
        num_analysed_variables = self._num_analysed_variables
        table = self._chiSquareTable
        total = self._chiSquareTable.get_total()

        levels = self._chiSquareTable.get_column_two_levels()
        level_counts = self._chiSquareTable.get_column_total()
        levels_count_sum = sum(level_counts)
        levels_percentages = [
            old_div(i * 100.0, levels_count_sum) for i in level_counts
        ]
        sorted_levels = sorted(zip(level_counts, levels), reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]

        top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        bottom_dim = sorted_levels[-1][1]
        bottom_dim_contribution = sorted_levels[-1][0]
        bottom_dims = [
            y for x, y in sorted_levels if x == bottom_dim_contribution
        ]

        target_levels = self._chiSquareTable.get_column_one_levels()

        target_counts = self._chiSquareTable.get_row_total()
        sorted_target_levels = sorted(zip(target_counts, target_levels),
                                      reverse=True)

        top_target_count, top_target = sorted_target_levels[0]
        second_target_count, second_target = sorted_target_levels[1]

        top_target_contributions = [
            table.get_value(top_target, i) for i in levels
        ]
        sum_top_target = sum(top_target_contributions)

        sorted_levels = sorted(zip(top_target_contributions, levels),
                               reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]
        top_target_top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        top_target_top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        top_target_bottom_dim = sorted_levels[-1][1]
        top_target_bottom_dim_contribution = sorted_levels[-1][0]

        top_target_percentages = [
            old_div(i * 100.0, sum_top_target)
            for i in top_target_contributions
        ]
        best_top_target_index = top_target_contributions.index(
            max(top_target_contributions))
        worst_top_target_index = top_target_contributions.index(
            min(top_target_contributions))
        top_target_differences = [
            x - y for x, y in zip(levels_percentages, top_target_percentages)
        ]
        if len(top_target_differences) > 6:
            tops = 2
            bottoms = -2
        elif len(top_target_differences) > 4:
            tops = 2
            bottoms = -1
        else:
            tops = 1
            bottoms = -1
        sorted_ = sorted(enumerate(top_target_differences),
                         key=lambda x: x[1],
                         reverse=True)
        best_top_difference_indices = [x for x, y in sorted_[:tops]]
        worst_top_difference_indices = [x for x, y in sorted_[bottoms:]]

        top_target_shares = [
            old_div(x * 100.0, y)
            for x, y in zip(top_target_contributions, level_counts)
        ]
        max_top_target_shares = max(top_target_shares)
        best_top_target_share_index = [
            idx for idx, val in enumerate(top_target_shares)
            if val == max_top_target_shares
        ]
        level_counts_threshold = old_div(
            sum(level_counts) * 0.05, len(level_counts))
        min_top_target_shares = min([
            x for x, y in zip(top_target_shares, level_counts)
            if y >= level_counts_threshold
        ])
        if max_top_target_shares == min_top_target_shares:
            worst_top_target_share_index = []
        else:
            worst_top_target_share_index = [
                idx for idx, val in enumerate(top_target_shares)
                if val == min_top_target_shares
            ]
        overall_top_percentage = old_div(sum_top_target * 100.0, total)

        second_target_contributions = [
            table.get_value(second_target, i) for i in levels
        ]
        sum_second_target = sum(second_target_contributions)

        sorted_levels = sorted(zip(second_target_contributions, levels),
                               reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]
        second_target_top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        second_target_top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        second_target_bottom_dim = sorted_levels[-1][1]
        second_target_bottom_dim_contribution = sorted_levels[-1][0]

        second_target_percentages = [
            old_div(i * 100.0, sum_second_target)
            for i in second_target_contributions
        ]
        best_second_target_index = second_target_contributions.index(
            max(second_target_contributions))
        worst_second_target_index = second_target_contributions.index(
            min(second_target_contributions))
        second_target_differences = [
            x - y
            for x, y in zip(levels_percentages, second_target_percentages)
        ]
        if len(second_target_differences) > 6:
            tops = 2
            bottoms = -2
        elif len(second_target_differences) > 4:
            tops = 2
            bottoms = -1
        else:
            tops = 1
            bottoms = -1
        sorted_ = sorted(enumerate(second_target_differences),
                         key=lambda x: x[1],
                         reverse=True)
        best_second_difference_indices = [x for x, y in sorted_[:tops]]
        worst_second_difference_indices = [x for x, y in sorted_[bottoms:]]

        second_target_shares = [
            old_div(x * 100.0, y)
            for x, y in zip(second_target_contributions, level_counts)
        ]
        max_second_target_shares = max(second_target_shares)
        best_second_target_share_index = [
            idx for idx, val in enumerate(second_target_shares)
            if val == max_second_target_shares
        ]
        level_counts_threshold = old_div(
            sum(level_counts) * 0.05, len(level_counts))
        if min(second_target_shares) == 0:
            min_second_target_shares = min([
                x for x, y in zip(second_target_shares, level_counts) if x != 0
            ])
        else:
            min_second_target_shares = min([
                x for x, y in zip(second_target_shares, level_counts)
                if y >= level_counts_threshold
            ])
        # worst_second_target_share_index = second_target_shares.index(min_second_target_shares)
        if max_second_target_shares == min_second_target_shares:
            worst_second_target_share_index = []
        else:
            worst_second_target_share_index = [
                idx for idx, val in enumerate(second_target_shares)
                if val == min_second_target_shares
            ]
        overall_second_percentage = old_div(sum_second_target * 100.0, total)

        targetCardDataDict = {}
        targetCardDataDict['target'] = target_dimension
        targetCardDataDict['colname'] = analysed_dimension
        targetCardDataDict['num_significant'] = len(significant_variables)
        targetCardDataDict['plural_colname'] = NarrativesUtils.pluralize(
            analysed_dimension)

        targetCardDataDict["blockSplitter"] = self._blockSplitter
        targetCardDataDict["binTargetCol"] = self._binTargetCol
        targetCardDataDict["binAnalyzedCol"] = self._binAnalyzedCol
        targetCardDataDict['highlightFlag'] = self._highlightFlag
        targetCardDataDict['levels'] = levels

        data_dict = {}
        data_dict[
            'best_second_difference'] = best_second_difference_indices  ##these changed
        data_dict['worst_second_difference'] = worst_second_difference_indices
        data_dict['best_top_difference'] = best_top_difference_indices
        data_dict['worst_top_difference'] = worst_top_difference_indices
        data_dict['levels_percentages'] = levels_percentages
        data_dict['top_target_percentages'] = top_target_percentages
        data_dict['second_target_percentages'] = second_target_percentages
        data_dict['levels'] = levels
        data_dict['best_top_share'] = best_top_target_share_index
        data_dict['worst_top_share'] = worst_top_target_share_index
        data_dict['best_second_share'] = best_second_target_share_index
        data_dict['worst_second_share'] = worst_second_target_share_index
        data_dict['top_target_shares'] = top_target_shares
        data_dict['second_target_shares'] = second_target_shares
        data_dict['overall_second'] = overall_second_percentage
        data_dict['overall_top'] = overall_top_percentage

        data_dict['num_significant'] = len(significant_variables)
        data_dict['colname'] = analysed_dimension
        data_dict['plural_colname'] = NarrativesUtils.pluralize(
            analysed_dimension)
        data_dict['target'] = target_dimension
        data_dict['top_levels'] = top_dims
        data_dict['top_levels_percent'] = round(
            old_div(top_dims_contribution * 100.0, total), 1)
        data_dict['bottom_level'] = bottom_dim
        data_dict['bottom_levels'] = bottom_dims
        data_dict['bottom_level_percent'] = round(
            old_div(bottom_dim_contribution * 100, sum(level_counts)), 2)
        data_dict['second_target'] = second_target
        data_dict['second_target_top_dims'] = second_target_top_dims
        data_dict['second_target_top_dims_contribution'] = old_div(
            second_target_top_dims_contribution * 100.0,
            sum(second_target_contributions))
        data_dict['second_target_bottom_dim'] = second_target_bottom_dim
        data_dict[
            'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution
        data_dict['best_second_target'] = levels[best_second_target_index]
        data_dict['best_second_target_count'] = second_target_contributions[
            best_second_target_index]
        data_dict['best_second_target_percent'] = round(
            old_div(
                second_target_contributions[best_second_target_index] * 100.0,
                sum(second_target_contributions)), 2)
        data_dict['worst_second_target'] = levels[worst_second_target_index]
        data_dict['worst_second_target_percent'] = round(
            old_div(
                second_target_contributions[worst_second_target_index] * 100.0,
                sum(second_target_contributions)), 2)

        data_dict['top_target'] = top_target
        data_dict['top_target_top_dims'] = top_target_top_dims
        data_dict['top_target_top_dims_contribution'] = old_div(
            top_target_top_dims_contribution * 100.0,
            sum(top_target_contributions))
        data_dict['top_target_bottom_dim'] = top_target_bottom_dim
        data_dict[
            'top_target_bottom_dim_contribution'] = top_target_bottom_dim_contribution
        data_dict['best_top_target'] = levels[best_top_target_index]
        data_dict['best_top_target_count'] = top_target_contributions[
            best_top_target_index]
        data_dict['best_top_target_percent'] = round(
            old_div(top_target_contributions[best_top_target_index] * 100.0,
                    sum(top_target_contributions)), 2)
        data_dict['worst_top_target'] = levels[worst_top_target_index]
        data_dict['worst_top_target_percent'] = round(
            old_div(top_target_contributions[worst_top_target_index] * 100.0,
                    sum(top_target_contributions)), 2)

        data_dict["blockSplitter"] = self._blockSplitter
        data_dict["binTargetCol"] = self._binTargetCol
        data_dict["binAnalyzedCol"] = self._binAnalyzedCol
        data_dict['highlightFlag'] = self._highlightFlag

        # print "_"*60
        # print "DATA DICT - ", data_dict
        # print "_"*60

        ###############
        #     CARD1   #
        ###############

        print("self._binTargetCol & self._binAnalyzedCol : ",
              self._binTargetCol, self._binAnalyzedCol)
        if len(data_dict['worst_second_share']) == 0:
            output = NarrativesUtils.block_splitter(
                NarrativesUtils.get_template_output(
                    self._base_dir, 'card1_binned_target_worst_second.html',
                    data_dict),
                self._blockSplitter,
                highlightFlag=self._highlightFlag)
        else:
            if (self._binTargetCol == True & self._binAnalyzedCol == False):
                print("Only Target Column is Binned, : ", self._binTargetCol)
                output = NarrativesUtils.block_splitter(
                    NarrativesUtils.get_template_output(
                        self._base_dir, 'card1_binned_target.html', data_dict),
                    self._blockSplitter,
                    highlightFlag=self._highlightFlag)
            elif (self._binTargetCol == True & self._binAnalyzedCol == True):
                print("Target Column and IV is Binned : ", self._binTargetCol,
                      self._binAnalyzedCol)
                output = NarrativesUtils.block_splitter(
                    NarrativesUtils.get_template_output(
                        self._base_dir, 'card1_binned_target_and_IV.html',
                        data_dict),
                    self._blockSplitter,
                    highlightFlag=self._highlightFlag)
            else:
                output = NarrativesUtils.block_splitter(
                    NarrativesUtils.get_template_output(
                        self._base_dir, 'card1.html', data_dict),
                    self._blockSplitter,
                    highlightFlag=self._highlightFlag)

        targetDimCard1Data = []
        targetDimcard1Heading = '<h3>Impact of ' + self._analysed_dimension + '  on ' + self._target_dimension + "</h3>"

        toggledata = ToggleData()

        targetDimTable1Data = self.generate_card1_table1()
        targetDimCard1Table1 = TableData()
        targetDimCard1Table1.set_table_type("heatMap")
        targetDimCard1Table1.set_table_data(targetDimTable1Data)
        toggledata.set_toggleon_data({
            "data": {
                "tableData": targetDimTable1Data,
                "tableType": "heatMap"
            },
            "dataType": "table"
        })

        targetDimTable2Data = self.generate_card1_table2()
        targetDimCard1Table2 = TableData()
        targetDimCard1Table2.set_table_type("normal")
        table2Data = targetDimTable2Data["data1"]
        table2Data = [
            innerList[1:] for innerList in table2Data
            if innerList[0].strip() != ""
        ]
        targetDimCard1Table2.set_table_data(table2Data)

        toggledata.set_toggleoff_data({
            "data": {
                "tableData": table2Data,
                "tableType": "heatMap"
            },
            "dataType": "table"
        })

        targetDimCard1Data.append(HtmlData(data=targetDimcard1Heading))
        targetDimCard1Data.append(toggledata)
        targetDimCard1Data += output

        self._card1.set_card_data(targetDimCard1Data)
        self._card1.set_card_name("{}: Relationship with {}".format(
            self._analysed_dimension, self._target_dimension))

        ###############
        #     CARD2   #
        ###############

        if self._appid == None:

            key_factors = ''
            num_key_factors = len(self._second_level_dimensions)

            if len(self._second_level_dimensions) == 5:
                key_factors = ', '.join(
                    self._second_level_dimensions[:4]
                ) + ' and ' + self._second_level_dimensions[4]
            elif len(self._second_level_dimensions) == 4:
                key_factors = ', '.join(
                    self._second_level_dimensions[:3]
                ) + ' and ' + self._second_level_dimensions[3]
            elif len(self._second_level_dimensions) == 3:
                key_factors = ', '.join(
                    self._second_level_dimensions[:2]
                ) + ' and ' + self._second_level_dimensions[2]
            elif len(self._second_level_dimensions) == 2:
                key_factors = ' and '.join(self._second_level_dimensions)
            elif len(self._second_level_dimensions) == 1:
                key_factors = self._second_level_dimensions[0]

            targetCardDataDict['num_key_factors'] = num_key_factors
            targetCardDataDict['key_factors'] = key_factors
            dict_for_test = {}
            for tupleObj in sorted_target_levels[:self._chiSquareLevelLimit]:
                targetLevel = tupleObj[1]

                targetCardDataDict['random_card2'] = random.randint(1, 100)
                targetCardDataDict['random_card4'] = random.randint(1, 100)

                second_target_contributions = [
                    table.get_value(targetLevel, i) for i in levels
                ]
                sum_second_target = sum(second_target_contributions)

                sorted_levels = sorted(zip(second_target_contributions,
                                           levels),
                                       reverse=True)

                level_differences = [0.0] + [
                    sorted_levels[i][0] - sorted_levels[i + 1][0]
                    for i in range(len(sorted_levels) - 1)
                ]
                level_diff_index = level_differences.index(
                    max(level_differences)) if level_differences.index(
                        max(level_differences)) > 0 else len(
                            level_differences
                        )  ##added for pipeline keyerror issue
                second_target_top_dims = [
                    j for i, j in sorted_levels[:level_diff_index]
                ]
                second_target_top_dims_contribution = sum([
                    i for i, j in sorted_levels[:level_differences.
                                                index(max(level_differences))]
                ])
                second_target_bottom_dim = sorted_levels[-1][1]
                second_target_bottom_dim_contribution = sorted_levels[-1][0]

                second_target_percentages = [
                    old_div(i * 100.0, sum_second_target)
                    for i in second_target_contributions
                ]
                best_second_target_index = second_target_contributions.index(
                    max(second_target_contributions))
                worst_second_target_index = second_target_contributions.index(
                    min(second_target_contributions))
                second_target_differences = [
                    x - y for x, y in zip(levels_percentages,
                                          second_target_percentages)
                ]
                if len(second_target_differences) > 6:
                    tops = 2
                    bottoms = -2
                elif len(second_target_differences) > 4:
                    tops = 2
                    bottoms = -1
                else:
                    tops = 1
                    bottoms = -1
                sorted_ = sorted(enumerate(second_target_differences),
                                 key=lambda x: x[1],
                                 reverse=True)
                best_second_difference_indices = [x for x, y in sorted_[:tops]]
                worst_second_difference_indices = [
                    x for x, y in sorted_[bottoms:]
                ]

                second_target_shares = [
                    old_div(x * 100.0, y)
                    for x, y in zip(second_target_contributions, level_counts)
                ]
                max_second_target_shares = max(second_target_shares)
                best_second_target_share_index = [
                    idx for idx, val in enumerate(second_target_shares)
                    if val == max_second_target_shares
                ]
                level_counts_threshold = old_div(
                    sum(level_counts) * 0.05, len(level_counts))
                min_second_target_shares = min([
                    x for x, y in zip(second_target_shares, level_counts)
                    if y >= level_counts_threshold
                ])
                worst_second_target_share_index = [
                    idx for idx, val in enumerate(second_target_shares)
                    if val == min_second_target_shares
                ]
                overall_second_percentage = old_div(sum_second_target * 100.0,
                                                    total)

                # DataFrame for contribution calculation
                if self._pandas_flag:
                    df_second_target = self._data_frame[(
                        self._data_frame[self._target_dimension] == targetLevel
                    ) & (self._data_frame[self._analysed_dimension] ==
                         second_target_top_dims[0])][
                             self._second_level_dimensions]
                    df_second_dim = self._data_frame[(
                        self._data_frame[self._analysed_dimension] ==
                        second_target_top_dims[0]
                    )][self._second_level_dimensions]
                else:
                    df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\
                                            filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                                            select(self._second_level_dimensions).toPandas()
                    df_second_dim = self._data_frame.filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                                        select(self._second_level_dimensions).toPandas()

                # if self._chisquare_result.get_splits():
                #     splits = self._chisquare_result.get_splits()
                #     idx = self._chiSquareTable.get_bin_names(splits).index(second_target_top_dims[0])
                #     idx1 = self._chiSquareTable.get_bin_names(splits).index(top_target_top_dims[0])
                #     splits[len(splits)-1] = splits[len(splits)-1]+1
                #     df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\
                #                         filter(col(self._analysed_dimension)>=splits[idx]).filter(col(self._analysed_dimension)<splits[idx+1]).\
                #                         select(self._second_level_dimensions).toPandas()
                #     df_second_dim = self._data_frame.filter(col(self._analysed_dimension)>=splits[idx]).\
                #                     filter(col(self._analysed_dimension)<splits[idx+1]).\
                #                     select(self._second_level_dimensions).toPandas()
                # else:
                #     df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\
                #                         filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                #                         select(self._second_level_dimensions).toPandas()
                #     df_second_dim = self._data_frame.filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                #                     select(self._second_level_dimensions).toPandas()

                # print self._data_frame.select('Sales').show()

                distribution_second = []
                d_l = []
                for d in self._second_level_dimensions:
                    grouped = df_second_target.groupby(d).agg({d: 'count'})
                    contributions = df_second_dim.groupby(d).agg({d: 'count'})
                    contribution_index = list(contributions.index)
                    contributions_val = contributions[d].tolist()
                    contributions_list = dict(
                        list(zip(contribution_index, contributions_val)))
                    index_list = list(grouped.index)
                    grouped_list = grouped[d].tolist()
                    contributions_percent_list = [
                        round(old_div(y * 100.0, contributions_list[x]), 2)
                        for x, y in zip(index_list, grouped_list)
                    ]
                    sum_ = grouped[d].sum()
                    diffs = [0] + [
                        grouped_list[i] - grouped_list[i + 1]
                        for i in range(len(grouped_list) - 1)
                    ]
                    max_diff = diffs.index(max(diffs))
                    grouped_dict = dict(list(zip(index_list, grouped_list)))

                    for val in contribution_index:
                        if val not in list(grouped_dict.keys()):
                            grouped_dict[val] = 0
                        else:
                            pass

                    index_list = []
                    grouped_list = []
                    contributions_val = []

                    for key in list(grouped_dict.keys()):
                        index_list.append(str(key))
                        grouped_list.append(grouped_dict[key])
                        contributions_val.append(contributions_list[key])
                    '''
                    print "="*70
                    print "GROUPED - ", grouped
                    print "INDEX LIST - ", index_list
                    print "GROUPED LIST - ", grouped_list
                    print "GROUPED DICT - ", grouped_dict
                    print "CONTRIBUTIONS - ", contributions
                    print "CONTRIBUTION INDEX - ", contribution_index
                    print "CONTRIBUTIONS VAL - ", contributions_val
                    print "CONTRIBUTIONS LIST - ", contributions_list
                    print "CONTRIBUTIONS PERCENT LIST - ", contributions_percent_list
                    print "SUM - ", sum_
                    print "DIFFS - ", diffs
                    print "MAX DIFF - ", max_diff
                    print "="*70
                    '''

                    informative_dict = {
                        "levels": index_list,
                        "positive_class_contribution": grouped_list,
                        "positive_plus_others": contributions_val
                    }

                    informative_df = pd.DataFrame(informative_dict)
                    informative_df["percentage_horizontal"] = old_div(
                        informative_df["positive_class_contribution"] * 100,
                        informative_df["positive_plus_others"])
                    informative_df["percentage_vertical"] = old_div(
                        informative_df["positive_class_contribution"] * 100,
                        sum_)
                    informative_df.sort_values(["percentage_vertical"],
                                               inplace=True,
                                               ascending=False)
                    informative_df = informative_df.reset_index(drop=True)

                    percentage_vertical_sorted = list(
                        informative_df["percentage_vertical"])
                    percentage_horizontal_sorted = list(
                        informative_df["percentage_horizontal"])
                    levels_sorted = list(informative_df["levels"])

                    differences_list = []
                    for i in range(1, len(percentage_vertical_sorted)):
                        difference = percentage_vertical_sorted[
                            i - 1] - percentage_vertical_sorted[i]
                        differences_list.append(round(difference, 2))
                    '''
                    print "-"*70
                    print "DIFFERENCES LIST - ", differences_list
                    print "-"*70
                    '''

                    index_txt = ''
                    if differences_list:
                        if differences_list[0] >= 30:
                            print("showing 1st case")
                            index_txt = levels_sorted[0]
                            max_diff_equivalent = 1
                        else:
                            if len(differences_list) >= 2:
                                if differences_list[1] >= 10:
                                    print("showing 1st and 2nd case")
                                    index_txt = levels_sorted[0] + '(' + str(
                                        round(percentage_vertical_sorted[0], 1)
                                    ) + '%)' + ' and ' + levels_sorted[
                                        1] + '(' + str(
                                            round(
                                                percentage_vertical_sorted[1],
                                                1)) + '%)'
                                    max_diff_equivalent = 2
                                else:
                                    print("showing 3rd case")
                                    index_txt = 'including ' + levels_sorted[
                                        0] + '(' + str(
                                            round(
                                                percentage_vertical_sorted[0],
                                                1)
                                        ) + '%)' + ' and ' + levels_sorted[
                                            1] + '(' + str(
                                                round(
                                                    percentage_vertical_sorted[
                                                        1], 1)) + '%)'
                                    max_diff_equivalent = 3
                            else:
                                print("showing 3rd case")
                                index_txt = 'including ' + levels_sorted[
                                    0] + '(' + str(
                                        round(percentage_vertical_sorted[0], 1)
                                    ) + '%)' + ' and ' + levels_sorted[
                                        1] + '(' + str(
                                            round(
                                                percentage_vertical_sorted[1],
                                                1)) + '%)'
                                max_diff_equivalent = 3

                    else:
                        max_diff_equivalent = 0
                    '''
                    print "-"*70
                    print informative_df.head(25)
                    print "-"*70
                    '''

                    distribution_second.append({
                        'contributions': [
                            round(i, 2) for i in
                            percentage_vertical_sorted[:max_diff_equivalent]
                        ],
                        'levels':
                        levels_sorted[:max_diff_equivalent],
                        'variation':
                        random.randint(1, 100),
                        'index_txt':
                        index_txt,
                        'd':
                        d,
                        'contributions_percent':
                        percentage_horizontal_sorted
                    })
                '''
                  print "DISTRIBUTION SECOND - ", distribution_second
                  print "<>"*50
                  '''
                targetCardDataDict['distribution_second'] = distribution_second
                targetCardDataDict['second_target'] = targetLevel
                targetCardDataDict[
                    'second_target_top_dims'] = second_target_top_dims
                targetCardDataDict[
                    'second_target_top_dims_contribution'] = old_div(
                        second_target_top_dims_contribution * 100.0,
                        sum(second_target_contributions))
                targetCardDataDict[
                    'second_target_bottom_dim'] = second_target_bottom_dim
                targetCardDataDict[
                    'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution
                targetCardDataDict['best_second_target'] = levels[
                    best_second_target_index]
                targetCardDataDict[
                    'best_second_target_count'] = second_target_contributions[
                        best_second_target_index]
                targetCardDataDict['best_second_target_percent'] = round(
                    old_div(
                        second_target_contributions[best_second_target_index] *
                        100.0, sum(second_target_contributions)), 2)
                targetCardDataDict['worst_second_target'] = levels[
                    worst_second_target_index]
                targetCardDataDict['worst_second_target_percent'] = round(
                    old_div(
                        second_target_contributions[worst_second_target_index]
                        * 100.0, sum(second_target_contributions)), 2)

                card2Data = []
                targetLevelContributions = [
                    table.get_value(targetLevel, i) for i in levels
                ]
                impact_target_thershold = old_div(
                    sum(targetLevelContributions) * 0.02,
                    len(targetLevelContributions))
                card2Heading = '<h3>Key Drivers of ' + self._target_dimension + ' (' + targetLevel + ')' + "</h3>"
                chart, bubble = self.generate_distribution_card_chart(
                    targetLevel, targetLevelContributions, levels,
                    level_counts, total, impact_target_thershold)
                card2ChartData = NormalChartData(data=chart["data"])
                "rounding the chartdata values for key drivers tab inside table percentage(table data)"
                for d in card2ChartData.get_data():
                    d['percentage'] = round(d['percentage'], 2)
                    d_l.append(d)
                card2ChartJson = ChartJson()
                card2ChartJson.set_data(d_l)
                card2ChartJson.set_chart_type("combination")
                card2ChartJson.set_types({
                    "total": "bar",
                    "percentage": "line"
                })
                card2ChartJson.set_legend({
                    "total": "# of " + targetLevel,
                    "percentage": "% of " + targetLevel
                })
                card2ChartJson.set_axes({
                    "x": "key",
                    "y": "total",
                    "y2": "percentage"
                })
                card2ChartJson.set_label_text({
                    "x": " ",
                    "y": "Count",
                    "y2": "Percentage"
                })
                print("self._binTargetCol & self._binAnalyzedCol : ",
                      self._binTargetCol, self._binAnalyzedCol)
                if (self._binTargetCol == True & self._binAnalyzedCol ==
                        False):
                    print("Only Target Column is Binned")
                    output2 = NarrativesUtils.block_splitter(
                        NarrativesUtils.get_template_output(
                            self._base_dir, 'card2_binned_target.html',
                            targetCardDataDict), self._blockSplitter)
                elif (self._binTargetCol == True & self._binAnalyzedCol ==
                      True):
                    print("Target Column and IV is Binned")
                    output2 = NarrativesUtils.block_splitter(
                        NarrativesUtils.get_template_output(
                            self._base_dir, 'card2_binned_target_and_IV.html',
                            targetCardDataDict), self._blockSplitter)
                else:
                    print("In Else, self._binTargetCol should be False : ",
                          self._binTargetCol)
                    output2 = NarrativesUtils.block_splitter(
                        NarrativesUtils.get_template_output(
                            self._base_dir, 'card2.html', targetCardDataDict),
                        self._blockSplitter)

                card2Data.append(HtmlData(data=card2Heading))
                statistical_info_array = [
                    ("Test Type", "Chi-Square"),
                    ("Chi-Square statistic",
                     str(round(self._chisquare_result.get_stat(), 3))),
                    ("P-Value",
                     str(round(self._chisquare_result.get_pvalue(), 3))),
                    ("Inference",
                     "Chi-squared analysis shows a significant association between {} (target) and {}."
                     .format(self._target_dimension, self._analysed_dimension))
                ]
                statistical_info_array = NarrativesUtils.statistical_info_array_formatter(
                    statistical_info_array)

                card2Data.append(
                    C3ChartData(data=card2ChartJson,
                                info=statistical_info_array))
                card2Data += output2
                card2BubbleData = "<div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}</span><br /><small>{}</small></h2></div><div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}</span><br /><small>{}</small></h2></div>".format(
                    bubble[0]["value"], bubble[0]["text"], bubble[1]["value"],
                    bubble[1]["text"])
                card2Data.append(HtmlData(data=card2BubbleData))
                targetCard = NormalCard()
                targetCard.set_card_data(card2Data)
                targetCard.set_card_name("{} : Distribution of {}".format(
                    self._analysed_dimension, targetLevel))
                self._targetCards.append(targetCard)
                dict_for_test[targetLevel] = targetCardDataDict
        out = {'data_dict': data_dict, 'target_dict': dict_for_test}

        return out

예제 #6

파일 보기

파일: decision_tree.py 프로젝트: Srinidhi-SA/Madvisorhadoop

    def _generate_summary(self):
        data_dict = {}
        rules_dict = self._table
        data_dict["blockSplitter"] = self._blockSplitter
        data_dict["targetcol"] = self._colname
        groups = rules_dict.keys()
        probabilityCutoff = 75
        probabilityGroups = [{
            "probability": probabilityCutoff,
            "count": 0,
            "range": [probabilityCutoff, 100]
        }, {
            "probability": probabilityCutoff - 1,
            "count": 0,
            "range": [0, probabilityCutoff - 1]
        }]
        tableArray = [[
            "Prediction Rule", "Probability", "Prediction", "Freq", "group",
            "richRules"
        ]]
        dropdownData = []
        chartDict = {}
        targetLevel = self._dataframe_context.get_target_level_for_model()
        probabilityArrayAll = []

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        progressMessage = CommonUtils.create_progress_message_object(
            self._analysisName,
            "custom",
            "info",
            "Generating Prediction rules",
            self._completionStatus,
            self._completionStatus,
            display=True)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=False)
        self._dataframe_context.update_completion_status(
            self._completionStatus)
        targetValues = [x for x in rules_dict.keys() if x == targetLevel
                        ] + [x for x in rules_dict.keys() if x != targetLevel]
        for idx, target in enumerate(targetValues):
            if idx == 0:
                if self._dataframe_context.get_story_on_scored_data() != True:
                    dropdownData.append({
                        "displayName": target,
                        "name": target,
                        "selected": True,
                        "id": idx + 1
                    })
                else:
                    dropdownData.append({
                        "displayName":
                        "{} : {}".format(self._colname, target),
                        "name":
                        target,
                        "selected":
                        True,
                        "id":
                        idx + 1
                    })
            else:
                if self._dataframe_context.get_story_on_scored_data() != True:
                    dropdownData.append({
                        "displayName": target,
                        "name": target,
                        "selected": False,
                        "id": idx + 1
                    })
                else:
                    dropdownData.append({
                        "displayName":
                        "{} : {}".format(self._colname, target),
                        "name":
                        target,
                        "selected":
                        False,
                        "id":
                        idx + 1
                    })
            rulesArray = rules_dict[target]
            probabilityArray = [
                round(x, 2) for x in self.success_percent[target]
            ]
            probabilityArrayAll += probabilityArray
            groupArray = [
                "strong" if x >= probabilityCutoff else "mixed"
                for x in probabilityArray
            ]
            for idx2, obj in enumerate(probabilityGroups):
                grpCount = len([
                    x for x in probabilityArray
                    if x >= obj["range"][0] and x <= obj["range"][1]
                ])
                obj["count"] += grpCount
                probabilityGroups[idx2] = obj
            predictionArray = [target] * len(rulesArray)
            freqArray = self.total_predictions[target]
            chartDict[target] = sum(freqArray)
            success = self.successful_predictions[target]
            success_percent = self.success_percent[target]
            richRulesArray = []
            crudeRuleArray = []
            analysisType = self._dataframe_context.get_analysis_type()
            targetCol = self._dataframe_context.get_result_column()
            binFlag = False
            if self._dataframe_context.get_custom_analysis_details() != None:
                binnedColObj = [
                    x["colName"] for x in
                    self._dataframe_context.get_custom_analysis_details()
                ]
                if binnedColObj != None and targetCol in binnedColObj:
                    binFlag = True
            for idx2, crudeRule in enumerate(rulesArray):
                richRule, crudeRule = NarrativesUtils.generate_rules(
                    self._colname,
                    target,
                    crudeRule,
                    freqArray[idx2],
                    success[idx2],
                    success_percent[idx2],
                    analysisType,
                    binFlag=binFlag)
                richRulesArray.append(richRule)
                crudeRuleArray.append(crudeRule)
            probabilityArray = map(
                lambda x: humanize.apnumber(x) + "%"
                if x >= 10 else str(int(x)) + "%", probabilityArray)
            # targetArray = zip(richRulesArray,probabilityArray,predictionArray,freqArray,groupArray)
            targetArray = zip(crudeRuleArray, probabilityArray,
                              predictionArray, freqArray, groupArray,
                              richRulesArray)
            targetArray = [list(x) for x in targetArray]
            tableArray += targetArray

        donutChartMaxLevel = 10
        if self._dataframe_context.get_story_on_scored_data() == True:
            chartDict = {}
            probabilityRangeForChart = GLOBALSETTINGS.PROBABILITY_RANGE_FOR_DONUT_CHART
            chartDict = dict(
                zip(probabilityRangeForChart.keys(),
                    [0] * len(probabilityRangeForChart)))
            for val in probabilityArrayAll:
                for grps, grpRange in probabilityRangeForChart.items():
                    if val > grpRange[0] and val <= grpRange[1]:
                        chartDict[grps] = chartDict[grps] + 1
            chartDict = {k: v for k, v in chartDict.items() if v != 0}
        else:
            chartDict = dict([(k, sum(v))
                              for k, v in self.total_predictions.items()])
            chartDict = {k: v for k, v in chartDict.items() if v != 0}
        if len(chartDict) > donutChartMaxLevel:
            chartDict = NarrativesUtils.restructure_donut_chart_data(
                chartDict, nLevels=donutChartMaxLevel)
        chartData = NormalChartData([chartDict]).get_data()
        chartJson = ChartJson(data=chartData)
        chartJson.set_title(self._colname)
        chartJson.set_chart_type("donut")
        mainCardChart = C3ChartData(data=chartJson)
        mainCardChart.set_width_percent(45)
        # mainCardChart = {"dataType": "c3Chart","widthPercent":33 ,"data": {"data": [chartDict],"title":self._colname,"axes":{},"label_text":{},"legend":{},"yAxisNumberFormat": ".2s","types":None,"axisRotation":False, "chart_type": "donut"}}

        dropdownDict = {
            "dataType": "dropdown",
            "label": "Showing prediction rules for",
            "data": dropdownData
        }

        data_dict["probabilityGroups"] = probabilityGroups
        if self._dataframe_context.get_story_on_scored_data() != True:
            maincardSummary = NarrativesUtils.get_template_output(self._base_dir,\
                                                        'decisiontreesummary.html',data_dict)
        else:
            predictedLevelcountArray = [(x[2], x[3]) for x in tableArray[1:]]
            predictedLevelCountDict = {}
            # predictedLevelcountDict = defaultdict(predictedLevelcountArray)
            for val in predictedLevelcountArray:
                predictedLevelCountDict.setdefault(val[0], []).append(val[1])

            levelCountDict = {}
            for k, v in predictedLevelCountDict.items():
                levelCountDict[k] = sum(v)
            # levelCountDict = self._metaParser.get_unique_level_dict(self._colname)
            total = float(
                sum([x for x in levelCountDict.values() if x != None]))
            levelCountTuple = [{
                "name": k,
                "count": v,
                "percentage": round(v * 100 / total, 2)
            } for k, v in levelCountDict.items() if v != None]
            percentageArray = [x["percentage"] for x in levelCountTuple]
            percentageArray = NarrativesUtils.ret_smart_round(percentageArray)
            levelCountTuple = [{
                "name": obj["name"],
                "count": obj["count"],
                "percentage": str(percentageArray[idx]) + "%"
            } for idx, obj in enumerate(levelCountTuple)]
            data_dict["nlevel"] = len(levelCountDict)
            print "levelCountTuple", levelCountTuple
            print "levelCountDict", levelCountDict
            if targetLevel in levelCountDict:
                data_dict["topLevel"] = [
                    x for x in levelCountTuple if x["name"] == targetLevel
                ][0]
                if len(levelCountTuple) > 1:
                    data_dict["secondLevel"] = max([
                        x for x in levelCountTuple if x["name"] != targetLevel
                    ],
                                                   key=lambda x: x["count"])
                else:
                    data_dict["secondLevel"] = None
            else:
                data_dict["topLevel"] = levelCountTuple[0]
                if len(levelCountTuple) > 1:
                    data_dict["secondLevel"] = levelCountTuple[1]
                else:
                    data_dict["secondLevel"] = None
            print data_dict
            maincardSummary = NarrativesUtils.get_template_output(
                self._base_dir, 'decisiontreescore.html', data_dict)
        main_card = NormalCard()
        main_card_data = []
        main_card_narrative = NarrativesUtils.block_splitter(
            maincardSummary, self._blockSplitter)
        main_card_data += main_card_narrative

        main_card_data.append(mainCardChart)
        main_card_data.append(dropdownDict)

        main_card_table = TableData()
        if self._dataframe_context.get_story_on_scored_data() == True:
            main_card_table.set_table_width(75)
        main_card_table.set_table_data(tableArray)
        main_card_table.set_table_type("popupDecisionTreeTable")
        main_card_data.append(main_card_table)
        uidTable = self._result_setter.get_unique_identifier_table()
        if uidTable != None:
            main_card_data.append(uidTable)
        else:
            main_card_table.set_table_width(100)
        main_card.set_card_data(main_card_data)
        main_card.set_card_name("Predicting Key Drivers of {}".format(
            self._colname))
        self._decisionTreeNode.add_a_card(main_card)

예제 #7

파일 보기

파일: decision_tree.py 프로젝트: Srinidhi-SA/Madvisorhadoop

    def _generate_summary(self):
        data_dict = {}
        rules_dict = self._table
        data_dict["blockSplitter"] = self._blockSplitter
        data_dict["targetcol"] = self._colname
        groups = rules_dict.keys()
        probabilityCutoff = 75
        probabilityGroups = [{
            "probability": probabilityCutoff,
            "count": 0,
            "range": [probabilityCutoff, 100]
        }, {
            "probability": probabilityCutoff - 1,
            "count": 0,
            "range": [0, probabilityCutoff - 1]
        }]
        tableArray = [[
            "Prediction Rule", "Probability", "Prediction", "Freq", "group",
            "richRules"
        ]]
        dropdownData = []
        chartDict = {}
        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        progressMessage = CommonUtils.create_progress_message_object(
            self._analysisName,
            "custom",
            "info",
            "Generating Prediction rules",
            self._completionStatus,
            self._completionStatus,
            display=True)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=False)

        for idx, target in enumerate(rules_dict.keys()):
            targetToDisplayInTable = target.split(":")[0].strip()
            if idx == 0:
                dropdownData.append({
                    "displayName": target,
                    "name": targetToDisplayInTable,
                    "searchTerm": targetToDisplayInTable,
                    "selected": True,
                    "id": idx + 1
                })
            else:
                dropdownData.append({
                    "displayName": target,
                    "name": targetToDisplayInTable,
                    "searchTerm": targetToDisplayInTable,
                    "selected": False,
                    "id": idx + 1
                })
            rulesArray = rules_dict[target]
            probabilityArray = [
                round(x, 2) for x in self.success_percent[target]
            ]
            groupArray = [
                "strong" if x >= probabilityCutoff else "mixed"
                for x in probabilityArray
            ]
            for idx2, obj in enumerate(probabilityGroups):
                grpCount = len([
                    x for x in probabilityArray
                    if x >= obj["range"][0] and x <= obj["range"][1]
                ])
                obj["count"] += grpCount
                probabilityGroups[idx2] = obj
            predictionArray = [targetToDisplayInTable] * len(rulesArray)
            freqArray = self.total_predictions[target]
            chartDict[target] = sum(freqArray)
            success = self.successful_predictions[target]
            success_percent = self.success_percent[target]
            richRulesArray = []
            crudeRuleArray = []
            analysisType = self._dataframe_context.get_analysis_type()
            targetCol = self._dataframe_context.get_result_column()
            binFlag = False
            if self._dataframe_context.get_custom_analysis_details() != None:
                binnedColObj = [
                    x["colName"] for x in
                    self._dataframe_context.get_custom_analysis_details()
                ]
                if binnedColObj != None and targetCol in binnedColObj:
                    binFlag = True
            for idx2, crudeRule in enumerate(rulesArray):
                richRule, crudeRule = NarrativesUtils.generate_rules(
                    self._colname,
                    target,
                    crudeRule,
                    freqArray[idx2],
                    success[idx2],
                    success_percent[idx2],
                    analysisType,
                    binFlag=binFlag)
                richRulesArray.append(richRule)
                crudeRuleArray.append(crudeRule)
            probabilityArray = map(
                lambda x: humanize.apnumber(x) + "%"
                if x >= 10 else str(int(x)) + "%", probabilityArray)
            # targetArray = zip(rulesArray,probabilityArray,predictionArray,freqArray,groupArray)
            targetArray = zip(crudeRuleArray, probabilityArray,
                              predictionArray, freqArray, groupArray,
                              richRulesArray)
            targetArray = [list(x) for x in targetArray]
            tableArray += targetArray

        donutChartMaxLevel = 10
        if len(chartDict) > donutChartMaxLevel:
            chartDict = NarrativesUtils.restructure_donut_chart_data(
                chartDict, nLevels=donutChartMaxLevel)
        chartData = NormalChartData([chartDict]).get_data()
        chartJson = ChartJson(data=chartData)
        chartJson.set_title(self._colname)
        chartJson.set_chart_type("donut")
        mainCardChart = C3ChartData(data=chartJson)
        mainCardChart.set_width_percent(45)
        # mainCardChart = {"dataType": "c3Chart","widthPercent":33 ,"data": {"data": [chartDict],"title":self._colname,"axes":{},"label_text":{},"legend":{},"yAxisNumberFormat": ".2s","types":None,"axisRotation":False, "chart_type": "donut"}}

        dropdownDict = {
            "dataType": "dropdown",
            "label": "Showing prediction rules for",
            "data": dropdownData
        }

        data_dict["probabilityGroups"] = probabilityGroups

        maincardSummary = NarrativesUtils.get_template_output(self._base_dir,\
                                                    'decisiontreesummary.html',data_dict)
        main_card = NormalCard()
        main_card_data = []
        main_card_narrative = NarrativesUtils.block_splitter(
            maincardSummary, self._blockSplitter)
        main_card_data += main_card_narrative

        main_card_data.append(mainCardChart)
        main_card_data.append(dropdownDict)

        main_card_table = TableData()
        main_card_table.set_table_data(tableArray)
        main_card_table.set_table_type("popupDecisionTreeTable")
        main_card_data.append(main_card_table)
        main_card.set_card_data(main_card_data)
        main_card.set_card_name("Predicting Key Drivers of {}".format(
            self._colname))
        self._decisionTreeNode.add_a_card(main_card)