예제 #1
0
class RegressionNarrative(object):
    def __init__(self, df_helper, df_context, result_setter, spark, df_regression_result, correlations,story_narrative,meta_parser):
        self._metaParser = meta_parser
        self._result_setter = result_setter
        self._story_narrative = story_narrative
        self._df_regression_result = df_regression_result
        self._correlations = correlations
        self._dataframe_helper = df_helper
        self._dataframe_context = df_context
        self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER

        # self._result_setter.set_trend_section_name("regression")
        self._measure_columns = self._dataframe_helper.get_numeric_columns()
        self._dimension_columns = self._dataframe_helper.get_string_columns()
        self._date_columns = self._dataframe_context.get_date_columns()
        self._uid_col = self._dataframe_context.get_uid_column()
        if self._metaParser.check_column_isin_ignored_suggestion(self._uid_col):
            self._dimension_columns = list(set(self._dimension_columns) - {self._uid_col})
        if len(self._date_columns) >0 :
            self._dimension_columns = list(set(self._dimension_columns)-set(self._date_columns))
        self._spark = spark
        self.measures = []
        self.result_column = self._dataframe_helper.resultcolumn

        self.all_coefficients = self._df_regression_result.get_all_coeff()
        all_coeff = [(x,self.all_coefficients[x]) for x in list(self.all_coefficients.keys())]
        all_coeff = sorted(all_coeff,key = lambda x:abs(x[1]["coefficient"]),reverse = True)
        self._all_coeffs = all_coeff
        self.significant_measures = [x[0] for x in all_coeff if x[1]['p_value']<=0.05]
        print(self.significant_measures)
        print("regression narratives started")
        self.narratives = {"heading": self.result_column + "Performance Report",
                           "main_card":{},
                           "cards":[]
                        }
        self._base_dir = "/regression/"
        self._run_dimension_level_regression = False

        # self._dim_regression = self.run_regression_for_dimension_levels()
        self._regressionNode = NarrativesTree()

        self._completionStatus = self._dataframe_context.get_completion_status()
        self._analysisName = self._dataframe_context.get_analysis_name()
        self._messageURL = self._dataframe_context.get_message_url()
        self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight()
        self._scriptStages = {
            "regressionNarrativeStart":{
                "summary":"Started The Regression Narratives",
                "weight":1
                },
            "regressionNarrativeEnd":{
                "summary":"Narratives For Regression Finished",
                "weight":0
                },
            }
        self._completionStatus += old_div(self._scriptWeightDict[self._analysisName]["narratives"]*self._scriptStages["regressionNarrativeStart"]["weight"],10)
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "regressionNarrativeStart",\
                                    "info",\
                                    self._scriptStages["regressionNarrativeStart"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,progressMessage)
        self._dataframe_context.update_completion_status(self._completionStatus)

        self.generate_narratives()
        self._regressionNode.set_name("Influencers")
        self._result_setter.set_regression_node(self._regressionNode)

        self._completionStatus += old_div(self._scriptWeightDict[self._analysisName]["narratives"]*self._scriptStages["regressionNarrativeEnd"]["weight"],10)
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "regressionNarrativeEnd",\
                                    "info",\
                                    self._scriptStages["regressionNarrativeEnd"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,progressMessage)
        self._dataframe_context.update_completion_status(self._completionStatus)


    def generate_narratives(self):
        regression_narrative_obj = LinearRegressionNarrative(
                                    self._df_regression_result,
                                    self._correlations,
                                    self._dataframe_helper,
                                    self._dataframe_context,
                                    self._metaParser,
                                    self._spark
                                    )
        main_card_data = regression_narrative_obj.generate_main_card_data()
        main_card_narrative = NarrativesUtils.get_template_output(self._base_dir,\
                                                        'regression_main_card.html',main_card_data)
        self.narratives['main_card'] = {}
        self.narratives["main_card"]['paragraphs'] = NarrativesUtils.paragraph_splitter(main_card_narrative)
        self.narratives["main_card"]['header'] = 'Key Measures that affect ' + self.result_column
        self.narratives["main_card"]['chart'] = {}
        self.narratives["main_card"]['chart']['heading'] = ''
        self.narratives["main_card"]['chart']['data'] = [[i for i,j in self._all_coeffs],
                                                         [j['coefficient'] for i,j in self._all_coeffs]]
        self.narratives["main_card"]['chart']['label'] = {'x':'Measure Name',
                                                            'y': 'Change in ' + self.result_column + ' per unit increase'}

        main_card = NormalCard()
        main_card_header = HtmlData(data = '<h3>Key Measures that affect ' + self.result_column+"</h3>")
        main_card_paragraphs = NarrativesUtils.block_splitter(main_card_narrative,self._blockSplitter)
        main_card_chart_data = [{"key":val[0],"value":val[1]} for val in zip([i for i,j in self._all_coeffs],[j['coefficient'] for i,j in self._all_coeffs])]
        main_card_chart = NormalChartData(data=main_card_chart_data)
        mainCardChartJson = ChartJson()
        mainCardChartJson.set_data(main_card_chart.get_data())
        mainCardChartJson.set_label_text({'x':'Influencing Factors','y': 'Change in ' + self.result_column + ' per unit increase'})
        mainCardChartJson.set_chart_type("bar")
        mainCardChartJson.set_axes({"x":"key","y":"value"})
        mainCardChartJson.set_yaxis_number_format(".2f")
        # st_info = ["Test : Regression","Threshold for p-value: 0.05", "Effect Size: Regression Coefficient"]
        chart_data = sorted(main_card_chart_data,key=lambda x:x["value"],reverse=True)
        statistical_info_array=[
            ("Test Type","Regression"),
            ("Effect Size","Coefficients"),
            ("Max Effect Size",chart_data[0]["key"]),
            ("Min Effect Size",chart_data[-1]["key"]),
            ]
        statistical_inferenc = ""
        if len(chart_data) == 1:
            statistical_inference = "{} is the only variable that have significant influence over {} (Target) having an \
             Effect size of {}".format(chart_data[0]["key"],self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4))
        elif len(chart_data) == 2:
            statistical_inference = "There are two variables ({} and {}) that have significant influence over {} (Target) and the \
             Effect size ranges are {} and {} respectively".format(chart_data[0]["key"],chart_data[1]["key"],self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4),round(chart_data[1]["value"],4))
        else:
            statistical_inference = "There are {} variables that have significant influence over {} (Target) and the \
             Effect size ranges from {} to {}".format(len(chart_data),self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4),round(chart_data[-1]["value"],4))
        if statistical_inference != "":
            statistical_info_array.append(("Inference",statistical_inference))
        statistical_info_array = NarrativesUtils.statistical_info_array_formatter(statistical_info_array)
        main_card.set_card_data(data = [main_card_header]+main_card_paragraphs+[C3ChartData(data=mainCardChartJson,info=statistical_info_array)])
        main_card.set_card_name("Key Influencers")
        self._regressionNode.add_a_card(main_card)


        count = 0
        for measure_column in self.significant_measures:
            sigMeasureNode = NarrativesTree()
            sigMeasureNode.set_name(measure_column)
            measureCard1 = NormalCard()
            measureCard1.set_card_name("{}: Impact on {}".format(measure_column,self.result_column))
            measureCard1Data = []
            if self._run_dimension_level_regression:
                measureCard2 = NormalCard()
                measureCard2.set_card_name("Key Areas where it Matters")
                measureCard2Data = []

            measure_column_cards = {}
            card0 = {}
            card1data = regression_narrative_obj.generate_card1_data(measure_column)
            card1heading = "<h3>Impact of "+measure_column+" on "+self.result_column+"</h3>"
            measureCard1Header = HtmlData(data=card1heading)
            card1data.update({"blockSplitter":self._blockSplitter})
            card1narrative = NarrativesUtils.get_template_output(self._base_dir,\
                                                            'regression_card1.html',card1data)

            card1paragraphs = NarrativesUtils.block_splitter(card1narrative,self._blockSplitter)
            card0 = {"paragraphs":card1paragraphs}
            card0["charts"] = {}
            card0['charts']['chart2']={}
            # card0['charts']['chart2']['data']=card1data["chart_data"]
            # card0['charts']['chart2']['heading'] = ''
            # card0['charts']['chart2']['labels'] = {}
            card0['charts']['chart1']={}
            card0["heading"] = card1heading
            measure_column_cards['card0'] = card0

            measureCard1Header = HtmlData(data=card1heading)
            measureCard1Data += [measureCard1Header]
            measureCard1para = card1paragraphs
            measureCard1Data += measureCard1para

            if self._run_dimension_level_regression:
                print("running narratives for key area dict")
                self._dim_regression = self.run_regression_for_dimension_levels()
                card2table, card2data=regression_narrative_obj.generate_card2_data(measure_column,self._dim_regression)
                card2data.update({"blockSplitter":self._blockSplitter})
                card2narrative = NarrativesUtils.get_template_output(self._base_dir,\
                                                            'regression_card2.html',card2data)
                card2paragraphs = NarrativesUtils.block_splitter(card2narrative,self._blockSplitter)

                card1 = {'tables': card2table, 'paragraphs' : card2paragraphs,
                        'heading' : 'Key Areas where ' + measure_column + ' matters'}
                measure_column_cards['card1'] = card1

                measureCard2Data += card2paragraphs
                if "table1" in card2table:
                    table1data = regression_narrative_obj.convert_table_data(card2table["table1"])
                    card2Table1 = TableData()
                    card2Table1.set_table_data(table1data)
                    card2Table1.set_table_type("heatMap")
                    card2Table1.set_table_top_header(card2table["table1"]["heading"])
                    card2Table1Json = json.loads(CommonUtils.convert_python_object_to_json(card2Table1))
                    # measureCard2Data.insert(3,card2Table1)
                    measureCard2Data.insert(3,card2Table1Json)

                if "table2" in card2table:
                    table2data = regression_narrative_obj.convert_table_data(card2table["table2"])
                    card2Table2 = TableData()
                    card2Table2.set_table_data(table2data)
                    card2Table2.set_table_type("heatMap")
                    card2Table2.set_table_top_header(card2table["table2"]["heading"])
                    # measureCard2Data.insert(5,card2Table2)
                    card2Table2Json = json.loads(CommonUtils.convert_python_object_to_json(card2Table2))
                    # measureCard2Data.append(card2Table2)
                    measureCard2Data.append(card2Table2Json)


            # self._result_setter.set_trend_section_data({"result_column":self.result_column,
            #                                             "measure_column":measure_column,
            #                                             "base_dir":self._base_dir
            #                                             })
            # trend_narratives_obj = TimeSeriesNarrative(self._dataframe_helper, self._dataframe_context, self._result_setter, self._spark, self._story_narrative)
            # card2 =  trend_narratives_obj.get_regression_trend_card_data()
            # if card2:
            #     measure_column_cards['card2'] = card2
            #
            #
            # card3 = {}
            progressMessage = CommonUtils.create_progress_message_object(self._analysisName,"custom","info","Analyzing Key Influencers",self._completionStatus,self._completionStatus,display=True)
            CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=False)
            card4data = regression_narrative_obj.generate_card4_data(self.result_column,measure_column)
            card4data.update({"blockSplitter":self._blockSplitter})
            # card4heading = "Sensitivity Analysis: Effect of "+self.result_column+" on Segments of "+measure_column
            card4narrative = NarrativesUtils.get_template_output(self._base_dir,\
                                                                'regression_card4.html',card4data)
            card4paragraphs = NarrativesUtils.block_splitter(card4narrative,self._blockSplitter)
            # card3 = {"paragraphs":card4paragraphs}
            card0['paragraphs'] = card1paragraphs+card4paragraphs
            card4Chart = card4data["charts"]
            # st_info = ["Test : Regression", "Variables : "+ self.result_column +", "+measure_column,"Intercept : "+str(round(self._df_regression_result.get_intercept(),2)), "Regression Coefficient : "+ str(round(self._df_regression_result.get_coeff(measure_column),2))]
            statistical_info_array=[
                ("Test Type","Regression"),
                ("Coefficient",str(round(self._df_regression_result.get_coeff(measure_column),2))),
                ("P-Value","<= 0.05"),
                ("Intercept",str(round(self._df_regression_result.get_intercept(),2))),
                ("R Square ",str(round(self._df_regression_result.get_rsquare(),2))),
                ]
            inferenceTuple = ()
            coeff = self._df_regression_result.get_coeff(measure_column)
            if coeff > 0:
                inferenceTuple = ("Inference","For every additional unit of increase in {} there will be an increase of {} units in {} (target).".format(measure_column,str(round(coeff,2)),self._dataframe_context.get_result_column()))
            else:
                inferenceTuple = ("Inference","For every additional unit of decrease in {} there will be an decrease of {} units in {} (target).".format(measure_column,str(round(coeff,2)),self._dataframe_context.get_result_column()))
            if len(inferenceTuple) > 0:
                statistical_info_array.append(inferenceTuple)
            statistical_info_array = NarrativesUtils.statistical_info_array_formatter(statistical_info_array)

            card4paragraphs.insert(2,C3ChartData(data=card4Chart,info=statistical_info_array))
            measureCard1Data += card4paragraphs

            self.narratives['cards'].append(measure_column_cards)

            if count == 0:
                card4data.pop("charts")
                self._result_setter.update_executive_summary_data(card4data)
            count += 1
            measureCard1.set_card_data(measureCard1Data)
            if self._run_dimension_level_regression:
                measureCard2.set_card_data(measureCard2Data)
                sigMeasureNode.add_cards([measureCard1,measureCard2])
            sigMeasureNode.add_cards([measureCard1])
            self._regressionNode.add_a_node(sigMeasureNode)
        # self._result_setter.set_trend_section_completion_status(True)
        self._story_narrative.add_a_node(self._regressionNode)


    def run_regression_for_dimension_levels(self):
        print("Running regression for Dimension Levels")
        significant_dimensions = self._dataframe_helper.get_significant_dimension()
        print("significant_dimensions:",significant_dimensions)
        if significant_dimensions != {}:
            sig_dims = [(x,significant_dimensions[x]) for x in list(significant_dimensions.keys())]
            sig_dims = sorted(sig_dims,key=lambda x:x[1],reverse=True)
            cat_columns = [x[0] for x in sig_dims[:5]]
        else:
            cat_columns = self._dimension_columns[:5]
        cat_columns= [x for x in cat_columns if x != "Agent Name"]
        print("Running regression for below 5 dimensions")
        print(cat_columns)
        regression_result_dimension_cols = dict(list(zip(cat_columns,[{}]*len(cat_columns))))
        for col in cat_columns:
            print("For Column:",col)
            # column_levels = self._dataframe_helper.get_all_levels(col)
            column_levels = list(self._metaParser.get_unique_level_dict(col).keys())
            level_regression_result = dict(list(zip(column_levels,[{}]*len(column_levels))))
            print("No of levels in this column",len(column_levels))
            for level in column_levels:
                print("Filtering data for level:",level)
                filtered_df = self._dataframe_helper.filter_dataframe(col,level)
                result = LinearRegression(filtered_df, self._dataframe_helper, self._dataframe_context,self._metaParser,self._spark).fit(self._dataframe_context.get_result_column())
                if result == None:
                    result = {"intercept" : 0.0,
                              "rmse" : 0.0,
                              "rsquare" : 0.0,
                              "coeff" : 0.0
                              }
                else:
                    result = {"intercept" : result.get_intercept(),
                              "rmse" : result.get_root_mean_square_error(),
                              "rsquare" : result.get_rsquare(),
                              "coeff" : result.get_all_coeff()
                              }
                level_regression_result[level] = result
            regression_result_dimension_cols[col] = level_regression_result
        # print json.dumps(regression_result_dimension_cols,indent=2)
        return regression_result_dimension_cols
예제 #2
0
class ChiSquareNarratives:
    #@accepts(object, int, DFChiSquareResult ,ContextSetter)
    def __init__(self,
                 df_helper,
                 df_chisquare_result,
                 spark,
                 df_context,
                 data_frame,
                 story_narrative,
                 result_setter,
                 scriptWeight=None,
                 analysisName=None):
        self._story_narrative = story_narrative
        self._result_setter = result_setter
        self._data_frame = data_frame
        self._dataframe_context = df_context
        self._dataframe_helper = df_helper
        self._storyOnScoredData = self._dataframe_context.get_story_on_scored_data(
        )
        self._measure_columns = df_helper.get_numeric_columns()
        self._df_chisquare = df_chisquare_result
        self._df_chisquare_result = df_chisquare_result.get_result()
        self.narratives = {}
        self._appid = df_context.get_app_id()
        self._chiSquareNode = NarrativesTree()
        self._chiSquareNode.set_name("Association")
        self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER
        self._noOfSigDimsToShow = GLOBALSETTINGS.CHISQUARESIGNIFICANTDIMENSIONTOSHOW
        self._base_dir = "/chisquare/"
        self._spark = spark

        ############################DataFrame Measure to Dimesion Column#####################

        pandas_df = self._data_frame.toPandas()
        target_dimension = self._df_chisquare_result.keys()

        bin_data = {}
        for col in self._measure_columns:
            chisquare_result = self._df_chisquare.get_chisquare_result(
                target_dimension[0], col)
            bin_data[col] = chisquare_result.get_contingency_table(
            ).get_column_two_levels()

        for bin_col in bin_data.keys():
            for split in bin_data[bin_col]:
                val = split.split('to')
                pandas_df[bin_col][
                    (pandas_df[bin_col] >= float(val[0].replace(',', '')))
                    & (pandas_df[bin_col] < float(val[1].replace(',', ''))
                       )] = split

        fields = [
            StructField(field_name, StringType(), True)
            for field_name in pandas_df.columns
        ]
        schema = StructType(fields)

        SQLctx = SQLContext(sparkContext=self._spark.sparkContext,
                            sparkSession=self._spark)
        self._data_frame = SQLctx.createDataFrame(pandas_df, schema)

        # print self._data_frame
        ############################DataFrame Measure to Dimesion Column#####################

        if self._appid != None:
            if self._appid == "1":
                self._base_dir += "appid1/"
            elif self._appid == "2":
                self._base_dir += "appid2/"

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        if analysisName == None:
            self._analysisName = self._dataframe_context.get_analysis_name()
        else:
            self._analysisName = analysisName

        self._messageURL = self._dataframe_context.get_message_url()
        if scriptWeight == None:
            self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight(
            )
        else:
            self._scriptWeightDict = scriptWeight
        self._analysisDict = self._dataframe_context.get_analysis_dict()
        if self._analysisDict != {}:
            self._nColsToUse = self._analysisDict[
                self._analysisName]["noOfColumnsToUse"]
        else:
            self._nColsToUse = None

        self._scriptStages = {
            "initialization": {
                "summary": "Initialized the Frequency Narratives",
                "weight": 0
            },
            "summarygeneration": {
                "summary": "summary generation finished",
                "weight": 10
            },
            "completion": {
                "summary": "Frequency Stats Narratives done",
                "weight": 0
            },
        }
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "initialization",
            "info",
            display=False,
            weightKey="narratives")

        self._generate_narratives()

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "summarygeneration",
            "info",
            display=False,
            weightKey="narratives")

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "completion",
            "info",
            display=False,
            weightKey="narratives")

    def _generate_narratives(self):
        """
        generate main card narrative and remaining cards are generated by calling ChiSquareAnalysis class for each of analyzed dimensions
        """
        for target_dimension in self._df_chisquare_result.keys():
            target_chisquare_result = self._df_chisquare_result[
                target_dimension]
            analysed_variables = target_chisquare_result.keys(
            )  ## List of all analyzed var.
            # List of significant var out of analyzed var.
            significant_variables = [
                dim for dim in target_chisquare_result.keys()
                if target_chisquare_result[dim].get_pvalue() <= 0.05
            ]
            effect_sizes = [
                target_chisquare_result[dim].get_effect_size()
                for dim in significant_variables
            ]

            effect_size_dict = dict(zip(significant_variables, effect_sizes))
            significant_variables = [
                y
                for (x, y) in sorted(zip(effect_sizes, significant_variables),
                                     reverse=True)
            ]
            #insignificant_variables = [i for i in self._df_chisquare_result[target_dimension] if i['pv']>0.05]

            num_analysed_variables = len(analysed_variables)
            num_significant_variables = len(significant_variables)
            self.narratives['main_card'] = {}
            self.narratives['main_card'][
                'heading'] = 'Relationship between ' + target_dimension + ' and other factors'
            self.narratives['main_card']['paragraphs'] = {}
            data_dict = {
                'num_variables': num_analysed_variables,
                'num_significant_variables': num_significant_variables,
                'significant_variables': significant_variables,
                'target': target_dimension,
                'analysed_dimensions': analysed_variables,
                'blockSplitter': self._blockSplitter
            }  # for both para 1 and para 2
            paragraph = {}
            paragraph['header'] = ''

            paragraph['content'] = NarrativesUtils.get_template_output(
                self._base_dir, 'main_card.html', data_dict)
            self.narratives['main_card']['paragraphs'] = [paragraph]
            self.narratives['cards'] = []
            chart = {
                'header':
                'Strength of association between ' + target_dimension +
                ' and other dimensions'
            }
            chart['data'] = effect_size_dict
            chart['label_text'] = {
                'x': 'Dimensions',
                'y': 'Effect Size (Cramers-V)'
            }

            chart_data = []
            chartDataValues = []
            for k, v in effect_size_dict.items():
                chart_data.append({"key": k, "value": float(v)})
                chartDataValues.append(float(v))
            chart_data = sorted(chart_data,
                                key=lambda x: x["value"],
                                reverse=True)
            chart_json = ChartJson()
            chart_json.set_data(chart_data)
            chart_json.set_chart_type("bar")
            # chart_json.set_label_text({'x':'Dimensions','y':'Effect Size (Cramers-V)'})
            chart_json.set_label_text({
                'x': '  ',
                'y': 'Effect Size (Cramers-V)'
            })
            chart_json.set_axis_rotation(True)
            chart_json.set_axes({"x": "key", "y": "value"})
            # chart_json.set_yaxis_number_format(".4f")
            chart_json.set_yaxis_number_format(
                NarrativesUtils.select_y_axis_format(chartDataValues))
            self.narratives['main_card']['chart'] = chart

            main_card = NormalCard()
            header = "<h3>Strength of association between " + target_dimension + " and other dimensions</h3>"
            main_card_data = [HtmlData(data=header)]
            main_card_narrative = NarrativesUtils.get_template_output(
                self._base_dir, 'main_card.html', data_dict)
            main_card_narrative = NarrativesUtils.block_splitter(
                main_card_narrative, self._blockSplitter)
            main_card_data += main_card_narrative
            # st_info = ["Test : Chi Square", "Threshold for p-value : 0.05", "Effect Size : Cramer's V"]
            # print "chartdata",chart_data
            if len(chart_data) > 0:
                statistical_info_array = [
                    ("Test Type", "Chi-Square"),
                    ("Effect Size", "Cramer's V"),
                    ("Max Effect Size", chart_data[0]["key"]),
                    ("Min Effect Size", chart_data[-1]["key"]),
                ]
                statistical_inferenc = ""
                if len(chart_data) == 1:
                    statistical_inference = "{} is the only variable that have significant association with the {} (Target) having an \
                     Effect size of {}".format(
                        chart_data[0]["key"],
                        self._dataframe_context.get_result_column(),
                        round(chart_data[0]["value"], 4))
                elif len(chart_data) == 2:
                    statistical_inference = "There are two variables ({} and {}) that have significant association with the {} (Target) and the \
                     Effect size ranges are {} and {} respectively".format(
                        chart_data[0]["key"], chart_data[1]["key"],
                        self._dataframe_context.get_result_column(),
                        round(chart_data[0]["value"], 4),
                        round(chart_data[1]["value"], 4))
                else:
                    statistical_inference = "There are {} variables that have significant association with the {} (Target) and the \
                     Effect size ranges from {} to {}".format(
                        len(chart_data),
                        self._dataframe_context.get_result_column(),
                        round(chart_data[0]["value"], 4),
                        round(chart_data[-1]["value"], 4))
                if statistical_inference != "":
                    statistical_info_array.append(
                        ("Inference", statistical_inference))
                statistical_info_array = NarrativesUtils.statistical_info_array_formatter(
                    statistical_info_array)
            else:
                statistical_info_array = []
            main_card_data.append(
                C3ChartData(data=chart_json, info=statistical_info_array))
            main_card.set_card_data(main_card_data)
            main_card.set_card_name("Key Influencers")

            if self._storyOnScoredData != True:
                self._chiSquareNode.add_a_card(main_card)
                self._result_setter.add_a_score_chi_card(main_card)

            print "target_dimension", target_dimension
            if self._appid == '2' and num_significant_variables > 5:
                significant_variables = significant_variables[:5]
            else:
                if self._nColsToUse != None:
                    significant_variables = significant_variables[:self.
                                                                  _nColsToUse]

            CommonUtils.create_update_and_save_progress_message(
                self._dataframe_context,
                self._scriptWeightDict,
                self._scriptStages,
                self._analysisName,
                "custom",
                "info",
                display=True,
                customMsg="Analyzing key drivers",
                weightKey="narratives")
            for analysed_dimension in significant_variables[:self.
                                                            _noOfSigDimsToShow]:
                chisquare_result = self._df_chisquare.get_chisquare_result(
                    target_dimension, analysed_dimension)
                if self._appid == '2':
                    print "APPID 2 is used"
                    card = ChiSquareAnalysis(
                        self._dataframe_context, self._dataframe_helper,
                        chisquare_result, target_dimension, analysed_dimension,
                        significant_variables, num_analysed_variables,
                        self._data_frame, self._measure_columns,
                        self._base_dir, None, target_chisquare_result)
                    # self.narratives['cards'].append(card)
                    self._result_setter.add_a_score_chi_card(
                        json.loads(
                            CommonUtils.convert_python_object_to_json(
                                card.get_dimension_card1())))

                elif self._appid == '1':
                    print "APPID 1 is used"
                    card = ChiSquareAnalysis(
                        self._dataframe_context, self._dataframe_helper,
                        chisquare_result, target_dimension, analysed_dimension,
                        significant_variables, num_analysed_variables,
                        self._data_frame, self._measure_columns,
                        self._base_dir, None, target_chisquare_result)
                    # self.narratives['cards'].append(card)
                    self._result_setter.add_a_score_chi_card(
                        json.loads(
                            CommonUtils.convert_python_object_to_json(
                                card.get_dimension_card1())))
                else:
                    target_dimension_card = ChiSquareAnalysis(
                        self._dataframe_context, self._dataframe_helper,
                        chisquare_result, target_dimension, analysed_dimension,
                        significant_variables, num_analysed_variables,
                        self._data_frame, self._measure_columns,
                        self._base_dir, None, target_chisquare_result)
                    self.narratives['cards'].append(target_dimension_card)
                    self._chiSquareNode.add_a_node(
                        target_dimension_card.get_dimension_node())
        self._story_narrative.add_a_node(self._chiSquareNode)
        self._result_setter.set_chisquare_node(self._chiSquareNode)
예제 #3
0
class ChiSquareNarratives(object):
    #@accepts(object, int, DFChiSquareResult ,ContextSetter)
    def __init__(self,
                 df_helper,
                 df_chisquare_result,
                 spark,
                 df_context,
                 data_frame,
                 story_narrative,
                 result_setter,
                 scriptWeight=None,
                 analysisName=None):
        self._story_narrative = story_narrative
        self._result_setter = result_setter
        self._data_frame = data_frame
        self._dataframe_context = df_context
        self._pandas_flag = df_context._pandas_flag
        self._dataframe_helper = df_helper
        self._storyOnScoredData = self._dataframe_context.get_story_on_scored_data(
        )
        self._measure_columns = df_helper.get_numeric_columns()
        self._df_chisquare = df_chisquare_result
        self._df_chisquare_result = df_chisquare_result.get_result()
        self.narratives = {}
        self._appid = df_context.get_app_id()
        self._chiSquareNode = NarrativesTree()
        self._chiSquareNode.set_name("Key Drivers")
        self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER
        self._noOfSigDimsToShow = GLOBALSETTINGS.CHISQUARESIGNIFICANTDIMENSIONTOSHOW
        self._base_dir = "/chisquare/"
        self._spark = spark

        ############################DataFrame Measure to Dimesion Column#####################

        if self._pandas_flag:
            pandas_df = self._data_frame.copy(deep=True)
        else:
            pandas_df = self._data_frame.toPandas()
        target_dimension = list(self._df_chisquare_result.keys())

        bin_data = {}
        for col in self._measure_columns:
            if self._df_chisquare.get_chisquare_result(target_dimension[0],
                                                       col):
                chisquare_result = self._df_chisquare.get_chisquare_result(
                    target_dimension[0], col)
                bin_data[col] = chisquare_result.get_contingency_table(
                ).get_column_two_levels()

        for bin_col in list(bin_data.keys()):
            for split in bin_data[bin_col]:
                val = split.split('to')
                # pandas_df[bin_col][(float(pandas_df[bin_col])>=float(val[0].replace(',',''))) & (float(pandas_df[bin_col])<float(val[1].replace(',','')))] =  split
                row_value = list(pandas_df[bin_col])
                temp = []
                for row_value_ in row_value:
                    if not isinstance(row_value_, str)  and  \
                      (float(row_value_) >= float(val[0].replace(',','')))   and   \
                      (float(row_value_) <  float(val[1].replace(',',''))):
                        temp.append(split)
                    else:
                        temp.append(row_value_)
                pandas_df[bin_col] = temp
        if self._pandas_flag:
            pass
            # self._data_frame = pandas_df
        else:
            fields = [
                StructField(field_name, StringType(), True)
                for field_name in pandas_df.columns
            ]
            schema = StructType(fields)

            SQLctx = SQLContext(sparkContext=self._spark.sparkContext,
                                sparkSession=self._spark)
            self._data_frame = SQLctx.createDataFrame(pandas_df, schema)

        # print self._data_frame
        ############################DataFrame Measure to Dimesion Column#####################

        if self._appid != None:
            if self._appid == "1":
                self._base_dir += "appid1/"
            elif self._appid == "2":
                self._base_dir += "appid2/"

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        if analysisName == None:
            self._analysisName = self._dataframe_context.get_analysis_name()
        else:
            self._analysisName = analysisName

        self._messageURL = self._dataframe_context.get_message_url()
        if scriptWeight == None:
            self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight(
            )
        else:
            self._scriptWeightDict = scriptWeight
        self._analysisDict = self._dataframe_context.get_analysis_dict()
        if self._analysisDict != {}:
            self._nColsToUse = self._analysisDict[
                self._analysisName]["noOfColumnsToUse"]
        else:
            self._nColsToUse = None

        self._scriptStages = {
            "initialization": {
                "summary": "Initialized the Frequency Narratives",
                "weight": 0
            },
            "summarygeneration": {
                "summary": "Summary Generation Finished",
                "weight": 4
            },
            "completion": {
                "summary": "Frequency Stats Narratives Done",
                "weight": 0
            },
        }
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "initialization",
            "info",
            display=False,
            weightKey="narratives")
        self.new_effect_size, self.signi_dict = self.feat_imp_threshold(
            target_dimension)
        self._generate_narratives()

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "summarygeneration",
            "info",
            display=False,
            weightKey="narratives")

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "completion",
            "info",
            display=False,
            weightKey="narratives")

    def feat_imp_threshold(self,
                           target_dimension,
                           dummy_Cols=True,
                           label_encoding=False):
        if self._pandas_flag:
            if is_numeric_dtype(self._data_frame[target_dimension[0]]):
                self.app_type = 'regression'
            elif is_string_dtype(self._data_frame[target_dimension[0]]):
                self.app_type = 'classification'
        else:
            if self._data_frame.select(
                    target_dimension[0]).dtypes[0][1] == 'string':
                self.app_type = 'classification'
            elif self._data_frame.select(
                    target_dimension[0]).dtypes[0][1] in ['int', 'double']:
                self.app_type = 'regression'
        try:
            DataValidation_obj = DataValidation(self._data_frame,
                                                target_dimension[0],
                                                self.app_type,
                                                self._pandas_flag)
            DataValidation_obj.data_validation_run()
        except Exception as e:
            CommonUtils.print_errors_and_store_traceback(
                self.LOGGER, "datavalidation", e)
            CommonUtils.save_error_messages(self.errorURL,
                                            self.app_type,
                                            e,
                                            ignore=self.ignoreMsg)
        try:
            DataPreprocessingAutoML_obj = DataPreprocessingAutoML(
                DataValidation_obj.data_frame, DataValidation_obj.target,
                DataValidation_obj.data_change_dict,
                DataValidation_obj.numeric_cols,
                DataValidation_obj.dimension_cols,
                DataValidation_obj.datetime_cols,
                DataValidation_obj.problem_type, self._pandas_flag)
            DataPreprocessingAutoML_obj.data_preprocessing_run()
        except Exception as e:
            CommonUtils.print_errors_and_store_traceback(
                self.LOGGER, "dataPreprocessing", e)
            CommonUtils.save_error_messages(self.errorURL,
                                            self.app_type,
                                            e,
                                            ignore=self.ignoreMsg)
        preprocess_df = DataPreprocessingAutoML_obj.data_frame
        FeatureEngineeringAutoML_obj = FeatureEngineeringAutoML(
            DataPreprocessingAutoML_obj.data_frame,
            DataPreprocessingAutoML_obj.target,
            DataPreprocessingAutoML_obj.data_change_dict,
            DataPreprocessingAutoML_obj.numeric_cols,
            DataPreprocessingAutoML_obj.dimension_cols,
            DataPreprocessingAutoML_obj.datetime_cols,
            DataPreprocessingAutoML_obj.problem_type, self._pandas_flag)
        if FeatureEngineeringAutoML_obj.datetime_cols != 0:
            FeatureEngineeringAutoML_obj.date_column_split(
                FeatureEngineeringAutoML_obj.datetime_cols)
        if dummy_Cols:
            if self._pandas_flag:
                FeatureEngineeringAutoML_obj.sk_one_hot_encoding(
                    FeatureEngineeringAutoML_obj.dimension_cols)
                clean_df = FeatureEngineeringAutoML_obj.data_frame
            else:
                FeatureEngineeringAutoML_obj.pyspark_one_hot_encoding(
                    FeatureEngineeringAutoML_obj.dimension_cols)
                clean_df = FeatureEngineeringAutoML_obj.data_frame
        if label_encoding:
            if self._pandas_flag:
                for column_name in FeatureEngineeringAutoML_obj.dimension_cols:
                    preprocess_df[
                        column_name +
                        '_label_encoded'] = LabelEncoder().fit_transform(
                            preprocess_df[column_name])
                    preprocess_df = preprocess_df.drop(column_name, 1)
                clean_df = preprocess_df
            else:
                FeatureEngineeringAutoML_obj.pyspark_label_encoding(
                    FeatureEngineeringAutoML_obj.dimension_cols)
                clean_df = FeatureEngineeringAutoML_obj.data_frame
        if self._pandas_flag:
            ind_var = clean_df.drop(target_dimension[0], 1)
            ind_var = ind_var[ind_var._get_numeric_data().columns]
            target = clean_df[target_dimension[0]]
            dtree = DecisionTreeClassifier(criterion='gini',
                                           max_depth=5,
                                           random_state=42)
            dtree.fit(ind_var, target)
            feat_imp_dict = {}
            for feature, importance in zip(list(ind_var.columns),
                                           dtree.feature_importances_):
                feat_imp_dict[feature] = round(importance, 2)
        else:
            num_var = [
                col[0] for col in clean_df.dtypes
                if ((col[1] == 'int') | (col[1] == 'double'))
                & (col[0] != target_dimension[0])
            ]
            num_var = [col for col in num_var if not col.endswith('indexed')]
            labels_count = [
                len(clean_df.select(col).distinct().collect())
                for col in num_var
            ]
            # labels_count = [len(clean_df.agg((F.collect_set(col).alias(col))).first().asDict()[col]) for col in num_var]
            labels_count.sort()
            max_count = labels_count[-1]
            label_indexes = StringIndexer(inputCol=target_dimension[0],
                                          outputCol='label',
                                          handleInvalid='keep')
            assembler = VectorAssembler(inputCols=num_var,
                                        outputCol="features")
            model = pysparkDecisionTreeClassifier(labelCol="label",
                                                  featuresCol="features",
                                                  seed=8464,
                                                  impurity='gini',
                                                  maxDepth=5,
                                                  maxBins=max_count + 2)
            pipe = Pipeline(stages=[assembler, label_indexes, model])
            mod_fit = pipe.fit(clean_df)
            df2 = mod_fit.transform(clean_df)
            list_extract = []
            for i in df2.schema["features"].metadata["ml_attr"]["attrs"]:
                list_extract = list_extract + df2.schema["features"].metadata[
                    "ml_attr"]["attrs"][i]
            varlist = pd.DataFrame(list_extract)
            varlist['score'] = varlist['idx'].apply(
                lambda x: mod_fit.stages[-1].featureImportances[x])
            feat_imp_dict = pd.Series(varlist.score.values,
                                      index=varlist.name).to_dict()
        feat_imp_ori_dict = {}
        actual_cols = list(self._data_frame.columns)
        actual_cols.remove(target_dimension[0])
        for col in actual_cols:
            fea_imp_ori_list = []
            for col_imp in feat_imp_dict:
                temp = col_imp.split(col, -1)
                if len(temp) == 2:
                    fea_imp_ori_list.append(feat_imp_dict[col_imp])
            feat_imp_ori_dict.update({col: sum(fea_imp_ori_list)})
        sort_dict = dict(
            sorted(feat_imp_ori_dict.items(), key=lambda x: x[1],
                   reverse=True))
        if self._pandas_flag:
            self._data_frame = self._data_frame.apply(
                lambda col: pd.to_datetime(col, errors='ignore')
                if col.dtypes == object else col,
                axis=0)
            cat_var = [
                key for key in dict(self._data_frame.dtypes)
                if dict(self._data_frame.dtypes)[key] in ['object']
            ]
        else:
            cat_var = [
                col[0] for col in self._data_frame.dtypes if col[1] == 'string'
            ]
        cat_var.remove(target_dimension[0])
        si_var_dict = {
            key: value
            for key, value in sort_dict.items() if key in cat_var
        }
        threshold = 0
        si_var_thresh = {}
        for key, value in si_var_dict.items():
            threshold = threshold + value
            if threshold < 0.8:
                si_var_thresh[key] = value
        return feat_imp_dict, si_var_thresh

    def _generate_narratives(self):
        """
        generate main card narrative and remaining cards are generated by calling ChiSquareAnalysis class for each of analyzed dimensions
        """
        for target_dimension in list(self._df_chisquare_result.keys()):
            target_chisquare_result = self._df_chisquare_result[
                target_dimension]
            analysed_variables = list(
                target_chisquare_result.keys())  ## List of all analyzed var.
            # List of significant var out of analyzed var.
            # significant_variables = [dim for dim in list(target_chisquare_result.keys()) if target_chisquare_result[dim].get_pvalue()<=0.05]
            effect_size_dict = self.new_effect_size
            significant_variables = list(self.signi_dict.keys())
            effect_sizes = list(self.signi_dict.values())
            significant_variables = [
                y
                for (x, y) in sorted(zip(effect_sizes, significant_variables),
                                     reverse=True) if round(float(x), 2) > 0
            ]
            #insignificant_variables = [i for i in self._df_chisquare_result[target_dimension] if i['pv']>0.05]

            num_analysed_variables = len(analysed_variables)
            num_significant_variables = len(significant_variables)
            self.narratives['main_card'] = {}
            self.narratives['main_card'][
                'heading'] = 'Relationship between ' + target_dimension + ' and other factors'
            self.narratives['main_card']['paragraphs'] = {}
            data_dict = {
                'num_variables': num_analysed_variables,
                'num_significant_variables': num_significant_variables,
                'significant_variables': significant_variables,
                'target': target_dimension,
                'analysed_dimensions': analysed_variables,
                'blockSplitter': self._blockSplitter
            }  # for both para 1 and para 2
            paragraph = {}
            paragraph['header'] = ''

            paragraph['content'] = NarrativesUtils.get_template_output(
                self._base_dir, 'main_card.html', data_dict)
            self.narratives['main_card']['paragraphs'] = [paragraph]
            self.narratives['cards'] = []
            chart = {
                'header':
                'Strength of association between ' + target_dimension +
                ' and other dimensions'
            }
            chart['data'] = effect_size_dict
            chart['label_text'] = {
                'x': 'Dimensions',
                'y': 'Feature Importance'
            }

            chart_data = []
            chartDataValues = []
            for k, v in list(effect_size_dict.items()):
                "rounding the chart data for keydrivers tab"
                if round(float(v), 2) > 0:
                    chart_data.append({
                        "Attribute": k,
                        "Effect_Size": round(float(v), 2)
                    })
                    chartDataValues.append(round(float(v), 2))
            chart_data = sorted(chart_data,
                                key=lambda x: x["Effect_Size"],
                                reverse=True)
            chart_json = ChartJson()
            chart_json.set_data(chart_data)
            chart_json.set_chart_type("bar")
            # chart_json.set_label_text({'x':'Dimensions','y':'Effect Size (Cramers-V)'})
            chart_json.set_label_text({'x': '  ', 'y': 'Feature Importance'})
            chart_json.set_axis_rotation(True)
            chart_json.set_axes({"x": "Attribute", "y": "Feature Importance"})
            chart_json.set_yaxis_number_format(".2f")
            # chart_json.set_yaxis_number_format(NarrativesUtils.select_y_axis_format(chartDataValues))
            self.narratives['main_card']['chart'] = chart

            main_card = NormalCard()
            header = "<h3>Key Factors that drive " + target_dimension + "</h3>"
            main_card_data = [HtmlData(data=header)]
            main_card_narrative = NarrativesUtils.get_template_output(
                self._base_dir, 'main_card.html', data_dict)
            main_card_narrative = NarrativesUtils.block_splitter(
                main_card_narrative, self._blockSplitter)
            main_card_data += main_card_narrative
            # st_info = ["Test : Chi Square", "Threshold for p-value : 0.05", "Effect Size : Cramer's V"]
            # print "chartdata",chart_data
            if len(chart_data) > 0:
                statistical_info_array = [
                    ("Test Type", "Chi-Square"),
                    ("Effect Size", "Cramer's V"),
                    ("Max Effect Size", chart_data[0]["Attribute"]),
                    ("Min Effect Size", chart_data[-1]["Attribute"]),
                ]
                statistical_inferenc = ""
                if len(chart_data) == 1:
                    statistical_inference = "{} is the only variable that have significant association with the {} (Target) having an \
                     Effect size of {}".format(
                        chart_data[0]["Attribute"],
                        self._dataframe_context.get_result_column(),
                        round(chart_data[0]["Effect_Size"], 4))
                elif len(chart_data) == 2:
                    statistical_inference = "There are two variables ({} and {}) that have significant association with the {} (Target) and the \
                     Effect size ranges are {} and {} respectively".format(
                        chart_data[0]["Attribute"], chart_data[1]["Attribute"],
                        self._dataframe_context.get_result_column(),
                        round(chart_data[0]["Effect_Size"], 4),
                        round(chart_data[1]["Effect_Size"], 4))
                else:
                    statistical_inference = "There are {} variables that have significant association with the {} (Target) and the \
                     Effect size ranges from {} to {}".format(
                        len(chart_data),
                        self._dataframe_context.get_result_column(),
                        round(chart_data[0]["Effect_Size"], 4),
                        round(chart_data[-1]["Effect_Size"], 4))
                if statistical_inference != "":
                    statistical_info_array.append(
                        ("Inference", statistical_inference))
                statistical_info_array = NarrativesUtils.statistical_info_array_formatter(
                    statistical_info_array)
            else:
                statistical_info_array = []
            main_card_data.append(
                C3ChartData(data=chart_json, info=statistical_info_array))
            main_card.set_card_data(main_card_data)
            main_card.set_card_name("Key Influencers")

            if self._storyOnScoredData != True:
                self._chiSquareNode.add_a_card(main_card)
                self._result_setter.add_a_score_chi_card(main_card)

            print("target_dimension", target_dimension)
            if self._appid == '2' and num_significant_variables > 5:
                significant_variables = significant_variables[:5]
            else:
                if self._nColsToUse != None:
                    significant_variables = significant_variables[:self.
                                                                  _nColsToUse]
                    nColsToUse_temp = self._nColsToUse
                else:
                    nColsToUse_temp = self._noOfSigDimsToShow

            CommonUtils.create_update_and_save_progress_message(
                self._dataframe_context,
                self._scriptWeightDict,
                self._scriptStages,
                self._analysisName,
                "custom",
                "info",
                display=True,
                customMsg="Analyzing key drivers",
                weightKey="narratives")
            for analysed_dimension in significant_variables[:nColsToUse_temp]:
                chisquare_result = self._df_chisquare.get_chisquare_result(
                    target_dimension, analysed_dimension)
                if self._appid == '2':
                    print("APPID 2 is used")
                    card = ChiSquareAnalysis(
                        self._dataframe_context, self._dataframe_helper,
                        chisquare_result, target_dimension, analysed_dimension,
                        significant_variables, num_analysed_variables,
                        self._data_frame, self._measure_columns,
                        self._base_dir, None, target_chisquare_result)
                    # self.narratives['cards'].append(card)
                    self._result_setter.add_a_score_chi_card(
                        json.loads(
                            CommonUtils.convert_python_object_to_json(
                                card.get_dimension_card1())))

                elif self._appid == '1':
                    print("APPID 1 is used")
                    card = ChiSquareAnalysis(
                        self._dataframe_context, self._dataframe_helper,
                        chisquare_result, target_dimension, analysed_dimension,
                        significant_variables, num_analysed_variables,
                        self._data_frame, self._measure_columns,
                        self._base_dir, None, target_chisquare_result)
                    # self.narratives['cards'].append(card)
                    self._result_setter.add_a_score_chi_card(
                        json.loads(
                            CommonUtils.convert_python_object_to_json(
                                card.get_dimension_card1())))
                else:
                    target_dimension_card = ChiSquareAnalysis(
                        self._dataframe_context, self._dataframe_helper,
                        chisquare_result, target_dimension, analysed_dimension,
                        significant_variables, num_analysed_variables,
                        self._data_frame, self._measure_columns,
                        self._base_dir, None, target_chisquare_result)
                    self.narratives['cards'].append(target_dimension_card)
                    self._chiSquareNode.add_a_node(
                        target_dimension_card.get_dimension_node())
        self._story_narrative.add_a_node(self._chiSquareNode)
        self._result_setter.set_chisquare_node(self._chiSquareNode)
예제 #4
0
        data_dict_overall["price_trend"] = stockPriceTrendArrayFormatted

        data_dict_overall["avg_sentiment_score"] = data_dict_overall["avg_sentiment_score"]/number_stocks
        data_dict_overall["stock_value_change"] = data_dict_overall["stock_value_change"]/number_stocks
        data_dict_overall["stock_percent_change"] = data_dict_overall["stock_percent_change"]/number_stocks

        data_dict_overall["number_articles_by_concept"] = self.get_number_articles_per_concept(data_dict_overall["nArticlesAndSentimentsPerConcept"])

        key, value = max(data_dict_overall["max_value_change"].iteritems(), key = lambda p: p[1])
        data_dict_overall["max_value_change_overall"] = (self.get_capitalized_name(key),value)
        key, value = min(data_dict_overall["max_value_change"].iteritems(), key = lambda p: p[1])
        data_dict_overall["min_value_change_overall"] = (self.get_capitalized_name(key),value)

        key,value = max(data_dict_overall["max_sentiment_change"].iteritems(), key = lambda p: p[1])
        data_dict_overall["max_sentiment_change_overall"] = (self.get_capitalized_name(key),value)

        # print data_dict_overall
        finalResult = NarrativesTree()
        overviewNode = NarrativesTree()
        stockNode = NarrativesTree()
        overviewNode.set_name("Overview")
        stockNode.set_name("Single Stock Analysis")
        overviewCard = MLUtils.stock_sense_overview_card(data_dict_overall)
        overviewNode.add_a_card(overviewCard)
        finalResult.add_a_node(overviewNode)
        individualStockNodes = MLUtils.stock_sense_individual_stock_cards(stockDict)
        stockNode.add_nodes(individualStockNodes)
        finalResult.add_a_node(stockNode)

        return finalResult
예제 #5
0
class AnovaNarratives(object):
    ALPHA = 0.05

    KEY_SUMMARY = 'summary'
    KEY_NARRATIVES = 'narratives'
    KEY_TAKEAWAY = 'key_takeaway'
    DRILL_DOWN = 'drill_down_narrative'
    KEY_CARD = 'card'
    KEY_HEADING = 'heading'
    KEY_SUBHEADING = 'header'
    KEY_CHART = 'charts'
    KEY_PARAGRAPH = 'paragraphs'
    KEY_PARA_HEADER = 'header'
    KEY_PARA_CONTENT = 'content'
    KEY_BUBBLE = 'bubble_data'

    # @accepts(object, DFAnovaResult, DataFrameHelper)
    def __init__(self,
                 df_anova_result,
                 df_helper,
                 df_context,
                 result_setter,
                 story_narrative,
                 scriptWeight=None,
                 analysisName=None):
        self._story_narrative = story_narrative
        self._result_setter = result_setter
        self._dataframe_context = df_context
        self._df_anova_result = df_anova_result
        self._df_helper = df_helper
        self.narratives = {}
        self.narratives['variables'] = ''
        self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER
        self._base_dir = "/anova/"

        self._analysisName = self._dataframe_context.get_analysis_name()
        self._analysisDict = self._dataframe_context.get_analysis_dict()

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        self._messageURL = self._dataframe_context.get_message_url()
        if analysisName == None:
            self._analysisName = self._dataframe_context.get_analysis_name()
        else:
            self._analysisName = analysisName
        if scriptWeight == None:
            self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight(
            )
        else:
            self._scriptWeightDict = scriptWeight
        self._scriptStages = {
            "anovaNarrativeStart": {
                "summary": "Started The Anova Narratives",
                "weight": 0
            },
            "anovaNarrativeEnd": {
                "summary": "Narratives For Anova Finished",
                "weight": 10
            },
        }
        # self._completionStatus += self._scriptWeightDict[self._analysisName]["narratives"]*self._scriptStages["anovaNarrativeStart"]["weight"]/10
        # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
        #                             "anovaNarrativeStart",\
        #                             "info",\
        #                             self._scriptStages["anovaNarrativeStart"]["summary"],\
        #                             self._completionStatus,\
        #                             self._completionStatus)
        # CommonUtils.save_progress_message(self._messageURL,progressMessage)
        # self._dataframe_context.update_completion_status(self._completionStatus)
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "anovaNarrativeStart",
            "info",
            display=False,
            emptyBin=False,
            customMsg=None,
            weightKey="narratives")

        self._generate_narratives()

        # self._completionStatus += self._scriptWeightDict[self._analysisName]["narratives"]*self._scriptStages["anovaNarrativeEnd"]["weight"]/10
        # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
        #                             "anovaNarrativeEnd",\
        #                             "info",\
        #                             self._scriptStages["anovaNarrativeEnd"]["summary"],\
        #                             self._completionStatus,\
        #                             self._completionStatus)
        # CommonUtils.save_progress_message(self._messageURL,progressMessage)
        # self._dataframe_context.update_completion_status(self._completionStatus)
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "anovaNarrativeEnd",
            "info",
            display=False,
            emptyBin=False,
            customMsg=None,
            weightKey="narratives")

        if self._anovaNodes.get_card_count() > 0:
            self._story_narrative.add_a_node(self._anovaNodes)
            #self._generate_take_away()
            self._result_setter.set_anova_node(self._anovaNodes)

    def _generate_narratives(self):
        try:
            nColsToUse = self._analysisDict[
                self._analysisName]["noOfColumnsToUse"]
        except:
            nColsToUse = None
        self._anovaNodes = NarrativesTree()
        self._anovaNodes.set_name("Performance")
        for measure_column in self._df_anova_result.get_measure_columns():
            measure_anova_result = self._df_anova_result.get_measure_result(
                measure_column)
            significant_dimensions_dict, insignificant_dimensions = measure_anova_result.get_OneWayAnovaSignificantDimensions(
            )
            num_dimensions = len(list(significant_dimensions_dict.items())
                                 ) + len(insignificant_dimensions)
            significant_dimensions = [
                k for k, v in sorted(list(significant_dimensions_dict.items()),
                                     key=lambda x: -x[1])
            ]
            if nColsToUse != None:
                significant_dimensions = significant_dimensions[:nColsToUse]
            num_significant_dimensions = len(significant_dimensions)
            num_insignificant_dimensions = len(insignificant_dimensions)
            print("num_significant_dimensions", num_significant_dimensions)
            if num_significant_dimensions > 0:
                mainCard = NormalCard(name="Overview of Key Factors")
                data_c3 = []
                for sig_dim in significant_dimensions:
                    data_c3.append({
                        'dimension':
                        sig_dim,
                        'effect_size':
                        float(significant_dimensions_dict[sig_dim])
                    })
                self.narratives = {}
                self.narratives[AnovaNarratives.
                                KEY_HEADING] = "%s Performance Analysis" % (
                                    measure_column, )
                self.narratives['main_card'] = {}
                self.narratives['cards'] = []
                self.narratives['main_card'][
                    AnovaNarratives.
                    KEY_SUBHEADING] = "Relationship between %s and other Dimensions" % (
                        measure_column)
                self.narratives['main_card'][
                    AnovaNarratives.KEY_PARAGRAPH] = []
                data_dict = { \
                                'significant_dimensions' : significant_dimensions,
                                'insignificant_dimensions' : insignificant_dimensions,
                                'num_significant_dimensions' : num_significant_dimensions,
                                'num_insignificant_dimensions' : num_insignificant_dimensions,
                                'num_dimensions' : num_significant_dimensions+num_insignificant_dimensions,
                                'target' : measure_column \
                            }
                output = {'header': ''}
                output['content'] = NarrativesUtils.get_template_output(
                    self._base_dir, 'anova_template_1.html', data_dict)
                self.narratives['main_card'][
                    AnovaNarratives.KEY_PARAGRAPH].append(output)
                output1 = {'header': ''}
                output1['content'] = NarrativesUtils.get_template_output(
                    self._base_dir, 'anova_template_2.html', data_dict)
                lines = []
                lines += NarrativesUtils.block_splitter(
                    output['content'], self._blockSplitter)
                data_c3 = NormalChartData(data_c3)
                chart_data = data_c3.get_data()
                chartDataValues = []
                effect_size_values = []
                for obj in chart_data:
                    effect_size_values.append(obj["effect_size"])
                chart_data_min = min(effect_size_values)
                if chart_data_min < 0.00001:
                    for obj in chart_data:
                        chartDataValues.append(str(obj["effect_size"]))
                else:
                    for obj in chart_data:
                        chartDataValues.append(obj["effect_size"])
                chart_json = ChartJson(data=chart_data,
                                       axes={
                                           'x': 'dimension',
                                           'y': 'effect_size'
                                       },
                                       label_text={
                                           'x': '',
                                           'y':
                                           'Effect Size (scaled exp values)'
                                       },
                                       chart_type='bar')
                chart_json.set_axis_rotation(True)
                # chart_json.set_yaxis_number_format(".4f")
                chart_json.set_yaxis_number_format(
                    NarrativesUtils.select_y_axis_format(chartDataValues))
                # st_info = ["Test : ANOVA", "Threshold for p-value : 0.05", "Effect Size : Tukey's HSD"]
                statistical_info_array = [
                    ("Test Type", "ANOVA"),
                    ("Effect Size", "ETA squared"),
                    ("Max Effect Size", chart_data[0]["dimension"]),
                    ("Min Effect Size", chart_data[-1]["dimension"]),
                ]
                statistical_inferenc = ""
                if len(chart_data) == 1:
                    statistical_inference = "{} is the only variable that have significant association with the {} (Target) having an \
                     Effect size of {}".format(
                        chart_data[0]["dimension"],
                        self._dataframe_context.get_result_column(),
                        round(chart_data[0]["effect_size"], 4))
                elif len(chart_data) == 2:
                    statistical_inference = "There are two variables ({} and {}) that have significant association with the {} (Target) and the \
                     Effect size ranges are {} and {} respectively".format(
                        chart_data[0]["dimension"], chart_data[1]["dimension"],
                        self._dataframe_context.get_result_column(),
                        round(chart_data[0]["effect_size"], 4),
                        round(chart_data[1]["effect_size"], 4))
                else:
                    statistical_inference = "There are {} variables that have significant association with the {} (Target) and the \
                     Effect size ranges from {} to {}".format(
                        len(chart_data),
                        self._dataframe_context.get_result_column(),
                        round(chart_data[0]["effect_size"], 4),
                        round(chart_data[-1]["effect_size"], 4))
                if statistical_inference != "":
                    statistical_info_array.append(
                        ("Inference", statistical_inference))
                statistical_info_array = NarrativesUtils.statistical_info_array_formatter(
                    statistical_info_array)
                lines += [
                    C3ChartData(data=chart_json, info=statistical_info_array)
                ]
                lines += NarrativesUtils.block_splitter(
                    output1['content'], self._blockSplitter)
                mainCard.set_card_data(lines)
                self._anovaNodes.add_a_card(mainCard)
                self.narratives['main_card'][
                    AnovaNarratives.KEY_PARAGRAPH].append(output1)
                self.narratives['main_card'][AnovaNarratives.KEY_CHART] = {}
                effect_size_chart = {
                    'heading': '',
                    'labels': {
                        'Dimension': 'Effect Size'
                    },
                    'data': significant_dimensions_dict
                }
                print(significant_dimensions_dict)
                self.narratives['main_card'][AnovaNarratives.KEY_CHART][
                    'effect_size'] = effect_size_chart
                progressMessage = CommonUtils.create_progress_message_object(
                    self._analysisName,
                    "custom",
                    "info",
                    "Analyzing Key Drivers",
                    self._completionStatus,
                    self._completionStatus,
                    display=True)
                CommonUtils.save_progress_message(self._messageURL,
                                                  progressMessage,
                                                  ignore=False)
                self._generate_dimension_narratives(significant_dimensions,
                                                    measure_anova_result,
                                                    measure_column)
            else:
                mainCard = NormalCard(name="Overview of Key Factors")
                cardText = HtmlData(
                    "There are no dimensions in the dataset that have significant influence on {}"
                    .format(measure_column))
                mainCard.set_card_data([cardText])
                self._anovaNodes.add_a_card(mainCard)

    def _generate_dimension_narratives(self, significant_dimensions,
                                       measure_anova_result, measure):
        self.narratives['cards'] = []
        anova_trend_result = measure_anova_result.get_trend_data()
        if len(significant_dimensions) == 0:
            self.narratives['cards'].append({
                'card1': '',
                'card2': '',
                'card3': ''
            })
        self.narratives['variables'] = significant_dimensions
        for dimension in significant_dimensions:
            dimensionNode = NarrativesTree(name=dimension)
            narratives = OneWayAnovaNarratives(self._dataframe_context,
                                               measure, dimension,
                                               measure_anova_result,
                                               anova_trend_result,
                                               self._result_setter,
                                               dimensionNode, self._base_dir)
            self._anovaNodes.add_a_node(dimensionNode)
            self.narratives['cards'].append(narratives)