예제 #1
0
 def generate_narratives(self):
     narrative_data_dict = self._result_setter.get_executive_summary_data()
     sig_dimension_dict = self._dataframe_helper.get_significant_dimension()
     sig_dimension_dict = sorted(sig_dimension_dict,
                                 key=lambda x: abs(sig_dimension_dict[x]),
                                 reverse=True)
     sig_dims = []
     anova_data = []
     for val in sig_dimension_dict:
         sig_dims.append(val)
         if val in narrative_data_dict:
             anova_data.append(narrative_data_dict[val])
     narrative_data_dict["sig_dims"] = sig_dims
     narrative_data_dict["anova_data"] = anova_data
     # print json.dumps(narrative_data_dict,indent=2)
     executive_summary = NarrativesUtils.get_template_output(self._base_dir,\
                                                     'executive_summary.html',narrative_data_dict)
     executive_summary_paragraphs = NarrativesUtils.paragraph_splitter(
         executive_summary)
     self.executive_summary = executive_summary_paragraphs
예제 #2
0
    def _generate_narratives_card1(self):
        chisquare_result = self._chisquare_result
        target_dimension = self._target_dimension
        analysed_dimension = self._analysed_dimension
        significant_variables = self._significant_variables
        num_analysed_variables = self._num_analysed_variables
        table = self._table
        total = self._table.get_total()
        table_counts = self._table.table
        table_percent = self._table.table_percent
        #row is target dimension and column is analysed dimension when created
        table_percent_by_row = self._table.table_percent_by_row
        table_percent_by_column = self._table.table_percent_by_column
        target_distribution = self._table.get_row_total()
        analysed_dimension_distribution = self._table.get_column_total()
        sorted_ = sorted(enumerate(target_distribution),
                         reverse=True,
                         key=lambda x: x[1])
        top_target_index, second_top_target_index = [x[0] for x in sorted_[:2]]

        levels = self._table.get_column_two_levels()
        level_counts = self._table.get_column_total()
        levels_count_sum = sum(level_counts)
        levels_percentages = [
            i * 100.0 / levels_count_sum for i in level_counts
        ]
        sorted_levels = sorted(zip(level_counts, levels), reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]
        top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        bottom_dim = sorted_levels[-1][1]
        bottom_dim_contribution = sorted_levels[-1][0]

        target_levels = self._table.get_column_one_levels()
        target_counts = self._table.get_row_total()
        sorted_target_levels = sorted(zip(target_counts, target_levels),
                                      reverse=True)
        top_target_count, top_target = sorted_target_levels[0]
        second_target_count, second_target = sorted_target_levels[1]

        top_target_contributions = [
            table.get_value(top_target, i) for i in levels
        ]
        sum_top_target = sum(top_target_contributions)

        sorted_levels = sorted(zip(top_target_contributions, levels),
                               reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]
        top_target_top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        top_target_top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        top_target_bottom_dim = sorted_levels[-1][1]
        top_target_bottom_dim_contribution = sorted_levels[-1][0]

        top_target_percentages = [
            i * 100.0 / sum_top_target for i in top_target_contributions
        ]
        best_top_target_index = top_target_contributions.index(
            max(top_target_contributions))
        worst_top_target_index = top_target_contributions.index(
            min(top_target_contributions))
        top_target_differences = [
            x - y for x, y in zip(levels_percentages, top_target_percentages)
        ]
        if len(top_target_differences) > 4:
            tops = 2
            bottoms = -2
        elif len(top_target_differences) == 4:
            tops = 2
            bottoms = -1
        else:
            tops = 1
            bottoms = -1
        sorted_ = sorted(enumerate(top_target_differences),
                         key=lambda x: x[1],
                         reverse=True)
        best_top_difference_indices = [x for x, y in sorted_[:tops]]
        worst_top_difference_indices = [x for x, y in sorted_[bottoms:]]

        top_target_shares = [
            x * 100.0 / y
            for x, y in zip(top_target_contributions, level_counts)
        ]
        best_top_target_share_index = top_target_shares.index(
            max(top_target_shares))
        worst_top_target_share_index = top_target_shares.index(
            min(top_target_shares))
        overall_top_percentage = sum_top_target * 100.0 / total

        second_target_contributions = [
            table.get_value(second_target, i) for i in levels
        ]
        sum_second_target = sum(second_target_contributions)

        sorted_levels = sorted(zip(second_target_contributions, levels),
                               reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]
        second_target_top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        second_target_top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        second_target_bottom_dim = sorted_levels[-1][1]
        second_target_bottom_dim_contribution = sorted_levels[-1][0]

        second_target_percentages = [
            i * 100.0 / sum_second_target for i in second_target_contributions
        ]
        best_second_target_index = second_target_contributions.index(
            max(second_target_contributions))
        worst_second_target_index = second_target_contributions.index(
            min(second_target_contributions))
        second_target_differences = [
            x - y
            for x, y in zip(levels_percentages, second_target_percentages)
        ]
        if len(second_target_differences) > 6:
            tops = 2
            bottoms = -2
        elif len(second_target_differences) > 4:
            tops = 2
            bottoms = -1
        else:
            tops = 1
            bottoms = -1
        sorted_ = sorted(enumerate(second_target_differences),
                         key=lambda x: x[1])
        best_second_difference_indices = [x for x, y in sorted_[:tops]]
        worst_second_difference_indices = [x for x, y in sorted_[bottoms:]]

        second_target_shares = [
            x * 100.0 / y
            for x, y in zip(second_target_contributions, level_counts)
        ]
        best_second_target_share_index = second_target_shares.index(
            max(second_target_shares))
        worst_second_target_share_index = second_target_shares.index(
            min(second_target_shares))
        overall_second_percentage = sum_second_target * 100.0 / total

        data_dict = {}
        data_dict['best_second_difference'] = best_second_difference_indices[0]
        data_dict['worst_second_difference'] = worst_second_difference_indices[
            0]
        data_dict['best_top_difference'] = best_top_difference_indices[0]
        data_dict['worst_top_difference'] = worst_top_difference_indices[0]
        data_dict['levels_percentages'] = levels_percentages
        data_dict['top_target_percentages'] = top_target_percentages
        data_dict['second_target_percentages'] = second_target_percentages
        data_dict['levels'] = levels
        data_dict['best_top_share'] = best_top_target_share_index
        data_dict['worst_top_share'] = worst_top_target_share_index
        data_dict['best_second_share'] = best_second_target_share_index
        data_dict['worst_second_share'] = worst_second_target_share_index
        data_dict['top_target_shares'] = top_target_shares
        data_dict['second_target_shares'] = second_target_shares
        data_dict['overall_second'] = overall_second_percentage
        data_dict['overall_top'] = overall_top_percentage

        data_dict['num_significant'] = len(significant_variables)
        data_dict['colname'] = analysed_dimension
        data_dict['target'] = target_dimension
        data_dict['top_levels'] = top_dims
        data_dict['top_levels_percent'] = NarrativesUtils.round_number(
            top_dims_contribution * 100.0 / total)
        data_dict['bottom_level'] = bottom_dim
        data_dict['bottom_level_percent'] = round(bottom_dim_contribution, 2)
        data_dict['second_target'] = second_target
        data_dict['second_target_top_dims'] = second_target_top_dims
        data_dict[
            'second_target_top_dims_contribution'] = second_target_top_dims_contribution
        data_dict['second_target_bottom_dim'] = second_target_bottom_dim
        data_dict[
            'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution
        data_dict['best_second_target'] = levels[best_second_target_index]
        data_dict['best_second_target_count'] = second_target_contributions[
            best_second_target_index]
        data_dict['best_second_target_percent'] = round(
            second_target_contributions[best_second_target_index] * 100.0 /
            total, 2)
        data_dict['worst_second_target'] = levels[worst_second_target_index]
        data_dict['worst_second_target_percent'] = round(
            second_target_contributions[worst_second_target_index] * 100.0 /
            total, 2)

        data_dict['top_target'] = top_target
        data_dict['top_target_top_dims'] = top_target_top_dims
        data_dict[
            'top_target_top_dims_contribution'] = top_target_top_dims_contribution
        data_dict['top_target_bottom_dim'] = top_target_bottom_dim
        data_dict[
            'top_target_bottom_dim_contribution'] = top_target_bottom_dim_contribution
        data_dict['best_top_target'] = levels[best_top_target_index]
        data_dict['best_top_target_count'] = top_target_contributions[
            best_top_target_index]
        data_dict['best_top_target_percent'] = round(
            top_target_contributions[best_top_target_index] * 100.0 / total, 2)
        data_dict['worst_top_target'] = levels[worst_top_target_index]
        data_dict['worst_top_target_percent'] = round(
            top_target_contributions[worst_top_target_index] * 100.0 / total,
            2)

        output = NarrativesUtils.paragraph_splitter(
            NarrativesUtils.get_template_output(self._base_dir, 'card1.html',
                                                data_dict))
        self.card1[
            'heading'] = 'Relationship between ' + self._target_dimension + '  and ' + self._analysed_dimension
        self.card1['paragraphs'] = output
        self.card1['chart'] = []
        self.card1['heat_map'] = self._table
        self.generate_card1_chart()
예제 #3
0
    def __init__(self, df_helper, df_context, result_setter, spark,
                 story_narrative, meta_parser):
        self._story_narrative = story_narrative
        self._result_setter = result_setter
        self._spark = spark
        self._dataframe_helper = df_helper
        self._dataframe_context = df_context
        self._pandas_flag = df_context._pandas_flag
        self._data_frame = df_helper.get_data_frame()
        self._num_significant_digits = NarrativesUtils.get_significant_digit_settings(
            "trend")
        self._metaParser = meta_parser

        self._result_column = self._dataframe_context.get_result_column()
        self._string_columns = self._dataframe_helper.get_string_columns()
        self._timestamp_columns = self._dataframe_helper.get_timestamp_columns(
        )

        # self._selected_date_columns = None
        self._selected_date_columns = self._dataframe_context.get_selected_date_columns(
        )
        self._all_date_columns = self._dataframe_context.get_date_columns()
        self._string_columns = list(
            set(self._string_columns) - set(self._all_date_columns))

        self._dateFormatDetected = False
        self._existingDateFormat = None
        self._dateFormatConversionDict = NarrativesUtils.date_formats_mapping_dict(
        )
        self._dateColumnFormatDict = df_context.get_date_format_dict()
        if self._dataframe_context.get_requested_date_format() != None:
            self._requestedDateFormat = df_context.get_requested_date_format()
        else:
            self._requestedDateFormat = None

        self._analysistype = self._dataframe_context.get_analysis_type()
        self._trendSettings = self._dataframe_context.get_trend_settings()
        self._trendSpecificMeasure = False
        if self._trendSettings != None:
            if self._analysistype == "dimension" and self._trendSettings[
                    "name"] != "Count":
                self._trendSpecificMeasure = True
                self._analysistype = "measure"
                self._result_column = self._trendSettings["selectedMeasure"]
            elif self._analysistype == "measure" and self._trendSettings[
                    "name"] != "Count":
                self._result_column = self._trendSettings["selectedMeasure"]

        self._trend_subsection = self._result_setter.get_trend_section_name()
        self._regression_trend_card = None
        self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER
        self._highlightFlag = "|~HIGHLIGHT~|"
        self._trend_on_td_column = False
        self._number_of_dimensions_to_consider = 10

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        self._analysisName = self._dataframe_context.get_analysis_name()
        self._messageURL = self._dataframe_context.get_message_url()
        if self._analysistype == "dimension":
            self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight(
            )
            self._scriptStages = {
                "initialization": {
                    "summary": "Initialized The Frequency Narratives",
                    "weight": 0
                },
                "summarygeneration": {
                    "summary": "Summary Generation Finished",
                    "weight": 4
                },
                "completion": {
                    "summary": "Frequency Stats Narratives Done",
                    "weight": 0
                },
            }
        elif self._analysistype == "measure":
            if self._trendSpecificMeasure:
                self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight(
                )
            else:
                self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight(
                )
            self._scriptStages = {
                "trendNarrativeStart": {
                    "summary": "Started The Descriptive Stats Narratives",
                    "weight": 1
                },
                "trendNarrativeEnd": {
                    "summary": "Narratives For Descriptive Stats Finished",
                    "weight": 0
                },
            }

        self._base_dir = "/trend/"
        if self._pandas_flag and self._selected_date_columns and not self._dateColumnFormatDict and not self._timestamp_columns:
            for column in self._selected_date_columns:
                uniqueVals = self._data_frame[column].astype(
                    str).unique().tolist()
                metaHelperInstance = MetaDataHelper(self._data_frame,
                                                    self._data_frame.shape[0])
                if len(uniqueVals
                       ) > 0 and metaHelperInstance.get_datetime_format_pandas(
                           [
                               self._data_frame.sort_values(
                                   by=column, ascending=False)[column][0]
                           ]) != None:
                    dateColumnFormat = metaHelperInstance.get_datetime_format_pandas(
                        uniqueVals)
                    self._dateColumnFormatDict.update(
                        {column: dateColumnFormat})
        dateColCheck = NarrativesUtils.check_date_column_formats(self._selected_date_columns,\
                                                    self._timestamp_columns,\
                                                    self._dateColumnFormatDict,\
                                                    self._dateFormatConversionDict,
                                                    self._requestedDateFormat)
        print(dateColCheck)

        self._dateFormatDetected = dateColCheck["dateFormatDetected"]
        self._trend_on_td_column = dateColCheck["trendOnTdCol"]
        if self._dateFormatDetected:
            self._requestedDateFormat = dateColCheck["requestedDateFormat"]
            self._existingDateFormat = dateColCheck["existingDateFormat"]
            # self._date_column_suggested is the column used for trend
            self._date_column_suggested = dateColCheck["suggestedDateColumn"]
        if self._existingDateFormat:
            self._data_frame, dataRangeStats = NarrativesUtils.calculate_data_range_stats(
                self._data_frame, self._existingDateFormat,
                self._date_column_suggested, self._trend_on_td_column,
                self._pandas_flag)
            print(dataRangeStats)
            self._durationString = dataRangeStats["durationString"]
            self._duration = dataRangeStats["duration"]
            self._dataLevel = dataRangeStats["dataLevel"]
            first_date = dataRangeStats["firstDate"]
            last_date = dataRangeStats["lastDate"]

            if self._timestamp_columns != None:
                if self._selected_date_columns == None:
                    self._selected_date_columns = self._timestamp_columns
                else:
                    self._selected_date_columns += self._timestamp_columns
        if self._pandas_flag:
            pass
        else:
            if self._trend_subsection == "regression":
                if self._selected_date_columns != None:
                    if self._dateFormatDetected:
                        trend_subsection_data = self._result_setter.get_trend_section_data(
                        )
                        measure_column = trend_subsection_data[
                            "measure_column"]
                        result_column = trend_subsection_data["result_column"]
                        base_dir = trend_subsection_data["base_dir"]

                        card3heading = 'How ' + result_column + ' and ' + measure_column + ' changed over time'
                        if self._dataLevel == "day":
                            grouped_data = self._data_frame.groupBy(
                                "suggestedDate").agg({
                                    measure_column: 'sum',
                                    result_column: 'sum'
                                })
                            grouped_data = grouped_data.withColumnRenamed(
                                grouped_data.columns[-1], result_column)
                            grouped_data = grouped_data.withColumnRenamed(
                                grouped_data.columns[-2], measure_column)
                            grouped_data = grouped_data.withColumn(
                                "year_month",
                                udf(lambda x: x.strftime("%b-%y"))(
                                    "suggestedDate"))
                            grouped_data = grouped_data.orderBy(
                                "suggestedDate", ascending=True)
                            grouped_data = grouped_data.withColumnRenamed(
                                grouped_data.columns[0], "key")
                            grouped_data = grouped_data.toPandas()
                        elif self._dataLevel == "month":
                            grouped_data = self._data_frame.groupBy(
                                "year_month").agg({
                                    measure_column: 'sum',
                                    result_column: 'sum'
                                })
                            grouped_data = grouped_data.withColumnRenamed(
                                grouped_data.columns[-1], result_column)
                            grouped_data = grouped_data.withColumnRenamed(
                                grouped_data.columns[-2], measure_column)
                            grouped_data = grouped_data.withColumn(
                                "suggestedDate",
                                udf(lambda x: datetime.strptime(x, "%b-%y"))(
                                    "year_month"))
                            grouped_data = grouped_data.orderBy(
                                "suggestedDate", ascending=True)
                            grouped_data = grouped_data.withColumnRenamed(
                                "suggestedDate", "key")
                            grouped_data = grouped_data.select([
                                "key", measure_column, result_column,
                                "year_month"
                            ]).toPandas()
                            grouped_data["key"] = grouped_data[
                                "year_month"].apply(
                                    lambda x: datetime.strptime(x, "%b-%y"
                                                                ).date())

                        trend_narrative_obj = TrendNarrative(
                            self._result_column, self._date_column_suggested,
                            grouped_data, self._existingDateFormat,
                            self._requestedDateFormat, self._base_dir,
                            self._metaParser)

                        card3data = trend_narrative_obj.generate_regression_trend_data(
                            grouped_data, measure_column, result_column,
                            self._dataLevel, self._durationString)

                        card3narrative = NarrativesUtils.get_template_output(base_dir,\
                                                                        'regression_card3.html',card3data)

                        card3chart = trend_narrative_obj.generate_regression_trend_chart(
                            grouped_data, self._dataLevel)
                        card3paragraphs = NarrativesUtils.paragraph_splitter(
                            card3narrative)
                        card2 = {
                            'charts': card3chart,
                            'paragraphs': card3paragraphs,
                            'heading': card3heading
                        }
                        self.set_regression_trend_card_data(card2)
                    else:
                        print("NO DATE FORMAT DETECTED")
                else:
                    print("NO DATE COLUMNS PRESENT")

        if self._analysistype == "measure":
            self._completionStatus += old_div(
                self._scriptWeightDict[self._analysisName]["total"] *
                self._scriptStages["trendNarrativeStart"]["weight"], 10)
            progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                        "trendNarrativeStart",\
                                        "info",\
                                        self._scriptStages["trendNarrativeStart"]["summary"],\
                                        self._completionStatus,\
                                        self._completionStatus)
            CommonUtils.save_progress_message(self._messageURL,
                                              progressMessage)
            self._dataframe_context.update_completion_status(
                self._completionStatus)
            # self._startMeasureTrend = self._result_setter.get_trend_section_completion_status()
            self._startMeasureTrend = True

            if self._startMeasureTrend == True:
                self.narratives = {
                    "SectionHeading": "",
                    "card1": {},
                    "card2": {},
                    "card3": {}
                }
                if self._selected_date_columns != None:
                    if self._dateFormatDetected:
                        grouped_data = NarrativesUtils.get_grouped_data_for_trend(
                            self._data_frame, self._dataLevel,
                            self._result_column, self._analysistype,
                            self._pandas_flag)
                        if self._pandas_flag:
                            self._data_frame = self._data_frame.drop(
                                self._date_column_suggested, axis=1)
                        else:
                            self._data_frame = self._data_frame.drop(
                                self._date_column_suggested)
                        # self._data_frame = self._data_frame.withColumnRenamed("year_month", self._date_column_suggested)

                        significant_dimensions = []
                        significant_dimension_dict = df_helper.get_significant_dimension(
                        )
                        if significant_dimension_dict != {} and significant_dimension_dict != None:
                            significant_dimension_tuple = tuple(
                                significant_dimension_dict.items())
                            significant_dimension_tuple = sorted(
                                significant_dimension_tuple,
                                key=lambda x: x[1],
                                reverse=True)
                            significant_dimensions = [
                                x[0] for x in
                                significant_dimension_tuple[:self.
                                                            _number_of_dimensions_to_consider]
                            ]
                        else:
                            significant_dimensions = self._string_columns[:self
                                                                          .
                                                                          _number_of_dimensions_to_consider]
                        print("significant_dimensions", significant_dimensions)
                        trend_narrative_obj = TrendNarrative(
                            self._result_column, self._date_column_suggested,
                            grouped_data, self._existingDateFormat,
                            self._requestedDateFormat, self._base_dir,
                            self._metaParser)
                        # grouped_data.to_csv("/home/gulshan/marlabs/datasets/trend_grouped_pandas.csv",index=False)
                        dataDict = trend_narrative_obj.generateDataDict(
                            grouped_data, self._dataLevel,
                            self._durationString)
                        # # update reference time with max value
                        reference_time = dataDict["reference_time"]
                        dataDict["duration"] = self._duration
                        dataDict["dataLevel"] = self._dataLevel
                        dataDict["durationString"] = self._durationString
                        dataDict[
                            "significant_dimensions"] = significant_dimensions
                        if len(significant_dimensions) > 0:
                            if self._dataLevel == "day":
                                datetimeformat = self._existingDateFormat
                            elif self._dataLevel == "month":
                                datetimeformat = "%b-%y"
                            # xtraData = trend_narrative_obj.get_xtra_calculations(self._data_frame,grouped_data,significant_dimensions,self._date_column_suggested,self._result_column,self._existingDateFormat,reference_time,self._dataLevel, self._pandas_flag)
                            xtraData = trend_narrative_obj.get_xtra_calculations(
                                self._data_frame, grouped_data,
                                significant_dimensions,
                                self._date_column_suggested,
                                self._result_column, datetimeformat,
                                reference_time, self._dataLevel,
                                self._pandas_flag)
                            if xtraData != None:
                                dataDict.update(xtraData)
                        # print 'Trend dataDict:  %s' %(json.dumps(dataDict, indent=2))
                        self._result_setter.update_executive_summary_data(
                            dataDict)
                        dataDict.update({
                            "blockSplitter": self._blockSplitter,
                            "highlightFlag": self._highlightFlag
                        })

                        summary1 = NarrativesUtils.get_template_output(self._base_dir,\
                                                                        'measure_trend_card1.html',dataDict)
                        summary2 = NarrativesUtils.get_template_output(self._base_dir,\
                                                                        'measure_trend_card2.html',dataDict)
                        measureTrendCard = NormalCard()
                        measureTrendcard1Data = NarrativesUtils.block_splitter(
                            summary1,
                            self._blockSplitter,
                            highlightFlag=self._highlightFlag)
                        measureTrendcard2Data = NarrativesUtils.block_splitter(
                            summary2, self._blockSplitter)
                        # print measureTrendcard1Data

                        bubbledata = dataDict["bubbleData"]
                        # print bubbledata
                        card1BubbleData = "<div class='col-md-6 col-xs-12 xs-p-20'><h2 class='text-center'><span>{}</span><br/><small>{}</small></h2></div><div class='col-md-6 col-xs-12 xs-p-20'><h2 class='text-center'><span>{}</span><br/><small>{}</small></h2></div>".format(
                            bubbledata[0]["value"], bubbledata[0]["text"],
                            bubbledata[1]["value"], bubbledata[1]["text"])
                        # print card1BubbleData

                        trend_chart_data = list(
                            grouped_data[["key",
                                          "value"]].T.to_dict().values())
                        trend_chart_data = sorted(trend_chart_data,
                                                  key=lambda x: x["key"])
                        card1chartdata = {"actual": [], "predicted": []}

                        if self._dataLevel == "day":
                            card1chartdata["actual"] = [{
                                "key": str(val["key"]),
                                "value": val["value"]
                            } for val in trend_chart_data]
                        elif self._dataLevel == "month":
                            card1chartdata["actual"] = [{
                                "key":
                                val["key"].strftime("%b-%y"),
                                "value":
                                val["value"]
                            } for val in trend_chart_data]

                        if self._duration < 365:
                            prediction_window = 3
                        else:
                            prediction_window = 6
                        predicted_values = trend_narrative_obj.get_forecast_values(
                            grouped_data["value"],
                            prediction_window)[len(grouped_data["value"]):]
                        predicted_values = [
                            round(x, self._num_significant_digits)
                            for x in predicted_values
                        ]

                        forecasted_data = []
                        forecasted_data.append(card1chartdata["actual"][-1])
                        forecasted_dates = []
                        # forecast_start_time = datetime.strptime(card1chartdata["actual"][-1]["key"],"%b-%y")
                        if self._dataLevel == "month":
                            forecast_start_time = datetime.strptime(
                                card1chartdata["actual"][-1]["key"], "%b-%y")
                        elif self._dataLevel == "day":
                            try:
                                forecast_start_time = datetime.strptime(
                                    card1chartdata["actual"][-1]["key"],
                                    "%Y-%m-%d")
                            except:
                                forecast_start_time = datetime.strptime(
                                    card1chartdata["actual"][-1]["key"],
                                    '%Y-%m-%d %H:%M:%S')
                        for val in range(prediction_window):
                            if self._dataLevel == "month":
                                key = forecast_start_time + relativedelta(
                                    months=1 + val)
                                forecasted_dates.append(key)
                            elif self._dataLevel == "day":
                                key = forecast_start_time + relativedelta(
                                    days=1 + val)
                                forecasted_dates.append(key)
                        forecasted_list = list(
                            zip(forecasted_dates, predicted_values))
                        if self._dataLevel == "month":
                            forecasted_list = [{
                                "key": val[0].strftime("%b-%y"),
                                "value": val[1]
                            } for val in forecasted_list]
                        elif self._dataLevel == "day":
                            forecasted_list = [{
                                "key":
                                val[0].strftime("%Y-%m-%d"),
                                "value":
                                val[1]
                            } for val in forecasted_list]
                        forecasted_data += forecasted_list
                        card1chartdata["predicted"] = forecasted_data
                        # print json.dumps(card1chartdata,indent=2)
                        card1chartdata = ScatterChartData(data=card1chartdata)
                        chartJson = ChartJson()
                        chartJson.set_data(card1chartdata.get_data())
                        chartJson.set_label_text({
                            'x': ' ',
                            'y': 'No. of Observations'
                        })
                        chartJson.set_legend({
                            "actual": "Observed",
                            "predicted": "Forecast"
                        })
                        chartJson.set_chart_type("scatter_line")
                        chartJson.set_axes({"x": "key", "y": "value"})
                        chartJson.set_yaxis_number_format(".2f")
                        st_info = [
                            "Trend Analysis",
                            "Forecast Method : Holt Winters Method"
                        ]
                        measureTrendcard1Data.insert(
                            1, C3ChartData(data=chartJson, info=st_info))
                        measureTrendcard1Data.append(
                            HtmlData(data=card1BubbleData))
                        cardData = measureTrendcard1Data + measureTrendcard2Data
                        measureTrendCard.set_card_data(cardData)
                        measureTrendCard.set_card_name("Trend Analysis")
                        trendStoryNode = NarrativesTree(
                            "Trend", None, [], [measureTrendCard])
                        self._story_narrative.add_a_node(trendStoryNode)
                        self._result_setter.set_trend_node(trendStoryNode)

                        # prediction_data = [{"key":x["key"],"value":x["value"]} for x in trend_chart_data]
                        # last_val = prediction_data[-1]
                        # last_val.update({"predicted_value":last_val["value"]})
                        # prediction_data[-1] = last_val
                        #
                        # for val in range(prediction_window):
                        #     dataLevel = dataDict["dataLevel"]
                        #     if self._dataLevel == "month":
                        #         last_key = prediction_data[-1]["key"]
                        #         key = last_key+relativedelta(months=1)
                        #         prediction_data.append({"key":key,"predicted_value":predicted_values[val]})
                        #         forecasted_data.append({"key":key,"value":predicted_values[val]})
                        #     elif self._dataLevel == "day":
                        #         last_key = prediction_data[-1]["key"]
                        #         key = last_key+relativedelta(days=1)
                        #         prediction_data.append({"key":key,"predicted_value":predicted_values[val]})
                        # prediction_data_copy = prediction_data
                        # prediction_data = []
                        # for val in prediction_data_copy:
                        #     val["key"] = val["key"].strftime("%b-%y")
                        #     prediction_data.append(val)

                        # forecastDataDict = {"startForecast":predicted_values[0],
                        #                     "endForecast":predicted_values[prediction_window-1],
                        #                     "measure":dataDict["measure"],
                        #                     "forecast":True,
                        #                     "forecast_percentage": round((predicted_values[prediction_window-1]-predicted_values[0])/predicted_values[0],self._num_significant_digits),
                        #                     "prediction_window_text": str(prediction_window) + " months"
                        #                     }
                        #
                        # self._result_setter.update_executive_summary_data(forecastDataDict)
                        # summary3 = NarrativesUtils.get_template_output(self._base_dir,\
                        # 'trend_narrative_card3.html',forecastDataDict)
                        self._completionStatus += old_div(
                            self._scriptWeightDict[self._analysisName]["total"]
                            *
                            self._scriptStages["trendNarrativeEnd"]["weight"],
                            10)
                        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                                    "trendNarrativeEnd",\
                                                    "info",\
                                                    self._scriptStages["trendNarrativeEnd"]["summary"],\
                                                    self._completionStatus,\
                                                    self._completionStatus)
                        CommonUtils.save_progress_message(
                            self._messageURL, progressMessage)
                        self._dataframe_context.update_completion_status(
                            self._completionStatus)
                    else:
                        # self._result_setter.update_executive_summary_data({"trend_present":False})
                        print("Trend Analysis for Measure Failed")
                        print("#" * 20 + "Trend Analysis Error" + "#" * 20)
                        print(
                            "No date format for the date column %s was detected."
                            % (self._date_column_suggested))
                        print("#" * 60)
                        self._completionStatus += self._scriptWeightDict[
                            self._analysisName]["total"]
                        self._dataframe_context.update_completion_status(
                            completionStatus)
                        progressMessage = CommonUtils.create_progress_message_object("Trend","failedState","error",\
                                        "Trend Failed As "+"No Date Format For The Date Column %s Was Detected !!!" %(self._date_column_suggested),\
                                        completionStatus,completionStatus)
                        CommonUtils.save_progress_message(
                            messageURL, progressMessage)
                        self._dataframe_context.update_completion_status(
                            self._completionStatus)
                else:
                    # self._result_setter.update_executive_summary_data({"trend_present":False})
                    print("Trend Analysis for Measure Failed")
                    print("#" * 20 + "Trend Analysis Error" + "#" * 20)
                    print("No date column present for Trend Analysis.")
                    print("#" * 60)
                    self._completionStatus += self._scriptWeightDict[
                        self._analysisName]["total"]
                    self._dataframe_context.update_completion_status(
                        completionStatus)
                    progressMessage = CommonUtils.create_progress_message_object("Trend","failedState","error",\
                                    "No Date Column Present",\
                                    completionStatus,completionStatus)
                    CommonUtils.save_progress_message(messageURL,
                                                      progressMessage)
                    self._dataframe_context.update_completion_status(
                        self._completionStatus)
            else:
                print("overall Trend not Started YET")

        elif self._analysistype == "dimension":
            print("Dimension Trend Started")
            self._completionStatus += old_div(
                self._scriptWeightDict[self._analysisName]["total"] *
                self._scriptStages["initialization"]["weight"], 10)
            progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                        "initialization",\
                                        "info",\
                                        self._scriptStages["initialization"]["summary"],\
                                        self._completionStatus,\
                                        self._completionStatus)
            CommonUtils.save_progress_message(self._messageURL,
                                              progressMessage)
            self._dataframe_context.update_completion_status(
                self._completionStatus)

            self.narratives = {"card0": {}}
            if self._selected_date_columns != None:
                if self._dateFormatDetected:
                    # result_column_levels = [x[0] for x in self._data_frame.select(self._result_column).distinct().collect()]
                    try:
                        result_column_levels = self._metaParser.get_unique_level_names(
                            self._result_column)
                    except:
                        if self._pandas_flag:
                            result_column_levels = list(
                                self._data_frame[self._result_column].unique())
                        else:
                            result_column_levels = [
                                x[0] for x in self._data_frame.select(
                                    self._result_column).distinct().collect()
                            ]
                            # result_column_levels = self._data_frame.agg((F.collect_set(self._result_column).alias(self._result_column))).first().asDict()[self._result_column]

                    print("-" * 100)
                    # TODO Implement meta parser getter here
                    print(result_column_levels)
                    if self._pandas_flag:
                        level_count_df = self._data_frame[
                            self._result_column].value_counts()[0:2]
                        top2levels = list(level_count_df.index)
                    else:
                        level_count_df = self._data_frame.groupBy(
                            self._result_column).count().orderBy(
                                "count", ascending=False)
                        level_count_df_rows = level_count_df.collect()
                        top2levels = [
                            level_count_df_rows[0][0],
                            level_count_df_rows[1][0]
                        ]
                    cardData = []
                    chart_data = {}
                    cardData1 = []
                    c3_chart = {"dataType": "c3Chart", "data": {}}
                    print("#" * 40)
                    overall_count = NarrativesUtils.get_grouped_count_data_for_dimension_trend(
                        self._data_frame, self._dataLevel, self._result_column,
                        self._pandas_flag)
                    print("#" * 40)
                    for idx, level in enumerate(top2levels):
                        print("calculations in progress for the level :- ",
                              level)
                        if self._pandas_flag:
                            leveldf = self._data_frame[self._data_frame[
                                self._result_column] == level]
                        else:
                            leveldf = self._data_frame.filter(
                                col(self._result_column) == level)
                        grouped_data = NarrativesUtils.get_grouped_data_for_trend(
                            leveldf, self._dataLevel, self._result_column,
                            self._analysistype, self._pandas_flag)
                        grouped_data.rename(columns={"value": "value_count"},
                                            inplace=True)
                        grouped_data = pd.merge(grouped_data,
                                                overall_count,
                                                on='key',
                                                how='left')
                        # grouped_data["value"] = grouped_data["value_count"].apply(lambda x:round(x*100/float(self._data_frame.count()),self._num_significant_digits))
                        grouped_data["value"] = old_div(
                            grouped_data["value_count"],
                            grouped_data["totalCount"])
                        grouped_data["value"] = grouped_data["value"].apply(
                            lambda x: round(x * 100, self.
                                            _num_significant_digits))
                        if self._pandas_flag:
                            leveldf = leveldf.drop(self._date_column_suggested,
                                                   axis=1)
                            leveldf = leveldf.rename(
                                columns={
                                    "year_month": self._date_column_suggested
                                })
                            if "year_month" not in leveldf.columns:
                                leveldf["year_month"] = leveldf[
                                    self._date_column_suggested]
                            leveldf["value_col"] = 1
                        else:
                            leveldf = leveldf.drop(self._date_column_suggested)
                            leveldf = leveldf.withColumnRenamed(
                                "year_month", self._date_column_suggested)
                            if "year_month" not in leveldf.columns:
                                leveldf = leveldf.withColumn(
                                    "year_month",
                                    col(self._date_column_suggested))
                            leveldf = leveldf.withColumn('value_col', lit(1))

                        trend_narrative_obj = TrendNarrative(
                            self._result_column, self._date_column_suggested,
                            grouped_data, self._existingDateFormat,
                            self._requestedDateFormat, self._base_dir,
                            self._metaParser)
                        dataDict = trend_narrative_obj.generateDataDict(
                            grouped_data, self._dataLevel,
                            self._durationString)
                        dataDict["target_column"] = dataDict["measure"]
                        dataDict["measure"] = level
                        dataDict["duration"] = self._duration
                        dataDict["dataLevel"] = self._dataLevel
                        dataDict["durationString"] = self._durationString
                        # grouped_data.to_csv("/home/gulshan/marlabs/datasets/grouped_data"+str(idx))
                        # print json.dumps(dataDict,indent=2)
                        significant_dimensions = []
                        significant_dimension_dict = df_helper.get_chisquare_significant_dimension(
                        )
                        if significant_dimension_dict != {} and significant_dimension_dict != None:
                            significant_dimension_tuple = tuple(
                                significant_dimension_dict.items())
                            significant_dimension_tuple = sorted(
                                significant_dimension_tuple,
                                key=lambda x: x[1],
                                reverse=True)
                            significant_dimensions = [
                                x[0] for x in
                                significant_dimension_tuple[:self.
                                                            _number_of_dimensions_to_consider]
                            ]
                        else:
                            significant_dimensions = self._string_columns[:self
                                                                          .
                                                                          _number_of_dimensions_to_consider]
                        print("significant_dimensions", significant_dimensions)
                        reference_time = dataDict["reference_time"]
                        dataDict[
                            "significant_dimensions"] = significant_dimensions
                        if len(significant_dimensions) > 0:
                            st = time.time()
                            xtraData = trend_narrative_obj.get_xtra_calculations(
                                leveldf, grouped_data, significant_dimensions,
                                self._date_column_suggested, "value_col",
                                self._existingDateFormat, reference_time,
                                self._dataLevel, self._pandas_flag)
                            print("time for get_xtra_calculations",
                                  time.time() - st)
                            if xtraData != None:
                                dataDict.update(xtraData)
                        dimensionCount = trend_narrative_obj.generate_dimension_extra_narrative(
                            grouped_data, dataDict, self._dataLevel)
                        if dimensionCount != None:
                            dataDict.update(dimensionCount)

                        dataDict.update({
                            "level_index": idx,
                            "blockSplitter": self._blockSplitter,
                            "highlightFlag": self._highlightFlag
                        })

                        self._result_setter.update_executive_summary_data(
                            dataDict)
                        trendStory = NarrativesUtils.get_template_output(self._base_dir,\
                                                                        'dimension_trend.html',dataDict)
                        blocks = NarrativesUtils.block_splitter(
                            trendStory, self._blockSplitter)

                        if idx != 0:
                            cardData1 += blocks[2:]
                        else:
                            cardData1 += blocks

                        trend_chart_data = [
                            x for x in list(grouped_data[
                                ["key", "value"]].T.to_dict().values())
                            if x['key'] != None
                        ]
                        trend_chart_data = sorted(trend_chart_data,
                                                  key=lambda x: x["key"])
                        card1chartdata = trend_chart_data
                        if self._dataLevel == "day":
                            card1chartdata = [{
                                "key": str(val["key"]),
                                "value": val["value"]
                            } for val in card1chartdata]
                        elif self._dataLevel == "month":
                            card1chartdata = [{
                                "key":
                                val["key"].strftime("%b-%y"),
                                "value":
                                val["value"]
                            } for val in card1chartdata]
                        chart_data[level] = card1chartdata

                    labels = {
                        "x": "key",
                        "y": list(chart_data.keys())[0],
                        "y2": list(chart_data.keys())[1]
                    }
                    c3Chart = {
                        "data": chart_data,
                        "format": "%b-%y",
                        "label": labels,
                        "label_text": {
                            "x": "Time",
                            "y": "Percentage of " + labels["y"],
                            "y2": "Percentage of " + labels["y2"]
                        }
                    }

                    c3_chart["data"] = c3Chart
                    multiLineData = []
                    for idx in range(len(chart_data[top2levels[0]])):
                        key = chart_data[top2levels[0]][idx]["key"]
                        value = chart_data[top2levels[0]][idx]["value"]
                        try:
                            value1 = chart_data[top2levels[1]][idx]["value"]
                        except:
                            value1 = 0
                        multiLineData.append({
                            "key": key,
                            top2levels[0]: value,
                            top2levels[1]: value1
                        })
                    chartData = NormalChartData(multiLineData)
                    chartJson = ChartJson()
                    chartJson.set_data(chartData.get_data())
                    chartJson.set_label_text(c3Chart["label_text"])
                    chartJson.set_legend(c3Chart["label"])
                    chartJson.set_chart_type("line")
                    chartJson.set_yaxis_number_format(".2f")
                    chartJson.set_axes(labels)
                    st_info = [
                        "Trend Analysis",
                        "Forecast Method : Holt Winters Method"
                    ]
                    cardData1.insert(1,
                                     C3ChartData(data=chartJson, info=st_info))
                    trendCard = NormalCard(name="Trend Analysis",
                                           slug=None,
                                           cardData=cardData1)
                    trendStoryNode = NarrativesTree("Trend", None, [],
                                                    [trendCard])
                    self._story_narrative.add_a_node(trendStoryNode)
                    self._result_setter.set_trend_node(trendStoryNode)
                    self._completionStatus += old_div(
                        self._scriptWeightDict[self._analysisName]["total"] *
                        self._scriptStages["summarygeneration"]["weight"], 10)
                    progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                                "summarygeneration",\
                                                "info",\
                                                self._scriptStages["summarygeneration"]["summary"],\
                                                self._completionStatus,\
                                                self._completionStatus)
                    CommonUtils.save_progress_message(self._messageURL,
                                                      progressMessage)
                    self._dataframe_context.update_completion_status(
                        self._completionStatus)

                    self._completionStatus += old_div(
                        self._scriptWeightDict[self._analysisName]["total"] *
                        self._scriptStages["completion"]["weight"], 10)
                    progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                                "completion",\
                                                "info",\
                                                self._scriptStages["completion"]["summary"],\
                                                self._completionStatus,\
                                                self._completionStatus)
                    CommonUtils.save_progress_message(self._messageURL,
                                                      progressMessage)
                    self._dataframe_context.update_completion_status(
                        self._completionStatus)
                else:
                    self._result_setter.update_executive_summary_data(
                        {"trend_present": False})
                    print("Trend Analysis for Dimension Failed")
                    print("#" * 20 + "Trend Analysis Error" + "#" * 20)
                    if self._date_column_suggested:
                        print(
                            "No date format for the date column %s was detected."
                            % (self._date_column_suggested))
                    print("#" * 60)
                    self._completionStatus += self._scriptWeightDict[
                        self._analysisName]["total"]
                    self._dataframe_context.update_completion_status(
                        self._completionStatus)
                    progressMessage = CommonUtils.create_progress_message_object("Trend","failedState","error",\
                                    "Trend Failed As "+"No Date Format For The Date Column %s Was Detected !!!" %(self._date_column_suggested),\
                                    self._completionStatus,self._completionStatus)
                    CommonUtils.save_progress_message(messageURL,
                                                      progressMessage)
                    self._dataframe_context.update_completion_status(
                        self._completionStatus)

            else:
                self._result_setter.update_executive_summary_data(
                    {"trend_present": False})
                print("Trend Analysis for Dimension Failed")
                print("#" * 20 + "Trend Analysis Error" + "#" * 20)
                print("No date column present for Trend Analysis.")
                print("#" * 60)
                self._completionStatus += self._scriptWeightDict[
                    self._analysisName]["total"]
                self._dataframe_context.update_completion_status(
                    self._completionStatus)
                progressMessage = CommonUtils.create_progress_message_object("Trend","failedState","error",\
                                "No Date Column Present",\
                                self._completionStatus,self._completionStatus)
                CommonUtils.save_progress_message(messageURL, progressMessage)
                self._dataframe_context.update_completion_status(
                    self._completionStatus)
예제 #4
0
    def generate_narratives(self):
        regression_narrative_obj = LinearRegressionNarrative(
                                    self._df_regression_result,
                                    self._correlations,
                                    self._dataframe_helper,
                                    self._dataframe_context,
                                    self._metaParser,
                                    self._spark
                                    )
        main_card_data = regression_narrative_obj.generate_main_card_data()
        main_card_narrative = NarrativesUtils.get_template_output(self._base_dir,\
                                                        'regression_main_card.html',main_card_data)
        self.narratives['main_card'] = {}
        self.narratives["main_card"]['paragraphs'] = NarrativesUtils.paragraph_splitter(main_card_narrative)
        self.narratives["main_card"]['header'] = 'Key Measures that affect ' + self.result_column
        self.narratives["main_card"]['chart'] = {}
        self.narratives["main_card"]['chart']['heading'] = ''
        self.narratives["main_card"]['chart']['data'] = [[i for i,j in self._all_coeffs],
                                                         [j['coefficient'] for i,j in self._all_coeffs]]
        self.narratives["main_card"]['chart']['label'] = {'x':'Measure Name',
                                                            'y': 'Change in ' + self.result_column + ' per unit increase'}

        main_card = NormalCard()
        main_card_header = HtmlData(data = '<h3>Key Measures that affect ' + self.result_column+"</h3>")
        main_card_paragraphs = NarrativesUtils.block_splitter(main_card_narrative,self._blockSplitter)
        main_card_chart_data = [{"key":val[0],"value":val[1]} for val in zip([i for i,j in self._all_coeffs],[j['coefficient'] for i,j in self._all_coeffs])]
        main_card_chart = NormalChartData(data=main_card_chart_data)
        mainCardChartJson = ChartJson()
        mainCardChartJson.set_data(main_card_chart.get_data())
        mainCardChartJson.set_label_text({'x':'Influencing Factors','y': 'Change in ' + self.result_column + ' per unit increase'})
        mainCardChartJson.set_chart_type("bar")
        mainCardChartJson.set_axes({"x":"key","y":"value"})
        mainCardChartJson.set_yaxis_number_format(".2f")
        # st_info = ["Test : Regression","Threshold for p-value: 0.05", "Effect Size: Regression Coefficient"]
        chart_data = sorted(main_card_chart_data,key=lambda x:x["value"],reverse=True)
        statistical_info_array=[
            ("Test Type","Regression"),
            ("Effect Size","Coefficients"),
            ("Max Effect Size",chart_data[0]["key"]),
            ("Min Effect Size",chart_data[-1]["key"]),
            ]
        statistical_inferenc = ""
        if len(chart_data) == 1:
            statistical_inference = "{} is the only variable that have significant influence over {} (Target) having an \
             Effect size of {}".format(chart_data[0]["key"],self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4))
        elif len(chart_data) == 2:
            statistical_inference = "There are two variables ({} and {}) that have significant influence over {} (Target) and the \
             Effect size ranges are {} and {} respectively".format(chart_data[0]["key"],chart_data[1]["key"],self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4),round(chart_data[1]["value"],4))
        else:
            statistical_inference = "There are {} variables that have significant influence over {} (Target) and the \
             Effect size ranges from {} to {}".format(len(chart_data),self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4),round(chart_data[-1]["value"],4))
        if statistical_inference != "":
            statistical_info_array.append(("Inference",statistical_inference))
        statistical_info_array = NarrativesUtils.statistical_info_array_formatter(statistical_info_array)
        main_card.set_card_data(data = [main_card_header]+main_card_paragraphs+[C3ChartData(data=mainCardChartJson,info=statistical_info_array)])
        main_card.set_card_name("Key Influencers")
        self._regressionNode.add_a_card(main_card)


        count = 0
        for measure_column in self.significant_measures:
            sigMeasureNode = NarrativesTree()
            sigMeasureNode.set_name(measure_column)
            measureCard1 = NormalCard()
            measureCard1.set_card_name("{}: Impact on {}".format(measure_column,self.result_column))
            measureCard1Data = []
            if self._run_dimension_level_regression:
                measureCard2 = NormalCard()
                measureCard2.set_card_name("Key Areas where it Matters")
                measureCard2Data = []

            measure_column_cards = {}
            card0 = {}
            card1data = regression_narrative_obj.generate_card1_data(measure_column)
            card1heading = "<h3>Impact of "+measure_column+" on "+self.result_column+"</h3>"
            measureCard1Header = HtmlData(data=card1heading)
            card1data.update({"blockSplitter":self._blockSplitter})
            card1narrative = NarrativesUtils.get_template_output(self._base_dir,\
                                                            'regression_card1.html',card1data)

            card1paragraphs = NarrativesUtils.block_splitter(card1narrative,self._blockSplitter)
            card0 = {"paragraphs":card1paragraphs}
            card0["charts"] = {}
            card0['charts']['chart2']={}
            # card0['charts']['chart2']['data']=card1data["chart_data"]
            # card0['charts']['chart2']['heading'] = ''
            # card0['charts']['chart2']['labels'] = {}
            card0['charts']['chart1']={}
            card0["heading"] = card1heading
            measure_column_cards['card0'] = card0

            measureCard1Header = HtmlData(data=card1heading)
            measureCard1Data += [measureCard1Header]
            measureCard1para = card1paragraphs
            measureCard1Data += measureCard1para

            if self._run_dimension_level_regression:
                print("running narratives for key area dict")
                self._dim_regression = self.run_regression_for_dimension_levels()
                card2table, card2data=regression_narrative_obj.generate_card2_data(measure_column,self._dim_regression)
                card2data.update({"blockSplitter":self._blockSplitter})
                card2narrative = NarrativesUtils.get_template_output(self._base_dir,\
                                                            'regression_card2.html',card2data)
                card2paragraphs = NarrativesUtils.block_splitter(card2narrative,self._blockSplitter)

                card1 = {'tables': card2table, 'paragraphs' : card2paragraphs,
                        'heading' : 'Key Areas where ' + measure_column + ' matters'}
                measure_column_cards['card1'] = card1

                measureCard2Data += card2paragraphs
                if "table1" in card2table:
                    table1data = regression_narrative_obj.convert_table_data(card2table["table1"])
                    card2Table1 = TableData()
                    card2Table1.set_table_data(table1data)
                    card2Table1.set_table_type("heatMap")
                    card2Table1.set_table_top_header(card2table["table1"]["heading"])
                    card2Table1Json = json.loads(CommonUtils.convert_python_object_to_json(card2Table1))
                    # measureCard2Data.insert(3,card2Table1)
                    measureCard2Data.insert(3,card2Table1Json)

                if "table2" in card2table:
                    table2data = regression_narrative_obj.convert_table_data(card2table["table2"])
                    card2Table2 = TableData()
                    card2Table2.set_table_data(table2data)
                    card2Table2.set_table_type("heatMap")
                    card2Table2.set_table_top_header(card2table["table2"]["heading"])
                    # measureCard2Data.insert(5,card2Table2)
                    card2Table2Json = json.loads(CommonUtils.convert_python_object_to_json(card2Table2))
                    # measureCard2Data.append(card2Table2)
                    measureCard2Data.append(card2Table2Json)


            # self._result_setter.set_trend_section_data({"result_column":self.result_column,
            #                                             "measure_column":measure_column,
            #                                             "base_dir":self._base_dir
            #                                             })
            # trend_narratives_obj = TimeSeriesNarrative(self._dataframe_helper, self._dataframe_context, self._result_setter, self._spark, self._story_narrative)
            # card2 =  trend_narratives_obj.get_regression_trend_card_data()
            # if card2:
            #     measure_column_cards['card2'] = card2
            #
            #
            # card3 = {}
            progressMessage = CommonUtils.create_progress_message_object(self._analysisName,"custom","info","Analyzing Key Influencers",self._completionStatus,self._completionStatus,display=True)
            CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=False)
            card4data = regression_narrative_obj.generate_card4_data(self.result_column,measure_column)
            card4data.update({"blockSplitter":self._blockSplitter})
            # card4heading = "Sensitivity Analysis: Effect of "+self.result_column+" on Segments of "+measure_column
            card4narrative = NarrativesUtils.get_template_output(self._base_dir,\
                                                                'regression_card4.html',card4data)
            card4paragraphs = NarrativesUtils.block_splitter(card4narrative,self._blockSplitter)
            # card3 = {"paragraphs":card4paragraphs}
            card0['paragraphs'] = card1paragraphs+card4paragraphs
            card4Chart = card4data["charts"]
            # st_info = ["Test : Regression", "Variables : "+ self.result_column +", "+measure_column,"Intercept : "+str(round(self._df_regression_result.get_intercept(),2)), "Regression Coefficient : "+ str(round(self._df_regression_result.get_coeff(measure_column),2))]
            statistical_info_array=[
                ("Test Type","Regression"),
                ("Coefficient",str(round(self._df_regression_result.get_coeff(measure_column),2))),
                ("P-Value","<= 0.05"),
                ("Intercept",str(round(self._df_regression_result.get_intercept(),2))),
                ("R Square ",str(round(self._df_regression_result.get_rsquare(),2))),
                ]
            inferenceTuple = ()
            coeff = self._df_regression_result.get_coeff(measure_column)
            if coeff > 0:
                inferenceTuple = ("Inference","For every additional unit of increase in {} there will be an increase of {} units in {} (target).".format(measure_column,str(round(coeff,2)),self._dataframe_context.get_result_column()))
            else:
                inferenceTuple = ("Inference","For every additional unit of decrease in {} there will be an decrease of {} units in {} (target).".format(measure_column,str(round(coeff,2)),self._dataframe_context.get_result_column()))
            if len(inferenceTuple) > 0:
                statistical_info_array.append(inferenceTuple)
            statistical_info_array = NarrativesUtils.statistical_info_array_formatter(statistical_info_array)

            card4paragraphs.insert(2,C3ChartData(data=card4Chart,info=statistical_info_array))
            measureCard1Data += card4paragraphs

            self.narratives['cards'].append(measure_column_cards)

            if count == 0:
                card4data.pop("charts")
                self._result_setter.update_executive_summary_data(card4data)
            count += 1
            measureCard1.set_card_data(measureCard1Data)
            if self._run_dimension_level_regression:
                measureCard2.set_card_data(measureCard2Data)
                sigMeasureNode.add_cards([measureCard1,measureCard2])
            sigMeasureNode.add_cards([measureCard1])
            self._regressionNode.add_a_node(sigMeasureNode)
        # self._result_setter.set_trend_section_completion_status(True)
        self._story_narrative.add_a_node(self._regressionNode)
    def chisquare_trend(self,column_name,base_dir):
        if self._date_columns != None:
            if self._dateFormatDetected:
                output = []
                date_column = self._date_column_suggested
                chisquare_column = column_name
                result_column = self._result_column
                if chisquare_column in self._dataframe_helper.get_numeric_columns():
                    min_max = self._data_frame.select([FN.min(chisquare_column), FN.max(chisquare_column)]).collect()
                    maxval = min_max[0][1]
                    minval = min_max[0][0]
                    step = (maxval - minval) / 5.0
                    splits = [math.floor(minval), minval + step, minval + (step * 2), minval + (step * 3), minval + (step * 4), math.ceil(maxval)]
                    bucketizer = Bucketizer(splits=splits,inputCol=chisquare_column,outputCol="BINNED_COL")
                    self._data_frame = self._data_frame.withColumn(chisquare_column, self._data_frame[chisquare_column].cast(DoubleType()))
                    bucketedData = bucketizer.transform(self._data_frame)
                    df = bucketedData.select([col for col in bucketedData.columns if col != chisquare_column])
                    df = df.withColumnRenamed("BINNED_COL",chisquare_column)
                    ranges = []
                    for idx in range(len(splits)-1):
                        text = str(splits[idx])+" to "+str(splits[idx+1])
                        ranges.append(text)
                    bin_dict = dict(list(zip(list(range(len(ranges))),ranges)))
                else:
                    df = self._data_frame

                df = df.select([date_column,chisquare_column,result_column]).toPandas()
                df["suggestedDate"] = df[date_column].apply(lambda x: datetime.strptime(x,self._existingDateFormat))
                df["year_month"] = df["suggestedDate"].apply(lambda x:x.strftime("%b-%y"))
                # result_column_count_df = df.groupBy(self._result_column).count().orderBy("count",ascending=False)
                # grouped_data.sort_values(by='key', ascending=True)
                result_column_count = df[result_column].value_counts()
                top2levels = result_column_count[:2].index
                for level in top2levels:
                    filtered_df = df.loc[df[result_column] == level]
                    grouped_result = pd.DataFrame(filtered_df[date_column].value_counts()).reset_index()
                    grouped_result.columns=[date_column,"value"]
                    # grouped_result["suggestedDate"] = grouped_result[date_column].apply(lambda x: datetime.strptime(x,self._existingDateFormat))
                    grouped_result["year_month"] = grouped_result[date_column].apply(lambda x: datetime.strptime(x,self._existingDateFormat).strftime("%b-%y"))
                    crosstab_df = pd.DataFrame(pd.crosstab(filtered_df["suggestedDate"],filtered_df[chisquare_column])).reset_index()
                    if chisquare_column in self._dataframe_helper.get_numeric_columns():
                        crosstab_columns = crosstab_df.columns
                        chisquare_levels = crosstab_columns[1:]
                        chisquare_levels = [bin_dict[x] for x in chisquare_levels]
                        crosstab_df.columns = [crosstab_columns[0]]+chisquare_levels
                    else:
                        chisquare_levels = crosstab_df.columns[1:]


                    crosstab_df["year_month"] = crosstab_df["suggestedDate"].apply(lambda x:x.strftime("%b-%y"))
                    final_df = pd.merge(grouped_result,crosstab_df, how='outer', on=['year_month'])
                    final_df.sort_values(by="suggestedDate",ascending=True,inplace=True)
                    final_df.reset_index(drop=True,inplace=True)
                    final_df["overallPerChange"] = [0]+[round((x-y)*100/float(y),self._num_significant_digits) for x,y in zip(final_df["value"].iloc[1:],final_df["value"])]

                    growth_dict = {}
                    for val in chisquare_levels:
                        growth_dict[val]  = {}
                        growth_dict[val]["growth"] = round(((final_df[val].iloc[-1]-final_df[val].iloc[0])*100/float(final_df[val].iloc[0])),self._num_significant_digits)
                        if growth_dict[val]["growth"] > 3 or final_df[val].iloc[0] == 0:
                            growth_dict[val]["growthType"] = "positive"
                            print(growth_dict[val]["growth"])
                        elif growth_dict[val]["growth"] < -3:
                            growth_dict[val]["growthType"] = "negative"
                        else:
                            growth_dict[val]["growthType"] = "stable"
                        growth_dict[val]["total"] = sum(final_df[val])
                    growth_dict["overall"] = {}
                    growth_dict["overall"]["growth"] = round((final_df["value"].iloc[-1]-final_df["value"].iloc[0]/float(final_df["value"].iloc[0])),self._num_significant_digits)
                    data_dict = {}
                    total_tuple = []
                    for k,v in list(growth_dict.items()):
                        if k != "overall":
                            total_tuple.append((k,v["total"]))
                    sorted_total_tuple = sorted(total_tuple,key=lambda x:x[1],reverse=True)
                    top_dimension = sorted_total_tuple[0][0]
                    final_df["topDimensionPerChange"] = [0]+[round((x-y)*100/float(y),self._num_significant_digits) for x,y in zip(final_df[top_dimension].iloc[1:],final_df[top_dimension])]
                    data_dict["dimension"] = chisquare_column
                    data_dict["correlation"] = final_df["value"].corr(final_df[top_dimension])
                    data_dict["subset_increase_percent"] = growth_dict[top_dimension]["growth"]
                    data_dict["overall_increase_percent"] = growth_dict["overall"]["growth"]
                    data_dict["target"] = level
                    data_dict["top_dimension"] = top_dimension
                    overall_peak_index = np.argmax(final_df["value"])
                    overall_low_index = np.argmin(final_df["value"])
                    top_dimension_peak_index = np.argmax(final_df[top_dimension])
                    top_dimension_low_index = np.argmin(final_df[top_dimension])
                    data_dict["overallPeakValue"] = final_df["value"][overall_peak_index]
                    data_dict["overallLowestValue"] = final_df["value"][overall_low_index]
                    data_dict["overallPeakTime"] = final_df["year_month"][overall_peak_index]
                    data_dict["overallLowestTime"] = final_df["year_month"][overall_low_index]
                    data_dict["overallPeakIncrease"] = final_df["overallPerChange"][overall_peak_index]
                    data_dict["topDimensionPeakValue"] = final_df[top_dimension][top_dimension_peak_index]
                    data_dict["topDimensionLowestValue"] = final_df[top_dimension][top_dimension_low_index]
                    data_dict["topDimensionPeakTime"] = final_df["year_month"][top_dimension_peak_index]
                    data_dict["topDimensionLowestTime"] = final_df["year_month"][top_dimension_low_index]
                    data_dict["topDimensionPeakIncrease"] = final_df["topDimensionPerChange"][top_dimension_peak_index]
                    data_dict["overall_streak"] = NarrativesUtils.streak_data(final_df,overall_peak_index,overall_low_index,\
                                                    "overallPerChange","value")
                    data_dict["top_dimension_streak"] = NarrativesUtils.streak_data(final_df,top_dimension_peak_index,top_dimension_low_index,\
                                                    "topDimensionPerChange",top_dimension)
                    # print growth_dict
                    data_dict["num_positive_growth_dimensions"] = 0
                    data_dict["positive_growth_dimensions"] = []
                    data_dict["positive_growth_values"] = []
                    data_dict["num_negative_growth_dimensions"] = 0
                    data_dict["negative_growth_dimensions"] = []
                    data_dict["negative_growth_values"] = []
                    data_dict["num_stable_growth_dimensions"] = 0
                    data_dict["stable_growth_dimensions"] = []
                    data_dict["stable_growth_values"] = []
                    data_dict["overall_growth_rate"] = growth_dict["overall"]["growth"]
                    data_dict["total_levels"] = len(chisquare_levels)
                    for val in chisquare_levels:
                        if growth_dict[val]["growthType"] == "positive":
                            data_dict["num_positive_growth_dimensions"] += 1
                            data_dict["positive_growth_dimensions"].append(val)
                            data_dict["positive_growth_values"].append(growth_dict[val]["growth"])
                        elif growth_dict[val]["growthType"] == "negative":
                            data_dict["num_negative_growth_dimensions"] += 1
                            data_dict["negative_growth_dimensions"].append(val)
                            data_dict["negative_growth_values"].append(growth_dict[val]["growth"])
                        else:
                            data_dict["num_stable_growth_dimensions"] += 1
                            data_dict["stable_growth_dimensions"].append(val)
                            data_dict["stable_growth_values"].append(growth_dict[val]["growth"])
                    summary1 = NarrativesUtils.get_template_output(self._base_dir,\
                                                                    'chisquare_trend.html',data_dict)
                    chart_data = {"data":[],"header":[]}
                    chart_data["header"] = ["time",result_column,top_dimension]
                    chart_data["data"]=[["time"],[result_column],[top_dimension]]
                    for idx in range(final_df.shape[0]):
                        chart_data["data"][0].append(final_df["year_month"].iloc[idx])
                        chart_data["data"][1].append(final_df["value"].iloc[idx])
                        chart_data["data"][2].append(final_df[top_dimension].iloc[idx])

                    paragraphs = NarrativesUtils.paragraph_splitter(summary1)
                    card_data = {"paragraphs":paragraphs,"chart":chart_data}
                    output.append([card_data])
                print(json.dumps(output,indent=2))