Beispiel #1
0
    def _generate_summary(self):

        ignored_columns = self._dataframe_context.get_ignore_column_suggestions()
        if ignored_columns == None:
            ignored_columns = []

        data_dict = {"n_c" : self._dataframe_helper.get_num_columns(),
                    "n_m" : len(self._dataframe_helper.get_numeric_columns()),
                    "n_d" : len(self._dataframe_helper.get_string_columns()),
                    "n_td" : len(self._dataframe_helper.get_timestamp_columns()),
                    "c" : self._column_name,
                    "d" : self._dataframe_helper.get_string_columns(),
                    "m" : self._dataframe_helper.get_numeric_columns(),
                    "td" : self._dataframe_helper.get_timestamp_columns(),
                    "observations" : self._dataframe_helper.get_num_rows(),
                    "ignorecolumns" : ignored_columns,
                    "n_t" : len(self._dataframe_helper.get_string_columns())+len(self._dataframe_helper.get_numeric_columns())+len(self._dataframe_helper.get_timestamp_columns())
                    # "n_t" : self._dataframe_helper.get_num_columns()+len(ignored_columns)
        }
        self.summary = NarrativesUtils.get_template_output(self._base_dir,\
                                        'descr_stats_summary.html',data_dict)
        MeasureSummaryCard = SummaryCard(name='Summary',slug=None,cardData = None)
        MeasureSummaryCard.set_no_of_measures(data_dict["n_m"])
        MeasureSummaryCard.set_no_of_dimensions(data_dict["n_d"])
        MeasureSummaryCard.set_no_of_time_dimensions(data_dict["n_td"])
        MeasureSummaryCard.set_summary_html(NarrativesUtils.block_splitter(self.summary,self._blockSplitter))
        self._story_narrative.add_a_card(MeasureSummaryCard)
        self._headNode.add_a_card(MeasureSummaryCard)
 def _generate_analysis(self):
     overall_aggregation = self.get_aggregared_count(
         self.df, self.dimension_columns)
     df1 = self.df.select(*self.dimension_columns + [self.measure_column])
     for dim in self.dimension_columns:
         data_dict = {"dim1": dim}
         anova_narr = self.anova_narratives[dim]
         highest_level_by_avg = max(anova_narr.group_by_mean,
                                    key=anova_narr.group_by_mean.get)
         highest_level_by_sum = max(anova_narr.group_by_total,
                                    key=anova_narr.group_by_total.get)
         # print highest_level_by_avg,highest_level_by_sum
         self.analysis[dim] = {"sum": "", "avg": ""}
         if highest_level_by_avg != highest_level_by_sum:
             df_avg = df1.filter(df1[dim] == highest_level_by_avg)
             inner_dict_avg = self.generate_inner_data_dict(
                 highest_level_by_avg, df_avg, dim, self.dimension_columns,
                 overall_aggregation, anova_narr)
             inner_dict_avg["dim1"] = dim
             inner_dict_avg["measure_column"] = self.measure_column
             inner_dict_avg["highest_level_by_avg"] = highest_level_by_avg
             data_dict["avg"] = inner_dict_avg
             df_sum = df1.filter(df1[dim] == highest_level_by_sum)
             inner_dict_sum = self.generate_inner_data_dict(
                 highest_level_by_sum, df_sum, dim, self.dimension_columns,
                 overall_aggregation, anova_narr)
             inner_dict_sum["dim1"] = dim
             inner_dict_sum["measure_column"] = self.measure_column
             inner_dict_sum["highest_level_by_sum"] = highest_level_by_sum
             data_dict["sum"] = inner_dict_sum
             self.analysis[dim]["avg"] = \
                     NarrativesUtils.get_template_output(self._base_dir,'anova_drilldown_avg.html',data_dict['avg'])
             self.analysis[dim]["sum"] = \
                     NarrativesUtils.get_template_output(self._base_dir,'anova_drilldown_avg.html',data_dict['sum'])
         else:
             df_avg = df1.filter(df1[dim] == highest_level_by_avg)
             inner_dict = self.generate_inner_data_dict(
                 highest_level_by_avg, df_avg, dim, self.dimension_columns,
                 overall_aggregation, anova_narr)
             inner_dict["dim1"] = dim
             inner_dict["highest_level_by_avg"] = highest_level_by_avg
             data_dict["avg"] = inner_dict
             data_dict["sum"] = data_dict["avg"]
             self.analysis[dim]["avg"] = \
                     NarrativesUtils.get_template_output(self._base_dir,'anova_drilldown_avg.html',data_dict['avg'])
    def _generate_summary(self):

        ignored_columns = self._dataframe_context.get_ignore_column_suggestions()
        if ignored_columns == None:
            ignored_columns = []

        metaHelperInstance = MetaDataHelper(self._data_frame, self._total_rows)
        sampleData = metaHelperInstance.get_sample_data()
        try:
            sampleData = sampleData.toPandas()
        except:
            pass
        l1=[]
        l2=[]
        if self._pandas_flag:
            for column in self._dataframe_helper.get_string_columns():
                uniqueVals = sampleData[column].unique().tolist()
                if len(uniqueVals) > 0 and metaHelperInstance.get_datetime_format_pandas([self._data_frame[column].sort_values(ascending=False)[0]])!=None:
                    dateColumnFormat = metaHelperInstance.get_datetime_format_pandas(uniqueVals)
                    l1.append(column)
                else:
                    dateColumnFormat = None
                    l2.append(column)
            # l1 = self._dataframe_helper.get_timestamp_columns()
            # l2 = self._dataframe_helper.get_string_columns()
        else:
            for column in self._dataframe_helper.get_string_columns():
                uniqueVals = sampleData[column].unique().tolist()
                if len(uniqueVals) > 0 and metaHelperInstance.get_datetime_format([self._data_frame.orderBy([column],ascending=[False]).select(column).first()[0]])!=None:
                    dateColumnFormat = metaHelperInstance.get_datetime_format(uniqueVals)
                    l1.append(column)
                else:
                    dateColumnFormat = None
                    l2.append(column)

        data_dict = {"n_c" : self._dataframe_helper.get_num_columns(),
                    "n_m" : len(self._dataframe_helper.get_numeric_columns()),
                    "n_d" : len(l2),
                    "n_td" : len(l1),
                    "c" : self._column_name,
                    "d" : l2,
                    "m" : self._dataframe_helper.get_numeric_columns(),
                    "td" : l1,
                    "observations" : self._dataframe_helper.get_num_rows(),
                    "ignorecolumns" : ignored_columns,
                    "n_t" : len(self._dataframe_helper.get_string_columns())+len(self._dataframe_helper.get_numeric_columns())+len(self._dataframe_helper.get_timestamp_columns())
                    # "n_t" : self._dataframe_helper.get_num_columns()+len(ignored_columns)
        }
        self.summary = NarrativesUtils.get_template_output(self._base_dir,\
                                        'descr_stats_summary.html',data_dict)
        MeasureSummaryCard = SummaryCard(name='Summary',slug=None,cardData = None)
        MeasureSummaryCard.set_no_of_measures(data_dict["n_m"])
        MeasureSummaryCard.set_no_of_dimensions(data_dict["n_d"])
        MeasureSummaryCard.set_no_of_time_dimensions(data_dict["n_td"])
        MeasureSummaryCard.set_summary_html(NarrativesUtils.block_splitter(self.summary,self._blockSplitter))
        self._story_narrative.add_a_card(MeasureSummaryCard)
        self._headNode.add_a_card(MeasureSummaryCard)
Beispiel #4
0
    def _generate_summary(self):
        ignored_columns = self._dataframe_context.get_ignore_column_suggestions(
        )
        if ignored_columns == None:
            ignored_columns = []

        data_dict = {
            "n_c":
            len(self._dataframe_helper.get_columns()),
            "n_m":
            len(self._dataframe_helper.get_numeric_columns()),
            "n_d":
            len(self._dataframe_helper.get_string_columns()),
            "n_td":
            len(self._dataframe_helper.get_timestamp_columns()),
            "c":
            self._column_name,
            "d":
            self._dataframe_helper.get_string_columns(),
            "m":
            self._dataframe_helper.get_numeric_columns(),
            "td":
            self._dataframe_helper.get_timestamp_columns(),
            "observations":
            self._dataframe_helper.get_num_rows(),
            "ignorecolumns":
            ignored_columns,
            "n_t":
            len(self._dataframe_helper.get_string_columns()) +
            len(self._dataframe_helper.get_numeric_columns()) +
            len(self._dataframe_helper.get_timestamp_columns()),
            # "n_t" : self._dataframe_helper.get_num_columns()+len(ignored_columns),
            "blockSplitter":
            self._blockSplitter
        }
        output = NarrativesUtils.get_template_output(self._base_dir,\
                                        'dimension_report_summary.html',data_dict)
        summary = NarrativesUtils.block_splitter(output, self._blockSplitter)
        dimensionSummaryCard = SummaryCard(name=self.header,
                                           slug=None,
                                           cardData=None)
        dimensionSummaryCard.set_no_of_measures(data_dict["n_m"])
        dimensionSummaryCard.set_no_of_dimensions(data_dict["n_d"])
        dimensionSummaryCard.set_no_of_time_dimensions(data_dict["n_td"])

        dimensionSummaryCard.set_summary_html(summary)
        dimensionSummaryCard.set_card_name("overall summary card")
        # dimensionSummaryCard.set_quote_html
        self._story_narrative.add_a_card(dimensionSummaryCard)
        self._headNode.add_a_card(dimensionSummaryCard)
 def _generate_analysis_para1(self):
     output = 'Para1 entered'
     data_dict = {"cols" : self._dataframe_helper.get_num_columns(),
                 "min" : int(round(self._measure_descr_stats.get_min(), 0)),
                 "max" : int(round(self._measure_descr_stats.get_max(), 0)),
                 "n" : self._five_point_summary_stats.get_num_outliers(),
                 "l" : self._five_point_summary_stats.get_left_outliers(),
                 "r" : self._five_point_summary_stats.get_right_outliers(),
                 "m" : self._dataframe_helper.get_numeric_columns(),
                 "total" : NarrativesUtils.round_number(self._measure_descr_stats.get_total(), 0),
                 "avg" : NarrativesUtils.round_number(self._measure_descr_stats.get_mean(), 2),
                 "o": self._five_point_summary_stats.get_num_outliers(),
                 "col_name": self._column_name,
                 'rows': self._dataframe_helper.get_num_rows()
     }
     output = NarrativesUtils.get_template_output(self._base_dir,\
                                     'distribution_narratives.html',data_dict)
     return output
Beispiel #6
0
 def generate_narratives(self):
     narrative_data_dict = self._result_setter.get_executive_summary_data()
     sig_dimension_dict = self._dataframe_helper.get_significant_dimension()
     sig_dimension_dict = sorted(sig_dimension_dict,
                                 key=lambda x: abs(sig_dimension_dict[x]),
                                 reverse=True)
     sig_dims = []
     anova_data = []
     for val in sig_dimension_dict:
         sig_dims.append(val)
         if val in narrative_data_dict:
             anova_data.append(narrative_data_dict[val])
     narrative_data_dict["sig_dims"] = sig_dims
     narrative_data_dict["anova_data"] = anova_data
     # print json.dumps(narrative_data_dict,indent=2)
     executive_summary = NarrativesUtils.get_template_output(self._base_dir,\
                                                     'executive_summary.html',narrative_data_dict)
     executive_summary_paragraphs = NarrativesUtils.paragraph_splitter(
         executive_summary)
     self.executive_summary = executive_summary_paragraphs
 def _generate_take_away(self):
     output = 'Takeaway entered'
     histogram_buckets = self._measure_descr_stats.get_histogram()
     threshold = self._dataframe_helper.get_num_rows() * 0.75
     s = 0
     start = 0
     end = len(histogram_buckets)
     flag = 0
     for bin_size in range(1,len(histogram_buckets)):
         s_t = 0
         for i in range(len(histogram_buckets)-bin_size+1):
             s_t = 0
             for j in range(i,i+bin_size):
                 s_t = s_t + histogram_buckets[j]['num_records']
             if(s_t >= threshold) and (s_t > s):
                 s = s_t
                 start = i
                 end = i + bin_size - 1
                 flag = 1
         if (flag == 1):
             break
     bin_size_75 = old_div((end - start + 1)*100,len(histogram_buckets))
     s = old_div(s*100,self._dataframe_helper.get_num_rows())
     start_value = histogram_buckets[start]['start_value']
     if end >= len(histogram_buckets):
         end = len(histogram_buckets)-1
     end_value = histogram_buckets[end]['end_value']
     data_dict = {"num_bins" : len(histogram_buckets),
                 "seventy_five" : bin_size_75,
                 "col_name" : self._column_name,
                 "c_col_name" : self._capitalized_column_name,
                 "skew" : self._measure_descr_stats.get_skew(),
                 "start": start_value,
                 "end": end_value
                 }
     if (len(histogram_buckets)>3):
         output = NarrativesUtils.get_template_output(self._base_dir,\
                                         'histogram_takeaway.html',data_dict)
     return output
Beispiel #8
0
    def generate_top_dimension_narratives(self):
        topLevelAnova = self._measure_anova_result.get_topLevelDfAnovaResult(
            self._dimension_column)
        # print topLevelAnova
        top_level = topLevelAnova.get_top_level_name()
        # print top_level
        # tuple of (dimension name,anovaResult,effect_size)
        top_level_sig_dimensions = topLevelAnova.get_top_significant_dimensions(
            3)
        significant_dimensions = [x[0] for x in top_level_sig_dimensions]
        print significant_dimensions
        contributorDict = {}
        for idx, obj in enumerate(top_level_sig_dimensions):
            leveldf = obj[1].get_level_dataframe()
            levelContribution = self.compute_level_contributions(leveldf)
            contributorDict[obj[0]] = {"level": levelContribution}
            totalCont = round(np.sum([c[1] for c in levelContribution[:3]]), 2)
            contributorDict[obj[0]].update({"total": totalCont})
        print contributorDict

        print "data dict started"
        data_dict = {
            'sig_dims': significant_dimensions,
            'num_sig_dims': len(significant_dimensions),
            'contributorDict': contributorDict,
            # 'top1_contributors' : top1_contributors,
            # 'top1_contribution' : NarrativesUtils.round_number(top1_contribution,2),
            # 'num_top1_contributors' : len(top1_contributors),
            # 'top2_contributors' : top2_contributors,
            # 'top2_contribution' : NarrativesUtils.round_number(top2_contribution,2),
            # 'num_top2_contributors' : len(top2_contributors),
            # 'top3_contributors' : top3_contributors,
            # 'top3_contribution' : NarrativesUtils.round_number(top3_contribution,2),
            # 'num_top3_contributors' : len(top3_contributors),
            'target': self._measure_column,
            'dimension': self._dimension_column,
            'top_level': top_level,
            'highlightFlag': self._highlightFlag,
            'blockSplitter': self._blockSplitter
        }

        output = {
            'header':
            'Key Factors influencing ' + self._measure_column + ' from ' +
            top_level,
            'content': []
        }
        if self._binAnalyzedCol == True:
            output = {
                'header':
                'Key Factors influencing ' + self._measure_column + ' from ' +
                self._dimension_column + ' - ' + top_level,
                'content': []
            }
            output['content'].append(
                NarrativesUtils.get_template_output(
                    self._base_dir, 'anova_template_4_binned_IV.html',
                    data_dict))
        else:
            output = {
                'header':
                'Key Factors influencing ' + self._measure_column + ' from ' +
                top_level,
                'content': []
            }
            output['content'].append(
                NarrativesUtils.get_template_output(self._base_dir,
                                                    'anova_template_4.html',
                                                    data_dict))

        lines = []
        lines += NarrativesUtils.block_splitter(
            '<h4>' + output['header'] + '</h4>', self._blockSplitter)
        for cnt in output['content']:
            lines += NarrativesUtils.block_splitter(
                cnt, self._blockSplitter, highlightFlag=self._highlightFlag)
        self._anovaCard1.add_card_data(lines)
        self.card1.add_paragraph(dict(output))
 def _generate_narratives(self):
     try:
         nColsToUse = self._analysisDict[
             self._analysisName]["noOfColumnsToUse"]
     except:
         nColsToUse = None
     self._anovaNodes = NarrativesTree()
     self._anovaNodes.set_name("Performance")
     for measure_column in self._df_anova_result.get_measure_columns():
         measure_anova_result = self._df_anova_result.get_measure_result(
             measure_column)
         significant_dimensions_dict, insignificant_dimensions = measure_anova_result.get_OneWayAnovaSignificantDimensions(
         )
         num_dimensions = len(list(significant_dimensions_dict.items())
                              ) + len(insignificant_dimensions)
         significant_dimensions = [
             k for k, v in sorted(list(significant_dimensions_dict.items()),
                                  key=lambda x: -x[1])
         ]
         if nColsToUse != None:
             significant_dimensions = significant_dimensions[:nColsToUse]
         num_significant_dimensions = len(significant_dimensions)
         num_insignificant_dimensions = len(insignificant_dimensions)
         print("num_significant_dimensions", num_significant_dimensions)
         if num_significant_dimensions > 0:
             mainCard = NormalCard(name="Overview of Key Factors")
             data_c3 = []
             for sig_dim in significant_dimensions:
                 data_c3.append({
                     'dimension':
                     sig_dim,
                     'effect_size':
                     float(significant_dimensions_dict[sig_dim])
                 })
             self.narratives = {}
             self.narratives[AnovaNarratives.
                             KEY_HEADING] = "%s Performance Analysis" % (
                                 measure_column, )
             self.narratives['main_card'] = {}
             self.narratives['cards'] = []
             self.narratives['main_card'][
                 AnovaNarratives.
                 KEY_SUBHEADING] = "Relationship between %s and other Dimensions" % (
                     measure_column)
             self.narratives['main_card'][
                 AnovaNarratives.KEY_PARAGRAPH] = []
             data_dict = { \
                             'significant_dimensions' : significant_dimensions,
                             'insignificant_dimensions' : insignificant_dimensions,
                             'num_significant_dimensions' : num_significant_dimensions,
                             'num_insignificant_dimensions' : num_insignificant_dimensions,
                             'num_dimensions' : num_significant_dimensions+num_insignificant_dimensions,
                             'target' : measure_column \
                         }
             output = {'header': ''}
             output['content'] = NarrativesUtils.get_template_output(
                 self._base_dir, 'anova_template_1.html', data_dict)
             self.narratives['main_card'][
                 AnovaNarratives.KEY_PARAGRAPH].append(output)
             output1 = {'header': ''}
             output1['content'] = NarrativesUtils.get_template_output(
                 self._base_dir, 'anova_template_2.html', data_dict)
             lines = []
             lines += NarrativesUtils.block_splitter(
                 output['content'], self._blockSplitter)
             data_c3 = NormalChartData(data_c3)
             chart_data = data_c3.get_data()
             chartDataValues = []
             effect_size_values = []
             for obj in chart_data:
                 effect_size_values.append(obj["effect_size"])
             chart_data_min = min(effect_size_values)
             if chart_data_min < 0.00001:
                 for obj in chart_data:
                     chartDataValues.append(str(obj["effect_size"]))
             else:
                 for obj in chart_data:
                     chartDataValues.append(obj["effect_size"])
             chart_json = ChartJson(data=chart_data,
                                    axes={
                                        'x': 'dimension',
                                        'y': 'effect_size'
                                    },
                                    label_text={
                                        'x': '',
                                        'y':
                                        'Effect Size (scaled exp values)'
                                    },
                                    chart_type='bar')
             chart_json.set_axis_rotation(True)
             # chart_json.set_yaxis_number_format(".4f")
             chart_json.set_yaxis_number_format(
                 NarrativesUtils.select_y_axis_format(chartDataValues))
             # st_info = ["Test : ANOVA", "Threshold for p-value : 0.05", "Effect Size : Tukey's HSD"]
             statistical_info_array = [
                 ("Test Type", "ANOVA"),
                 ("Effect Size", "ETA squared"),
                 ("Max Effect Size", chart_data[0]["dimension"]),
                 ("Min Effect Size", chart_data[-1]["dimension"]),
             ]
             statistical_inferenc = ""
             if len(chart_data) == 1:
                 statistical_inference = "{} is the only variable that have significant association with the {} (Target) having an \
                  Effect size of {}".format(
                     chart_data[0]["dimension"],
                     self._dataframe_context.get_result_column(),
                     round(chart_data[0]["effect_size"], 4))
             elif len(chart_data) == 2:
                 statistical_inference = "There are two variables ({} and {}) that have significant association with the {} (Target) and the \
                  Effect size ranges are {} and {} respectively".format(
                     chart_data[0]["dimension"], chart_data[1]["dimension"],
                     self._dataframe_context.get_result_column(),
                     round(chart_data[0]["effect_size"], 4),
                     round(chart_data[1]["effect_size"], 4))
             else:
                 statistical_inference = "There are {} variables that have significant association with the {} (Target) and the \
                  Effect size ranges from {} to {}".format(
                     len(chart_data),
                     self._dataframe_context.get_result_column(),
                     round(chart_data[0]["effect_size"], 4),
                     round(chart_data[-1]["effect_size"], 4))
             if statistical_inference != "":
                 statistical_info_array.append(
                     ("Inference", statistical_inference))
             statistical_info_array = NarrativesUtils.statistical_info_array_formatter(
                 statistical_info_array)
             lines += [
                 C3ChartData(data=chart_json, info=statistical_info_array)
             ]
             lines += NarrativesUtils.block_splitter(
                 output1['content'], self._blockSplitter)
             mainCard.set_card_data(lines)
             self._anovaNodes.add_a_card(mainCard)
             self.narratives['main_card'][
                 AnovaNarratives.KEY_PARAGRAPH].append(output1)
             self.narratives['main_card'][AnovaNarratives.KEY_CHART] = {}
             effect_size_chart = {
                 'heading': '',
                 'labels': {
                     'Dimension': 'Effect Size'
                 },
                 'data': significant_dimensions_dict
             }
             print(significant_dimensions_dict)
             self.narratives['main_card'][AnovaNarratives.KEY_CHART][
                 'effect_size'] = effect_size_chart
             progressMessage = CommonUtils.create_progress_message_object(
                 self._analysisName,
                 "custom",
                 "info",
                 "Analyzing Key Drivers",
                 self._completionStatus,
                 self._completionStatus,
                 display=True)
             CommonUtils.save_progress_message(self._messageURL,
                                               progressMessage,
                                               ignore=False)
             self._generate_dimension_narratives(significant_dimensions,
                                                 measure_anova_result,
                                                 measure_column)
         else:
             mainCard = NormalCard(name="Overview of Key Factors")
             cardText = HtmlData(
                 "There are no dimensions in the dataset that have significant influence on {}"
                 .format(measure_column))
             mainCard.set_card_data([cardText])
             self._anovaNodes.add_a_card(mainCard)
Beispiel #10
0
    def _generate_analysis_para2(self):
        output = 'Para2 entered'
        histogram_buckets = self._measure_descr_stats.get_histogram()
        print(histogram_buckets)
        print("$"*200)
        threshold = self._dataframe_helper.get_num_rows() * 0.75
        s = 0
        start = 0
        end = len(histogram_buckets)
        flag = 0
        for bin_size in range(1,len(histogram_buckets)):
            s_t = 0
            for i in range(len(histogram_buckets)-bin_size+1):
                s_t = 0
                for j in range(i,i+bin_size):
                    s_t = s_t + histogram_buckets[j]['num_records']
                if(s_t >= threshold) and (s_t > s):
                    s = s_t
                    start = i
                    end = i + bin_size - 1
                    flag = 1
            if (flag == 1):
                break
        bin_size_75 = old_div((end - start + 1)*100,len(histogram_buckets))
        s = old_div(s*100,self._dataframe_helper.get_num_rows())
        print(histogram_buckets)
        print("="*120)
        start_value = histogram_buckets[start]['start_value']
        print(start,end)
        if end >= len(histogram_buckets):
            end = len(histogram_buckets)-1
        print(start,end)
        end_value = histogram_buckets[end]['end_value']
        if len(histogram_buckets) > 2:
            lowest = min(histogram_buckets[0]['num_records'],histogram_buckets[1]['num_records'],histogram_buckets[2]['num_records'])
            highest = max(histogram_buckets[0]['num_records'],histogram_buckets[1]['num_records'],histogram_buckets[2]['num_records'])
        else:
            lowest = min(histogram_buckets[0]['num_records'],histogram_buckets[1]['num_records'])
            highest = max(histogram_buckets[0]['num_records'],histogram_buckets[1]['num_records'])

        quartile_sums = self._five_point_summary_stats.get_sums()
        quartile_means = self._five_point_summary_stats.get_means()
        print(quartile_means)
        quartile_frequencies = self._five_point_summary_stats.get_frequencies()
        total = self._measure_descr_stats.get_total()
        avg = self._measure_descr_stats.get_mean()
        counts = self._measure_descr_stats.get_num_values()

        data_dict = {"histogram" : histogram_buckets,
                    "per_cont_hist1" : NarrativesUtils.round_number(old_div(histogram_buckets[0]['num_records']*100,self._measure_descr_stats.get_total()), MeasureColumnNarrative.MAX_FRACTION_DIGITS),
                    "per_cont_hist2" : NarrativesUtils.round_number(old_div(histogram_buckets[1]['num_records']*100,self._measure_descr_stats.get_total()), MeasureColumnNarrative.MAX_FRACTION_DIGITS),
                    "lowest_cont" : NarrativesUtils.round_number(old_div(lowest*100,self._measure_descr_stats.get_total()), MeasureColumnNarrative.MAX_FRACTION_DIGITS),
                    "highest_cont" : NarrativesUtils.round_number(old_div(highest*100,self._measure_descr_stats.get_total()), MeasureColumnNarrative.MAX_FRACTION_DIGITS),
                    "num_bins" : len(histogram_buckets),
                    "seventy_five" : bin_size_75,
                    "col_name" : self._column_name,
                    "skew" : self._measure_descr_stats.get_skew(),
                    "three_quarter_percent" : round(s,2),
                    "start_value" : start_value,
                    "end_value" : end_value,
                    "measure_colname":self._column_name,
                    "q4_cont" : NarrativesUtils.round_number(old_div(quartile_frequencies['q4']*100.0,counts), 2),
                    "q1_cont" : NarrativesUtils.round_number(old_div(quartile_frequencies['q1']*100.0,counts), 2),
                    "q4_frac" : NarrativesUtils.round_number(old_div(quartile_sums['q4']*100.0,total), 2),
                    "q1_frac" : NarrativesUtils.round_number(old_div(quartile_sums['q1']*100.0,total), 2),
                    "q4_sum" : NarrativesUtils.round_number(quartile_sums['q4'], 2),
                    "q4_mean" : NarrativesUtils.round_number(quartile_means['q4'], 2),
                    "q1_sum" : NarrativesUtils.round_number(quartile_sums['q1'], 2),
                    "q4_overall_mean" : round(old_div(quartile_means['q4']*1.0,avg), 2),
                    "total" : NarrativesUtils.round_number(total,2),
                    "avg" : NarrativesUtils.round_number(avg,2),
                    "highlightFlag":self._highlightFlag,
                    "blockSplitter":self._blockSplitter
        }
        try:
            data_dict["q4_q1_mean"] = round(old_div(quartile_means['q4']*1.0,quartile_means['q1']), 1)
        except:
            data_dict["q4_q1_mean"] = None

        self._result_setter.update_executive_summary_data({"skew":data_dict["skew"]})
        if abs(self._measure_descr_stats.get_skew())>0.1:
            content = NarrativesUtils.get_template_output(self._base_dir,\
                                            'descriptive_card2.html',data_dict)
            blocks = NarrativesUtils.block_splitter(content,self._blockSplitter,highlightFlag=self._highlightFlag)
            self.card2 = {}
            self.card2['data'] = {
                                    'heading': 'Concentration of High & Low segments',
                                    'content': blocks
                                }
            quartiles = ['q1','q2','q3','q4']
            observations = [0.0] + [old_div(quartile_frequencies[i]*100.0,counts) for i in quartiles]
            totals = [0.0] + [old_div(quartile_sums[i]*100.0,total) for i in quartiles]
            chart = {'x-label': '% of Observations',
                    'y-label': '% of Total '+self._column_name+' (Cumulative)',
                    'x': list(NarrativesUtils.accumu(observations)),
                    'y': list(NarrativesUtils.accumu(totals))}
            self.card2['chart'] = chart
        output = NarrativesUtils.get_template_output(self._base_dir,\
                                        'histogram_narrative.html',data_dict)
        return output
Beispiel #11
0
    def generate_narratives(self):
        regression_narrative_obj = LinearRegressionNarrative(
                                    self._df_regression_result,
                                    self._correlations,
                                    self._dataframe_helper,
                                    self._dataframe_context,
                                    self._metaParser,
                                    self._spark
                                    )
        main_card_data = regression_narrative_obj.generate_main_card_data()
        main_card_narrative = NarrativesUtils.get_template_output(self._base_dir,\
                                                        'regression_main_card.html',main_card_data)
        self.narratives['main_card'] = {}
        self.narratives["main_card"]['paragraphs'] = NarrativesUtils.paragraph_splitter(main_card_narrative)
        self.narratives["main_card"]['header'] = 'Key Measures that affect ' + self.result_column
        self.narratives["main_card"]['chart'] = {}
        self.narratives["main_card"]['chart']['heading'] = ''
        self.narratives["main_card"]['chart']['data'] = [[i for i,j in self._all_coeffs],
                                                         [j['coefficient'] for i,j in self._all_coeffs]]
        self.narratives["main_card"]['chart']['label'] = {'x':'Measure Name',
                                                            'y': 'Change in ' + self.result_column + ' per unit increase'}

        main_card = NormalCard()
        main_card_header = HtmlData(data = '<h3>Key Measures that affect ' + self.result_column+"</h3>")
        main_card_paragraphs = NarrativesUtils.block_splitter(main_card_narrative,self._blockSplitter)
        main_card_chart_data = [{"key":val[0],"value":val[1]} for val in zip([i for i,j in self._all_coeffs],[j['coefficient'] for i,j in self._all_coeffs])]
        main_card_chart = NormalChartData(data=main_card_chart_data)
        mainCardChartJson = ChartJson()
        mainCardChartJson.set_data(main_card_chart.get_data())
        mainCardChartJson.set_label_text({'x':'Influencing Factors','y': 'Change in ' + self.result_column + ' per unit increase'})
        mainCardChartJson.set_chart_type("bar")
        mainCardChartJson.set_axes({"x":"key","y":"value"})
        mainCardChartJson.set_yaxis_number_format(".2f")
        # st_info = ["Test : Regression","Threshold for p-value: 0.05", "Effect Size: Regression Coefficient"]
        chart_data = sorted(main_card_chart_data,key=lambda x:x["value"],reverse=True)
        statistical_info_array=[
            ("Test Type","Regression"),
            ("Effect Size","Coefficients"),
            ("Max Effect Size",chart_data[0]["key"]),
            ("Min Effect Size",chart_data[-1]["key"]),
            ]
        statistical_inferenc = ""
        if len(chart_data) == 1:
            statistical_inference = "{} is the only variable that have significant influence over {} (Target) having an \
             Effect size of {}".format(chart_data[0]["key"],self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4))
        elif len(chart_data) == 2:
            statistical_inference = "There are two variables ({} and {}) that have significant influence over {} (Target) and the \
             Effect size ranges are {} and {} respectively".format(chart_data[0]["key"],chart_data[1]["key"],self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4),round(chart_data[1]["value"],4))
        else:
            statistical_inference = "There are {} variables that have significant influence over {} (Target) and the \
             Effect size ranges from {} to {}".format(len(chart_data),self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4),round(chart_data[-1]["value"],4))
        if statistical_inference != "":
            statistical_info_array.append(("Inference",statistical_inference))
        statistical_info_array = NarrativesUtils.statistical_info_array_formatter(statistical_info_array)
        main_card.set_card_data(data = [main_card_header]+main_card_paragraphs+[C3ChartData(data=mainCardChartJson,info=statistical_info_array)])
        main_card.set_card_name("Key Influencers")
        self._regressionNode.add_a_card(main_card)


        count = 0
        for measure_column in self.significant_measures:
            sigMeasureNode = NarrativesTree()
            sigMeasureNode.set_name(measure_column)
            measureCard1 = NormalCard()
            measureCard1.set_card_name("{}: Impact on {}".format(measure_column,self.result_column))
            measureCard1Data = []
            if self._run_dimension_level_regression:
                measureCard2 = NormalCard()
                measureCard2.set_card_name("Key Areas where it Matters")
                measureCard2Data = []

            measure_column_cards = {}
            card0 = {}
            card1data = regression_narrative_obj.generate_card1_data(measure_column)
            card1heading = "<h3>Impact of "+measure_column+" on "+self.result_column+"</h3>"
            measureCard1Header = HtmlData(data=card1heading)
            card1data.update({"blockSplitter":self._blockSplitter})
            card1narrative = NarrativesUtils.get_template_output(self._base_dir,\
                                                            'regression_card1.html',card1data)

            card1paragraphs = NarrativesUtils.block_splitter(card1narrative,self._blockSplitter)
            card0 = {"paragraphs":card1paragraphs}
            card0["charts"] = {}
            card0['charts']['chart2']={}
            # card0['charts']['chart2']['data']=card1data["chart_data"]
            # card0['charts']['chart2']['heading'] = ''
            # card0['charts']['chart2']['labels'] = {}
            card0['charts']['chart1']={}
            card0["heading"] = card1heading
            measure_column_cards['card0'] = card0

            measureCard1Header = HtmlData(data=card1heading)
            measureCard1Data += [measureCard1Header]
            measureCard1para = card1paragraphs
            measureCard1Data += measureCard1para

            if self._run_dimension_level_regression:
                print("running narratives for key area dict")
                self._dim_regression = self.run_regression_for_dimension_levels()
                card2table, card2data=regression_narrative_obj.generate_card2_data(measure_column,self._dim_regression)
                card2data.update({"blockSplitter":self._blockSplitter})
                card2narrative = NarrativesUtils.get_template_output(self._base_dir,\
                                                            'regression_card2.html',card2data)
                card2paragraphs = NarrativesUtils.block_splitter(card2narrative,self._blockSplitter)

                card1 = {'tables': card2table, 'paragraphs' : card2paragraphs,
                        'heading' : 'Key Areas where ' + measure_column + ' matters'}
                measure_column_cards['card1'] = card1

                measureCard2Data += card2paragraphs
                if "table1" in card2table:
                    table1data = regression_narrative_obj.convert_table_data(card2table["table1"])
                    card2Table1 = TableData()
                    card2Table1.set_table_data(table1data)
                    card2Table1.set_table_type("heatMap")
                    card2Table1.set_table_top_header(card2table["table1"]["heading"])
                    card2Table1Json = json.loads(CommonUtils.convert_python_object_to_json(card2Table1))
                    # measureCard2Data.insert(3,card2Table1)
                    measureCard2Data.insert(3,card2Table1Json)

                if "table2" in card2table:
                    table2data = regression_narrative_obj.convert_table_data(card2table["table2"])
                    card2Table2 = TableData()
                    card2Table2.set_table_data(table2data)
                    card2Table2.set_table_type("heatMap")
                    card2Table2.set_table_top_header(card2table["table2"]["heading"])
                    # measureCard2Data.insert(5,card2Table2)
                    card2Table2Json = json.loads(CommonUtils.convert_python_object_to_json(card2Table2))
                    # measureCard2Data.append(card2Table2)
                    measureCard2Data.append(card2Table2Json)


            # self._result_setter.set_trend_section_data({"result_column":self.result_column,
            #                                             "measure_column":measure_column,
            #                                             "base_dir":self._base_dir
            #                                             })
            # trend_narratives_obj = TimeSeriesNarrative(self._dataframe_helper, self._dataframe_context, self._result_setter, self._spark, self._story_narrative)
            # card2 =  trend_narratives_obj.get_regression_trend_card_data()
            # if card2:
            #     measure_column_cards['card2'] = card2
            #
            #
            # card3 = {}
            progressMessage = CommonUtils.create_progress_message_object(self._analysisName,"custom","info","Analyzing Key Influencers",self._completionStatus,self._completionStatus,display=True)
            CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=False)
            card4data = regression_narrative_obj.generate_card4_data(self.result_column,measure_column)
            card4data.update({"blockSplitter":self._blockSplitter})
            # card4heading = "Sensitivity Analysis: Effect of "+self.result_column+" on Segments of "+measure_column
            card4narrative = NarrativesUtils.get_template_output(self._base_dir,\
                                                                'regression_card4.html',card4data)
            card4paragraphs = NarrativesUtils.block_splitter(card4narrative,self._blockSplitter)
            # card3 = {"paragraphs":card4paragraphs}
            card0['paragraphs'] = card1paragraphs+card4paragraphs
            card4Chart = card4data["charts"]
            # st_info = ["Test : Regression", "Variables : "+ self.result_column +", "+measure_column,"Intercept : "+str(round(self._df_regression_result.get_intercept(),2)), "Regression Coefficient : "+ str(round(self._df_regression_result.get_coeff(measure_column),2))]
            statistical_info_array=[
                ("Test Type","Regression"),
                ("Coefficient",str(round(self._df_regression_result.get_coeff(measure_column),2))),
                ("P-Value","<= 0.05"),
                ("Intercept",str(round(self._df_regression_result.get_intercept(),2))),
                ("R Square ",str(round(self._df_regression_result.get_rsquare(),2))),
                ]
            inferenceTuple = ()
            coeff = self._df_regression_result.get_coeff(measure_column)
            if coeff > 0:
                inferenceTuple = ("Inference","For every additional unit of increase in {} there will be an increase of {} units in {} (target).".format(measure_column,str(round(coeff,2)),self._dataframe_context.get_result_column()))
            else:
                inferenceTuple = ("Inference","For every additional unit of decrease in {} there will be an decrease of {} units in {} (target).".format(measure_column,str(round(coeff,2)),self._dataframe_context.get_result_column()))
            if len(inferenceTuple) > 0:
                statistical_info_array.append(inferenceTuple)
            statistical_info_array = NarrativesUtils.statistical_info_array_formatter(statistical_info_array)

            card4paragraphs.insert(2,C3ChartData(data=card4Chart,info=statistical_info_array))
            measureCard1Data += card4paragraphs

            self.narratives['cards'].append(measure_column_cards)

            if count == 0:
                card4data.pop("charts")
                self._result_setter.update_executive_summary_data(card4data)
            count += 1
            measureCard1.set_card_data(measureCard1Data)
            if self._run_dimension_level_regression:
                measureCard2.set_card_data(measureCard2Data)
                sigMeasureNode.add_cards([measureCard1,measureCard2])
            sigMeasureNode.add_cards([measureCard1])
            self._regressionNode.add_a_node(sigMeasureNode)
        # self._result_setter.set_trend_section_completion_status(True)
        self._story_narrative.add_a_node(self._regressionNode)
    def chisquare_trend(self,column_name,base_dir):
        if self._date_columns != None:
            if self._dateFormatDetected:
                output = []
                date_column = self._date_column_suggested
                chisquare_column = column_name
                result_column = self._result_column
                if chisquare_column in self._dataframe_helper.get_numeric_columns():
                    min_max = self._data_frame.select([FN.min(chisquare_column), FN.max(chisquare_column)]).collect()
                    maxval = min_max[0][1]
                    minval = min_max[0][0]
                    step = (maxval - minval) / 5.0
                    splits = [math.floor(minval), minval + step, minval + (step * 2), minval + (step * 3), minval + (step * 4), math.ceil(maxval)]
                    bucketizer = Bucketizer(splits=splits,inputCol=chisquare_column,outputCol="BINNED_COL")
                    self._data_frame = self._data_frame.withColumn(chisquare_column, self._data_frame[chisquare_column].cast(DoubleType()))
                    bucketedData = bucketizer.transform(self._data_frame)
                    df = bucketedData.select([col for col in bucketedData.columns if col != chisquare_column])
                    df = df.withColumnRenamed("BINNED_COL",chisquare_column)
                    ranges = []
                    for idx in range(len(splits)-1):
                        text = str(splits[idx])+" to "+str(splits[idx+1])
                        ranges.append(text)
                    bin_dict = dict(list(zip(list(range(len(ranges))),ranges)))
                else:
                    df = self._data_frame

                df = df.select([date_column,chisquare_column,result_column]).toPandas()
                df["suggestedDate"] = df[date_column].apply(lambda x: datetime.strptime(x,self._existingDateFormat))
                df["year_month"] = df["suggestedDate"].apply(lambda x:x.strftime("%b-%y"))
                # result_column_count_df = df.groupBy(self._result_column).count().orderBy("count",ascending=False)
                # grouped_data.sort_values(by='key', ascending=True)
                result_column_count = df[result_column].value_counts()
                top2levels = result_column_count[:2].index
                for level in top2levels:
                    filtered_df = df.loc[df[result_column] == level]
                    grouped_result = pd.DataFrame(filtered_df[date_column].value_counts()).reset_index()
                    grouped_result.columns=[date_column,"value"]
                    # grouped_result["suggestedDate"] = grouped_result[date_column].apply(lambda x: datetime.strptime(x,self._existingDateFormat))
                    grouped_result["year_month"] = grouped_result[date_column].apply(lambda x: datetime.strptime(x,self._existingDateFormat).strftime("%b-%y"))
                    crosstab_df = pd.DataFrame(pd.crosstab(filtered_df["suggestedDate"],filtered_df[chisquare_column])).reset_index()
                    if chisquare_column in self._dataframe_helper.get_numeric_columns():
                        crosstab_columns = crosstab_df.columns
                        chisquare_levels = crosstab_columns[1:]
                        chisquare_levels = [bin_dict[x] for x in chisquare_levels]
                        crosstab_df.columns = [crosstab_columns[0]]+chisquare_levels
                    else:
                        chisquare_levels = crosstab_df.columns[1:]


                    crosstab_df["year_month"] = crosstab_df["suggestedDate"].apply(lambda x:x.strftime("%b-%y"))
                    final_df = pd.merge(grouped_result,crosstab_df, how='outer', on=['year_month'])
                    final_df.sort_values(by="suggestedDate",ascending=True,inplace=True)
                    final_df.reset_index(drop=True,inplace=True)
                    final_df["overallPerChange"] = [0]+[round((x-y)*100/float(y),self._num_significant_digits) for x,y in zip(final_df["value"].iloc[1:],final_df["value"])]

                    growth_dict = {}
                    for val in chisquare_levels:
                        growth_dict[val]  = {}
                        growth_dict[val]["growth"] = round(((final_df[val].iloc[-1]-final_df[val].iloc[0])*100/float(final_df[val].iloc[0])),self._num_significant_digits)
                        if growth_dict[val]["growth"] > 3 or final_df[val].iloc[0] == 0:
                            growth_dict[val]["growthType"] = "positive"
                            print(growth_dict[val]["growth"])
                        elif growth_dict[val]["growth"] < -3:
                            growth_dict[val]["growthType"] = "negative"
                        else:
                            growth_dict[val]["growthType"] = "stable"
                        growth_dict[val]["total"] = sum(final_df[val])
                    growth_dict["overall"] = {}
                    growth_dict["overall"]["growth"] = round((final_df["value"].iloc[-1]-final_df["value"].iloc[0]/float(final_df["value"].iloc[0])),self._num_significant_digits)
                    data_dict = {}
                    total_tuple = []
                    for k,v in list(growth_dict.items()):
                        if k != "overall":
                            total_tuple.append((k,v["total"]))
                    sorted_total_tuple = sorted(total_tuple,key=lambda x:x[1],reverse=True)
                    top_dimension = sorted_total_tuple[0][0]
                    final_df["topDimensionPerChange"] = [0]+[round((x-y)*100/float(y),self._num_significant_digits) for x,y in zip(final_df[top_dimension].iloc[1:],final_df[top_dimension])]
                    data_dict["dimension"] = chisquare_column
                    data_dict["correlation"] = final_df["value"].corr(final_df[top_dimension])
                    data_dict["subset_increase_percent"] = growth_dict[top_dimension]["growth"]
                    data_dict["overall_increase_percent"] = growth_dict["overall"]["growth"]
                    data_dict["target"] = level
                    data_dict["top_dimension"] = top_dimension
                    overall_peak_index = np.argmax(final_df["value"])
                    overall_low_index = np.argmin(final_df["value"])
                    top_dimension_peak_index = np.argmax(final_df[top_dimension])
                    top_dimension_low_index = np.argmin(final_df[top_dimension])
                    data_dict["overallPeakValue"] = final_df["value"][overall_peak_index]
                    data_dict["overallLowestValue"] = final_df["value"][overall_low_index]
                    data_dict["overallPeakTime"] = final_df["year_month"][overall_peak_index]
                    data_dict["overallLowestTime"] = final_df["year_month"][overall_low_index]
                    data_dict["overallPeakIncrease"] = final_df["overallPerChange"][overall_peak_index]
                    data_dict["topDimensionPeakValue"] = final_df[top_dimension][top_dimension_peak_index]
                    data_dict["topDimensionLowestValue"] = final_df[top_dimension][top_dimension_low_index]
                    data_dict["topDimensionPeakTime"] = final_df["year_month"][top_dimension_peak_index]
                    data_dict["topDimensionLowestTime"] = final_df["year_month"][top_dimension_low_index]
                    data_dict["topDimensionPeakIncrease"] = final_df["topDimensionPerChange"][top_dimension_peak_index]
                    data_dict["overall_streak"] = NarrativesUtils.streak_data(final_df,overall_peak_index,overall_low_index,\
                                                    "overallPerChange","value")
                    data_dict["top_dimension_streak"] = NarrativesUtils.streak_data(final_df,top_dimension_peak_index,top_dimension_low_index,\
                                                    "topDimensionPerChange",top_dimension)
                    # print growth_dict
                    data_dict["num_positive_growth_dimensions"] = 0
                    data_dict["positive_growth_dimensions"] = []
                    data_dict["positive_growth_values"] = []
                    data_dict["num_negative_growth_dimensions"] = 0
                    data_dict["negative_growth_dimensions"] = []
                    data_dict["negative_growth_values"] = []
                    data_dict["num_stable_growth_dimensions"] = 0
                    data_dict["stable_growth_dimensions"] = []
                    data_dict["stable_growth_values"] = []
                    data_dict["overall_growth_rate"] = growth_dict["overall"]["growth"]
                    data_dict["total_levels"] = len(chisquare_levels)
                    for val in chisquare_levels:
                        if growth_dict[val]["growthType"] == "positive":
                            data_dict["num_positive_growth_dimensions"] += 1
                            data_dict["positive_growth_dimensions"].append(val)
                            data_dict["positive_growth_values"].append(growth_dict[val]["growth"])
                        elif growth_dict[val]["growthType"] == "negative":
                            data_dict["num_negative_growth_dimensions"] += 1
                            data_dict["negative_growth_dimensions"].append(val)
                            data_dict["negative_growth_values"].append(growth_dict[val]["growth"])
                        else:
                            data_dict["num_stable_growth_dimensions"] += 1
                            data_dict["stable_growth_dimensions"].append(val)
                            data_dict["stable_growth_values"].append(growth_dict[val]["growth"])
                    summary1 = NarrativesUtils.get_template_output(self._base_dir,\
                                                                    'chisquare_trend.html',data_dict)
                    chart_data = {"data":[],"header":[]}
                    chart_data["header"] = ["time",result_column,top_dimension]
                    chart_data["data"]=[["time"],[result_column],[top_dimension]]
                    for idx in range(final_df.shape[0]):
                        chart_data["data"][0].append(final_df["year_month"].iloc[idx])
                        chart_data["data"][1].append(final_df["value"].iloc[idx])
                        chart_data["data"][2].append(final_df[top_dimension].iloc[idx])

                    paragraphs = NarrativesUtils.paragraph_splitter(summary1)
                    card_data = {"paragraphs":paragraphs,"chart":chart_data}
                    output.append([card_data])
                print(json.dumps(output,indent=2))
    def _generate_summary(self):
        data_dict = {}
        rules_dict = self._table
        data_dict["blockSplitter"] = self._blockSplitter
        data_dict["targetcol"] = self._colname
        groups = rules_dict.keys()
        probabilityCutoff = 75
        probabilityGroups = [{
            "probability": probabilityCutoff,
            "count": 0,
            "range": [probabilityCutoff, 100]
        }, {
            "probability": probabilityCutoff - 1,
            "count": 0,
            "range": [0, probabilityCutoff - 1]
        }]
        tableArray = [[
            "Prediction Rule", "Probability", "Prediction", "Freq", "group",
            "richRules"
        ]]
        dropdownData = []
        chartDict = {}
        targetLevel = self._dataframe_context.get_target_level_for_model()
        probabilityArrayAll = []

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        progressMessage = CommonUtils.create_progress_message_object(
            self._analysisName,
            "custom",
            "info",
            "Generating Prediction rules",
            self._completionStatus,
            self._completionStatus,
            display=True)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=False)
        self._dataframe_context.update_completion_status(
            self._completionStatus)
        targetValues = [x for x in rules_dict.keys() if x == targetLevel
                        ] + [x for x in rules_dict.keys() if x != targetLevel]
        for idx, target in enumerate(targetValues):
            if idx == 0:
                if self._dataframe_context.get_story_on_scored_data() != True:
                    dropdownData.append({
                        "displayName": target,
                        "name": target,
                        "selected": True,
                        "id": idx + 1
                    })
                else:
                    dropdownData.append({
                        "displayName":
                        "{} : {}".format(self._colname, target),
                        "name":
                        target,
                        "selected":
                        True,
                        "id":
                        idx + 1
                    })
            else:
                if self._dataframe_context.get_story_on_scored_data() != True:
                    dropdownData.append({
                        "displayName": target,
                        "name": target,
                        "selected": False,
                        "id": idx + 1
                    })
                else:
                    dropdownData.append({
                        "displayName":
                        "{} : {}".format(self._colname, target),
                        "name":
                        target,
                        "selected":
                        False,
                        "id":
                        idx + 1
                    })
            rulesArray = rules_dict[target]
            probabilityArray = [
                round(x, 2) for x in self.success_percent[target]
            ]
            probabilityArrayAll += probabilityArray
            groupArray = [
                "strong" if x >= probabilityCutoff else "mixed"
                for x in probabilityArray
            ]
            for idx2, obj in enumerate(probabilityGroups):
                grpCount = len([
                    x for x in probabilityArray
                    if x >= obj["range"][0] and x <= obj["range"][1]
                ])
                obj["count"] += grpCount
                probabilityGroups[idx2] = obj
            predictionArray = [target] * len(rulesArray)
            freqArray = self.total_predictions[target]
            chartDict[target] = sum(freqArray)
            success = self.successful_predictions[target]
            success_percent = self.success_percent[target]
            richRulesArray = []
            crudeRuleArray = []
            analysisType = self._dataframe_context.get_analysis_type()
            targetCol = self._dataframe_context.get_result_column()
            binFlag = False
            if self._dataframe_context.get_custom_analysis_details() != None:
                binnedColObj = [
                    x["colName"] for x in
                    self._dataframe_context.get_custom_analysis_details()
                ]
                if binnedColObj != None and targetCol in binnedColObj:
                    binFlag = True
            for idx2, crudeRule in enumerate(rulesArray):
                richRule, crudeRule = NarrativesUtils.generate_rules(
                    self._colname,
                    target,
                    crudeRule,
                    freqArray[idx2],
                    success[idx2],
                    success_percent[idx2],
                    analysisType,
                    binFlag=binFlag)
                richRulesArray.append(richRule)
                crudeRuleArray.append(crudeRule)
            probabilityArray = map(
                lambda x: humanize.apnumber(x) + "%"
                if x >= 10 else str(int(x)) + "%", probabilityArray)
            # targetArray = zip(richRulesArray,probabilityArray,predictionArray,freqArray,groupArray)
            targetArray = zip(crudeRuleArray, probabilityArray,
                              predictionArray, freqArray, groupArray,
                              richRulesArray)
            targetArray = [list(x) for x in targetArray]
            tableArray += targetArray

        donutChartMaxLevel = 10
        if self._dataframe_context.get_story_on_scored_data() == True:
            chartDict = {}
            probabilityRangeForChart = GLOBALSETTINGS.PROBABILITY_RANGE_FOR_DONUT_CHART
            chartDict = dict(
                zip(probabilityRangeForChart.keys(),
                    [0] * len(probabilityRangeForChart)))
            for val in probabilityArrayAll:
                for grps, grpRange in probabilityRangeForChart.items():
                    if val > grpRange[0] and val <= grpRange[1]:
                        chartDict[grps] = chartDict[grps] + 1
            chartDict = {k: v for k, v in chartDict.items() if v != 0}
        else:
            chartDict = dict([(k, sum(v))
                              for k, v in self.total_predictions.items()])
            chartDict = {k: v for k, v in chartDict.items() if v != 0}
        if len(chartDict) > donutChartMaxLevel:
            chartDict = NarrativesUtils.restructure_donut_chart_data(
                chartDict, nLevels=donutChartMaxLevel)
        chartData = NormalChartData([chartDict]).get_data()
        chartJson = ChartJson(data=chartData)
        chartJson.set_title(self._colname)
        chartJson.set_chart_type("donut")
        mainCardChart = C3ChartData(data=chartJson)
        mainCardChart.set_width_percent(45)
        # mainCardChart = {"dataType": "c3Chart","widthPercent":33 ,"data": {"data": [chartDict],"title":self._colname,"axes":{},"label_text":{},"legend":{},"yAxisNumberFormat": ".2s","types":None,"axisRotation":False, "chart_type": "donut"}}

        dropdownDict = {
            "dataType": "dropdown",
            "label": "Showing prediction rules for",
            "data": dropdownData
        }

        data_dict["probabilityGroups"] = probabilityGroups
        if self._dataframe_context.get_story_on_scored_data() != True:
            maincardSummary = NarrativesUtils.get_template_output(self._base_dir,\
                                                        'decisiontreesummary.html',data_dict)
        else:
            predictedLevelcountArray = [(x[2], x[3]) for x in tableArray[1:]]
            predictedLevelCountDict = {}
            # predictedLevelcountDict = defaultdict(predictedLevelcountArray)
            for val in predictedLevelcountArray:
                predictedLevelCountDict.setdefault(val[0], []).append(val[1])

            levelCountDict = {}
            for k, v in predictedLevelCountDict.items():
                levelCountDict[k] = sum(v)
            # levelCountDict = self._metaParser.get_unique_level_dict(self._colname)
            total = float(
                sum([x for x in levelCountDict.values() if x != None]))
            levelCountTuple = [{
                "name": k,
                "count": v,
                "percentage": round(v * 100 / total, 2)
            } for k, v in levelCountDict.items() if v != None]
            percentageArray = [x["percentage"] for x in levelCountTuple]
            percentageArray = NarrativesUtils.ret_smart_round(percentageArray)
            levelCountTuple = [{
                "name": obj["name"],
                "count": obj["count"],
                "percentage": str(percentageArray[idx]) + "%"
            } for idx, obj in enumerate(levelCountTuple)]
            data_dict["nlevel"] = len(levelCountDict)
            print "levelCountTuple", levelCountTuple
            print "levelCountDict", levelCountDict
            if targetLevel in levelCountDict:
                data_dict["topLevel"] = [
                    x for x in levelCountTuple if x["name"] == targetLevel
                ][0]
                if len(levelCountTuple) > 1:
                    data_dict["secondLevel"] = max([
                        x for x in levelCountTuple if x["name"] != targetLevel
                    ],
                                                   key=lambda x: x["count"])
                else:
                    data_dict["secondLevel"] = None
            else:
                data_dict["topLevel"] = levelCountTuple[0]
                if len(levelCountTuple) > 1:
                    data_dict["secondLevel"] = levelCountTuple[1]
                else:
                    data_dict["secondLevel"] = None
            print data_dict
            maincardSummary = NarrativesUtils.get_template_output(
                self._base_dir, 'decisiontreescore.html', data_dict)
        main_card = NormalCard()
        main_card_data = []
        main_card_narrative = NarrativesUtils.block_splitter(
            maincardSummary, self._blockSplitter)
        main_card_data += main_card_narrative

        main_card_data.append(mainCardChart)
        main_card_data.append(dropdownDict)

        main_card_table = TableData()
        if self._dataframe_context.get_story_on_scored_data() == True:
            main_card_table.set_table_width(75)
        main_card_table.set_table_data(tableArray)
        main_card_table.set_table_type("popupDecisionTreeTable")
        main_card_data.append(main_card_table)
        uidTable = self._result_setter.get_unique_identifier_table()
        if uidTable != None:
            main_card_data.append(uidTable)
        else:
            main_card_table.set_table_width(100)
        main_card.set_card_data(main_card_data)
        main_card.set_card_name("Predicting Key Drivers of {}".format(
            self._colname))
        self._decisionTreeNode.add_a_card(main_card)
    def Predict(self):
        self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight(
        )
        self._scriptStages = {
            "initialization": {
                "summary": "Initialized the Naive Bayes Scripts",
                "weight": 2
            },
            "prediction": {
                "summary": "Spark ML Naive Bayes Model Prediction Finished",
                "weight": 2
            },
            "frequency": {
                "summary": "descriptive analysis finished",
                "weight": 2
            },
            "chisquare": {
                "summary": "chi Square analysis finished",
                "weight": 4
            },
            "completion": {
                "summary": "all analysis finished",
                "weight": 4
            },
        }

        self._completionStatus += self._scriptWeightDict[self._analysisName][
            "total"] * self._scriptStages["initialization"]["weight"] / 10
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "initialization",\
                                    "info",\
                                    self._scriptStages["initialization"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL, progressMessage)
        self._dataframe_context.update_completion_status(
            self._completionStatus)

        SQLctx = SQLContext(sparkContext=self._spark.sparkContext,
                            sparkSession=self._spark)
        dataSanity = True
        level_counts_train = self._dataframe_context.get_level_count_dict()
        categorical_columns = self._dataframe_helper.get_string_columns()
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        time_dimension_columns = self._dataframe_helper.get_timestamp_columns()
        result_column = self._dataframe_context.get_result_column()
        categorical_columns = [
            x for x in categorical_columns if x != result_column
        ]

        level_counts_score = CommonUtils.get_level_count_dict(
            self._data_frame,
            categorical_columns,
            self._dataframe_context.get_column_separator(),
            output_type="dict",
            dataType="spark")
        for key in level_counts_train:
            if key in level_counts_score:
                if level_counts_train[key] != level_counts_score[key]:
                    dataSanity = False
            else:
                dataSanity = False

        test_data_path = self._dataframe_context.get_input_file()
        score_data_path = self._dataframe_context.get_score_path(
        ) + "/data.csv"
        trained_model_path = self._dataframe_context.get_model_path()
        trained_model_path = "/".join(
            trained_model_path.split("/")[:-1]
        ) + "/" + self._slug + "/" + self._dataframe_context.get_model_for_scoring(
        )
        # score_summary_path = self._dataframe_context.get_score_path()+"/Summary/summary.json"

        pipelineModel = MLUtils.load_pipeline(trained_model_path)

        df = self._data_frame
        transformed = pipelineModel.transform(df)
        label_indexer_dict = MLUtils.read_string_indexer_mapping(
            trained_model_path, SQLctx)
        prediction_to_levels = udf(lambda x: label_indexer_dict[x],
                                   StringType())
        transformed = transformed.withColumn(
            result_column, prediction_to_levels(transformed.prediction))

        if "probability" in transformed.columns:
            probability_dataframe = transformed.select(
                [result_column, "probability"]).toPandas()
            probability_dataframe = probability_dataframe.rename(
                index=str, columns={result_column: "predicted_class"})
            probability_dataframe[
                "predicted_probability"] = probability_dataframe[
                    "probability"].apply(lambda x: max(x))
            self._score_summary[
                "prediction_split"] = MLUtils.calculate_scored_probability_stats(
                    probability_dataframe)
            self._score_summary["result_column"] = result_column
            scored_dataframe = transformed.select(
                categorical_columns + time_dimension_columns +
                numerical_columns + [result_column, "probability"]).toPandas()
            scored_dataframe['predicted_probability'] = probability_dataframe[
                "predicted_probability"].values
            # scored_dataframe = scored_dataframe.rename(index=str, columns={"predicted_probability": "probability"})
        else:
            self._score_summary["prediction_split"] = []
            self._score_summary["result_column"] = result_column
            scored_dataframe = transformed.select(categorical_columns +
                                                  time_dimension_columns +
                                                  numerical_columns +
                                                  [result_column]).toPandas()

        labelMappingDict = self._dataframe_context.get_label_map()
        if score_data_path.startswith("file"):
            score_data_path = score_data_path[7:]
        scored_dataframe.to_csv(score_data_path, header=True, index=False)

        uidCol = self._dataframe_context.get_uid_column()
        if uidCol == None:
            uidCols = self._metaParser.get_suggested_uid_columns()
            if len(uidCols) > 0:
                uidCol = uidCols[0]
        uidTableData = []
        predictedClasses = list(scored_dataframe[result_column].unique())
        if uidCol:
            if uidCol in df.columns:
                for level in predictedClasses:
                    levelDf = scored_dataframe[scored_dataframe[result_column]
                                               == level]
                    levelDf = levelDf[[
                        uidCol, "predicted_probability", result_column
                    ]]
                    levelDf.sort_values(by="predicted_probability",
                                        ascending=False,
                                        inplace=True)
                    levelDf["predicted_probability"] = levelDf[
                        "predicted_probability"].apply(
                            lambda x: humanize.apnumber(x * 100) + "%"
                            if x * 100 >= 10 else str(int(x * 100)) + "%")
                    uidTableData.append(levelDf[:5])
                uidTableData = pd.concat(uidTableData)
                uidTableData = [list(arr) for arr in list(uidTableData.values)]
                uidTableData = [[uidCol, "Probability", result_column]
                                ] + uidTableData
                uidTable = TableData()
                uidTable.set_table_width(25)
                uidTable.set_table_data(uidTableData)
                uidTable.set_table_type("normalHideColumn")
                self._result_setter.set_unique_identifier_table(
                    json.loads(
                        CommonUtils.convert_python_object_to_json(uidTable)))

        self._completionStatus += self._scriptWeightDict[self._analysisName][
            "total"] * self._scriptStages["prediction"]["weight"] / 10
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "prediction",\
                                    "info",\
                                    self._scriptStages["prediction"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL, progressMessage)
        self._dataframe_context.update_completion_status(
            self._completionStatus)

        print("STARTING DIMENSION ANALYSIS ...")
        columns_to_keep = []
        columns_to_drop = []

        columns_to_keep = self._dataframe_context.get_score_consider_columns()

        if len(columns_to_keep) > 0:
            columns_to_drop = list(set(df.columns) - set(columns_to_keep))
        else:
            columns_to_drop += ["predicted_probability"]

        scored_df = transformed.select(categorical_columns +
                                       time_dimension_columns +
                                       numerical_columns + [result_column])
        columns_to_drop = [
            x for x in columns_to_drop if x in scored_df.columns
        ]
        modified_df = scored_df.select(
            [x for x in scored_df.columns if x not in columns_to_drop])
        resultColLevelCount = dict(
            modified_df.groupby(result_column).count().collect())
        self._metaParser.update_column_dict(
            result_column, {
                "LevelCount": resultColLevelCount,
                "numberOfUniqueValues": len(resultColLevelCount.keys())
            })
        self._dataframe_context.set_story_on_scored_data(True)

        self._dataframe_context.update_consider_columns(columns_to_keep)
        df_helper = DataFrameHelper(modified_df, self._dataframe_context,
                                    self._metaParser)
        df_helper.set_params()
        spark_scored_df = df_helper.get_data_frame()

        if len(predictedClasses) >= 2:
            try:
                fs = time.time()
                df_decision_tree_obj = DecisionTrees(
                    spark_scored_df,
                    df_helper,
                    self._dataframe_context,
                    self._spark,
                    self._metaParser,
                    scriptWeight=self._scriptWeightDict,
                    analysisName=self._analysisName).test_all(
                        dimension_columns=[result_column])
                narratives_obj = CommonUtils.as_dict(
                    DecisionTreeNarrative(result_column,
                                          df_decision_tree_obj,
                                          self._dataframe_helper,
                                          self._dataframe_context,
                                          self._metaParser,
                                          self._result_setter,
                                          story_narrative=None,
                                          analysisName=self._analysisName,
                                          scriptWeight=self._scriptWeightDict))
                print(narratives_obj)
            except Exception as e:
                print("DecisionTree Analysis Failed ", str(e))
        else:
            data_dict = {
                "npred": len(predictedClasses),
                "nactual": len(labelMappingDict.values())
            }

            if data_dict["nactual"] > 2:
                levelCountDict[predictedClasses[0]] = resultColLevelCount[
                    predictedClasses[0]]
                levelCountDict["Others"] = sum([
                    v for k, v in resultColLevelCount.items()
                    if k != predictedClasses[0]
                ])
            else:
                levelCountDict = resultColLevelCount
                otherClass = list(
                    set(labelMappingDict.values()) - set(predictedClasses))[0]
                levelCountDict[otherClass] = 0

                print(levelCountDict)

            total = float(
                sum([x for x in levelCountDict.values() if x != None]))
            levelCountTuple = [({
                "name":
                k,
                "count":
                v,
                "percentage":
                humanize.apnumber(v * 100 / total) + "%"
            }) for k, v in levelCountDict.items() if v != None]
            levelCountTuple = sorted(levelCountTuple,
                                     key=lambda x: x["count"],
                                     reverse=True)
            data_dict["blockSplitter"] = "|~NEWBLOCK~|"
            data_dict["targetcol"] = result_column
            data_dict["nlevel"] = len(levelCountDict.keys())
            data_dict["topLevel"] = levelCountTuple[0]
            data_dict["secondLevel"] = levelCountTuple[1]
            maincardSummary = NarrativesUtils.get_template_output(
                "/apps/", 'scorewithoutdtree.html', data_dict)

            main_card = NormalCard()
            main_card_data = []
            main_card_narrative = NarrativesUtils.block_splitter(
                maincardSummary, "|~NEWBLOCK~|")
            main_card_data += main_card_narrative

            chartData = NormalChartData([levelCountDict]).get_data()
            chartJson = ChartJson(data=chartData)
            chartJson.set_title(result_column)
            chartJson.set_chart_type("donut")
            mainCardChart = C3ChartData(data=chartJson)
            mainCardChart.set_width_percent(33)
            main_card_data.append(mainCardChart)

            uidTable = self._result_setter.get_unique_identifier_table()
            if uidTable != None:
                main_card_data.append(uidTable)
            main_card.set_card_data(main_card_data)
            main_card.set_card_name(
                "Predicting Key Drivers of {}".format(result_column))
            self._result_setter.set_score_dtree_cards([main_card], {})
Beispiel #15
0
    def __init__(self, df_helper, df_context, result_setter, spark,
                 story_narrative, meta_parser):
        self._story_narrative = story_narrative
        self._result_setter = result_setter
        self._spark = spark
        self._dataframe_helper = df_helper
        self._dataframe_context = df_context
        self._pandas_flag = df_context._pandas_flag
        self._data_frame = df_helper.get_data_frame()
        self._num_significant_digits = NarrativesUtils.get_significant_digit_settings(
            "trend")
        self._metaParser = meta_parser

        self._result_column = self._dataframe_context.get_result_column()
        self._string_columns = self._dataframe_helper.get_string_columns()
        self._timestamp_columns = self._dataframe_helper.get_timestamp_columns(
        )

        # self._selected_date_columns = None
        self._selected_date_columns = self._dataframe_context.get_selected_date_columns(
        )
        self._all_date_columns = self._dataframe_context.get_date_columns()
        self._string_columns = list(
            set(self._string_columns) - set(self._all_date_columns))

        self._dateFormatDetected = False
        self._existingDateFormat = None
        self._dateFormatConversionDict = NarrativesUtils.date_formats_mapping_dict(
        )
        self._dateColumnFormatDict = df_context.get_date_format_dict()
        if self._dataframe_context.get_requested_date_format() != None:
            self._requestedDateFormat = df_context.get_requested_date_format()
        else:
            self._requestedDateFormat = None

        self._analysistype = self._dataframe_context.get_analysis_type()
        self._trendSettings = self._dataframe_context.get_trend_settings()
        self._trendSpecificMeasure = False
        if self._trendSettings != None:
            if self._analysistype == "dimension" and self._trendSettings[
                    "name"] != "Count":
                self._trendSpecificMeasure = True
                self._analysistype = "measure"
                self._result_column = self._trendSettings["selectedMeasure"]
            elif self._analysistype == "measure" and self._trendSettings[
                    "name"] != "Count":
                self._result_column = self._trendSettings["selectedMeasure"]

        self._trend_subsection = self._result_setter.get_trend_section_name()
        self._regression_trend_card = None
        self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER
        self._highlightFlag = "|~HIGHLIGHT~|"
        self._trend_on_td_column = False
        self._number_of_dimensions_to_consider = 10

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        self._analysisName = self._dataframe_context.get_analysis_name()
        self._messageURL = self._dataframe_context.get_message_url()
        if self._analysistype == "dimension":
            self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight(
            )
            self._scriptStages = {
                "initialization": {
                    "summary": "Initialized The Frequency Narratives",
                    "weight": 0
                },
                "summarygeneration": {
                    "summary": "Summary Generation Finished",
                    "weight": 4
                },
                "completion": {
                    "summary": "Frequency Stats Narratives Done",
                    "weight": 0
                },
            }
        elif self._analysistype == "measure":
            if self._trendSpecificMeasure:
                self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight(
                )
            else:
                self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight(
                )
            self._scriptStages = {
                "trendNarrativeStart": {
                    "summary": "Started The Descriptive Stats Narratives",
                    "weight": 1
                },
                "trendNarrativeEnd": {
                    "summary": "Narratives For Descriptive Stats Finished",
                    "weight": 0
                },
            }

        self._base_dir = "/trend/"
        if self._pandas_flag and self._selected_date_columns and not self._dateColumnFormatDict and not self._timestamp_columns:
            for column in self._selected_date_columns:
                uniqueVals = self._data_frame[column].astype(
                    str).unique().tolist()
                metaHelperInstance = MetaDataHelper(self._data_frame,
                                                    self._data_frame.shape[0])
                if len(uniqueVals
                       ) > 0 and metaHelperInstance.get_datetime_format_pandas(
                           [
                               self._data_frame.sort_values(
                                   by=column, ascending=False)[column][0]
                           ]) != None:
                    dateColumnFormat = metaHelperInstance.get_datetime_format_pandas(
                        uniqueVals)
                    self._dateColumnFormatDict.update(
                        {column: dateColumnFormat})
        dateColCheck = NarrativesUtils.check_date_column_formats(self._selected_date_columns,\
                                                    self._timestamp_columns,\
                                                    self._dateColumnFormatDict,\
                                                    self._dateFormatConversionDict,
                                                    self._requestedDateFormat)
        print(dateColCheck)

        self._dateFormatDetected = dateColCheck["dateFormatDetected"]
        self._trend_on_td_column = dateColCheck["trendOnTdCol"]
        if self._dateFormatDetected:
            self._requestedDateFormat = dateColCheck["requestedDateFormat"]
            self._existingDateFormat = dateColCheck["existingDateFormat"]
            # self._date_column_suggested is the column used for trend
            self._date_column_suggested = dateColCheck["suggestedDateColumn"]
        if self._existingDateFormat:
            self._data_frame, dataRangeStats = NarrativesUtils.calculate_data_range_stats(
                self._data_frame, self._existingDateFormat,
                self._date_column_suggested, self._trend_on_td_column,
                self._pandas_flag)
            print(dataRangeStats)
            self._durationString = dataRangeStats["durationString"]
            self._duration = dataRangeStats["duration"]
            self._dataLevel = dataRangeStats["dataLevel"]
            first_date = dataRangeStats["firstDate"]
            last_date = dataRangeStats["lastDate"]

            if self._timestamp_columns != None:
                if self._selected_date_columns == None:
                    self._selected_date_columns = self._timestamp_columns
                else:
                    self._selected_date_columns += self._timestamp_columns
        if self._pandas_flag:
            pass
        else:
            if self._trend_subsection == "regression":
                if self._selected_date_columns != None:
                    if self._dateFormatDetected:
                        trend_subsection_data = self._result_setter.get_trend_section_data(
                        )
                        measure_column = trend_subsection_data[
                            "measure_column"]
                        result_column = trend_subsection_data["result_column"]
                        base_dir = trend_subsection_data["base_dir"]

                        card3heading = 'How ' + result_column + ' and ' + measure_column + ' changed over time'
                        if self._dataLevel == "day":
                            grouped_data = self._data_frame.groupBy(
                                "suggestedDate").agg({
                                    measure_column: 'sum',
                                    result_column: 'sum'
                                })
                            grouped_data = grouped_data.withColumnRenamed(
                                grouped_data.columns[-1], result_column)
                            grouped_data = grouped_data.withColumnRenamed(
                                grouped_data.columns[-2], measure_column)
                            grouped_data = grouped_data.withColumn(
                                "year_month",
                                udf(lambda x: x.strftime("%b-%y"))(
                                    "suggestedDate"))
                            grouped_data = grouped_data.orderBy(
                                "suggestedDate", ascending=True)
                            grouped_data = grouped_data.withColumnRenamed(
                                grouped_data.columns[0], "key")
                            grouped_data = grouped_data.toPandas()
                        elif self._dataLevel == "month":
                            grouped_data = self._data_frame.groupBy(
                                "year_month").agg({
                                    measure_column: 'sum',
                                    result_column: 'sum'
                                })
                            grouped_data = grouped_data.withColumnRenamed(
                                grouped_data.columns[-1], result_column)
                            grouped_data = grouped_data.withColumnRenamed(
                                grouped_data.columns[-2], measure_column)
                            grouped_data = grouped_data.withColumn(
                                "suggestedDate",
                                udf(lambda x: datetime.strptime(x, "%b-%y"))(
                                    "year_month"))
                            grouped_data = grouped_data.orderBy(
                                "suggestedDate", ascending=True)
                            grouped_data = grouped_data.withColumnRenamed(
                                "suggestedDate", "key")
                            grouped_data = grouped_data.select([
                                "key", measure_column, result_column,
                                "year_month"
                            ]).toPandas()
                            grouped_data["key"] = grouped_data[
                                "year_month"].apply(
                                    lambda x: datetime.strptime(x, "%b-%y"
                                                                ).date())

                        trend_narrative_obj = TrendNarrative(
                            self._result_column, self._date_column_suggested,
                            grouped_data, self._existingDateFormat,
                            self._requestedDateFormat, self._base_dir,
                            self._metaParser)

                        card3data = trend_narrative_obj.generate_regression_trend_data(
                            grouped_data, measure_column, result_column,
                            self._dataLevel, self._durationString)

                        card3narrative = NarrativesUtils.get_template_output(base_dir,\
                                                                        'regression_card3.html',card3data)

                        card3chart = trend_narrative_obj.generate_regression_trend_chart(
                            grouped_data, self._dataLevel)
                        card3paragraphs = NarrativesUtils.paragraph_splitter(
                            card3narrative)
                        card2 = {
                            'charts': card3chart,
                            'paragraphs': card3paragraphs,
                            'heading': card3heading
                        }
                        self.set_regression_trend_card_data(card2)
                    else:
                        print("NO DATE FORMAT DETECTED")
                else:
                    print("NO DATE COLUMNS PRESENT")

        if self._analysistype == "measure":
            self._completionStatus += old_div(
                self._scriptWeightDict[self._analysisName]["total"] *
                self._scriptStages["trendNarrativeStart"]["weight"], 10)
            progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                        "trendNarrativeStart",\
                                        "info",\
                                        self._scriptStages["trendNarrativeStart"]["summary"],\
                                        self._completionStatus,\
                                        self._completionStatus)
            CommonUtils.save_progress_message(self._messageURL,
                                              progressMessage)
            self._dataframe_context.update_completion_status(
                self._completionStatus)
            # self._startMeasureTrend = self._result_setter.get_trend_section_completion_status()
            self._startMeasureTrend = True

            if self._startMeasureTrend == True:
                self.narratives = {
                    "SectionHeading": "",
                    "card1": {},
                    "card2": {},
                    "card3": {}
                }
                if self._selected_date_columns != None:
                    if self._dateFormatDetected:
                        grouped_data = NarrativesUtils.get_grouped_data_for_trend(
                            self._data_frame, self._dataLevel,
                            self._result_column, self._analysistype,
                            self._pandas_flag)
                        if self._pandas_flag:
                            self._data_frame = self._data_frame.drop(
                                self._date_column_suggested, axis=1)
                        else:
                            self._data_frame = self._data_frame.drop(
                                self._date_column_suggested)
                        # self._data_frame = self._data_frame.withColumnRenamed("year_month", self._date_column_suggested)

                        significant_dimensions = []
                        significant_dimension_dict = df_helper.get_significant_dimension(
                        )
                        if significant_dimension_dict != {} and significant_dimension_dict != None:
                            significant_dimension_tuple = tuple(
                                significant_dimension_dict.items())
                            significant_dimension_tuple = sorted(
                                significant_dimension_tuple,
                                key=lambda x: x[1],
                                reverse=True)
                            significant_dimensions = [
                                x[0] for x in
                                significant_dimension_tuple[:self.
                                                            _number_of_dimensions_to_consider]
                            ]
                        else:
                            significant_dimensions = self._string_columns[:self
                                                                          .
                                                                          _number_of_dimensions_to_consider]
                        print("significant_dimensions", significant_dimensions)
                        trend_narrative_obj = TrendNarrative(
                            self._result_column, self._date_column_suggested,
                            grouped_data, self._existingDateFormat,
                            self._requestedDateFormat, self._base_dir,
                            self._metaParser)
                        # grouped_data.to_csv("/home/gulshan/marlabs/datasets/trend_grouped_pandas.csv",index=False)
                        dataDict = trend_narrative_obj.generateDataDict(
                            grouped_data, self._dataLevel,
                            self._durationString)
                        # # update reference time with max value
                        reference_time = dataDict["reference_time"]
                        dataDict["duration"] = self._duration
                        dataDict["dataLevel"] = self._dataLevel
                        dataDict["durationString"] = self._durationString
                        dataDict[
                            "significant_dimensions"] = significant_dimensions
                        if len(significant_dimensions) > 0:
                            if self._dataLevel == "day":
                                datetimeformat = self._existingDateFormat
                            elif self._dataLevel == "month":
                                datetimeformat = "%b-%y"
                            # xtraData = trend_narrative_obj.get_xtra_calculations(self._data_frame,grouped_data,significant_dimensions,self._date_column_suggested,self._result_column,self._existingDateFormat,reference_time,self._dataLevel, self._pandas_flag)
                            xtraData = trend_narrative_obj.get_xtra_calculations(
                                self._data_frame, grouped_data,
                                significant_dimensions,
                                self._date_column_suggested,
                                self._result_column, datetimeformat,
                                reference_time, self._dataLevel,
                                self._pandas_flag)
                            if xtraData != None:
                                dataDict.update(xtraData)
                        # print 'Trend dataDict:  %s' %(json.dumps(dataDict, indent=2))
                        self._result_setter.update_executive_summary_data(
                            dataDict)
                        dataDict.update({
                            "blockSplitter": self._blockSplitter,
                            "highlightFlag": self._highlightFlag
                        })

                        summary1 = NarrativesUtils.get_template_output(self._base_dir,\
                                                                        'measure_trend_card1.html',dataDict)
                        summary2 = NarrativesUtils.get_template_output(self._base_dir,\
                                                                        'measure_trend_card2.html',dataDict)
                        measureTrendCard = NormalCard()
                        measureTrendcard1Data = NarrativesUtils.block_splitter(
                            summary1,
                            self._blockSplitter,
                            highlightFlag=self._highlightFlag)
                        measureTrendcard2Data = NarrativesUtils.block_splitter(
                            summary2, self._blockSplitter)
                        # print measureTrendcard1Data

                        bubbledata = dataDict["bubbleData"]
                        # print bubbledata
                        card1BubbleData = "<div class='col-md-6 col-xs-12 xs-p-20'><h2 class='text-center'><span>{}</span><br/><small>{}</small></h2></div><div class='col-md-6 col-xs-12 xs-p-20'><h2 class='text-center'><span>{}</span><br/><small>{}</small></h2></div>".format(
                            bubbledata[0]["value"], bubbledata[0]["text"],
                            bubbledata[1]["value"], bubbledata[1]["text"])
                        # print card1BubbleData

                        trend_chart_data = list(
                            grouped_data[["key",
                                          "value"]].T.to_dict().values())
                        trend_chart_data = sorted(trend_chart_data,
                                                  key=lambda x: x["key"])
                        card1chartdata = {"actual": [], "predicted": []}

                        if self._dataLevel == "day":
                            card1chartdata["actual"] = [{
                                "key": str(val["key"]),
                                "value": val["value"]
                            } for val in trend_chart_data]
                        elif self._dataLevel == "month":
                            card1chartdata["actual"] = [{
                                "key":
                                val["key"].strftime("%b-%y"),
                                "value":
                                val["value"]
                            } for val in trend_chart_data]

                        if self._duration < 365:
                            prediction_window = 3
                        else:
                            prediction_window = 6
                        predicted_values = trend_narrative_obj.get_forecast_values(
                            grouped_data["value"],
                            prediction_window)[len(grouped_data["value"]):]
                        predicted_values = [
                            round(x, self._num_significant_digits)
                            for x in predicted_values
                        ]

                        forecasted_data = []
                        forecasted_data.append(card1chartdata["actual"][-1])
                        forecasted_dates = []
                        # forecast_start_time = datetime.strptime(card1chartdata["actual"][-1]["key"],"%b-%y")
                        if self._dataLevel == "month":
                            forecast_start_time = datetime.strptime(
                                card1chartdata["actual"][-1]["key"], "%b-%y")
                        elif self._dataLevel == "day":
                            try:
                                forecast_start_time = datetime.strptime(
                                    card1chartdata["actual"][-1]["key"],
                                    "%Y-%m-%d")
                            except:
                                forecast_start_time = datetime.strptime(
                                    card1chartdata["actual"][-1]["key"],
                                    '%Y-%m-%d %H:%M:%S')
                        for val in range(prediction_window):
                            if self._dataLevel == "month":
                                key = forecast_start_time + relativedelta(
                                    months=1 + val)
                                forecasted_dates.append(key)
                            elif self._dataLevel == "day":
                                key = forecast_start_time + relativedelta(
                                    days=1 + val)
                                forecasted_dates.append(key)
                        forecasted_list = list(
                            zip(forecasted_dates, predicted_values))
                        if self._dataLevel == "month":
                            forecasted_list = [{
                                "key": val[0].strftime("%b-%y"),
                                "value": val[1]
                            } for val in forecasted_list]
                        elif self._dataLevel == "day":
                            forecasted_list = [{
                                "key":
                                val[0].strftime("%Y-%m-%d"),
                                "value":
                                val[1]
                            } for val in forecasted_list]
                        forecasted_data += forecasted_list
                        card1chartdata["predicted"] = forecasted_data
                        # print json.dumps(card1chartdata,indent=2)
                        card1chartdata = ScatterChartData(data=card1chartdata)
                        chartJson = ChartJson()
                        chartJson.set_data(card1chartdata.get_data())
                        chartJson.set_label_text({
                            'x': ' ',
                            'y': 'No. of Observations'
                        })
                        chartJson.set_legend({
                            "actual": "Observed",
                            "predicted": "Forecast"
                        })
                        chartJson.set_chart_type("scatter_line")
                        chartJson.set_axes({"x": "key", "y": "value"})
                        chartJson.set_yaxis_number_format(".2f")
                        st_info = [
                            "Trend Analysis",
                            "Forecast Method : Holt Winters Method"
                        ]
                        measureTrendcard1Data.insert(
                            1, C3ChartData(data=chartJson, info=st_info))
                        measureTrendcard1Data.append(
                            HtmlData(data=card1BubbleData))
                        cardData = measureTrendcard1Data + measureTrendcard2Data
                        measureTrendCard.set_card_data(cardData)
                        measureTrendCard.set_card_name("Trend Analysis")
                        trendStoryNode = NarrativesTree(
                            "Trend", None, [], [measureTrendCard])
                        self._story_narrative.add_a_node(trendStoryNode)
                        self._result_setter.set_trend_node(trendStoryNode)

                        # prediction_data = [{"key":x["key"],"value":x["value"]} for x in trend_chart_data]
                        # last_val = prediction_data[-1]
                        # last_val.update({"predicted_value":last_val["value"]})
                        # prediction_data[-1] = last_val
                        #
                        # for val in range(prediction_window):
                        #     dataLevel = dataDict["dataLevel"]
                        #     if self._dataLevel == "month":
                        #         last_key = prediction_data[-1]["key"]
                        #         key = last_key+relativedelta(months=1)
                        #         prediction_data.append({"key":key,"predicted_value":predicted_values[val]})
                        #         forecasted_data.append({"key":key,"value":predicted_values[val]})
                        #     elif self._dataLevel == "day":
                        #         last_key = prediction_data[-1]["key"]
                        #         key = last_key+relativedelta(days=1)
                        #         prediction_data.append({"key":key,"predicted_value":predicted_values[val]})
                        # prediction_data_copy = prediction_data
                        # prediction_data = []
                        # for val in prediction_data_copy:
                        #     val["key"] = val["key"].strftime("%b-%y")
                        #     prediction_data.append(val)

                        # forecastDataDict = {"startForecast":predicted_values[0],
                        #                     "endForecast":predicted_values[prediction_window-1],
                        #                     "measure":dataDict["measure"],
                        #                     "forecast":True,
                        #                     "forecast_percentage": round((predicted_values[prediction_window-1]-predicted_values[0])/predicted_values[0],self._num_significant_digits),
                        #                     "prediction_window_text": str(prediction_window) + " months"
                        #                     }
                        #
                        # self._result_setter.update_executive_summary_data(forecastDataDict)
                        # summary3 = NarrativesUtils.get_template_output(self._base_dir,\
                        # 'trend_narrative_card3.html',forecastDataDict)
                        self._completionStatus += old_div(
                            self._scriptWeightDict[self._analysisName]["total"]
                            *
                            self._scriptStages["trendNarrativeEnd"]["weight"],
                            10)
                        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                                    "trendNarrativeEnd",\
                                                    "info",\
                                                    self._scriptStages["trendNarrativeEnd"]["summary"],\
                                                    self._completionStatus,\
                                                    self._completionStatus)
                        CommonUtils.save_progress_message(
                            self._messageURL, progressMessage)
                        self._dataframe_context.update_completion_status(
                            self._completionStatus)
                    else:
                        # self._result_setter.update_executive_summary_data({"trend_present":False})
                        print("Trend Analysis for Measure Failed")
                        print("#" * 20 + "Trend Analysis Error" + "#" * 20)
                        print(
                            "No date format for the date column %s was detected."
                            % (self._date_column_suggested))
                        print("#" * 60)
                        self._completionStatus += self._scriptWeightDict[
                            self._analysisName]["total"]
                        self._dataframe_context.update_completion_status(
                            completionStatus)
                        progressMessage = CommonUtils.create_progress_message_object("Trend","failedState","error",\
                                        "Trend Failed As "+"No Date Format For The Date Column %s Was Detected !!!" %(self._date_column_suggested),\
                                        completionStatus,completionStatus)
                        CommonUtils.save_progress_message(
                            messageURL, progressMessage)
                        self._dataframe_context.update_completion_status(
                            self._completionStatus)
                else:
                    # self._result_setter.update_executive_summary_data({"trend_present":False})
                    print("Trend Analysis for Measure Failed")
                    print("#" * 20 + "Trend Analysis Error" + "#" * 20)
                    print("No date column present for Trend Analysis.")
                    print("#" * 60)
                    self._completionStatus += self._scriptWeightDict[
                        self._analysisName]["total"]
                    self._dataframe_context.update_completion_status(
                        completionStatus)
                    progressMessage = CommonUtils.create_progress_message_object("Trend","failedState","error",\
                                    "No Date Column Present",\
                                    completionStatus,completionStatus)
                    CommonUtils.save_progress_message(messageURL,
                                                      progressMessage)
                    self._dataframe_context.update_completion_status(
                        self._completionStatus)
            else:
                print("overall Trend not Started YET")

        elif self._analysistype == "dimension":
            print("Dimension Trend Started")
            self._completionStatus += old_div(
                self._scriptWeightDict[self._analysisName]["total"] *
                self._scriptStages["initialization"]["weight"], 10)
            progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                        "initialization",\
                                        "info",\
                                        self._scriptStages["initialization"]["summary"],\
                                        self._completionStatus,\
                                        self._completionStatus)
            CommonUtils.save_progress_message(self._messageURL,
                                              progressMessage)
            self._dataframe_context.update_completion_status(
                self._completionStatus)

            self.narratives = {"card0": {}}
            if self._selected_date_columns != None:
                if self._dateFormatDetected:
                    # result_column_levels = [x[0] for x in self._data_frame.select(self._result_column).distinct().collect()]
                    try:
                        result_column_levels = self._metaParser.get_unique_level_names(
                            self._result_column)
                    except:
                        if self._pandas_flag:
                            result_column_levels = list(
                                self._data_frame[self._result_column].unique())
                        else:
                            result_column_levels = [
                                x[0] for x in self._data_frame.select(
                                    self._result_column).distinct().collect()
                            ]
                            # result_column_levels = self._data_frame.agg((F.collect_set(self._result_column).alias(self._result_column))).first().asDict()[self._result_column]

                    print("-" * 100)
                    # TODO Implement meta parser getter here
                    print(result_column_levels)
                    if self._pandas_flag:
                        level_count_df = self._data_frame[
                            self._result_column].value_counts()[0:2]
                        top2levels = list(level_count_df.index)
                    else:
                        level_count_df = self._data_frame.groupBy(
                            self._result_column).count().orderBy(
                                "count", ascending=False)
                        level_count_df_rows = level_count_df.collect()
                        top2levels = [
                            level_count_df_rows[0][0],
                            level_count_df_rows[1][0]
                        ]
                    cardData = []
                    chart_data = {}
                    cardData1 = []
                    c3_chart = {"dataType": "c3Chart", "data": {}}
                    print("#" * 40)
                    overall_count = NarrativesUtils.get_grouped_count_data_for_dimension_trend(
                        self._data_frame, self._dataLevel, self._result_column,
                        self._pandas_flag)
                    print("#" * 40)
                    for idx, level in enumerate(top2levels):
                        print("calculations in progress for the level :- ",
                              level)
                        if self._pandas_flag:
                            leveldf = self._data_frame[self._data_frame[
                                self._result_column] == level]
                        else:
                            leveldf = self._data_frame.filter(
                                col(self._result_column) == level)
                        grouped_data = NarrativesUtils.get_grouped_data_for_trend(
                            leveldf, self._dataLevel, self._result_column,
                            self._analysistype, self._pandas_flag)
                        grouped_data.rename(columns={"value": "value_count"},
                                            inplace=True)
                        grouped_data = pd.merge(grouped_data,
                                                overall_count,
                                                on='key',
                                                how='left')
                        # grouped_data["value"] = grouped_data["value_count"].apply(lambda x:round(x*100/float(self._data_frame.count()),self._num_significant_digits))
                        grouped_data["value"] = old_div(
                            grouped_data["value_count"],
                            grouped_data["totalCount"])
                        grouped_data["value"] = grouped_data["value"].apply(
                            lambda x: round(x * 100, self.
                                            _num_significant_digits))
                        if self._pandas_flag:
                            leveldf = leveldf.drop(self._date_column_suggested,
                                                   axis=1)
                            leveldf = leveldf.rename(
                                columns={
                                    "year_month": self._date_column_suggested
                                })
                            if "year_month" not in leveldf.columns:
                                leveldf["year_month"] = leveldf[
                                    self._date_column_suggested]
                            leveldf["value_col"] = 1
                        else:
                            leveldf = leveldf.drop(self._date_column_suggested)
                            leveldf = leveldf.withColumnRenamed(
                                "year_month", self._date_column_suggested)
                            if "year_month" not in leveldf.columns:
                                leveldf = leveldf.withColumn(
                                    "year_month",
                                    col(self._date_column_suggested))
                            leveldf = leveldf.withColumn('value_col', lit(1))

                        trend_narrative_obj = TrendNarrative(
                            self._result_column, self._date_column_suggested,
                            grouped_data, self._existingDateFormat,
                            self._requestedDateFormat, self._base_dir,
                            self._metaParser)
                        dataDict = trend_narrative_obj.generateDataDict(
                            grouped_data, self._dataLevel,
                            self._durationString)
                        dataDict["target_column"] = dataDict["measure"]
                        dataDict["measure"] = level
                        dataDict["duration"] = self._duration
                        dataDict["dataLevel"] = self._dataLevel
                        dataDict["durationString"] = self._durationString
                        # grouped_data.to_csv("/home/gulshan/marlabs/datasets/grouped_data"+str(idx))
                        # print json.dumps(dataDict,indent=2)
                        significant_dimensions = []
                        significant_dimension_dict = df_helper.get_chisquare_significant_dimension(
                        )
                        if significant_dimension_dict != {} and significant_dimension_dict != None:
                            significant_dimension_tuple = tuple(
                                significant_dimension_dict.items())
                            significant_dimension_tuple = sorted(
                                significant_dimension_tuple,
                                key=lambda x: x[1],
                                reverse=True)
                            significant_dimensions = [
                                x[0] for x in
                                significant_dimension_tuple[:self.
                                                            _number_of_dimensions_to_consider]
                            ]
                        else:
                            significant_dimensions = self._string_columns[:self
                                                                          .
                                                                          _number_of_dimensions_to_consider]
                        print("significant_dimensions", significant_dimensions)
                        reference_time = dataDict["reference_time"]
                        dataDict[
                            "significant_dimensions"] = significant_dimensions
                        if len(significant_dimensions) > 0:
                            st = time.time()
                            xtraData = trend_narrative_obj.get_xtra_calculations(
                                leveldf, grouped_data, significant_dimensions,
                                self._date_column_suggested, "value_col",
                                self._existingDateFormat, reference_time,
                                self._dataLevel, self._pandas_flag)
                            print("time for get_xtra_calculations",
                                  time.time() - st)
                            if xtraData != None:
                                dataDict.update(xtraData)
                        dimensionCount = trend_narrative_obj.generate_dimension_extra_narrative(
                            grouped_data, dataDict, self._dataLevel)
                        if dimensionCount != None:
                            dataDict.update(dimensionCount)

                        dataDict.update({
                            "level_index": idx,
                            "blockSplitter": self._blockSplitter,
                            "highlightFlag": self._highlightFlag
                        })

                        self._result_setter.update_executive_summary_data(
                            dataDict)
                        trendStory = NarrativesUtils.get_template_output(self._base_dir,\
                                                                        'dimension_trend.html',dataDict)
                        blocks = NarrativesUtils.block_splitter(
                            trendStory, self._blockSplitter)

                        if idx != 0:
                            cardData1 += blocks[2:]
                        else:
                            cardData1 += blocks

                        trend_chart_data = [
                            x for x in list(grouped_data[
                                ["key", "value"]].T.to_dict().values())
                            if x['key'] != None
                        ]
                        trend_chart_data = sorted(trend_chart_data,
                                                  key=lambda x: x["key"])
                        card1chartdata = trend_chart_data
                        if self._dataLevel == "day":
                            card1chartdata = [{
                                "key": str(val["key"]),
                                "value": val["value"]
                            } for val in card1chartdata]
                        elif self._dataLevel == "month":
                            card1chartdata = [{
                                "key":
                                val["key"].strftime("%b-%y"),
                                "value":
                                val["value"]
                            } for val in card1chartdata]
                        chart_data[level] = card1chartdata

                    labels = {
                        "x": "key",
                        "y": list(chart_data.keys())[0],
                        "y2": list(chart_data.keys())[1]
                    }
                    c3Chart = {
                        "data": chart_data,
                        "format": "%b-%y",
                        "label": labels,
                        "label_text": {
                            "x": "Time",
                            "y": "Percentage of " + labels["y"],
                            "y2": "Percentage of " + labels["y2"]
                        }
                    }

                    c3_chart["data"] = c3Chart
                    multiLineData = []
                    for idx in range(len(chart_data[top2levels[0]])):
                        key = chart_data[top2levels[0]][idx]["key"]
                        value = chart_data[top2levels[0]][idx]["value"]
                        try:
                            value1 = chart_data[top2levels[1]][idx]["value"]
                        except:
                            value1 = 0
                        multiLineData.append({
                            "key": key,
                            top2levels[0]: value,
                            top2levels[1]: value1
                        })
                    chartData = NormalChartData(multiLineData)
                    chartJson = ChartJson()
                    chartJson.set_data(chartData.get_data())
                    chartJson.set_label_text(c3Chart["label_text"])
                    chartJson.set_legend(c3Chart["label"])
                    chartJson.set_chart_type("line")
                    chartJson.set_yaxis_number_format(".2f")
                    chartJson.set_axes(labels)
                    st_info = [
                        "Trend Analysis",
                        "Forecast Method : Holt Winters Method"
                    ]
                    cardData1.insert(1,
                                     C3ChartData(data=chartJson, info=st_info))
                    trendCard = NormalCard(name="Trend Analysis",
                                           slug=None,
                                           cardData=cardData1)
                    trendStoryNode = NarrativesTree("Trend", None, [],
                                                    [trendCard])
                    self._story_narrative.add_a_node(trendStoryNode)
                    self._result_setter.set_trend_node(trendStoryNode)
                    self._completionStatus += old_div(
                        self._scriptWeightDict[self._analysisName]["total"] *
                        self._scriptStages["summarygeneration"]["weight"], 10)
                    progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                                "summarygeneration",\
                                                "info",\
                                                self._scriptStages["summarygeneration"]["summary"],\
                                                self._completionStatus,\
                                                self._completionStatus)
                    CommonUtils.save_progress_message(self._messageURL,
                                                      progressMessage)
                    self._dataframe_context.update_completion_status(
                        self._completionStatus)

                    self._completionStatus += old_div(
                        self._scriptWeightDict[self._analysisName]["total"] *
                        self._scriptStages["completion"]["weight"], 10)
                    progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                                "completion",\
                                                "info",\
                                                self._scriptStages["completion"]["summary"],\
                                                self._completionStatus,\
                                                self._completionStatus)
                    CommonUtils.save_progress_message(self._messageURL,
                                                      progressMessage)
                    self._dataframe_context.update_completion_status(
                        self._completionStatus)
                else:
                    self._result_setter.update_executive_summary_data(
                        {"trend_present": False})
                    print("Trend Analysis for Dimension Failed")
                    print("#" * 20 + "Trend Analysis Error" + "#" * 20)
                    if self._date_column_suggested:
                        print(
                            "No date format for the date column %s was detected."
                            % (self._date_column_suggested))
                    print("#" * 60)
                    self._completionStatus += self._scriptWeightDict[
                        self._analysisName]["total"]
                    self._dataframe_context.update_completion_status(
                        self._completionStatus)
                    progressMessage = CommonUtils.create_progress_message_object("Trend","failedState","error",\
                                    "Trend Failed As "+"No Date Format For The Date Column %s Was Detected !!!" %(self._date_column_suggested),\
                                    self._completionStatus,self._completionStatus)
                    CommonUtils.save_progress_message(messageURL,
                                                      progressMessage)
                    self._dataframe_context.update_completion_status(
                        self._completionStatus)

            else:
                self._result_setter.update_executive_summary_data(
                    {"trend_present": False})
                print("Trend Analysis for Dimension Failed")
                print("#" * 20 + "Trend Analysis Error" + "#" * 20)
                print("No date column present for Trend Analysis.")
                print("#" * 60)
                self._completionStatus += self._scriptWeightDict[
                    self._analysisName]["total"]
                self._dataframe_context.update_completion_status(
                    self._completionStatus)
                progressMessage = CommonUtils.create_progress_message_object("Trend","failedState","error",\
                                "No Date Column Present",\
                                self._completionStatus,self._completionStatus)
                CommonUtils.save_progress_message(messageURL, progressMessage)
                self._dataframe_context.update_completion_status(
                    self._completionStatus)
Beispiel #16
0
    def Predict(self):
        self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight(
        )
        self._scriptStages = {
            "initialization": {
                "summary": "Initialized the Random Forest Scripts",
                "weight": 2
            },
            "prediction": {
                "summary": "Random Forest Model Prediction Finished",
                "weight": 2
            },
            "frequency": {
                "summary": "descriptive analysis finished",
                "weight": 2
            },
            "chisquare": {
                "summary": "chi Square analysis finished",
                "weight": 4
            },
            "completion": {
                "summary": "all analysis finished",
                "weight": 4
            },
        }

        self._completionStatus += old_div(
            self._scriptWeightDict[self._analysisName]["total"] *
            self._scriptStages["initialization"]["weight"], 10)
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "initialization",\
                                    "info",\
                                    self._scriptStages["initialization"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsg)
        self._dataframe_context.update_completion_status(
            self._completionStatus)
        # Match with the level_counts and then clean the data
        dataSanity = True
        level_counts_train = self._dataframe_context.get_level_count_dict()
        cat_cols = self._dataframe_helper.get_string_columns()
        # level_counts_score = CommonUtils.get_level_count_dict(self._data_frame,cat_cols,self._dataframe_context.get_column_separator(),output_type="dict")
        # if level_counts_train != {}:
        #     for key in level_counts_train:
        #         if key in level_counts_score:
        #             if level_counts_train[key] != level_counts_score[key]:
        #                 dataSanity = False
        #         else:
        #             dataSanity = False
        categorical_columns = self._dataframe_helper.get_string_columns()
        uid_col = self._dataframe_context.get_uid_column()
        if self._metaParser.check_column_isin_ignored_suggestion(uid_col):
            categorical_columns = list(set(categorical_columns) - {uid_col})
        allDateCols = self._dataframe_context.get_date_columns()
        categorical_columns = list(set(categorical_columns) - set(allDateCols))
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        result_column = self._dataframe_context.get_result_column()
        test_data_path = self._dataframe_context.get_input_file()

        if self._mlEnv == "spark":
            pass
        elif self._mlEnv == "sklearn":

            score_data_path = self._dataframe_context.get_score_path(
            ) + "/data.csv"
            if score_data_path.startswith("file"):
                score_data_path = score_data_path[7:]
            trained_model_path = self._dataframe_context.get_model_path()
            trained_model_path += "/" + self._dataframe_context.get_model_for_scoring(
            ) + ".pkl"
            if trained_model_path.startswith("file"):
                trained_model_path = trained_model_path[7:]
            score_summary_path = self._dataframe_context.get_score_path(
            ) + "/Summary/summary.json"
            if score_summary_path.startswith("file"):
                score_summary_path = score_summary_path[7:]
            trained_model = joblib.load(trained_model_path)
            # pandas_df = self._data_frame.toPandas()
            df = self._data_frame.toPandas()
            model_columns = self._dataframe_context.get_model_features()
            pandas_df = MLUtils.create_dummy_columns(
                df, [x for x in categorical_columns if x != result_column])
            pandas_df = MLUtils.fill_missing_columns(pandas_df, model_columns,
                                                     result_column)
            if uid_col:
                pandas_df = pandas_df[[
                    x for x in pandas_df.columns if x != uid_col
                ]]
            y_score = trained_model.predict(pandas_df)
            y_prob = trained_model.predict_proba(pandas_df)
            y_prob = MLUtils.calculate_predicted_probability(y_prob)
            y_prob = list([round(x, 2) for x in y_prob])
            score = {
                "predicted_class": y_score,
                "predicted_probability": y_prob
            }

        df["predicted_class"] = score["predicted_class"]
        labelMappingDict = self._dataframe_context.get_label_map()
        df["predicted_class"] = df["predicted_class"].apply(
            lambda x: labelMappingDict[x] if x != None else "NA")
        df["predicted_probability"] = score["predicted_probability"]
        self._score_summary[
            "prediction_split"] = MLUtils.calculate_scored_probability_stats(
                df)
        self._score_summary["result_column"] = result_column
        if result_column in df.columns:
            df.drop(result_column, axis=1, inplace=True)
        df = df.rename(index=str, columns={"predicted_class": result_column})
        df.to_csv(score_data_path, header=True, index=False)
        uidCol = self._dataframe_context.get_uid_column()
        if uidCol == None:
            uidCols = self._metaParser.get_suggested_uid_columns()
            if len(uidCols) > 0:
                uidCol = uidCols[0]
        uidTableData = []
        predictedClasses = list(df[result_column].unique())
        if uidCol:
            if uidCol in df.columns:
                for level in predictedClasses:
                    levelDf = df[df[result_column] == level]
                    levelDf = levelDf[[
                        uidCol, "predicted_probability", result_column
                    ]]
                    levelDf.sort_values(by="predicted_probability",
                                        ascending=False,
                                        inplace=True)
                    levelDf["predicted_probability"] = levelDf[
                        "predicted_probability"].apply(
                            lambda x: humanize.apnumber(x * 100) + "%"
                            if x * 100 >= 10 else str(int(x * 100)) + "%")
                    uidTableData.append(levelDf[:5])
                uidTableData = pd.concat(uidTableData)
                uidTableData = [list(arr) for arr in list(uidTableData.values)]
                uidTableData = [[uidCol, "Probability", result_column]
                                ] + uidTableData
                uidTable = TableData()
                uidTable.set_table_width(25)
                uidTable.set_table_data(uidTableData)
                uidTable.set_table_type("normalHideColumn")
                self._result_setter.set_unique_identifier_table(
                    json.loads(
                        CommonUtils.convert_python_object_to_json(uidTable)))

        self._completionStatus += old_div(
            self._scriptWeightDict[self._analysisName]["total"] *
            self._scriptStages["prediction"]["weight"], 10)
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "prediction",\
                                    "info",\
                                    self._scriptStages["prediction"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsg)
        self._dataframe_context.update_completion_status(
            self._completionStatus)
        # CommonUtils.write_to_file(score_summary_path,json.dumps({"scoreSummary":self._score_summary}))

        print("STARTING DIMENSION ANALYSIS ...")
        columns_to_keep = []
        columns_to_drop = []

        # considercolumnstype = self._dataframe_context.get_score_consider_columns_type()
        # considercolumns = self._dataframe_context.get_score_consider_columns()
        # if considercolumnstype != None:
        #     if considercolumns != None:
        #         if considercolumnstype == ["excluding"]:
        #             columns_to_drop = considercolumns
        #         elif considercolumnstype == ["including"]:
        #             columns_to_keep = considercolumns

        columns_to_keep = self._dataframe_context.get_score_consider_columns()
        if len(columns_to_keep) > 0:
            columns_to_drop = list(set(df.columns) - set(columns_to_keep))
        else:
            columns_to_drop += ["predicted_probability"]
        columns_to_drop = [
            x for x in columns_to_drop
            if x in df.columns and x != result_column
        ]
        print("columns_to_drop", columns_to_drop)
        df.drop(columns_to_drop, axis=1, inplace=True)

        resultColLevelCount = dict(df[result_column].value_counts())
        # self._metaParser.update_level_counts(result_column,resultColLevelCount)
        self._metaParser.update_column_dict(
            result_column, {
                "LevelCount": resultColLevelCount,
                "numberOfUniqueValues": len(list(resultColLevelCount.keys()))
            })
        self._dataframe_context.set_story_on_scored_data(True)
        SQLctx = SQLContext(sparkContext=self._spark.sparkContext,
                            sparkSession=self._spark)
        spark_scored_df = SQLctx.createDataFrame(df)
        # spark_scored_df.write.csv(score_data_path+"/data",mode="overwrite",header=True)
        # TODO update metadata for the newly created dataframe
        self._dataframe_context.update_consider_columns(columns_to_keep)
        df_helper = DataFrameHelper(spark_scored_df, self._dataframe_context,
                                    self._metaParser)
        df_helper.set_params()
        spark_scored_df = df_helper.get_data_frame()
        # try:
        #     fs = time.time()
        #     narratives_file = self._dataframe_context.get_score_path()+"/narratives/FreqDimension/data.json"
        #     if narratives_file.startswith("file"):
        #         narratives_file = narratives_file[7:]
        #     result_file = self._dataframe_context.get_score_path()+"/results/FreqDimension/data.json"
        #     if result_file.startswith("file"):
        #         result_file = result_file[7:]
        #     init_freq_dim = FreqDimensions(df, df_helper, self._dataframe_context,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName)
        #     df_freq_dimension_obj = init_freq_dim.test_all(dimension_columns=[result_column])
        #     df_freq_dimension_result = CommonUtils.as_dict(df_freq_dimension_obj)
        #     narratives_obj = DimensionColumnNarrative(result_column, df_helper, self._dataframe_context, df_freq_dimension_obj,self._result_setter,self._prediction_narrative,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName)
        #     narratives = CommonUtils.as_dict(narratives_obj)
        #
        #     print "Frequency Analysis Done in ", time.time() - fs,  " seconds."
        #     self._completionStatus += self._scriptWeightDict[self._analysisName]["total"]*self._scriptStages["frequency"]["weight"]/10
        #     progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
        #                                 "frequency",\
        #                                 "info",\
        #                                 self._scriptStages["frequency"]["summary"],\
        #                                 self._completionStatus,\
        #                                 self._completionStatus)
        #     CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsg)
        #     self._dataframe_context.update_completion_status(self._completionStatus)
        #     print "Frequency ",self._completionStatus
        # except:
        #     print "Frequency Analysis Failed "
        #
        # try:
        #     fs = time.time()
        #     narratives_file = self._dataframe_context.get_score_path()+"/narratives/ChiSquare/data.json"
        #     if narratives_file.startswith("file"):
        #         narratives_file = narratives_file[7:]
        #     result_file = self._dataframe_context.get_score_path()+"/results/ChiSquare/data.json"
        #     if result_file.startswith("file"):
        #         result_file = result_file[7:]
        #     init_chisquare_obj = ChiSquare(df, df_helper, self._dataframe_context,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName)
        #     df_chisquare_obj = init_chisquare_obj.test_all(dimension_columns= [result_column])
        #     df_chisquare_result = CommonUtils.as_dict(df_chisquare_obj)
        #     chisquare_narratives = CommonUtils.as_dict(ChiSquareNarratives(df_helper, df_chisquare_obj, self._dataframe_context,df,self._prediction_narrative,self._result_setter,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName))
        # except:
        #     print "ChiSquare Analysis Failed "
        if len(predictedClasses) >= 2:
            try:
                fs = time.time()
                df_decision_tree_obj = DecisionTrees(
                    spark_scored_df,
                    df_helper,
                    self._dataframe_context,
                    self._spark,
                    self._metaParser,
                    scriptWeight=self._scriptWeightDict,
                    analysisName=self._analysisName).test_all(
                        dimension_columns=[result_column])
                narratives_obj = CommonUtils.as_dict(
                    DecisionTreeNarrative(result_column,
                                          df_decision_tree_obj,
                                          self._dataframe_helper,
                                          self._dataframe_context,
                                          self._metaParser,
                                          self._result_setter,
                                          story_narrative=None,
                                          analysisName=self._analysisName,
                                          scriptWeight=self._scriptWeightDict))
                print(narratives_obj)
            except:
                print("DecisionTree Analysis Failed ")
        else:
            data_dict = {
                "npred": len(predictedClasses),
                "nactual": len(list(labelMappingDict.values()))
            }
            if data_dict["nactual"] > 2:
                levelCountDict[predictedClasses[0]] = resultColLevelCount[
                    predictedClasses[0]]
                levelCountDict["Others"] = sum([
                    v for k, v in list(resultColLevelCount.items())
                    if k != predictedClasses[0]
                ])
            else:
                levelCountDict = resultColLevelCount
                otherClass = list(
                    set(labelMappingDict.values()) - set(predictedClasses))[0]
                levelCountDict[otherClass] = 0

                print(levelCountDict)

            total = float(
                sum([x for x in list(levelCountDict.values()) if x != None]))
            levelCountTuple = [({
                "name":
                k,
                "count":
                v,
                "percentage":
                humanize.apnumber(old_div(v * 100, total)) +
                "%" if old_div(v * 100, total) >= 10 else
                str(int(old_div(v * 100, total))) + "%"
            }) for k, v in list(levelCountDict.items()) if v != None]
            levelCountTuple = sorted(levelCountTuple,
                                     key=lambda x: x["count"],
                                     reverse=True)
            data_dict["blockSplitter"] = "|~NEWBLOCK~|"
            data_dict["targetcol"] = result_column
            data_dict["nlevel"] = len(list(levelCountDict.keys()))
            data_dict["topLevel"] = levelCountTuple[0]
            data_dict["secondLevel"] = levelCountTuple[1]
            maincardSummary = NarrativesUtils.get_template_output(
                "/apps/", 'scorewithoutdtree.html', data_dict)

            main_card = NormalCard()
            main_card_data = []
            main_card_narrative = NarrativesUtils.block_splitter(
                maincardSummary, "|~NEWBLOCK~|")
            main_card_data += main_card_narrative

            chartData = NormalChartData([levelCountDict]).get_data()
            chartJson = ChartJson(data=chartData)
            chartJson.set_title(result_column)
            chartJson.set_chart_type("donut")
            mainCardChart = C3ChartData(data=chartJson)
            mainCardChart.set_width_percent(33)
            main_card_data.append(mainCardChart)

            uidTable = self._result_setter.get_unique_identifier_table()
            if uidTable != None:
                main_card_data.append(uidTable)
            main_card.set_card_data(main_card_data)
            main_card.set_card_name(
                "Predicting Key Drivers of {}".format(result_column))
            self._result_setter.set_score_dtree_cards([main_card], {})
Beispiel #17
0
    def _generate_analysis(self):
        lines = []
        freq_dict = self._dimension_col_freq_dict
        # print "freq_dict",freq_dict
        json_freq_dict = json.dumps(freq_dict)
        freq_dict = json.loads(freq_dict)
        colname = self._colname
        freq_data = []
        print "self._dataframe_helper.get_cols_to_bin()", self._dataframe_helper.get_cols_to_bin(
        )
        if colname in self._dataframe_helper.get_cols_to_bin():
            keys_to_sort = freq_dict[colname][colname].values()
            convert = lambda text: int(text) if text.isdigit() else text
            alphanum_key = lambda key: [
                convert(c) for c in re.split('([0-9]+)', key)
            ]
            keys_to_sort.sort(key=alphanum_key)
            temp_dict = {}
            for k, v in freq_dict[colname][colname].items():
                temp_dict[v] = freq_dict[colname]["count"][k]
            for each in keys_to_sort:
                freq_data.append({"key": each, "Count": temp_dict[each]})
        else:
            for k, v in freq_dict[colname][colname].items():
                freq_data.append({
                    "key": v,
                    "Count": freq_dict[colname]["count"][k]
                })
            freq_data = sorted(freq_data,
                               key=lambda x: x["Count"],
                               reverse=True)
        data_dict = {"colname": self._colname}
        data_dict["plural_colname"] = pattern.en.pluralize(
            data_dict["colname"])
        count = freq_dict[colname]['count']
        max_key = max(count, key=count.get)
        min_key = min(count, key=count.get)
        data_dict["blockSplitter"] = self._blockSplitter
        data_dict["max"] = {
            "key": freq_dict[colname][colname][max_key],
            "val": count[max_key]
        }
        data_dict["min"] = {
            "key": freq_dict[colname][colname][min_key],
            "val": count[min_key]
        }
        data_dict["keys"] = freq_dict[colname][colname].values()
        data_dict["avg"] = round(
            sum(count.values()) / float(len(count.values())), 2)
        data_dict["above_avg"] = [
            freq_dict[colname][colname][key] for key in count.keys()
            if count[key] > data_dict["avg"]
        ]
        data_dict["per_bigger_avg"] = round(
            data_dict["max"]["val"] / float(data_dict["avg"]), 4)
        data_dict["per_bigger_low"] = round(
            data_dict["max"]["val"] / float(data_dict["min"]["val"]), 4)
        uniq_val = list(set(count.values()))
        data_dict["n_uniq"] = len(uniq_val)
        if len(uniq_val) == 1:
            data_dict["count"] = uniq_val[0]
        if len(data_dict["keys"]) >= 3:
            #percent_75 = np.percentile(count.values(),75)
            #kv=[(freq_dict[colname][colname][key],count[key]) for key in count.keys()]
            percent_75 = sum(count.values()) * 0.75
            kv = sorted(count.items(),
                        key=operator.itemgetter(1),
                        reverse=True)
            kv_75 = [(k, v) for k, v in kv if v <= percent_75]
            kv_75 = []
            temp_sum = 0
            for k, v in kv:
                temp_sum = temp_sum + v
                kv_75.append((freq_dict[colname][colname][k], v))
                if temp_sum >= percent_75:
                    break
            data_dict["percent_contr"] = round(
                temp_sum * 100.0 / float(sum(count.values())), 2)
            data_dict["kv_75"] = len(kv_75)

            data_dict["kv_75_cat"] = [k for k, v in kv_75]

        largest_text = " %s is the largest with %s observations" % (
            data_dict["max"]["key"],
            NarrativesUtils.round_number(data_dict["max"]["val"]))
        smallest_text = " %s is the smallest with %s observations" % (
            data_dict["min"]["key"],
            NarrativesUtils.round_number(data_dict["min"]["val"]))
        largest_per = round(
            data_dict["max"]["val"] * 100.0 / float(sum(count.values())), 2)
        data_dict['largest_per'] = largest_per
        smallest_per = round(
            data_dict["min"]["val"] * 100.0 / float(sum(count.values())), 2)
        self.count = {
            "largest": [largest_text,
                        str(round(largest_per, 1)) + '%'],
            "smallest": [smallest_text,
                         str(round(smallest_per, 1)) + '%']
        }
        if len(data_dict["keys"]) >= 3:
            # self.subheader = "Top %d %s account for more than three quarters (%d percent) of observations." % (data_dict["kv_75"],data_dict["plural_colname"],data_dict["percent_contr"])
            self.subheader = 'Distribution of ' + self._capitalized_column_name
        else:
            self.subheader = 'Distribution of ' + self._capitalized_column_name
        output1 =  NarrativesUtils.get_template_output(self._base_dir,\
                                                'dimension_distribution1.html',data_dict)
        output1 = NarrativesUtils.block_splitter(output1, self._blockSplitter)
        output2 = NarrativesUtils.get_template_output(self._base_dir,\
                                                'dimension_distribution2.html',data_dict)
        output2 = NarrativesUtils.block_splitter(output2, self._blockSplitter)
        chart_data = NormalChartData(data=freq_data)
        chart_json = ChartJson()
        chart_json.set_data(chart_data.get_data())
        chart_json.set_chart_type("bar")
        chart_json.set_axes({"x": "key", "y": "Count"})
        chart_json.set_label_text({'x': ' ', 'y': 'No. of Observations'})
        chart_json.set_yaxis_number_format(".2s")
        lines += output1
        lines += [C3ChartData(data=chart_json)]
        lines += output2
        bubble_data = "<div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}%</span><br /><small>{}</small></h2></div><div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}%</span><br /><small>{}</small></h2></div>".format(
            largest_per, largest_text, smallest_per, smallest_text)
        lines.append(HtmlData(data=bubble_data))
        # print lines
        dimensionCard1 = NormalCard(name=self.subheader,
                                    slug=None,
                                    cardData=lines)
        self._dimensionSummaryNode.add_a_card(dimensionCard1)
        self._result_setter.set_score_freq_card(
            json.loads(
                CommonUtils.convert_python_object_to_json(dimensionCard1)))
        return lines
Beispiel #18
0
    def _generate_narratives(self):
        chisquare_result = self._chisquare_result
        target_dimension = self._target_dimension
        analysed_dimension = self._analysed_dimension
        significant_variables = self._significant_variables
        num_analysed_variables = self._num_analysed_variables
        #self.narratives[target_dimension][analysed_dimension]['table'] = []
        splits = chisquare_result.get_splits()
        chisquare_result_percentage_table = chisquare_result.get_rounded_percentage_table(
        )
        chisquare_result_contingency_table = chisquare_result.get_contingency_table(
        )
        chisquare_result_percentage_table_by_target = chisquare_result.get_rounded_percentage_table_by_target(
        )

        if splits:
            new_column_2_name = self._get_bin_names(splits)
            # new_column_2_name = NarrativesUtils.get_bin_names(splits)
            chisquare_result_percentage_table.column_two_values = [
                new_column_2_name[int(float(i))]
                for i in chisquare_result_percentage_table.column_two_values
            ]
            chisquare_result_contingency_table.column_two_values = [
                new_column_2_name[int(float(i))]
                for i in chisquare_result_contingency_table.column_two_values
            ]
            chisquare_result_percentage_table_by_target.column_two_values = [
                new_column_2_name[int(float(i))] for i in
                chisquare_result_percentage_table_by_target.column_two_values
            ]
        cumulative_percent = {}

        num_categories = len(
            chisquare_result_percentage_table.column_two_values)
        chisquare_result_percentage_table.table
        for i in range(0, num_categories):
            column_two_value = chisquare_result_percentage_table.column_two_values[
                i]
            cumulative_percent[column_two_value] = sum(
                row_data[i]
                for row_data in chisquare_result_percentage_table.table)

        cumulative_percent = sorted(cumulative_percent.items(),
                                    key=operator.itemgetter(1),
                                    reverse=True)

        half_observation_categories = []
        half_observation_percent = 0
        for c, p in cumulative_percent:
            half_observation_percent = half_observation_percent + p
            half_observation_categories.append(c)
            if (half_observation_percent >= 50):
                break

        lowest_contributor = cumulative_percent[-1][0]
        lowest_contributor_percent = cumulative_percent[-1][1]
        #to_exclude = len(chisquare_result_percentage_table[cumulative_percent[0][0]])-1
        to_exclude = len(chisquare_result_percentage_table.column_one_values)
        maximum_percent = 0
        maximum_category = []
        minimum_percent = 100
        minimum_category = []
        maximum_observation = 0
        minimum_observation = 0
        category_list = {}
        observations_by_target_categories = {}
        maximum_std = 0
        maximum_std_category = []
        minimum_std = 1000000000000000
        minimum_std_category = []

        category_list = chisquare_result_percentage_table.column_one_values

        #rows = [analysed_dimension+'/'+target_dimension] + category_list + ['Distribution by '+analysed_dimension]
        #self.table.append(rows)

        for i in chisquare_result_percentage_table.column_two_values:
            #rows = [i]
            for j in chisquare_result_percentage_table.column_one_values:
                if not observations_by_target_categories.has_key(j):
                    observations_by_target_categories[j] = {}
                else:
                    #rows.append(str(round(chisquare_result_percentage_table.get_value(j, i),2)))
                    observations_by_target_categories[j][
                        i] = chisquare_result_percentage_table.get_value(j, i)
                    if (chisquare_result_percentage_table.get_value(j, i) >
                            maximum_percent):
                        maximum_category = [i, j]
                        maximum_percent = chisquare_result_percentage_table.get_value(
                            j, i)
                        maximum_observation = chisquare_result_contingency_table.get_value(
                            j, i)
                    elif (chisquare_result_percentage_table.get_value(j, i) <
                          minimum_percent):
                        minimum_category = [i, j]
                        minimum_percent = chisquare_result_percentage_table.get_value(
                            j, i)
                        minimum_observation = chisquare_result_contingency_table.get_value(
                            j, i)

        self.table = chisquare_result_percentage_table_by_target

        # for i in observations_by_target_categories.keys():
        #     if (maximum_std < numpy.std(observations_by_target_categories[i].values())):
        #         maximum_std = numpy.std(observations_by_target_categories[i].values())
        #         temp_max = 0
        #         for j in observations_by_target_categories[i].keys():
        #             if temp_max < observations_by_target_categories[i][j]:
        #                 temp_max = observations_by_target_categories[i][j]
        #
        #                 maximum_std_category = [i, j, temp_max*100/sum(observations_by_target_categories[i].values())]
        #             elif (minimum_std > numpy.std(observations_by_target_categories[i].values())):
        #                 minimum_std = numpy.std(observations_by_target_categories[i].values())
        #
        #                 minimum_std_category = [i]
        maximums = {}
        minimums = {}
        for idx, t in enumerate(self.table.table):
            present_cat = self.table.column_one_values[idx]
            maxi = self.table.column_two_values[t.index(max(t))]
            mini = self.table.column_two_values[t.index(min(t))]
            if not maximums.has_key(maxi):
                maximums[maxi] = []
            tmp = present_cat + '(' + str(max(t)) + '%)'
            maximums[maxi].append(tmp)
            if not minimums.has_key(mini):
                minimums[mini] = []
            tmp = present_cat + '(' + str(min(t)) + '%)'
            minimums[mini].append(tmp)
        temp = {}
        for k in maximums:
            temp[k] = combine(maximums[k])
        maximums = temp
        temp = {}
        for k in minimums:
            temp[k] = combine(minimums[k])
        minimums = temp
        data_dict = {
            'num_variables': num_analysed_variables,
            'num_significant_variables': len(significant_variables),
            'significant_variables': significant_variables,
            'target_dimension': target_dimension,
            'fifty_percent_categories': half_observation_categories,
            'fifty_percent_contribution': round(half_observation_percent, 2),
            'lowest_contributor': lowest_contributor,
            'lowest_contributor_percent': round(lowest_contributor_percent, 2),
            'num_categories': num_categories,
            'analysed_dimension': analysed_dimension,
            'maximums': maximums,
            'minimums': minimums
        }
        analysis1 = NarrativesUtils.get_template_output(
            self._base_dir, 'chisquare_template3.html', data_dict)
        title1 = ''
        analysis2 = NarrativesUtils.get_template_output(
            self._base_dir, 'chisquare_template4.html', data_dict)
        title2 = ''

        self.analysis = {
            'title1': '',
            'analysis1': analysis1,
            'title2': '',
            'analysis2': analysis2
        }
        # self.sub_heading = re.split(', whereas',analysis1)[0]
        self.sub_heading = analysed_dimension.title()
Beispiel #19
0
    def _generate_card1(self):
        self._anovaCard1 = NormalCard(name='Impact on ' +
                                      self._measure_column_capitalized)
        lines = []
        lines += NarrativesUtils.block_splitter(
            '<h3>' + self._measure_column_capitalized + ': Impact of ' +
            self._dimension_column_capitalized + ' on ' +
            self._measure_column_capitalized + '</h3>', self._blockSplitter)
        self.card1 = Card('Impact of ' + self._dimension_column_capitalized +
                          ' on ' + self._measure_column_capitalized)
        dim_table = self._dimension_anova_result.get_level_dataframe()
        # print dim_table
        keys = dim_table['levels']
        totals = dim_table['total']
        means = dim_table['average']
        counts = dim_table['count']
        if len(keys) >= 5:
            self._card3_required = True

        group_by_total = {}
        group_by_mean = {}

        for k, t, m in zip(keys, totals, means):
            group_by_total[k] = t
            group_by_mean[k] = m

        chart1 = chart(data=group_by_total,
                       labels={
                           self._dimension_column_capitalized:
                           self._measure_column_capitalized
                       })
        chart2 = chart(data=group_by_mean,
                       labels={
                           self._dimension_column_capitalized:
                           self._measure_column_capitalized
                       })

        self.card1.add_chart('group_by_total', chart1)
        self.card1.add_chart('group_by_mean', chart2)
        # st_info = ["Test : ANOVA", "p-value: 0.05", "F-stat: "+str(round(self._dimension_anova_result.get_f_value(),2))]
        statistical_info_array = [
            ("Test Type", "ANOVA"), ("P-Value", "0.05"),
            ("F Value",
             str(round(self._dimension_anova_result.get_f_value(), 2))),
            ("Inference",
             "There is a significant effect of {} on {} (target).".format(
                 self._dimension_column_capitalized,
                 self._measure_column_capitalized))
        ]
        statistical_info_array = NarrativesUtils.statistical_info_array_formatter(
            statistical_info_array)
        card1_chart1 = C3ChartData(data=self._get_c3chart_card1_chart1(
            group_by_total, group_by_mean),
                                   info=statistical_info_array)

        self._result_setter.set_anova_chart_on_scored_data(
            {self._dimension_column: card1_chart1})
        lines += [card1_chart1]

        # top_group_by_total = keys[totals.index(max(totals))]
        top_group_by_total = keys[totals.argmax()]
        sum_top_group_by_total = max(totals)
        avg_top_group_by_total = means[totals.argmax()]
        bubble1 = BubbleData(
            NarrativesUtils.round_number(sum_top_group_by_total,
                                         1), top_group_by_total +
            ' is the largest contributor to ' + self._measure_column)
        # self.card1.add_bubble_data(bubble1)

        top_group_by_mean = keys[means.argmax()]
        sum_top_group_by_mean = totals[means.argmax()]
        avg_top_group_by_mean = max(means)
        bubble2 = BubbleData(
            NarrativesUtils.round_number(avg_top_group_by_mean,
                                         1), top_group_by_mean +
            ' has the highest average ' + self._measure_column)
        # self.card1.add_bubble_data(bubble2)

        groups_by_total = sorted(zip(totals, keys), reverse=True)
        sum_total = sum(totals)
        uniformly_distributed = True
        five_percent_total = 0.05 * sum_total
        fifteen_percent_total = 0.15 * sum_total
        sorted_total = sorted(totals, reverse=True)
        if len(groups_by_total) % 2 == 0:
            fifty_percent_index = int(len(groups_by_total) / 2)
            top_fifty_total = sum(sorted_total[:fifty_percent_index])
            bottom_fifty_total = sum(sorted_total[fifty_percent_index:])
            if top_fifty_total - bottom_fifty_total >= fifteen_percent_total:
                uniformly_distributed = False
        else:
            fifty_percent_index = int(len(groups_by_total) / 2) + 1
            top_fifty_total = sum(sorted_total[:fifty_percent_index])
            bottom_fifty_total = sum(sorted_total[fifty_percent_index - 1:])
            if top_fifty_total - bottom_fifty_total >= fifteen_percent_total:
                uniformly_distributed = False
        top_groups = None
        top_groups_contribution = None
        if (not uniformly_distributed) and len(groups_by_total) > 2:
            max_diff = 0
            diffs = [
                sorted_total[i] - sorted_total[i + 1]
                for i in range(fifty_percent_index)
            ]
            max_diff_index = diffs.index(max(diffs[1:]))
            top_groups = [k for t, k in groups_by_total[:max_diff_index + 1]]
            top_groups_contribution = sum(
                sorted_total[:max_diff_index + 1]) * 100 / sum_total
            bottom_groups = []
            bottom_groups_contribution = 0
            for t, k in groups_by_total[:0:-1]:
                bottom_groups.append(k)
                bottom_groups_contribution = bottom_groups_contribution + t
                if bottom_groups_contribution >= five_percent_total:
                    break
            bottom_groups_contribution = bottom_groups_contribution * 100 / sum_total
        elif not uniformly_distributed:
            top_groups = [groups_by_total[0][1]]
            top_groups_contribution = groups_by_total[0][0] * 100 / sum_total
            bottom_groups = [groups_by_total[1][1]]
            bottom_groups_contribution = groups_by_total[1][0] * 100 / sum_total
        elif uniformly_distributed:
            top_groups = []
            top_groups_contribution = 0
            bottom_groups = []
            bottom_groups_contribution = 0

        num_groups = len(keys)

        data_dict = {
            'uniformly_distributed':
            uniformly_distributed,
            'top_groups':
            top_groups,
            'num_top_groups':
            len(top_groups),
            'top_groups_percent':
            NarrativesUtils.round_number(top_groups_contribution, 2),
            'dimension_name':
            self._dimension_column,
            'plural_dimension_name':
            NarrativesUtils.pluralize(self._dimension_column),
            'measure_name':
            self._measure_column,
            'best_category_by_mean':
            top_group_by_mean,
            'best_category_by_mean_cont':
            round(100.0 * sum_top_group_by_mean / sum(totals), 2),
            'best_category_by_mean_avg':
            NarrativesUtils.round_number(avg_top_group_by_mean, 2),
            'best_category_by_total':
            top_group_by_total,
            'best_category_by_total_cont':
            round(100.0 * sum_top_group_by_total / sum(totals), 2),
            'best_category_by_total_avg':
            NarrativesUtils.round_number(avg_top_group_by_total, 2),
            'best_category_by_total_sum':
            NarrativesUtils.round_number(sum_top_group_by_total, 2),
            'bottom_groups':
            bottom_groups,
            'num_bottom_groups':
            len(bottom_groups),
            'bottom_groups_percent':
            NarrativesUtils.round_number(bottom_groups_contribution, 2),
            'num_groups':
            num_groups
        }
        output = {'header': 'Overview', 'content': []}
        if self._binAnalyzedCol == True:
            narrativeText = NarrativesUtils.get_template_output(
                self._base_dir, 'anova_template_3_binned_IV.html', data_dict)
            output['content'].append(narrativeText)
            self._result_setter.set_anova_narrative_on_scored_data(
                {self._dimension_column: narrativeText})
        else:
            narrativeText = NarrativesUtils.get_template_output(
                self._base_dir, 'anova_template_3.html', data_dict)
            output['content'].append(narrativeText)
            self._result_setter.set_anova_narrative_on_scored_data(
                {self._dimension_column: narrativeText})

        for cnt in output['content']:
            lines += NarrativesUtils.block_splitter(cnt, self._blockSplitter)
        self._anovaCard1.set_card_data(lines)
        self.card1.add_paragraph(dict(output))
        self._result_setter.set_anova_cards_regression_score(self.card1)
    def _generate_narratives(self):
        chisquare_result = self._chisquare_result
        target_dimension = self._target_dimension
        analysed_dimension = self._analysed_dimension
        significant_variables = self._significant_variables
        num_analysed_variables = self._num_analysed_variables
        table = self._chiSquareTable
        total = self._chiSquareTable.get_total()

        levels = self._chiSquareTable.get_column_two_levels()
        level_counts = self._chiSquareTable.get_column_total()
        levels_count_sum = sum(level_counts)
        levels_percentages = [
            old_div(i * 100.0, levels_count_sum) for i in level_counts
        ]
        sorted_levels = sorted(zip(level_counts, levels), reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]

        top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        bottom_dim = sorted_levels[-1][1]
        bottom_dim_contribution = sorted_levels[-1][0]
        bottom_dims = [
            y for x, y in sorted_levels if x == bottom_dim_contribution
        ]

        target_levels = self._chiSquareTable.get_column_one_levels()

        target_counts = self._chiSquareTable.get_row_total()
        sorted_target_levels = sorted(zip(target_counts, target_levels),
                                      reverse=True)

        top_target_count, top_target = sorted_target_levels[0]
        second_target_count, second_target = sorted_target_levels[1]

        top_target_contributions = [
            table.get_value(top_target, i) for i in levels
        ]
        sum_top_target = sum(top_target_contributions)

        sorted_levels = sorted(zip(top_target_contributions, levels),
                               reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]
        top_target_top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        top_target_top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        top_target_bottom_dim = sorted_levels[-1][1]
        top_target_bottom_dim_contribution = sorted_levels[-1][0]

        top_target_percentages = [
            old_div(i * 100.0, sum_top_target)
            for i in top_target_contributions
        ]
        best_top_target_index = top_target_contributions.index(
            max(top_target_contributions))
        worst_top_target_index = top_target_contributions.index(
            min(top_target_contributions))
        top_target_differences = [
            x - y for x, y in zip(levels_percentages, top_target_percentages)
        ]
        if len(top_target_differences) > 6:
            tops = 2
            bottoms = -2
        elif len(top_target_differences) > 4:
            tops = 2
            bottoms = -1
        else:
            tops = 1
            bottoms = -1
        sorted_ = sorted(enumerate(top_target_differences),
                         key=lambda x: x[1],
                         reverse=True)
        best_top_difference_indices = [x for x, y in sorted_[:tops]]
        worst_top_difference_indices = [x for x, y in sorted_[bottoms:]]

        top_target_shares = [
            old_div(x * 100.0, y)
            for x, y in zip(top_target_contributions, level_counts)
        ]
        max_top_target_shares = max(top_target_shares)
        best_top_target_share_index = [
            idx for idx, val in enumerate(top_target_shares)
            if val == max_top_target_shares
        ]
        level_counts_threshold = old_div(
            sum(level_counts) * 0.05, len(level_counts))
        min_top_target_shares = min([
            x for x, y in zip(top_target_shares, level_counts)
            if y >= level_counts_threshold
        ])
        if max_top_target_shares == min_top_target_shares:
            worst_top_target_share_index = []
        else:
            worst_top_target_share_index = [
                idx for idx, val in enumerate(top_target_shares)
                if val == min_top_target_shares
            ]
        overall_top_percentage = old_div(sum_top_target * 100.0, total)

        second_target_contributions = [
            table.get_value(second_target, i) for i in levels
        ]
        sum_second_target = sum(second_target_contributions)

        sorted_levels = sorted(zip(second_target_contributions, levels),
                               reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]
        second_target_top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        second_target_top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        second_target_bottom_dim = sorted_levels[-1][1]
        second_target_bottom_dim_contribution = sorted_levels[-1][0]

        second_target_percentages = [
            old_div(i * 100.0, sum_second_target)
            for i in second_target_contributions
        ]
        best_second_target_index = second_target_contributions.index(
            max(second_target_contributions))
        worst_second_target_index = second_target_contributions.index(
            min(second_target_contributions))
        second_target_differences = [
            x - y
            for x, y in zip(levels_percentages, second_target_percentages)
        ]
        if len(second_target_differences) > 6:
            tops = 2
            bottoms = -2
        elif len(second_target_differences) > 4:
            tops = 2
            bottoms = -1
        else:
            tops = 1
            bottoms = -1
        sorted_ = sorted(enumerate(second_target_differences),
                         key=lambda x: x[1],
                         reverse=True)
        best_second_difference_indices = [x for x, y in sorted_[:tops]]
        worst_second_difference_indices = [x for x, y in sorted_[bottoms:]]

        second_target_shares = [
            old_div(x * 100.0, y)
            for x, y in zip(second_target_contributions, level_counts)
        ]
        max_second_target_shares = max(second_target_shares)
        best_second_target_share_index = [
            idx for idx, val in enumerate(second_target_shares)
            if val == max_second_target_shares
        ]
        level_counts_threshold = old_div(
            sum(level_counts) * 0.05, len(level_counts))
        if min(second_target_shares) == 0:
            min_second_target_shares = min([
                x for x, y in zip(second_target_shares, level_counts) if x != 0
            ])
        else:
            min_second_target_shares = min([
                x for x, y in zip(second_target_shares, level_counts)
                if y >= level_counts_threshold
            ])
        # worst_second_target_share_index = second_target_shares.index(min_second_target_shares)
        if max_second_target_shares == min_second_target_shares:
            worst_second_target_share_index = []
        else:
            worst_second_target_share_index = [
                idx for idx, val in enumerate(second_target_shares)
                if val == min_second_target_shares
            ]
        overall_second_percentage = old_div(sum_second_target * 100.0, total)

        targetCardDataDict = {}
        targetCardDataDict['target'] = target_dimension
        targetCardDataDict['colname'] = analysed_dimension
        targetCardDataDict['num_significant'] = len(significant_variables)
        targetCardDataDict['plural_colname'] = NarrativesUtils.pluralize(
            analysed_dimension)

        targetCardDataDict["blockSplitter"] = self._blockSplitter
        targetCardDataDict["binTargetCol"] = self._binTargetCol
        targetCardDataDict["binAnalyzedCol"] = self._binAnalyzedCol
        targetCardDataDict['highlightFlag'] = self._highlightFlag
        targetCardDataDict['levels'] = levels

        data_dict = {}
        data_dict[
            'best_second_difference'] = best_second_difference_indices  ##these changed
        data_dict['worst_second_difference'] = worst_second_difference_indices
        data_dict['best_top_difference'] = best_top_difference_indices
        data_dict['worst_top_difference'] = worst_top_difference_indices
        data_dict['levels_percentages'] = levels_percentages
        data_dict['top_target_percentages'] = top_target_percentages
        data_dict['second_target_percentages'] = second_target_percentages
        data_dict['levels'] = levels
        data_dict['best_top_share'] = best_top_target_share_index
        data_dict['worst_top_share'] = worst_top_target_share_index
        data_dict['best_second_share'] = best_second_target_share_index
        data_dict['worst_second_share'] = worst_second_target_share_index
        data_dict['top_target_shares'] = top_target_shares
        data_dict['second_target_shares'] = second_target_shares
        data_dict['overall_second'] = overall_second_percentage
        data_dict['overall_top'] = overall_top_percentage

        data_dict['num_significant'] = len(significant_variables)
        data_dict['colname'] = analysed_dimension
        data_dict['plural_colname'] = NarrativesUtils.pluralize(
            analysed_dimension)
        data_dict['target'] = target_dimension
        data_dict['top_levels'] = top_dims
        data_dict['top_levels_percent'] = round(
            old_div(top_dims_contribution * 100.0, total), 1)
        data_dict['bottom_level'] = bottom_dim
        data_dict['bottom_levels'] = bottom_dims
        data_dict['bottom_level_percent'] = round(
            old_div(bottom_dim_contribution * 100, sum(level_counts)), 2)
        data_dict['second_target'] = second_target
        data_dict['second_target_top_dims'] = second_target_top_dims
        data_dict['second_target_top_dims_contribution'] = old_div(
            second_target_top_dims_contribution * 100.0,
            sum(second_target_contributions))
        data_dict['second_target_bottom_dim'] = second_target_bottom_dim
        data_dict[
            'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution
        data_dict['best_second_target'] = levels[best_second_target_index]
        data_dict['best_second_target_count'] = second_target_contributions[
            best_second_target_index]
        data_dict['best_second_target_percent'] = round(
            old_div(
                second_target_contributions[best_second_target_index] * 100.0,
                sum(second_target_contributions)), 2)
        data_dict['worst_second_target'] = levels[worst_second_target_index]
        data_dict['worst_second_target_percent'] = round(
            old_div(
                second_target_contributions[worst_second_target_index] * 100.0,
                sum(second_target_contributions)), 2)

        data_dict['top_target'] = top_target
        data_dict['top_target_top_dims'] = top_target_top_dims
        data_dict['top_target_top_dims_contribution'] = old_div(
            top_target_top_dims_contribution * 100.0,
            sum(top_target_contributions))
        data_dict['top_target_bottom_dim'] = top_target_bottom_dim
        data_dict[
            'top_target_bottom_dim_contribution'] = top_target_bottom_dim_contribution
        data_dict['best_top_target'] = levels[best_top_target_index]
        data_dict['best_top_target_count'] = top_target_contributions[
            best_top_target_index]
        data_dict['best_top_target_percent'] = round(
            old_div(top_target_contributions[best_top_target_index] * 100.0,
                    sum(top_target_contributions)), 2)
        data_dict['worst_top_target'] = levels[worst_top_target_index]
        data_dict['worst_top_target_percent'] = round(
            old_div(top_target_contributions[worst_top_target_index] * 100.0,
                    sum(top_target_contributions)), 2)

        data_dict["blockSplitter"] = self._blockSplitter
        data_dict["binTargetCol"] = self._binTargetCol
        data_dict["binAnalyzedCol"] = self._binAnalyzedCol
        data_dict['highlightFlag'] = self._highlightFlag

        # print "_"*60
        # print "DATA DICT - ", data_dict
        # print "_"*60

        ###############
        #     CARD1   #
        ###############

        print("self._binTargetCol & self._binAnalyzedCol : ",
              self._binTargetCol, self._binAnalyzedCol)
        if len(data_dict['worst_second_share']) == 0:
            output = NarrativesUtils.block_splitter(
                NarrativesUtils.get_template_output(
                    self._base_dir, 'card1_binned_target_worst_second.html',
                    data_dict),
                self._blockSplitter,
                highlightFlag=self._highlightFlag)
        else:
            if (self._binTargetCol == True & self._binAnalyzedCol == False):
                print("Only Target Column is Binned, : ", self._binTargetCol)
                output = NarrativesUtils.block_splitter(
                    NarrativesUtils.get_template_output(
                        self._base_dir, 'card1_binned_target.html', data_dict),
                    self._blockSplitter,
                    highlightFlag=self._highlightFlag)
            elif (self._binTargetCol == True & self._binAnalyzedCol == True):
                print("Target Column and IV is Binned : ", self._binTargetCol,
                      self._binAnalyzedCol)
                output = NarrativesUtils.block_splitter(
                    NarrativesUtils.get_template_output(
                        self._base_dir, 'card1_binned_target_and_IV.html',
                        data_dict),
                    self._blockSplitter,
                    highlightFlag=self._highlightFlag)
            else:
                output = NarrativesUtils.block_splitter(
                    NarrativesUtils.get_template_output(
                        self._base_dir, 'card1.html', data_dict),
                    self._blockSplitter,
                    highlightFlag=self._highlightFlag)

        targetDimCard1Data = []
        targetDimcard1Heading = '<h3>Impact of ' + self._analysed_dimension + '  on ' + self._target_dimension + "</h3>"

        toggledata = ToggleData()

        targetDimTable1Data = self.generate_card1_table1()
        targetDimCard1Table1 = TableData()
        targetDimCard1Table1.set_table_type("heatMap")
        targetDimCard1Table1.set_table_data(targetDimTable1Data)
        toggledata.set_toggleon_data({
            "data": {
                "tableData": targetDimTable1Data,
                "tableType": "heatMap"
            },
            "dataType": "table"
        })

        targetDimTable2Data = self.generate_card1_table2()
        targetDimCard1Table2 = TableData()
        targetDimCard1Table2.set_table_type("normal")
        table2Data = targetDimTable2Data["data1"]
        table2Data = [
            innerList[1:] for innerList in table2Data
            if innerList[0].strip() != ""
        ]
        targetDimCard1Table2.set_table_data(table2Data)

        toggledata.set_toggleoff_data({
            "data": {
                "tableData": table2Data,
                "tableType": "heatMap"
            },
            "dataType": "table"
        })

        targetDimCard1Data.append(HtmlData(data=targetDimcard1Heading))
        targetDimCard1Data.append(toggledata)
        targetDimCard1Data += output

        self._card1.set_card_data(targetDimCard1Data)
        self._card1.set_card_name("{}: Relationship with {}".format(
            self._analysed_dimension, self._target_dimension))

        ###############
        #     CARD2   #
        ###############

        if self._appid == None:

            key_factors = ''
            num_key_factors = len(self._second_level_dimensions)

            if len(self._second_level_dimensions) == 5:
                key_factors = ', '.join(
                    self._second_level_dimensions[:4]
                ) + ' and ' + self._second_level_dimensions[4]
            elif len(self._second_level_dimensions) == 4:
                key_factors = ', '.join(
                    self._second_level_dimensions[:3]
                ) + ' and ' + self._second_level_dimensions[3]
            elif len(self._second_level_dimensions) == 3:
                key_factors = ', '.join(
                    self._second_level_dimensions[:2]
                ) + ' and ' + self._second_level_dimensions[2]
            elif len(self._second_level_dimensions) == 2:
                key_factors = ' and '.join(self._second_level_dimensions)
            elif len(self._second_level_dimensions) == 1:
                key_factors = self._second_level_dimensions[0]

            targetCardDataDict['num_key_factors'] = num_key_factors
            targetCardDataDict['key_factors'] = key_factors
            dict_for_test = {}
            for tupleObj in sorted_target_levels[:self._chiSquareLevelLimit]:
                targetLevel = tupleObj[1]

                targetCardDataDict['random_card2'] = random.randint(1, 100)
                targetCardDataDict['random_card4'] = random.randint(1, 100)

                second_target_contributions = [
                    table.get_value(targetLevel, i) for i in levels
                ]
                sum_second_target = sum(second_target_contributions)

                sorted_levels = sorted(zip(second_target_contributions,
                                           levels),
                                       reverse=True)

                level_differences = [0.0] + [
                    sorted_levels[i][0] - sorted_levels[i + 1][0]
                    for i in range(len(sorted_levels) - 1)
                ]
                level_diff_index = level_differences.index(
                    max(level_differences)) if level_differences.index(
                        max(level_differences)) > 0 else len(
                            level_differences
                        )  ##added for pipeline keyerror issue
                second_target_top_dims = [
                    j for i, j in sorted_levels[:level_diff_index]
                ]
                second_target_top_dims_contribution = sum([
                    i for i, j in sorted_levels[:level_differences.
                                                index(max(level_differences))]
                ])
                second_target_bottom_dim = sorted_levels[-1][1]
                second_target_bottom_dim_contribution = sorted_levels[-1][0]

                second_target_percentages = [
                    old_div(i * 100.0, sum_second_target)
                    for i in second_target_contributions
                ]
                best_second_target_index = second_target_contributions.index(
                    max(second_target_contributions))
                worst_second_target_index = second_target_contributions.index(
                    min(second_target_contributions))
                second_target_differences = [
                    x - y for x, y in zip(levels_percentages,
                                          second_target_percentages)
                ]
                if len(second_target_differences) > 6:
                    tops = 2
                    bottoms = -2
                elif len(second_target_differences) > 4:
                    tops = 2
                    bottoms = -1
                else:
                    tops = 1
                    bottoms = -1
                sorted_ = sorted(enumerate(second_target_differences),
                                 key=lambda x: x[1],
                                 reverse=True)
                best_second_difference_indices = [x for x, y in sorted_[:tops]]
                worst_second_difference_indices = [
                    x for x, y in sorted_[bottoms:]
                ]

                second_target_shares = [
                    old_div(x * 100.0, y)
                    for x, y in zip(second_target_contributions, level_counts)
                ]
                max_second_target_shares = max(second_target_shares)
                best_second_target_share_index = [
                    idx for idx, val in enumerate(second_target_shares)
                    if val == max_second_target_shares
                ]
                level_counts_threshold = old_div(
                    sum(level_counts) * 0.05, len(level_counts))
                min_second_target_shares = min([
                    x for x, y in zip(second_target_shares, level_counts)
                    if y >= level_counts_threshold
                ])
                worst_second_target_share_index = [
                    idx for idx, val in enumerate(second_target_shares)
                    if val == min_second_target_shares
                ]
                overall_second_percentage = old_div(sum_second_target * 100.0,
                                                    total)

                # DataFrame for contribution calculation
                if self._pandas_flag:
                    df_second_target = self._data_frame[(
                        self._data_frame[self._target_dimension] == targetLevel
                    ) & (self._data_frame[self._analysed_dimension] ==
                         second_target_top_dims[0])][
                             self._second_level_dimensions]
                    df_second_dim = self._data_frame[(
                        self._data_frame[self._analysed_dimension] ==
                        second_target_top_dims[0]
                    )][self._second_level_dimensions]
                else:
                    df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\
                                            filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                                            select(self._second_level_dimensions).toPandas()
                    df_second_dim = self._data_frame.filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                                        select(self._second_level_dimensions).toPandas()

                # if self._chisquare_result.get_splits():
                #     splits = self._chisquare_result.get_splits()
                #     idx = self._chiSquareTable.get_bin_names(splits).index(second_target_top_dims[0])
                #     idx1 = self._chiSquareTable.get_bin_names(splits).index(top_target_top_dims[0])
                #     splits[len(splits)-1] = splits[len(splits)-1]+1
                #     df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\
                #                         filter(col(self._analysed_dimension)>=splits[idx]).filter(col(self._analysed_dimension)<splits[idx+1]).\
                #                         select(self._second_level_dimensions).toPandas()
                #     df_second_dim = self._data_frame.filter(col(self._analysed_dimension)>=splits[idx]).\
                #                     filter(col(self._analysed_dimension)<splits[idx+1]).\
                #                     select(self._second_level_dimensions).toPandas()
                # else:
                #     df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\
                #                         filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                #                         select(self._second_level_dimensions).toPandas()
                #     df_second_dim = self._data_frame.filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                #                     select(self._second_level_dimensions).toPandas()

                # print self._data_frame.select('Sales').show()

                distribution_second = []
                d_l = []
                for d in self._second_level_dimensions:
                    grouped = df_second_target.groupby(d).agg({d: 'count'})
                    contributions = df_second_dim.groupby(d).agg({d: 'count'})
                    contribution_index = list(contributions.index)
                    contributions_val = contributions[d].tolist()
                    contributions_list = dict(
                        list(zip(contribution_index, contributions_val)))
                    index_list = list(grouped.index)
                    grouped_list = grouped[d].tolist()
                    contributions_percent_list = [
                        round(old_div(y * 100.0, contributions_list[x]), 2)
                        for x, y in zip(index_list, grouped_list)
                    ]
                    sum_ = grouped[d].sum()
                    diffs = [0] + [
                        grouped_list[i] - grouped_list[i + 1]
                        for i in range(len(grouped_list) - 1)
                    ]
                    max_diff = diffs.index(max(diffs))
                    grouped_dict = dict(list(zip(index_list, grouped_list)))

                    for val in contribution_index:
                        if val not in list(grouped_dict.keys()):
                            grouped_dict[val] = 0
                        else:
                            pass

                    index_list = []
                    grouped_list = []
                    contributions_val = []

                    for key in list(grouped_dict.keys()):
                        index_list.append(str(key))
                        grouped_list.append(grouped_dict[key])
                        contributions_val.append(contributions_list[key])
                    '''
                    print "="*70
                    print "GROUPED - ", grouped
                    print "INDEX LIST - ", index_list
                    print "GROUPED LIST - ", grouped_list
                    print "GROUPED DICT - ", grouped_dict
                    print "CONTRIBUTIONS - ", contributions
                    print "CONTRIBUTION INDEX - ", contribution_index
                    print "CONTRIBUTIONS VAL - ", contributions_val
                    print "CONTRIBUTIONS LIST - ", contributions_list
                    print "CONTRIBUTIONS PERCENT LIST - ", contributions_percent_list
                    print "SUM - ", sum_
                    print "DIFFS - ", diffs
                    print "MAX DIFF - ", max_diff
                    print "="*70
                    '''

                    informative_dict = {
                        "levels": index_list,
                        "positive_class_contribution": grouped_list,
                        "positive_plus_others": contributions_val
                    }

                    informative_df = pd.DataFrame(informative_dict)
                    informative_df["percentage_horizontal"] = old_div(
                        informative_df["positive_class_contribution"] * 100,
                        informative_df["positive_plus_others"])
                    informative_df["percentage_vertical"] = old_div(
                        informative_df["positive_class_contribution"] * 100,
                        sum_)
                    informative_df.sort_values(["percentage_vertical"],
                                               inplace=True,
                                               ascending=False)
                    informative_df = informative_df.reset_index(drop=True)

                    percentage_vertical_sorted = list(
                        informative_df["percentage_vertical"])
                    percentage_horizontal_sorted = list(
                        informative_df["percentage_horizontal"])
                    levels_sorted = list(informative_df["levels"])

                    differences_list = []
                    for i in range(1, len(percentage_vertical_sorted)):
                        difference = percentage_vertical_sorted[
                            i - 1] - percentage_vertical_sorted[i]
                        differences_list.append(round(difference, 2))
                    '''
                    print "-"*70
                    print "DIFFERENCES LIST - ", differences_list
                    print "-"*70
                    '''

                    index_txt = ''
                    if differences_list:
                        if differences_list[0] >= 30:
                            print("showing 1st case")
                            index_txt = levels_sorted[0]
                            max_diff_equivalent = 1
                        else:
                            if len(differences_list) >= 2:
                                if differences_list[1] >= 10:
                                    print("showing 1st and 2nd case")
                                    index_txt = levels_sorted[0] + '(' + str(
                                        round(percentage_vertical_sorted[0], 1)
                                    ) + '%)' + ' and ' + levels_sorted[
                                        1] + '(' + str(
                                            round(
                                                percentage_vertical_sorted[1],
                                                1)) + '%)'
                                    max_diff_equivalent = 2
                                else:
                                    print("showing 3rd case")
                                    index_txt = 'including ' + levels_sorted[
                                        0] + '(' + str(
                                            round(
                                                percentage_vertical_sorted[0],
                                                1)
                                        ) + '%)' + ' and ' + levels_sorted[
                                            1] + '(' + str(
                                                round(
                                                    percentage_vertical_sorted[
                                                        1], 1)) + '%)'
                                    max_diff_equivalent = 3
                            else:
                                print("showing 3rd case")
                                index_txt = 'including ' + levels_sorted[
                                    0] + '(' + str(
                                        round(percentage_vertical_sorted[0], 1)
                                    ) + '%)' + ' and ' + levels_sorted[
                                        1] + '(' + str(
                                            round(
                                                percentage_vertical_sorted[1],
                                                1)) + '%)'
                                max_diff_equivalent = 3

                    else:
                        max_diff_equivalent = 0
                    '''
                    print "-"*70
                    print informative_df.head(25)
                    print "-"*70
                    '''

                    distribution_second.append({
                        'contributions': [
                            round(i, 2) for i in
                            percentage_vertical_sorted[:max_diff_equivalent]
                        ],
                        'levels':
                        levels_sorted[:max_diff_equivalent],
                        'variation':
                        random.randint(1, 100),
                        'index_txt':
                        index_txt,
                        'd':
                        d,
                        'contributions_percent':
                        percentage_horizontal_sorted
                    })
                '''
                  print "DISTRIBUTION SECOND - ", distribution_second
                  print "<>"*50
                  '''
                targetCardDataDict['distribution_second'] = distribution_second
                targetCardDataDict['second_target'] = targetLevel
                targetCardDataDict[
                    'second_target_top_dims'] = second_target_top_dims
                targetCardDataDict[
                    'second_target_top_dims_contribution'] = old_div(
                        second_target_top_dims_contribution * 100.0,
                        sum(second_target_contributions))
                targetCardDataDict[
                    'second_target_bottom_dim'] = second_target_bottom_dim
                targetCardDataDict[
                    'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution
                targetCardDataDict['best_second_target'] = levels[
                    best_second_target_index]
                targetCardDataDict[
                    'best_second_target_count'] = second_target_contributions[
                        best_second_target_index]
                targetCardDataDict['best_second_target_percent'] = round(
                    old_div(
                        second_target_contributions[best_second_target_index] *
                        100.0, sum(second_target_contributions)), 2)
                targetCardDataDict['worst_second_target'] = levels[
                    worst_second_target_index]
                targetCardDataDict['worst_second_target_percent'] = round(
                    old_div(
                        second_target_contributions[worst_second_target_index]
                        * 100.0, sum(second_target_contributions)), 2)

                card2Data = []
                targetLevelContributions = [
                    table.get_value(targetLevel, i) for i in levels
                ]
                impact_target_thershold = old_div(
                    sum(targetLevelContributions) * 0.02,
                    len(targetLevelContributions))
                card2Heading = '<h3>Key Drivers of ' + self._target_dimension + ' (' + targetLevel + ')' + "</h3>"
                chart, bubble = self.generate_distribution_card_chart(
                    targetLevel, targetLevelContributions, levels,
                    level_counts, total, impact_target_thershold)
                card2ChartData = NormalChartData(data=chart["data"])
                "rounding the chartdata values for key drivers tab inside table percentage(table data)"
                for d in card2ChartData.get_data():
                    d['percentage'] = round(d['percentage'], 2)
                    d_l.append(d)
                card2ChartJson = ChartJson()
                card2ChartJson.set_data(d_l)
                card2ChartJson.set_chart_type("combination")
                card2ChartJson.set_types({
                    "total": "bar",
                    "percentage": "line"
                })
                card2ChartJson.set_legend({
                    "total": "# of " + targetLevel,
                    "percentage": "% of " + targetLevel
                })
                card2ChartJson.set_axes({
                    "x": "key",
                    "y": "total",
                    "y2": "percentage"
                })
                card2ChartJson.set_label_text({
                    "x": " ",
                    "y": "Count",
                    "y2": "Percentage"
                })
                print("self._binTargetCol & self._binAnalyzedCol : ",
                      self._binTargetCol, self._binAnalyzedCol)
                if (self._binTargetCol == True & self._binAnalyzedCol ==
                        False):
                    print("Only Target Column is Binned")
                    output2 = NarrativesUtils.block_splitter(
                        NarrativesUtils.get_template_output(
                            self._base_dir, 'card2_binned_target.html',
                            targetCardDataDict), self._blockSplitter)
                elif (self._binTargetCol == True & self._binAnalyzedCol ==
                      True):
                    print("Target Column and IV is Binned")
                    output2 = NarrativesUtils.block_splitter(
                        NarrativesUtils.get_template_output(
                            self._base_dir, 'card2_binned_target_and_IV.html',
                            targetCardDataDict), self._blockSplitter)
                else:
                    print("In Else, self._binTargetCol should be False : ",
                          self._binTargetCol)
                    output2 = NarrativesUtils.block_splitter(
                        NarrativesUtils.get_template_output(
                            self._base_dir, 'card2.html', targetCardDataDict),
                        self._blockSplitter)

                card2Data.append(HtmlData(data=card2Heading))
                statistical_info_array = [
                    ("Test Type", "Chi-Square"),
                    ("Chi-Square statistic",
                     str(round(self._chisquare_result.get_stat(), 3))),
                    ("P-Value",
                     str(round(self._chisquare_result.get_pvalue(), 3))),
                    ("Inference",
                     "Chi-squared analysis shows a significant association between {} (target) and {}."
                     .format(self._target_dimension, self._analysed_dimension))
                ]
                statistical_info_array = NarrativesUtils.statistical_info_array_formatter(
                    statistical_info_array)

                card2Data.append(
                    C3ChartData(data=card2ChartJson,
                                info=statistical_info_array))
                card2Data += output2
                card2BubbleData = "<div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}</span><br /><small>{}</small></h2></div><div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}</span><br /><small>{}</small></h2></div>".format(
                    bubble[0]["value"], bubble[0]["text"], bubble[1]["value"],
                    bubble[1]["text"])
                card2Data.append(HtmlData(data=card2BubbleData))
                targetCard = NormalCard()
                targetCard.set_card_data(card2Data)
                targetCard.set_card_name("{} : Distribution of {}".format(
                    self._analysed_dimension, targetLevel))
                self._targetCards.append(targetCard)
                dict_for_test[targetLevel] = targetCardDataDict
        out = {'data_dict': data_dict, 'target_dict': dict_for_test}

        return out
Beispiel #21
0
    def _generate_narratives(self):
        chisquare_result = self._chisquare_result
        target_dimension = self._target_dimension
        analysed_dimension = self._analysed_dimension
        significant_variables = self._significant_variables
        num_analysed_variables = self._num_analysed_variables
        table = self._chiSquareTable
        total = self._chiSquareTable.get_total()

        levels = self._chiSquareTable.get_column_two_levels()
        level_counts = self._chiSquareTable.get_column_total()
        levels_count_sum = sum(level_counts)
        levels_percentages = [
            i * 100.0 / levels_count_sum for i in level_counts
        ]
        sorted_levels = sorted(zip(level_counts, levels), reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]

        top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        bottom_dim = sorted_levels[-1][1]
        bottom_dim_contribution = sorted_levels[-1][0]
        bottom_dims = [
            y for x, y in sorted_levels if x == bottom_dim_contribution
        ]

        target_levels = self._chiSquareTable.get_column_one_levels()

        target_counts = self._chiSquareTable.get_row_total()
        sorted_target_levels = sorted(zip(target_counts, target_levels),
                                      reverse=True)

        top_target_count, top_target = sorted_target_levels[0]
        second_target_count, second_target = sorted_target_levels[1]

        top_target_contributions = [
            table.get_value(top_target, i) for i in levels
        ]
        sum_top_target = sum(top_target_contributions)

        sorted_levels = sorted(zip(top_target_contributions, levels),
                               reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]
        top_target_top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        top_target_top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        top_target_bottom_dim = sorted_levels[-1][1]
        top_target_bottom_dim_contribution = sorted_levels[-1][0]

        top_target_percentages = [
            i * 100.0 / sum_top_target for i in top_target_contributions
        ]
        best_top_target_index = top_target_contributions.index(
            max(top_target_contributions))
        worst_top_target_index = top_target_contributions.index(
            min(top_target_contributions))
        top_target_differences = [
            x - y for x, y in zip(levels_percentages, top_target_percentages)
        ]
        if len(top_target_differences) > 6:
            tops = 2
            bottoms = -2
        elif len(top_target_differences) > 4:
            tops = 2
            bottoms = -1
        else:
            tops = 1
            bottoms = -1
        sorted_ = sorted(enumerate(top_target_differences),
                         key=lambda x: x[1],
                         reverse=True)
        best_top_difference_indices = [x for x, y in sorted_[:tops]]
        worst_top_difference_indices = [x for x, y in sorted_[bottoms:]]

        top_target_shares = [
            x * 100.0 / y
            for x, y in zip(top_target_contributions, level_counts)
        ]
        max_top_target_shares = max(top_target_shares)
        best_top_target_share_index = [
            idx for idx, val in enumerate(top_target_shares)
            if val == max_top_target_shares
        ]
        level_counts_threshold = sum(level_counts) * 0.05 / len(level_counts)
        min_top_target_shares = min([
            x for x, y in zip(top_target_shares, level_counts)
            if y >= level_counts_threshold
        ])
        worst_top_target_share_index = [
            idx for idx, val in enumerate(top_target_shares)
            if val == min_top_target_shares
        ]
        overall_top_percentage = sum_top_target * 100.0 / total

        second_target_contributions = [
            table.get_value(second_target, i) for i in levels
        ]
        sum_second_target = sum(second_target_contributions)

        sorted_levels = sorted(zip(second_target_contributions, levels),
                               reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]
        second_target_top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        second_target_top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        second_target_bottom_dim = sorted_levels[-1][1]
        second_target_bottom_dim_contribution = sorted_levels[-1][0]

        second_target_percentages = [
            i * 100.0 / sum_second_target for i in second_target_contributions
        ]
        best_second_target_index = second_target_contributions.index(
            max(second_target_contributions))
        worst_second_target_index = second_target_contributions.index(
            min(second_target_contributions))
        second_target_differences = [
            x - y
            for x, y in zip(levels_percentages, second_target_percentages)
        ]
        if len(second_target_differences) > 6:
            tops = 2
            bottoms = -2
        elif len(second_target_differences) > 4:
            tops = 2
            bottoms = -1
        else:
            tops = 1
            bottoms = -1
        sorted_ = sorted(enumerate(second_target_differences),
                         key=lambda x: x[1],
                         reverse=True)
        best_second_difference_indices = [x for x, y in sorted_[:tops]]
        worst_second_difference_indices = [x for x, y in sorted_[bottoms:]]

        second_target_shares = [
            x * 100.0 / y
            for x, y in zip(second_target_contributions, level_counts)
        ]
        max_second_target_shares = max(second_target_shares)
        best_second_target_share_index = [
            idx for idx, val in enumerate(second_target_shares)
            if val == max_second_target_shares
        ]
        level_counts_threshold = sum(level_counts) * 0.05 / len(level_counts)
        min_second_target_shares = min([
            x for x, y in zip(second_target_shares, level_counts)
            if y >= level_counts_threshold
        ])
        # worst_second_target_share_index = second_target_shares.index(min_second_target_shares)
        worst_second_target_share_index = [
            idx for idx, val in enumerate(second_target_shares)
            if val == min_second_target_shares
        ]
        overall_second_percentage = sum_second_target * 100.0 / total

        targetCardDataDict = {}
        targetCardDataDict['target'] = target_dimension
        targetCardDataDict['colname'] = analysed_dimension
        targetCardDataDict['num_significant'] = len(significant_variables)
        targetCardDataDict['plural_colname'] = NarrativesUtils.pluralize(
            analysed_dimension)

        targetCardDataDict["blockSplitter"] = self._blockSplitter
        targetCardDataDict["binTargetCol"] = self._binTargetCol
        targetCardDataDict["binAnalyzedCol"] = self._binAnalyzedCol
        targetCardDataDict['highlightFlag'] = self._highlightFlag
        targetCardDataDict['levels'] = levels

        data_dict = {}
        data_dict[
            'best_second_difference'] = best_second_difference_indices  ##these changed
        data_dict['worst_second_difference'] = worst_second_difference_indices
        data_dict['best_top_difference'] = best_top_difference_indices
        data_dict['worst_top_difference'] = worst_top_difference_indices
        data_dict['levels_percentages'] = levels_percentages
        data_dict['top_target_percentages'] = top_target_percentages
        data_dict['second_target_percentages'] = second_target_percentages
        data_dict['levels'] = levels
        data_dict['best_top_share'] = best_top_target_share_index
        data_dict['worst_top_share'] = worst_top_target_share_index
        data_dict['best_second_share'] = best_second_target_share_index
        data_dict['worst_second_share'] = worst_second_target_share_index
        data_dict['top_target_shares'] = top_target_shares
        data_dict['second_target_shares'] = second_target_shares
        data_dict['overall_second'] = overall_second_percentage
        data_dict['overall_top'] = overall_top_percentage

        data_dict['num_significant'] = len(significant_variables)
        data_dict['colname'] = analysed_dimension
        data_dict['plural_colname'] = NarrativesUtils.pluralize(
            analysed_dimension)
        data_dict['target'] = target_dimension
        data_dict['top_levels'] = top_dims
        data_dict['top_levels_percent'] = round(
            top_dims_contribution * 100.0 / total, 1)
        data_dict['bottom_level'] = bottom_dim
        data_dict['bottom_levels'] = bottom_dims
        data_dict['bottom_level_percent'] = round(
            bottom_dim_contribution * 100 / sum(level_counts), 2)
        data_dict['second_target'] = second_target
        data_dict['second_target_top_dims'] = second_target_top_dims
        data_dict[
            'second_target_top_dims_contribution'] = second_target_top_dims_contribution * 100.0 / sum(
                second_target_contributions)
        data_dict['second_target_bottom_dim'] = second_target_bottom_dim
        data_dict[
            'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution
        data_dict['best_second_target'] = levels[best_second_target_index]
        data_dict['best_second_target_count'] = second_target_contributions[
            best_second_target_index]
        data_dict['best_second_target_percent'] = round(
            second_target_contributions[best_second_target_index] * 100.0 /
            sum(second_target_contributions), 2)
        data_dict['worst_second_target'] = levels[worst_second_target_index]
        data_dict['worst_second_target_percent'] = round(
            second_target_contributions[worst_second_target_index] * 100.0 /
            sum(second_target_contributions), 2)

        data_dict['top_target'] = top_target
        data_dict['top_target_top_dims'] = top_target_top_dims
        data_dict[
            'top_target_top_dims_contribution'] = top_target_top_dims_contribution * 100.0 / sum(
                top_target_contributions)
        data_dict['top_target_bottom_dim'] = top_target_bottom_dim
        data_dict[
            'top_target_bottom_dim_contribution'] = top_target_bottom_dim_contribution
        data_dict['best_top_target'] = levels[best_top_target_index]
        data_dict['best_top_target_count'] = top_target_contributions[
            best_top_target_index]
        data_dict['best_top_target_percent'] = round(
            top_target_contributions[best_top_target_index] * 100.0 /
            sum(top_target_contributions), 2)
        data_dict['worst_top_target'] = levels[worst_top_target_index]
        data_dict['worst_top_target_percent'] = round(
            top_target_contributions[worst_top_target_index] * 100.0 /
            sum(top_target_contributions), 2)

        data_dict["blockSplitter"] = self._blockSplitter
        data_dict["binTargetCol"] = self._binTargetCol
        data_dict["binAnalyzedCol"] = self._binAnalyzedCol
        data_dict['highlightFlag'] = self._highlightFlag

        ###############
        #     CARD1   #
        ###############

        print "self._binTargetCol & self._binAnalyzedCol : ", self._binTargetCol, self._binAnalyzedCol
        if (self._binTargetCol == True & self._binAnalyzedCol == False):
            print "Only Target Column is Binned, : ", self._binTargetCol
            output = NarrativesUtils.block_splitter(
                NarrativesUtils.get_template_output(
                    self._base_dir, 'card1_binned_target.html', data_dict),
                self._blockSplitter,
                highlightFlag=self._highlightFlag)
        elif (self._binTargetCol == True & self._binAnalyzedCol == True):
            print "Target Column and IV is Binned : ", self._binTargetCol, self._binAnalyzedCol
            output = NarrativesUtils.block_splitter(
                NarrativesUtils.get_template_output(
                    self._base_dir, 'card1_binned_target_and_IV.html',
                    data_dict),
                self._blockSplitter,
                highlightFlag=self._highlightFlag)
        else:
            output = NarrativesUtils.block_splitter(
                NarrativesUtils.get_template_output(self._base_dir,
                                                    'card1.html', data_dict),
                self._blockSplitter,
                highlightFlag=self._highlightFlag)

        targetDimCard1Data = []
        targetDimcard1Heading = '<h3>Relationship between ' + self._target_dimension + '  and ' + self._analysed_dimension + "</h3>"

        toggledata = ToggleData()

        targetDimTable1Data = self.generate_card1_table1()
        targetDimCard1Table1 = TableData()
        targetDimCard1Table1.set_table_type("heatMap")
        targetDimCard1Table1.set_table_data(targetDimTable1Data)
        toggledata.set_toggleon_data({
            "data": {
                "tableData": targetDimTable1Data,
                "tableType": "heatMap"
            },
            "dataType": "table"
        })

        targetDimTable2Data = self.generate_card1_table2()
        targetDimCard1Table2 = TableData()
        targetDimCard1Table2.set_table_type("normal")
        table2Data = targetDimTable2Data["data1"]
        table2Data = [
            innerList[1:] for innerList in table2Data
            if innerList[0].strip() != ""
        ]
        targetDimCard1Table2.set_table_data(table2Data)

        toggledata.set_toggleoff_data({
            "data": {
                "tableData": table2Data,
                "tableType": "heatMap"
            },
            "dataType": "table"
        })

        targetDimCard1Data.append(HtmlData(data=targetDimcard1Heading))
        targetDimCard1Data.append(toggledata)
        targetDimCard1Data += output

        self._card1.set_card_data(targetDimCard1Data)
        self._card1.set_card_name("{}: Relationship with {}".format(
            self._analysed_dimension, self._target_dimension))

        ###############
        #     CARD2   #
        ###############

        if self._appid == None:

            key_factors = ''
            num_key_factors = len(self._second_level_dimensions)

            if len(self._second_level_dimensions) == 5:
                key_factors = ', '.join(
                    self._second_level_dimensions[:4]
                ) + ' and ' + self._second_level_dimensions[4]
            elif len(self._second_level_dimensions) == 4:
                key_factors = ', '.join(
                    self._second_level_dimensions[:3]
                ) + ' and ' + self._second_level_dimensions[3]
            elif len(self._second_level_dimensions) == 3:
                key_factors = ', '.join(
                    self._second_level_dimensions[:2]
                ) + ' and ' + self._second_level_dimensions[2]
            elif len(self._second_level_dimensions) == 2:
                key_factors = ' and '.join(self._second_level_dimensions)
            elif len(self._second_level_dimensions) == 1:
                key_factors = self._second_level_dimensions[0]

            targetCardDataDict['num_key_factors'] = num_key_factors
            targetCardDataDict['key_factors'] = key_factors
            dict_for_test = {}
            for tupleObj in sorted_target_levels[:self._chiSquareLevelLimit]:
                targetLevel = tupleObj[1]

                targetCardDataDict['random_card2'] = random.randint(1, 100)
                targetCardDataDict['random_card4'] = random.randint(1, 100)

                second_target_contributions = [
                    table.get_value(targetLevel, i) for i in levels
                ]
                sum_second_target = sum(second_target_contributions)

                sorted_levels = sorted(zip(second_target_contributions,
                                           levels),
                                       reverse=True)

                level_differences = [0.0] + [
                    sorted_levels[i][0] - sorted_levels[i + 1][0]
                    for i in range(len(sorted_levels) - 1)
                ]
                second_target_top_dims = [
                    j for i, j in sorted_levels[:level_differences.
                                                index(max(level_differences))]
                ]
                second_target_top_dims_contribution = sum([
                    i for i, j in sorted_levels[:level_differences.
                                                index(max(level_differences))]
                ])
                second_target_bottom_dim = sorted_levels[-1][1]
                second_target_bottom_dim_contribution = sorted_levels[-1][0]

                second_target_percentages = [
                    i * 100.0 / sum_second_target
                    for i in second_target_contributions
                ]
                best_second_target_index = second_target_contributions.index(
                    max(second_target_contributions))
                worst_second_target_index = second_target_contributions.index(
                    min(second_target_contributions))
                second_target_differences = [
                    x - y for x, y in zip(levels_percentages,
                                          second_target_percentages)
                ]
                if len(second_target_differences) > 6:
                    tops = 2
                    bottoms = -2
                elif len(second_target_differences) > 4:
                    tops = 2
                    bottoms = -1
                else:
                    tops = 1
                    bottoms = -1
                sorted_ = sorted(enumerate(second_target_differences),
                                 key=lambda x: x[1],
                                 reverse=True)
                best_second_difference_indices = [x for x, y in sorted_[:tops]]
                worst_second_difference_indices = [
                    x for x, y in sorted_[bottoms:]
                ]

                second_target_shares = [
                    x * 100.0 / y
                    for x, y in zip(second_target_contributions, level_counts)
                ]
                max_second_target_shares = max(second_target_shares)
                best_second_target_share_index = [
                    idx for idx, val in enumerate(second_target_shares)
                    if val == max_second_target_shares
                ]
                level_counts_threshold = sum(level_counts) * 0.05 / len(
                    level_counts)
                min_second_target_shares = min([
                    x for x, y in zip(second_target_shares, level_counts)
                    if y >= level_counts_threshold
                ])
                worst_second_target_share_index = [
                    idx for idx, val in enumerate(second_target_shares)
                    if val == min_second_target_shares
                ]
                overall_second_percentage = sum_second_target * 100.0 / total

                # DataFrame for contribution calculation

                df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\
                                        filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                                        select(self._second_level_dimensions).toPandas()
                df_second_dim = self._data_frame.filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                                    select(self._second_level_dimensions).toPandas()

                # if self._chisquare_result.get_splits():
                #     splits = self._chisquare_result.get_splits()
                #     idx = self._chiSquareTable.get_bin_names(splits).index(second_target_top_dims[0])
                #     idx1 = self._chiSquareTable.get_bin_names(splits).index(top_target_top_dims[0])
                #     splits[len(splits)-1] = splits[len(splits)-1]+1
                #     df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\
                #                         filter(col(self._analysed_dimension)>=splits[idx]).filter(col(self._analysed_dimension)<splits[idx+1]).\
                #                         select(self._second_level_dimensions).toPandas()
                #     df_second_dim = self._data_frame.filter(col(self._analysed_dimension)>=splits[idx]).\
                #                     filter(col(self._analysed_dimension)<splits[idx+1]).\
                #                     select(self._second_level_dimensions).toPandas()
                # else:
                #     df_second_target = self._data_frame.filter(col(self._target_dimension)==targetLevel).\
                #                         filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                #                         select(self._second_level_dimensions).toPandas()
                #     df_second_dim = self._data_frame.filter(col(self._analysed_dimension)==second_target_top_dims[0]).\
                #                     select(self._second_level_dimensions).toPandas()

                # print self._data_frame.select('Sales').show()

                distribution_second = []
                for d in self._second_level_dimensions:

                    grouped = df_second_target.groupby(d).agg({
                        d: 'count'
                    }).sort_values(d, ascending=False)
                    contributions = df_second_dim.groupby(d).agg({d: 'count'})
                    contribution_index = list(contributions.index)
                    contributions_val = contributions[d].tolist()
                    contributions_list = dict(
                        zip(contribution_index, contributions_val))
                    index_list = list(grouped.index)
                    grouped_list = grouped[d].tolist()
                    contributions_percent_list = [
                        round(y * 100.0 / contributions_list[x], 2)
                        for x, y in zip(index_list, grouped_list)
                    ]
                    sum_ = grouped[d].sum()
                    diffs = [0] + [
                        grouped_list[i] - grouped_list[i + 1]
                        for i in range(len(grouped_list) - 1)
                    ]
                    max_diff = diffs.index(max(diffs))

                    index_txt = ''
                    if max_diff == 1:
                        index_txt = index_list[0]
                    elif max_diff == 2:
                        index_txt = index_list[0] + '(' + str(
                            round(grouped_list[0] * 100.0 / sum_, 1)
                        ) + '%)' + ' and ' + index_list[1] + '(' + str(
                            round(grouped_list[1] * 100.0 / sum_, 1)) + '%)'
                    elif max_diff > 2:
                        index_txt = 'including ' + index_list[0] + '(' + str(
                            round(grouped_list[0] * 100.0 / sum_, 1)
                        ) + '%)' + ' and ' + index_list[1] + '(' + str(
                            round(grouped_list[1] * 100.0 / sum_, 1)) + '%)'
                    distribution_second.append({'contributions':[round(i*100.0/sum_,2) for i in grouped_list[:max_diff]],\
                                            'levels': index_list[:max_diff],'variation':random.randint(1,100),\
                                            'index_txt': index_txt, 'd':d,'contributions_percent':contributions_percent_list})

                targetCardDataDict['distribution_second'] = distribution_second
                targetCardDataDict['second_target'] = targetLevel
                targetCardDataDict[
                    'second_target_top_dims'] = second_target_top_dims
                targetCardDataDict[
                    'second_target_top_dims_contribution'] = second_target_top_dims_contribution * 100.0 / sum(
                        second_target_contributions)
                targetCardDataDict[
                    'second_target_bottom_dim'] = second_target_bottom_dim
                targetCardDataDict[
                    'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution
                targetCardDataDict['best_second_target'] = levels[
                    best_second_target_index]
                targetCardDataDict[
                    'best_second_target_count'] = second_target_contributions[
                        best_second_target_index]
                targetCardDataDict['best_second_target_percent'] = round(
                    second_target_contributions[best_second_target_index] *
                    100.0 / sum(second_target_contributions), 2)
                targetCardDataDict['worst_second_target'] = levels[
                    worst_second_target_index]
                targetCardDataDict['worst_second_target_percent'] = round(
                    second_target_contributions[worst_second_target_index] *
                    100.0 / sum(second_target_contributions), 2)

                card2Data = []
                targetLevelContributions = [
                    table.get_value(targetLevel, i) for i in levels
                ]
                card2Heading = '<h3>Distribution of ' + self._target_dimension + ' (' + targetLevel + ') across ' + self._analysed_dimension + "</h3>"
                chart, bubble = self.generate_distribution_card_chart(
                    targetLevel, targetLevelContributions, levels,
                    level_counts, total)
                card2ChartData = NormalChartData(data=chart["data"])
                card2ChartJson = ChartJson()
                card2ChartJson.set_data(card2ChartData.get_data())
                card2ChartJson.set_chart_type("combination")
                card2ChartJson.set_types({
                    "total": "bar",
                    "percentage": "line"
                })
                card2ChartJson.set_legend({
                    "total": "# of " + targetLevel,
                    "percentage": "% of " + targetLevel
                })
                card2ChartJson.set_axes({
                    "x": "key",
                    "y": "total",
                    "y2": "percentage"
                })
                card2ChartJson.set_label_text({
                    "x": " ",
                    "y": "Count",
                    "y2": "Percentage"
                })
                print "self._binTargetCol & self._binAnalyzedCol : ", self._binTargetCol, self._binAnalyzedCol
                if (self._binTargetCol == True & self._binAnalyzedCol ==
                        False):
                    print "Only Target Column is Binned"
                    output2 = NarrativesUtils.block_splitter(
                        NarrativesUtils.get_template_output(
                            self._base_dir, 'card2_binned_target.html',
                            targetCardDataDict), self._blockSplitter)
                elif (self._binTargetCol == True & self._binAnalyzedCol ==
                      True):
                    print "Target Column and IV is Binned"
                    output2 = NarrativesUtils.block_splitter(
                        NarrativesUtils.get_template_output(
                            self._base_dir, 'card2_binned_target_and_IV.html',
                            targetCardDataDict), self._blockSplitter)
                else:
                    print "In Else, self._binTargetCol should be False : ", self._binTargetCol
                    output2 = NarrativesUtils.block_splitter(
                        NarrativesUtils.get_template_output(
                            self._base_dir, 'card2.html', targetCardDataDict),
                        self._blockSplitter)

                card2Data.append(HtmlData(data=card2Heading))
                statistical_info_array = [
                    ("Test Type", "Chi-Square"),
                    ("Chi-Square statistic",
                     str(round(self._chisquare_result.get_stat(), 3))),
                    ("P-Value",
                     str(round(self._chisquare_result.get_pvalue(), 3))),
                    ("Inference",
                     "Chi-squared analysis shows a significant association between {} (target) and {}."
                     .format(self._target_dimension, self._analysed_dimension))
                ]
                statistical_info_array = NarrativesUtils.statistical_info_array_formatter(
                    statistical_info_array)

                card2Data.append(
                    C3ChartData(data=card2ChartJson,
                                info=statistical_info_array))
                card2Data += output2
                card2BubbleData = "<div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}</span><br /><small>{}</small></h2></div><div class='col-md-6 col-xs-12'><h2 class='text-center'><span>{}</span><br /><small>{}</small></h2></div>".format(
                    bubble[0]["value"], bubble[0]["text"], bubble[1]["value"],
                    bubble[1]["text"])
                card2Data.append(HtmlData(data=card2BubbleData))
                targetCard = NormalCard()
                targetCard.set_card_data(card2Data)
                targetCard.set_card_name("{} : Distribution of {}".format(
                    self._analysed_dimension, targetLevel))
                self._targetCards.append(targetCard)
                dict_for_test[targetLevel] = targetCardDataDict
        out = {'data_dict': data_dict, 'target_dict': dict_for_test}

        return out
Beispiel #22
0
    def _generate_card2(self):
        subset_df = self._dimension_trend_data.get_grouped_data()
        overall_df = self._overall_trend_data.get_grouped_data()
        total_measure = 'Total ' + self._measure_column_capitalized
        if len(overall_df.columns) == 3:
            overall_df.columns = ["key", total_measure, "year_month"]
        else:
            overall_df.columns = ["key", total_measure]
        top_level_name = self._measure_anova_result.get_topLevelDfAnovaResult(
            self._dimension_column).get_top_level_name()
        subset_measure = top_level_name + ' ' + self._measure_column_capitalized
        if len(subset_df.columns) == 3:
            subset_df.columns = ['key', subset_measure, "year_month"]
        else:
            subset_df.columns = ['key', subset_measure]
        inner_join = overall_df.merge(subset_df[['key', subset_measure]],
                                      how='inner',
                                      on='key')
        inner_join["key"] = inner_join["key"].apply(lambda x: str(x))
        # print "inner_join", inner_join
        correlation = inner_join[[total_measure, subset_measure
                                  ]].corr()[total_measure][subset_measure]
        if self._dataLevel == "month":
            data = {
                'Time Period': list(inner_join['year_month']),
                total_measure: list(inner_join[total_measure]),
                subset_measure: list(inner_join[subset_measure])
            }
            data_c3 = [['Time Period'] + list(inner_join['year_month']),
                       [total_measure] + list(inner_join[total_measure]),
                       [subset_measure] + list(inner_join[subset_measure])]
        elif self._dataLevel == "day":
            data = {
                'Time Period': list(inner_join['key']),
                total_measure: list(inner_join[total_measure]),
                subset_measure: list(inner_join[subset_measure])
            }
            data_c3 = [['Time Period'] + list(inner_join['key']),
                       [total_measure] + list(inner_join[total_measure]),
                       [subset_measure] + list(inner_join[subset_measure])]
        chart1 = chart(data=data)
        chart1.add_data_c3(data_c3)
        # self.card2.add_chart('trend_chart',chart1)
        self.card1.add_chart('trend_chart', chart1)

        overall_increase_percent = (overall_df[total_measure].iloc[-1] * 100 /
                                    overall_df[total_measure].iloc[0]) - 100
        subset_increase_percent = (subset_df[subset_measure].iloc[-1] * 100 /
                                   subset_df[subset_measure].iloc[0]) - 100

        overall_peak_index = overall_df[total_measure].argmax()
        overall_peak_value = overall_df[total_measure].ix[overall_peak_index]
        if self._dataLevel == "month":
            overall_peak_date = overall_df['year_month'].ix[overall_peak_index]
        elif self._dataLevel == "day":
            overall_peak_date = overall_df['key'].ix[overall_peak_index]
        subset_peak_index = subset_df[subset_measure].argmax()
        subset_peak_value = subset_df[subset_measure].ix[subset_peak_index]
        if self._dataLevel == "month":
            subset_peak_date = subset_df['year_month'].ix[subset_peak_index]
        elif self._dataLevel == "day":
            subset_peak_date = subset_df['key'].ix[subset_peak_index]

        overall_df['prev'] = overall_df[total_measure].shift(1)
        subset_df['prev'] = subset_df[subset_measure].shift(1)
        if math.isnan(overall_df['prev'].ix[overall_peak_index]):
            overall_peak_increase = 0
        else:
            overall_peak_increase = (
                subset_df[subset_measure].ix[subset_peak_index] /
                subset_df['prev'].ix[subset_peak_index]) * 100 - 100
        if math.isnan(subset_df['prev'].ix[subset_peak_index]):
            subset_peak_increase = 0
        else:
            subset_peak_increase = (
                subset_df[subset_measure].ix[subset_peak_index] /
                subset_df['prev'].ix[subset_peak_index]) * 100 - 100

        overall_df['avg_diff'] = overall_df[total_measure] - overall_df[
            total_measure].mean()
        subset_df['avg_diff'] = subset_df[subset_measure] - subset_df[
            subset_measure].mean()

        overall_df = self.streaks(overall_df, 'avg_diff')
        subset_df = self.streaks(subset_df, 'avg_diff')

        overall_longest_streak_end_index = overall_df['u_streak'].argmax()
        overall_longest_streak_contribution = overall_df[total_measure].ix[
            overall_longest_streak_end_index]
        overall_streak_length = int(
            overall_df['u_streak'].ix[overall_longest_streak_end_index])
        for i in range(1, int(overall_streak_length)):
            overall_longest_streak_contribution = overall_df[
                total_measure].shift(i).ix[overall_longest_streak_end_index]
        overall_longest_streak_contribution = overall_longest_streak_contribution * 100 / overall_df[
            total_measure].sum()
        if self._dataLevel == "month":
            overall_longest_streak_end_date = overall_df['year_month'].ix[
                overall_longest_streak_end_index]
            overall_longest_streak_start_date = overall_df['year_month'].shift(
                overall_streak_length - 1).ix[overall_longest_streak_end_index]
        elif self._dataLevel == "day":
            overall_longest_streak_end_date = overall_df['key'].ix[
                overall_longest_streak_end_index]
            overall_longest_streak_start_date = overall_df['key'].shift(
                overall_streak_length - 1).ix[overall_longest_streak_end_index]

        subset_longest_streak_end_index = subset_df['u_streak'].argmax()
        subset_longest_streak_contribution = subset_df[subset_measure].ix[
            subset_longest_streak_end_index]
        subset_streak_length = int(
            subset_df['u_streak'].ix[subset_longest_streak_end_index])
        for i in range(1, int(subset_streak_length)):
            subset_longest_streak_contribution = subset_df[
                subset_measure].shift(i).ix[subset_longest_streak_end_index]
        subset_longest_streak_contribution = subset_longest_streak_contribution * 100 / subset_df[
            subset_measure].sum()
        if self._dataLevel == "month":
            subset_longest_streak_end_date = subset_df['year_month'].ix[
                subset_longest_streak_end_index]
            subset_longest_streak_start_date = subset_df['year_month'].shift(
                subset_streak_length - 1).ix[subset_longest_streak_end_index]
        elif self._dataLevel == "day":
            subset_longest_streak_end_date = subset_df['key'].ix[
                subset_longest_streak_end_index]
            subset_longest_streak_start_date = subset_df['key'].shift(
                subset_streak_length - 1).ix[subset_longest_streak_end_index]
        data_dict = {
            'correlation':
            correlation,
            'overall_increase_percent':
            round(overall_increase_percent, 2),
            'subset_increase_percent':
            round(subset_increase_percent, 2),
            'overall_peak_value':
            NarrativesUtils.round_number(overall_peak_value, 2),
            'overall_peak_date':
            overall_peak_date,
            'overall_peak_increase':
            round(overall_peak_increase, 2),
            'overall_streak_length':
            overall_streak_length,
            'overall_streak_start_date':
            overall_longest_streak_start_date,
            'overall_streak_end_date':
            overall_longest_streak_end_date,
            'overall_streak_contribution':
            round(overall_longest_streak_contribution, 2),
            'subset_peak_value':
            NarrativesUtils.round_number(subset_peak_value, 2),
            'subset_peak_date':
            subset_peak_date,
            'subset_peak_increase':
            round(subset_peak_increase, 2),
            'subset_streak_length':
            subset_streak_length,
            'subset_streak_start_date':
            subset_longest_streak_start_date,
            'subset_streak_end_date':
            subset_longest_streak_end_date,
            'subset_streak_contribution':
            round(subset_longest_streak_contribution, 2),
            'target':
            self._measure_column,
            'top_dimension':
            top_level_name,
            'dimension':
            self._dimension_column,
        }

        print "data_dict - For anova_template_6 -------------------"
        print data_dict

        # print json.dumps(data_dict,indent=2)

        if self._binAnalyzedCol == True:
            print "Binned IV"
            output = {}
            output[
                'header'] = "<h4>" + self._dimension_column + " - " + top_level_name + "'s " + self._measure_column + " Performance over time" + "</h4>"
            output['content'] = []
            output['content'].append(
                NarrativesUtils.get_template_output(
                    self._base_dir, 'anova_template_6_binned_IV.html',
                    data_dict))
        else:
            output = {}
            output[
                'header'] = "<h4>" + top_level_name + "'s " + self._measure_column + " Performance over time" + "</h4>"
            output['content'] = []
            output['content'].append(
                NarrativesUtils.get_template_output(self._base_dir,
                                                    'anova_template_6.html',
                                                    data_dict))
        # self.card2.add_paragraph(output)
        lines = []
        lines += [HtmlData(data=output['header'])]
        lines += [
            C3ChartData(
                self._get_c3chart_trend(data, 'Time Period', total_measure,
                                        subset_measure))
        ]
        for cnt in output['content']:
            lines += NarrativesUtils.block_splitter(cnt, self._blockSplitter)
        self._anovaCard1.add_card_data(lines)
        self.card1.add_paragraph(dict(output))
Beispiel #23
0
    def generate_trending_comments(self):
        grouped_data_frame = self._trend_result.get_grouped_data(
            self._dimension_column)
        grouped_data_frame['increase'] = (
            grouped_data_frame['measure']['last'] -
            grouped_data_frame['measure']['first']
        ) * 100 / grouped_data_frame['measure']['first']
        positive_growth_dimensions = grouped_data_frame['dimension'].ix[
            grouped_data_frame['increase'] > 3]
        negative_growth_dimensions = grouped_data_frame['dimension'].ix[
            grouped_data_frame['increase'] < -2]
        stable_growth_dimensions = grouped_data_frame['dimension'].ix[
            (grouped_data_frame['increase'] >= -2)
            & (grouped_data_frame['increase'] <= 3)]
        positive_growth_values = grouped_data_frame['increase'].ix[
            grouped_data_frame['increase'] > 3]
        negative_growth_values = grouped_data_frame['increase'].ix[
            grouped_data_frame['increase'] < -2]
        # stable_growth_values = grouped_data_frame['increase'].ix[(grouped_data_frame['increase']>=-2) & (grouped_data_frame['increase']<=3)]

        positive_growth_dimensions = [
            i for j, i in sorted(zip(positive_growth_values,
                                     positive_growth_dimensions),
                                 reverse=True)
        ]
        negative_growth_dimensions = [
            i for j, i in sorted(
                zip(negative_growth_values, negative_growth_dimensions))
        ]
        positive_growth_values = sorted(positive_growth_values, reverse=True)
        negative_growth_values = sorted(negative_growth_values)

        overall_growth_rate = self._trend_result.get_overall_growth_percent()

        data_dict = {
            'positive_growth_dimensions':
            positive_growth_dimensions,
            'negative_growth_dimensions':
            negative_growth_dimensions,
            'stable_growth_dimensions':
            stable_growth_dimensions,
            'positive_growth_values': [
                NarrativesUtils.round_number(i, 2)
                for i in positive_growth_values
            ],
            'negative_growth_values': [
                NarrativesUtils.round_number(i, 2)
                for i in negative_growth_values
            ],
            'num_positive_growth_dimensions':
            len(positive_growth_dimensions),
            'num_negative_growth_dimensions':
            len(negative_growth_dimensions),
            'num_stable_growth_dimensions':
            len(stable_growth_dimensions),
            'target':
            self._measure_column,
            'dimension':
            self._dimension_column,
            'overall_growth_rate':
            NarrativesUtils.round_number(overall_growth_rate),
        }
        output = {'header': "", 'content': []}
        output['content'].append(
            NarrativesUtils.get_template_output(self._base_dir,
                                                'anova_template_7.html',
                                                data_dict))
Beispiel #24
0
    def _generate_analysis2(self):
        lines = []
        freq_dict = self._dimension_col_freq_dict
        json_freq_dict = json.dumps(freq_dict)
        freq_dict = json.loads(freq_dict)
        colname = self._colname
        data_dict = {"colname": self._colname}
        data_dict["plural_colname"] = pattern.en.pluralize(
            data_dict["colname"])
        count = freq_dict[colname]['count']
        max_key = max(count, key=count.get)
        min_key = min(count, key=count.get)
        data_dict["max"] = {
            "key": freq_dict[colname][colname][max_key],
            "val": count[max_key]
        }
        data_dict["min"] = {
            "key": freq_dict[colname][colname][min_key],
            "val": count[min_key]
        }
        data_dict["keys"] = freq_dict[colname][colname].values()
        data_dict["avg"] = round(
            sum(count.values()) / float(len(count.values())), 2)
        data_dict["above_avg"] = [
            freq_dict[colname][colname][key] for key in count.keys()
            if count[key] > data_dict["avg"]
        ]
        data_dict["per_bigger_avg"] = round(
            data_dict["max"]["val"] / float(data_dict["avg"]), 2)
        data_dict["per_bigger_low"] = round(
            data_dict["max"]["val"] / float(data_dict["min"]["val"]), 2)
        uniq_val = list(set(count.values()))
        data_dict["n_uniq"] = len(uniq_val)
        if len(uniq_val) == 1:
            data_dict["count"] = uniq_val[0]
        if len(data_dict["keys"]) >= 2:
            percent_75 = sum(count.values()) * 0.75
            kv = sorted(count.items(),
                        key=operator.itemgetter(1),
                        reverse=True)
            kv_75 = [(k, v) for k, v in kv if v <= percent_75]
            kv_75 = []
            temp_sum = 0
            for k, v in kv[:-1]:
                temp_sum = temp_sum + v
                kv_75.append((freq_dict[colname][colname][k], v))
                if temp_sum >= percent_75:
                    break
            data_dict["percent_contr"] = round(
                temp_sum * 100 / float(sum(count.values())), 2)
            data_dict["kv_75"] = len(kv_75)

            data_dict["kv_75_cat"] = [k for k, v in kv_75]

        largest_text = " %s is the largest with %s observations" % (
            data_dict["max"]["key"],
            str(NarrativesUtils.round_number(data_dict["max"]["val"])))
        smallest_text = " %s is the smallest with %s observations" % (
            data_dict["min"]["key"],
            str(NarrativesUtils.round_number(data_dict["min"]["val"])))
        largest_per = NarrativesUtils.round_number(
            data_dict["max"]["val"] / float(sum(count.values())), 2) * 100
        smallest_per = NarrativesUtils.round_number(
            data_dict["min"]["val"] / float(sum(count.values())), 2) * 100
        data_dict['largest_per'] = largest_per
        self.count = {
            "largest": [largest_text,
                        str(round(largest_per, 0)) + '%'],
            "smallest": [smallest_text,
                         str(round(smallest_per, 0)) + '%']
        }
        self.subheader = "Snapshot of " + data_dict["colname"]
        output1 =  NarrativesUtils.get_template_output(self._base_dir,\
                                                'dimension_distribution1.html',data_dict)
        output2 = NarrativesUtils.get_template_output(self._base_dir,\
                                                'dimension_distribution2.html',data_dict)
        lines.append(output1)
        lines.append(output2)
        return lines
Beispiel #25
0
    def _generate_card3(self):
        self._anovaCard3 = NormalCard(name=self._dimension_column_capitalized +
                                      '- Decision Matrix')
        self.card3 = Card(self._dimension_column_capitalized + '-' +
                          self._measure_column_capitalized +
                          ' Performance Decision Matrix')
        self.card3.add_paragraph({
            'header':
            '',
            'content':
            'Based on the absolute ' + self._measure_column +
            ' values and the overall growth rates, mAdvisor presents the decision matrix for '
            + self._measure_column + ' for ' + self._dimension_column +
            ' as displayed below.'
        })
        lines = []

        lines += NarrativesUtils.block_splitter(
            '<h3>' + self._dimension_column_capitalized + '-' +
            self._measure_column_capitalized +
            ' Performance Decision Matrix</h3><br>' +
            'Based on the absolute ' + self._measure_column +
            ' values and the overall growth rates, mAdvisor presents the decision matrix for '
            + self._measure_column + ' for ' + self._dimension_column +
            ' as displayed below.', self._blockSplitter)
        grouped_data_frame = self._dimension_trend_data.get_grouped_data()
        pivot_df = self._dimension_trend_data.get_level_pivot()
        grouped_data_frame['increase'] = [0] + [
            round((x - y) * 100 / float(y), 2)
            for x, y in zip(grouped_data_frame["value"].iloc[1:],
                            grouped_data_frame["value"])
        ]
        grouped_data_frame['contribution'] = grouped_data_frame[
            'value'] * 100 / float(grouped_data_frame['value'].sum())

        self._contribution_limit = grouped_data_frame['contribution'].mean()
        self._increase_limit = max(0.0, grouped_data_frame['increase'].mean())
        dimensionLevel = list(set(pivot_df.columns) - {"year_month", "key"})
        print dimensionLevel
        share = []
        growth = []
        for lvl in dimensionLevel:
            lvl_share = float(np.nansum(pivot_df[lvl])) * 100 / np.nansum(
                grouped_data_frame["value"])
            share.append(lvl_share)
            lvl_val_array = list(pivot_df[lvl][~np.isnan(pivot_df[lvl])])
            lvl_growth = float(lvl_val_array[-1] -
                               lvl_val_array[0]) * 100 / lvl_val_array[0]
            growth.append(lvl_growth)
        tempDf = pd.DataFrame({
            "dimension": dimensionLevel,
            "increase": growth,
            "contribution": share
        })
        tempDf['category'] = tempDf.apply(self.get_category, axis=1)
        data = {
            'Share of ' + self._measure_column: list(tempDf['contribution']),
            self._measure_column_capitalized + ' growth':
            list(tempDf['increase']),
            self._dimension_column: list(tempDf['dimension']),
            'Category': list(tempDf['category']),
        }
        # data_c3 = [[self._measure_column_capitalized+' growth'] + list(grouped_data_frame['increase']),
        #             ['Share of '+self._measure_column] + list(grouped_data_frame['contribution']),
        #             [self._dimension_column] + list(grouped_data_frame['dimension']),
        #             ['Category'] + list(grouped_data_frame['category'])]
        growth = list(tempDf['increase'])
        share = list(tempDf['contribution'])
        label = list(tempDf['dimension'])
        category_legend = list(tempDf['category'])
        all_data = sorted(zip(share, growth, label, category_legend))

        share = [i[0] for i in all_data]
        growth = [i[1] for i in all_data]
        label = [i[2] for i in all_data]
        category_legend = [i[3] for i in all_data]

        modified_category_legend = []
        for val in category_legend:
            if val == "Playing Safe":
                modified_category_legend.append("Opportunity Bay")
            elif val == "Opportunity Bay":
                modified_category_legend.append("Playing Safe")
            else:
                modified_category_legend.append(val)
        category_legend = modified_category_legend
        data_c3 = [['Growth'] + growth, ['Share'] + share,
                   [self._dimension_column] + label,
                   ['Category'] + category_legend]
        decisionMatrixChartJson = ChartJson(
            data=NormalChartData(data_c3).get_data(),
            chart_type='scatter_tooltip')
        decisionMatrixChartJson.set_legend(
            {"legendWillNotBeUsed": "legendWillNotBeUsed"})
        decisionMatrixChartJson.set_label_text({
            'x':
            'Percentage share of ' + self._measure_column,
            'y':
            "Growth over time"
        })
        lines += [C3ChartData(decisionMatrixChartJson)]

        chart_data = chart(data=data, labels={})
        chart_data.add_data_c3(data_c3)
        self.card3.add_chart('decision_matrix', chart_data)
        leaders_club = list(
            tempDf['dimension'][tempDf['category'] == 'Leaders Club'])
        playing_safe = list(
            tempDf['dimension'][tempDf['category'] == 'Playing Safe'])
        opportunity_bay = list(
            tempDf['dimension'][tempDf['category'] == 'Opportunity Bay'])
        red_alert = list(
            tempDf['dimension'][tempDf['category'] == 'Red Alert'])
        data_dict = {
            'leaders_club': leaders_club,
            'playing_safe': playing_safe,
            'opportunity_bay': opportunity_bay,
            'red_alert': red_alert,
            'num_leaders_club': len(leaders_club),
            'num_playing_safe': len(playing_safe),
            'num_opportunity_bay': len(opportunity_bay),
            'num_red_alert': len(red_alert),
            'target': self._measure_column,
            'dimension': self._dimension_column
        }
        executive_summary_data = {}
        executive_summary_data[self._dimension_column] = {
            "num_red_alert": len(red_alert),
            "red_alert": red_alert
        }
        self._result_setter.update_executive_summary_data(
            executive_summary_data)

        output = {'header': '', 'content': []}
        output['content'].append(
            NarrativesUtils.get_template_output(self._base_dir,
                                                'anova_template_5.html',
                                                data_dict))
        self.card3.add_paragraph(output)
        for cnt in output['content']:
            lines += NarrativesUtils.block_splitter(cnt, self._blockSplitter)
        self._anovaCard3.set_card_data(lines)
Beispiel #26
0
    def _generate_narratives_card1(self):
        chisquare_result = self._chisquare_result
        target_dimension = self._target_dimension
        analysed_dimension = self._analysed_dimension
        significant_variables = self._significant_variables
        num_analysed_variables = self._num_analysed_variables
        table = self._table
        total = self._table.get_total()
        table_counts = self._table.table
        table_percent = self._table.table_percent
        #row is target dimension and column is analysed dimension when created
        table_percent_by_row = self._table.table_percent_by_row
        table_percent_by_column = self._table.table_percent_by_column
        target_distribution = self._table.get_row_total()
        analysed_dimension_distribution = self._table.get_column_total()
        sorted_ = sorted(enumerate(target_distribution),
                         reverse=True,
                         key=lambda x: x[1])
        top_target_index, second_top_target_index = [x[0] for x in sorted_[:2]]

        levels = self._table.get_column_two_levels()
        level_counts = self._table.get_column_total()
        levels_count_sum = sum(level_counts)
        levels_percentages = [
            i * 100.0 / levels_count_sum for i in level_counts
        ]
        sorted_levels = sorted(zip(level_counts, levels), reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]
        top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        bottom_dim = sorted_levels[-1][1]
        bottom_dim_contribution = sorted_levels[-1][0]

        target_levels = self._table.get_column_one_levels()
        target_counts = self._table.get_row_total()
        sorted_target_levels = sorted(zip(target_counts, target_levels),
                                      reverse=True)
        top_target_count, top_target = sorted_target_levels[0]
        second_target_count, second_target = sorted_target_levels[1]

        top_target_contributions = [
            table.get_value(top_target, i) for i in levels
        ]
        sum_top_target = sum(top_target_contributions)

        sorted_levels = sorted(zip(top_target_contributions, levels),
                               reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]
        top_target_top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        top_target_top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        top_target_bottom_dim = sorted_levels[-1][1]
        top_target_bottom_dim_contribution = sorted_levels[-1][0]

        top_target_percentages = [
            i * 100.0 / sum_top_target for i in top_target_contributions
        ]
        best_top_target_index = top_target_contributions.index(
            max(top_target_contributions))
        worst_top_target_index = top_target_contributions.index(
            min(top_target_contributions))
        top_target_differences = [
            x - y for x, y in zip(levels_percentages, top_target_percentages)
        ]
        if len(top_target_differences) > 4:
            tops = 2
            bottoms = -2
        elif len(top_target_differences) == 4:
            tops = 2
            bottoms = -1
        else:
            tops = 1
            bottoms = -1
        sorted_ = sorted(enumerate(top_target_differences),
                         key=lambda x: x[1],
                         reverse=True)
        best_top_difference_indices = [x for x, y in sorted_[:tops]]
        worst_top_difference_indices = [x for x, y in sorted_[bottoms:]]

        top_target_shares = [
            x * 100.0 / y
            for x, y in zip(top_target_contributions, level_counts)
        ]
        best_top_target_share_index = top_target_shares.index(
            max(top_target_shares))
        worst_top_target_share_index = top_target_shares.index(
            min(top_target_shares))
        overall_top_percentage = sum_top_target * 100.0 / total

        second_target_contributions = [
            table.get_value(second_target, i) for i in levels
        ]
        sum_second_target = sum(second_target_contributions)

        sorted_levels = sorted(zip(second_target_contributions, levels),
                               reverse=True)
        level_differences = [0.0] + [
            sorted_levels[i][0] - sorted_levels[i + 1][0]
            for i in range(len(sorted_levels) - 1)
        ]
        second_target_top_dims = [
            j for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ]
        second_target_top_dims_contribution = sum([
            i for i, j in
            sorted_levels[:level_differences.index(max(level_differences))]
        ])
        second_target_bottom_dim = sorted_levels[-1][1]
        second_target_bottom_dim_contribution = sorted_levels[-1][0]

        second_target_percentages = [
            i * 100.0 / sum_second_target for i in second_target_contributions
        ]
        best_second_target_index = second_target_contributions.index(
            max(second_target_contributions))
        worst_second_target_index = second_target_contributions.index(
            min(second_target_contributions))
        second_target_differences = [
            x - y
            for x, y in zip(levels_percentages, second_target_percentages)
        ]
        if len(second_target_differences) > 6:
            tops = 2
            bottoms = -2
        elif len(second_target_differences) > 4:
            tops = 2
            bottoms = -1
        else:
            tops = 1
            bottoms = -1
        sorted_ = sorted(enumerate(second_target_differences),
                         key=lambda x: x[1])
        best_second_difference_indices = [x for x, y in sorted_[:tops]]
        worst_second_difference_indices = [x for x, y in sorted_[bottoms:]]

        second_target_shares = [
            x * 100.0 / y
            for x, y in zip(second_target_contributions, level_counts)
        ]
        best_second_target_share_index = second_target_shares.index(
            max(second_target_shares))
        worst_second_target_share_index = second_target_shares.index(
            min(second_target_shares))
        overall_second_percentage = sum_second_target * 100.0 / total

        data_dict = {}
        data_dict['best_second_difference'] = best_second_difference_indices[0]
        data_dict['worst_second_difference'] = worst_second_difference_indices[
            0]
        data_dict['best_top_difference'] = best_top_difference_indices[0]
        data_dict['worst_top_difference'] = worst_top_difference_indices[0]
        data_dict['levels_percentages'] = levels_percentages
        data_dict['top_target_percentages'] = top_target_percentages
        data_dict['second_target_percentages'] = second_target_percentages
        data_dict['levels'] = levels
        data_dict['best_top_share'] = best_top_target_share_index
        data_dict['worst_top_share'] = worst_top_target_share_index
        data_dict['best_second_share'] = best_second_target_share_index
        data_dict['worst_second_share'] = worst_second_target_share_index
        data_dict['top_target_shares'] = top_target_shares
        data_dict['second_target_shares'] = second_target_shares
        data_dict['overall_second'] = overall_second_percentage
        data_dict['overall_top'] = overall_top_percentage

        data_dict['num_significant'] = len(significant_variables)
        data_dict['colname'] = analysed_dimension
        data_dict['target'] = target_dimension
        data_dict['top_levels'] = top_dims
        data_dict['top_levels_percent'] = NarrativesUtils.round_number(
            top_dims_contribution * 100.0 / total)
        data_dict['bottom_level'] = bottom_dim
        data_dict['bottom_level_percent'] = round(bottom_dim_contribution, 2)
        data_dict['second_target'] = second_target
        data_dict['second_target_top_dims'] = second_target_top_dims
        data_dict[
            'second_target_top_dims_contribution'] = second_target_top_dims_contribution
        data_dict['second_target_bottom_dim'] = second_target_bottom_dim
        data_dict[
            'second_target_bottom_dim_contribution'] = second_target_bottom_dim_contribution
        data_dict['best_second_target'] = levels[best_second_target_index]
        data_dict['best_second_target_count'] = second_target_contributions[
            best_second_target_index]
        data_dict['best_second_target_percent'] = round(
            second_target_contributions[best_second_target_index] * 100.0 /
            total, 2)
        data_dict['worst_second_target'] = levels[worst_second_target_index]
        data_dict['worst_second_target_percent'] = round(
            second_target_contributions[worst_second_target_index] * 100.0 /
            total, 2)

        data_dict['top_target'] = top_target
        data_dict['top_target_top_dims'] = top_target_top_dims
        data_dict[
            'top_target_top_dims_contribution'] = top_target_top_dims_contribution
        data_dict['top_target_bottom_dim'] = top_target_bottom_dim
        data_dict[
            'top_target_bottom_dim_contribution'] = top_target_bottom_dim_contribution
        data_dict['best_top_target'] = levels[best_top_target_index]
        data_dict['best_top_target_count'] = top_target_contributions[
            best_top_target_index]
        data_dict['best_top_target_percent'] = round(
            top_target_contributions[best_top_target_index] * 100.0 / total, 2)
        data_dict['worst_top_target'] = levels[worst_top_target_index]
        data_dict['worst_top_target_percent'] = round(
            top_target_contributions[worst_top_target_index] * 100.0 / total,
            2)

        output = NarrativesUtils.paragraph_splitter(
            NarrativesUtils.get_template_output(self._base_dir, 'card1.html',
                                                data_dict))
        self.card1[
            'heading'] = 'Relationship between ' + self._target_dimension + '  and ' + self._analysed_dimension
        self.card1['paragraphs'] = output
        self.card1['chart'] = []
        self.card1['heat_map'] = self._table
        self.generate_card1_chart()
Beispiel #27
0
    def _generate_narratives(self):
        """
        generate main card narrative and remaining cards are generated by calling ChiSquareAnalysis class for each of analyzed dimensions
        """
        for target_dimension in self._df_chisquare_result.keys():
            target_chisquare_result = self._df_chisquare_result[
                target_dimension]
            analysed_variables = target_chisquare_result.keys(
            )  ## List of all analyzed var.
            # List of significant var out of analyzed var.
            significant_variables = [
                dim for dim in target_chisquare_result.keys()
                if target_chisquare_result[dim].get_pvalue() <= 0.05
            ]
            effect_sizes = [
                target_chisquare_result[dim].get_effect_size()
                for dim in significant_variables
            ]

            effect_size_dict = dict(zip(significant_variables, effect_sizes))
            significant_variables = [
                y
                for (x, y) in sorted(zip(effect_sizes, significant_variables),
                                     reverse=True)
            ]
            #insignificant_variables = [i for i in self._df_chisquare_result[target_dimension] if i['pv']>0.05]

            num_analysed_variables = len(analysed_variables)
            num_significant_variables = len(significant_variables)
            self.narratives['main_card'] = {}
            self.narratives['main_card'][
                'heading'] = 'Relationship between ' + target_dimension + ' and other factors'
            self.narratives['main_card']['paragraphs'] = {}
            data_dict = {
                'num_variables': num_analysed_variables,
                'num_significant_variables': num_significant_variables,
                'significant_variables': significant_variables,
                'target': target_dimension,
                'analysed_dimensions': analysed_variables,
                'blockSplitter': self._blockSplitter
            }  # for both para 1 and para 2
            paragraph = {}
            paragraph['header'] = ''

            paragraph['content'] = NarrativesUtils.get_template_output(
                self._base_dir, 'main_card.html', data_dict)
            self.narratives['main_card']['paragraphs'] = [paragraph]
            self.narratives['cards'] = []
            chart = {
                'header':
                'Strength of association between ' + target_dimension +
                ' and other dimensions'
            }
            chart['data'] = effect_size_dict
            chart['label_text'] = {
                'x': 'Dimensions',
                'y': 'Effect Size (Cramers-V)'
            }

            chart_data = []
            chartDataValues = []
            for k, v in effect_size_dict.items():
                chart_data.append({"key": k, "value": float(v)})
                chartDataValues.append(float(v))
            chart_data = sorted(chart_data,
                                key=lambda x: x["value"],
                                reverse=True)
            chart_json = ChartJson()
            chart_json.set_data(chart_data)
            chart_json.set_chart_type("bar")
            # chart_json.set_label_text({'x':'Dimensions','y':'Effect Size (Cramers-V)'})
            chart_json.set_label_text({
                'x': '  ',
                'y': 'Effect Size (Cramers-V)'
            })
            chart_json.set_axis_rotation(True)
            chart_json.set_axes({"x": "key", "y": "value"})
            # chart_json.set_yaxis_number_format(".4f")
            chart_json.set_yaxis_number_format(
                NarrativesUtils.select_y_axis_format(chartDataValues))
            self.narratives['main_card']['chart'] = chart

            main_card = NormalCard()
            header = "<h3>Strength of association between " + target_dimension + " and other dimensions</h3>"
            main_card_data = [HtmlData(data=header)]
            main_card_narrative = NarrativesUtils.get_template_output(
                self._base_dir, 'main_card.html', data_dict)
            main_card_narrative = NarrativesUtils.block_splitter(
                main_card_narrative, self._blockSplitter)
            main_card_data += main_card_narrative
            # st_info = ["Test : Chi Square", "Threshold for p-value : 0.05", "Effect Size : Cramer's V"]
            # print "chartdata",chart_data
            if len(chart_data) > 0:
                statistical_info_array = [
                    ("Test Type", "Chi-Square"),
                    ("Effect Size", "Cramer's V"),
                    ("Max Effect Size", chart_data[0]["key"]),
                    ("Min Effect Size", chart_data[-1]["key"]),
                ]
                statistical_inferenc = ""
                if len(chart_data) == 1:
                    statistical_inference = "{} is the only variable that have significant association with the {} (Target) having an \
                     Effect size of {}".format(
                        chart_data[0]["key"],
                        self._dataframe_context.get_result_column(),
                        round(chart_data[0]["value"], 4))
                elif len(chart_data) == 2:
                    statistical_inference = "There are two variables ({} and {}) that have significant association with the {} (Target) and the \
                     Effect size ranges are {} and {} respectively".format(
                        chart_data[0]["key"], chart_data[1]["key"],
                        self._dataframe_context.get_result_column(),
                        round(chart_data[0]["value"], 4),
                        round(chart_data[1]["value"], 4))
                else:
                    statistical_inference = "There are {} variables that have significant association with the {} (Target) and the \
                     Effect size ranges from {} to {}".format(
                        len(chart_data),
                        self._dataframe_context.get_result_column(),
                        round(chart_data[0]["value"], 4),
                        round(chart_data[-1]["value"], 4))
                if statistical_inference != "":
                    statistical_info_array.append(
                        ("Inference", statistical_inference))
                statistical_info_array = NarrativesUtils.statistical_info_array_formatter(
                    statistical_info_array)
            else:
                statistical_info_array = []
            main_card_data.append(
                C3ChartData(data=chart_json, info=statistical_info_array))
            main_card.set_card_data(main_card_data)
            main_card.set_card_name("Key Influencers")

            if self._storyOnScoredData != True:
                self._chiSquareNode.add_a_card(main_card)
                self._result_setter.add_a_score_chi_card(main_card)

            print "target_dimension", target_dimension
            if self._appid == '2' and num_significant_variables > 5:
                significant_variables = significant_variables[:5]
            else:
                if self._nColsToUse != None:
                    significant_variables = significant_variables[:self.
                                                                  _nColsToUse]

            CommonUtils.create_update_and_save_progress_message(
                self._dataframe_context,
                self._scriptWeightDict,
                self._scriptStages,
                self._analysisName,
                "custom",
                "info",
                display=True,
                customMsg="Analyzing key drivers",
                weightKey="narratives")
            for analysed_dimension in significant_variables[:self.
                                                            _noOfSigDimsToShow]:
                chisquare_result = self._df_chisquare.get_chisquare_result(
                    target_dimension, analysed_dimension)
                if self._appid == '2':
                    print "APPID 2 is used"
                    card = ChiSquareAnalysis(
                        self._dataframe_context, self._dataframe_helper,
                        chisquare_result, target_dimension, analysed_dimension,
                        significant_variables, num_analysed_variables,
                        self._data_frame, self._measure_columns,
                        self._base_dir, None, target_chisquare_result)
                    # self.narratives['cards'].append(card)
                    self._result_setter.add_a_score_chi_card(
                        json.loads(
                            CommonUtils.convert_python_object_to_json(
                                card.get_dimension_card1())))

                elif self._appid == '1':
                    print "APPID 1 is used"
                    card = ChiSquareAnalysis(
                        self._dataframe_context, self._dataframe_helper,
                        chisquare_result, target_dimension, analysed_dimension,
                        significant_variables, num_analysed_variables,
                        self._data_frame, self._measure_columns,
                        self._base_dir, None, target_chisquare_result)
                    # self.narratives['cards'].append(card)
                    self._result_setter.add_a_score_chi_card(
                        json.loads(
                            CommonUtils.convert_python_object_to_json(
                                card.get_dimension_card1())))
                else:
                    target_dimension_card = ChiSquareAnalysis(
                        self._dataframe_context, self._dataframe_helper,
                        chisquare_result, target_dimension, analysed_dimension,
                        significant_variables, num_analysed_variables,
                        self._data_frame, self._measure_columns,
                        self._base_dir, None, target_chisquare_result)
                    self.narratives['cards'].append(target_dimension_card)
                    self._chiSquareNode.add_a_node(
                        target_dimension_card.get_dimension_node())
        self._story_narrative.add_a_node(self._chiSquareNode)
        self._result_setter.set_chisquare_node(self._chiSquareNode)
 def generate_summary(self, dataDict):
     output = NarrativesUtils.get_template_output(self._base_dir,\
                                                     'trend_summary.html',data_dict)
     return output
    def _generate_summary(self):
        data_dict = {}
        rules_dict = self._table
        data_dict["blockSplitter"] = self._blockSplitter
        data_dict["targetcol"] = self._colname
        groups = rules_dict.keys()
        probabilityCutoff = 75
        probabilityGroups = [{
            "probability": probabilityCutoff,
            "count": 0,
            "range": [probabilityCutoff, 100]
        }, {
            "probability": probabilityCutoff - 1,
            "count": 0,
            "range": [0, probabilityCutoff - 1]
        }]
        tableArray = [[
            "Prediction Rule", "Probability", "Prediction", "Freq", "group",
            "richRules"
        ]]
        dropdownData = []
        chartDict = {}
        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        progressMessage = CommonUtils.create_progress_message_object(
            self._analysisName,
            "custom",
            "info",
            "Generating Prediction rules",
            self._completionStatus,
            self._completionStatus,
            display=True)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=False)

        for idx, target in enumerate(rules_dict.keys()):
            targetToDisplayInTable = target.split(":")[0].strip()
            if idx == 0:
                dropdownData.append({
                    "displayName": target,
                    "name": targetToDisplayInTable,
                    "searchTerm": targetToDisplayInTable,
                    "selected": True,
                    "id": idx + 1
                })
            else:
                dropdownData.append({
                    "displayName": target,
                    "name": targetToDisplayInTable,
                    "searchTerm": targetToDisplayInTable,
                    "selected": False,
                    "id": idx + 1
                })
            rulesArray = rules_dict[target]
            probabilityArray = [
                round(x, 2) for x in self.success_percent[target]
            ]
            groupArray = [
                "strong" if x >= probabilityCutoff else "mixed"
                for x in probabilityArray
            ]
            for idx2, obj in enumerate(probabilityGroups):
                grpCount = len([
                    x for x in probabilityArray
                    if x >= obj["range"][0] and x <= obj["range"][1]
                ])
                obj["count"] += grpCount
                probabilityGroups[idx2] = obj
            predictionArray = [targetToDisplayInTable] * len(rulesArray)
            freqArray = self.total_predictions[target]
            chartDict[target] = sum(freqArray)
            success = self.successful_predictions[target]
            success_percent = self.success_percent[target]
            richRulesArray = []
            crudeRuleArray = []
            analysisType = self._dataframe_context.get_analysis_type()
            targetCol = self._dataframe_context.get_result_column()
            binFlag = False
            if self._dataframe_context.get_custom_analysis_details() != None:
                binnedColObj = [
                    x["colName"] for x in
                    self._dataframe_context.get_custom_analysis_details()
                ]
                if binnedColObj != None and targetCol in binnedColObj:
                    binFlag = True
            for idx2, crudeRule in enumerate(rulesArray):
                richRule, crudeRule = NarrativesUtils.generate_rules(
                    self._colname,
                    target,
                    crudeRule,
                    freqArray[idx2],
                    success[idx2],
                    success_percent[idx2],
                    analysisType,
                    binFlag=binFlag)
                richRulesArray.append(richRule)
                crudeRuleArray.append(crudeRule)
            probabilityArray = map(
                lambda x: humanize.apnumber(x) + "%"
                if x >= 10 else str(int(x)) + "%", probabilityArray)
            # targetArray = zip(rulesArray,probabilityArray,predictionArray,freqArray,groupArray)
            targetArray = zip(crudeRuleArray, probabilityArray,
                              predictionArray, freqArray, groupArray,
                              richRulesArray)
            targetArray = [list(x) for x in targetArray]
            tableArray += targetArray

        donutChartMaxLevel = 10
        if len(chartDict) > donutChartMaxLevel:
            chartDict = NarrativesUtils.restructure_donut_chart_data(
                chartDict, nLevels=donutChartMaxLevel)
        chartData = NormalChartData([chartDict]).get_data()
        chartJson = ChartJson(data=chartData)
        chartJson.set_title(self._colname)
        chartJson.set_chart_type("donut")
        mainCardChart = C3ChartData(data=chartJson)
        mainCardChart.set_width_percent(45)
        # mainCardChart = {"dataType": "c3Chart","widthPercent":33 ,"data": {"data": [chartDict],"title":self._colname,"axes":{},"label_text":{},"legend":{},"yAxisNumberFormat": ".2s","types":None,"axisRotation":False, "chart_type": "donut"}}

        dropdownDict = {
            "dataType": "dropdown",
            "label": "Showing prediction rules for",
            "data": dropdownData
        }

        data_dict["probabilityGroups"] = probabilityGroups

        maincardSummary = NarrativesUtils.get_template_output(self._base_dir,\
                                                    'decisiontreesummary.html',data_dict)
        main_card = NormalCard()
        main_card_data = []
        main_card_narrative = NarrativesUtils.block_splitter(
            maincardSummary, self._blockSplitter)
        main_card_data += main_card_narrative

        main_card_data.append(mainCardChart)
        main_card_data.append(dropdownDict)

        main_card_table = TableData()
        main_card_table.set_table_data(tableArray)
        main_card_table.set_table_type("popupDecisionTreeTable")
        main_card_data.append(main_card_table)
        main_card.set_card_data(main_card_data)
        main_card.set_card_name("Predicting Key Drivers of {}".format(
            self._colname))
        self._decisionTreeNode.add_a_card(main_card)