コード例 #1
0
    def _generate_summary(self):

        ignored_columns = self._dataframe_context.get_ignore_column_suggestions()
        if ignored_columns == None:
            ignored_columns = []

        metaHelperInstance = MetaDataHelper(self._data_frame, self._total_rows)
        sampleData = metaHelperInstance.get_sample_data()
        try:
            sampleData = sampleData.toPandas()
        except:
            pass
        l1=[]
        l2=[]
        if self._pandas_flag:
            for column in self._dataframe_helper.get_string_columns():
                uniqueVals = sampleData[column].unique().tolist()
                if len(uniqueVals) > 0 and metaHelperInstance.get_datetime_format_pandas([self._data_frame[column].sort_values(ascending=False)[0]])!=None:
                    dateColumnFormat = metaHelperInstance.get_datetime_format_pandas(uniqueVals)
                    l1.append(column)
                else:
                    dateColumnFormat = None
                    l2.append(column)
            # l1 = self._dataframe_helper.get_timestamp_columns()
            # l2 = self._dataframe_helper.get_string_columns()
        else:
            for column in self._dataframe_helper.get_string_columns():
                uniqueVals = sampleData[column].unique().tolist()
                if len(uniqueVals) > 0 and metaHelperInstance.get_datetime_format([self._data_frame.orderBy([column],ascending=[False]).select(column).first()[0]])!=None:
                    dateColumnFormat = metaHelperInstance.get_datetime_format(uniqueVals)
                    l1.append(column)
                else:
                    dateColumnFormat = None
                    l2.append(column)

        data_dict = {"n_c" : self._dataframe_helper.get_num_columns(),
                    "n_m" : len(self._dataframe_helper.get_numeric_columns()),
                    "n_d" : len(l2),
                    "n_td" : len(l1),
                    "c" : self._column_name,
                    "d" : l2,
                    "m" : self._dataframe_helper.get_numeric_columns(),
                    "td" : l1,
                    "observations" : self._dataframe_helper.get_num_rows(),
                    "ignorecolumns" : ignored_columns,
                    "n_t" : len(self._dataframe_helper.get_string_columns())+len(self._dataframe_helper.get_numeric_columns())+len(self._dataframe_helper.get_timestamp_columns())
                    # "n_t" : self._dataframe_helper.get_num_columns()+len(ignored_columns)
        }
        self.summary = NarrativesUtils.get_template_output(self._base_dir,\
                                        'descr_stats_summary.html',data_dict)
        MeasureSummaryCard = SummaryCard(name='Summary',slug=None,cardData = None)
        MeasureSummaryCard.set_no_of_measures(data_dict["n_m"])
        MeasureSummaryCard.set_no_of_dimensions(data_dict["n_d"])
        MeasureSummaryCard.set_no_of_time_dimensions(data_dict["n_td"])
        MeasureSummaryCard.set_summary_html(NarrativesUtils.block_splitter(self.summary,self._blockSplitter))
        self._story_narrative.add_a_card(MeasureSummaryCard)
        self._headNode.add_a_card(MeasureSummaryCard)
コード例 #2
0
    def __init__(self, df_helper, df_context, result_setter, spark,
                 story_narrative, meta_parser):
        self._story_narrative = story_narrative
        self._result_setter = result_setter
        self._spark = spark
        self._dataframe_helper = df_helper
        self._dataframe_context = df_context
        self._pandas_flag = df_context._pandas_flag
        self._data_frame = df_helper.get_data_frame()
        self._num_significant_digits = NarrativesUtils.get_significant_digit_settings(
            "trend")
        self._metaParser = meta_parser

        self._result_column = self._dataframe_context.get_result_column()
        self._string_columns = self._dataframe_helper.get_string_columns()
        self._timestamp_columns = self._dataframe_helper.get_timestamp_columns(
        )

        # self._selected_date_columns = None
        self._selected_date_columns = self._dataframe_context.get_selected_date_columns(
        )
        self._all_date_columns = self._dataframe_context.get_date_columns()
        self._string_columns = list(
            set(self._string_columns) - set(self._all_date_columns))

        self._dateFormatDetected = False
        self._existingDateFormat = None
        self._dateFormatConversionDict = NarrativesUtils.date_formats_mapping_dict(
        )
        self._dateColumnFormatDict = df_context.get_date_format_dict()
        if self._dataframe_context.get_requested_date_format() != None:
            self._requestedDateFormat = df_context.get_requested_date_format()
        else:
            self._requestedDateFormat = None

        self._analysistype = self._dataframe_context.get_analysis_type()
        self._trendSettings = self._dataframe_context.get_trend_settings()
        self._trendSpecificMeasure = False
        if self._trendSettings != None:
            if self._analysistype == "dimension" and self._trendSettings[
                    "name"] != "Count":
                self._trendSpecificMeasure = True
                self._analysistype = "measure"
                self._result_column = self._trendSettings["selectedMeasure"]
            elif self._analysistype == "measure" and self._trendSettings[
                    "name"] != "Count":
                self._result_column = self._trendSettings["selectedMeasure"]

        self._trend_subsection = self._result_setter.get_trend_section_name()
        self._regression_trend_card = None
        self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER
        self._highlightFlag = "|~HIGHLIGHT~|"
        self._trend_on_td_column = False
        self._number_of_dimensions_to_consider = 10

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        self._analysisName = self._dataframe_context.get_analysis_name()
        self._messageURL = self._dataframe_context.get_message_url()
        if self._analysistype == "dimension":
            self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight(
            )
            self._scriptStages = {
                "initialization": {
                    "summary": "Initialized The Frequency Narratives",
                    "weight": 0
                },
                "summarygeneration": {
                    "summary": "Summary Generation Finished",
                    "weight": 4
                },
                "completion": {
                    "summary": "Frequency Stats Narratives Done",
                    "weight": 0
                },
            }
        elif self._analysistype == "measure":
            if self._trendSpecificMeasure:
                self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight(
                )
            else:
                self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight(
                )
            self._scriptStages = {
                "trendNarrativeStart": {
                    "summary": "Started The Descriptive Stats Narratives",
                    "weight": 1
                },
                "trendNarrativeEnd": {
                    "summary": "Narratives For Descriptive Stats Finished",
                    "weight": 0
                },
            }

        self._base_dir = "/trend/"
        if self._pandas_flag and self._selected_date_columns and not self._dateColumnFormatDict and not self._timestamp_columns:
            for column in self._selected_date_columns:
                uniqueVals = self._data_frame[column].astype(
                    str).unique().tolist()
                metaHelperInstance = MetaDataHelper(self._data_frame,
                                                    self._data_frame.shape[0])
                if len(uniqueVals
                       ) > 0 and metaHelperInstance.get_datetime_format_pandas(
                           [
                               self._data_frame.sort_values(
                                   by=column, ascending=False)[column][0]
                           ]) != None:
                    dateColumnFormat = metaHelperInstance.get_datetime_format_pandas(
                        uniqueVals)
                    self._dateColumnFormatDict.update(
                        {column: dateColumnFormat})
        dateColCheck = NarrativesUtils.check_date_column_formats(self._selected_date_columns,\
                                                    self._timestamp_columns,\
                                                    self._dateColumnFormatDict,\
                                                    self._dateFormatConversionDict,
                                                    self._requestedDateFormat)
        print(dateColCheck)

        self._dateFormatDetected = dateColCheck["dateFormatDetected"]
        self._trend_on_td_column = dateColCheck["trendOnTdCol"]
        if self._dateFormatDetected:
            self._requestedDateFormat = dateColCheck["requestedDateFormat"]
            self._existingDateFormat = dateColCheck["existingDateFormat"]
            # self._date_column_suggested is the column used for trend
            self._date_column_suggested = dateColCheck["suggestedDateColumn"]
        if self._existingDateFormat:
            self._data_frame, dataRangeStats = NarrativesUtils.calculate_data_range_stats(
                self._data_frame, self._existingDateFormat,
                self._date_column_suggested, self._trend_on_td_column,
                self._pandas_flag)
            print(dataRangeStats)
            self._durationString = dataRangeStats["durationString"]
            self._duration = dataRangeStats["duration"]
            self._dataLevel = dataRangeStats["dataLevel"]
            first_date = dataRangeStats["firstDate"]
            last_date = dataRangeStats["lastDate"]

            if self._timestamp_columns != None:
                if self._selected_date_columns == None:
                    self._selected_date_columns = self._timestamp_columns
                else:
                    self._selected_date_columns += self._timestamp_columns
        if self._pandas_flag:
            pass
        else:
            if self._trend_subsection == "regression":
                if self._selected_date_columns != None:
                    if self._dateFormatDetected:
                        trend_subsection_data = self._result_setter.get_trend_section_data(
                        )
                        measure_column = trend_subsection_data[
                            "measure_column"]
                        result_column = trend_subsection_data["result_column"]
                        base_dir = trend_subsection_data["base_dir"]

                        card3heading = 'How ' + result_column + ' and ' + measure_column + ' changed over time'
                        if self._dataLevel == "day":
                            grouped_data = self._data_frame.groupBy(
                                "suggestedDate").agg({
                                    measure_column: 'sum',
                                    result_column: 'sum'
                                })
                            grouped_data = grouped_data.withColumnRenamed(
                                grouped_data.columns[-1], result_column)
                            grouped_data = grouped_data.withColumnRenamed(
                                grouped_data.columns[-2], measure_column)
                            grouped_data = grouped_data.withColumn(
                                "year_month",
                                udf(lambda x: x.strftime("%b-%y"))(
                                    "suggestedDate"))
                            grouped_data = grouped_data.orderBy(
                                "suggestedDate", ascending=True)
                            grouped_data = grouped_data.withColumnRenamed(
                                grouped_data.columns[0], "key")
                            grouped_data = grouped_data.toPandas()
                        elif self._dataLevel == "month":
                            grouped_data = self._data_frame.groupBy(
                                "year_month").agg({
                                    measure_column: 'sum',
                                    result_column: 'sum'
                                })
                            grouped_data = grouped_data.withColumnRenamed(
                                grouped_data.columns[-1], result_column)
                            grouped_data = grouped_data.withColumnRenamed(
                                grouped_data.columns[-2], measure_column)
                            grouped_data = grouped_data.withColumn(
                                "suggestedDate",
                                udf(lambda x: datetime.strptime(x, "%b-%y"))(
                                    "year_month"))
                            grouped_data = grouped_data.orderBy(
                                "suggestedDate", ascending=True)
                            grouped_data = grouped_data.withColumnRenamed(
                                "suggestedDate", "key")
                            grouped_data = grouped_data.select([
                                "key", measure_column, result_column,
                                "year_month"
                            ]).toPandas()
                            grouped_data["key"] = grouped_data[
                                "year_month"].apply(
                                    lambda x: datetime.strptime(x, "%b-%y"
                                                                ).date())

                        trend_narrative_obj = TrendNarrative(
                            self._result_column, self._date_column_suggested,
                            grouped_data, self._existingDateFormat,
                            self._requestedDateFormat, self._base_dir,
                            self._metaParser)

                        card3data = trend_narrative_obj.generate_regression_trend_data(
                            grouped_data, measure_column, result_column,
                            self._dataLevel, self._durationString)

                        card3narrative = NarrativesUtils.get_template_output(base_dir,\
                                                                        'regression_card3.html',card3data)

                        card3chart = trend_narrative_obj.generate_regression_trend_chart(
                            grouped_data, self._dataLevel)
                        card3paragraphs = NarrativesUtils.paragraph_splitter(
                            card3narrative)
                        card2 = {
                            'charts': card3chart,
                            'paragraphs': card3paragraphs,
                            'heading': card3heading
                        }
                        self.set_regression_trend_card_data(card2)
                    else:
                        print("NO DATE FORMAT DETECTED")
                else:
                    print("NO DATE COLUMNS PRESENT")

        if self._analysistype == "measure":
            self._completionStatus += old_div(
                self._scriptWeightDict[self._analysisName]["total"] *
                self._scriptStages["trendNarrativeStart"]["weight"], 10)
            progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                        "trendNarrativeStart",\
                                        "info",\
                                        self._scriptStages["trendNarrativeStart"]["summary"],\
                                        self._completionStatus,\
                                        self._completionStatus)
            CommonUtils.save_progress_message(self._messageURL,
                                              progressMessage)
            self._dataframe_context.update_completion_status(
                self._completionStatus)
            # self._startMeasureTrend = self._result_setter.get_trend_section_completion_status()
            self._startMeasureTrend = True

            if self._startMeasureTrend == True:
                self.narratives = {
                    "SectionHeading": "",
                    "card1": {},
                    "card2": {},
                    "card3": {}
                }
                if self._selected_date_columns != None:
                    if self._dateFormatDetected:
                        grouped_data = NarrativesUtils.get_grouped_data_for_trend(
                            self._data_frame, self._dataLevel,
                            self._result_column, self._analysistype,
                            self._pandas_flag)
                        if self._pandas_flag:
                            self._data_frame = self._data_frame.drop(
                                self._date_column_suggested, axis=1)
                        else:
                            self._data_frame = self._data_frame.drop(
                                self._date_column_suggested)
                        # self._data_frame = self._data_frame.withColumnRenamed("year_month", self._date_column_suggested)

                        significant_dimensions = []
                        significant_dimension_dict = df_helper.get_significant_dimension(
                        )
                        if significant_dimension_dict != {} and significant_dimension_dict != None:
                            significant_dimension_tuple = tuple(
                                significant_dimension_dict.items())
                            significant_dimension_tuple = sorted(
                                significant_dimension_tuple,
                                key=lambda x: x[1],
                                reverse=True)
                            significant_dimensions = [
                                x[0] for x in
                                significant_dimension_tuple[:self.
                                                            _number_of_dimensions_to_consider]
                            ]
                        else:
                            significant_dimensions = self._string_columns[:self
                                                                          .
                                                                          _number_of_dimensions_to_consider]
                        print("significant_dimensions", significant_dimensions)
                        trend_narrative_obj = TrendNarrative(
                            self._result_column, self._date_column_suggested,
                            grouped_data, self._existingDateFormat,
                            self._requestedDateFormat, self._base_dir,
                            self._metaParser)
                        # grouped_data.to_csv("/home/gulshan/marlabs/datasets/trend_grouped_pandas.csv",index=False)
                        dataDict = trend_narrative_obj.generateDataDict(
                            grouped_data, self._dataLevel,
                            self._durationString)
                        # # update reference time with max value
                        reference_time = dataDict["reference_time"]
                        dataDict["duration"] = self._duration
                        dataDict["dataLevel"] = self._dataLevel
                        dataDict["durationString"] = self._durationString
                        dataDict[
                            "significant_dimensions"] = significant_dimensions
                        if len(significant_dimensions) > 0:
                            if self._dataLevel == "day":
                                datetimeformat = self._existingDateFormat
                            elif self._dataLevel == "month":
                                datetimeformat = "%b-%y"
                            # xtraData = trend_narrative_obj.get_xtra_calculations(self._data_frame,grouped_data,significant_dimensions,self._date_column_suggested,self._result_column,self._existingDateFormat,reference_time,self._dataLevel, self._pandas_flag)
                            xtraData = trend_narrative_obj.get_xtra_calculations(
                                self._data_frame, grouped_data,
                                significant_dimensions,
                                self._date_column_suggested,
                                self._result_column, datetimeformat,
                                reference_time, self._dataLevel,
                                self._pandas_flag)
                            if xtraData != None:
                                dataDict.update(xtraData)
                        # print 'Trend dataDict:  %s' %(json.dumps(dataDict, indent=2))
                        self._result_setter.update_executive_summary_data(
                            dataDict)
                        dataDict.update({
                            "blockSplitter": self._blockSplitter,
                            "highlightFlag": self._highlightFlag
                        })

                        summary1 = NarrativesUtils.get_template_output(self._base_dir,\
                                                                        'measure_trend_card1.html',dataDict)
                        summary2 = NarrativesUtils.get_template_output(self._base_dir,\
                                                                        'measure_trend_card2.html',dataDict)
                        measureTrendCard = NormalCard()
                        measureTrendcard1Data = NarrativesUtils.block_splitter(
                            summary1,
                            self._blockSplitter,
                            highlightFlag=self._highlightFlag)
                        measureTrendcard2Data = NarrativesUtils.block_splitter(
                            summary2, self._blockSplitter)
                        # print measureTrendcard1Data

                        bubbledata = dataDict["bubbleData"]
                        # print bubbledata
                        card1BubbleData = "<div class='col-md-6 col-xs-12 xs-p-20'><h2 class='text-center'><span>{}</span><br/><small>{}</small></h2></div><div class='col-md-6 col-xs-12 xs-p-20'><h2 class='text-center'><span>{}</span><br/><small>{}</small></h2></div>".format(
                            bubbledata[0]["value"], bubbledata[0]["text"],
                            bubbledata[1]["value"], bubbledata[1]["text"])
                        # print card1BubbleData

                        trend_chart_data = list(
                            grouped_data[["key",
                                          "value"]].T.to_dict().values())
                        trend_chart_data = sorted(trend_chart_data,
                                                  key=lambda x: x["key"])
                        card1chartdata = {"actual": [], "predicted": []}

                        if self._dataLevel == "day":
                            card1chartdata["actual"] = [{
                                "key": str(val["key"]),
                                "value": val["value"]
                            } for val in trend_chart_data]
                        elif self._dataLevel == "month":
                            card1chartdata["actual"] = [{
                                "key":
                                val["key"].strftime("%b-%y"),
                                "value":
                                val["value"]
                            } for val in trend_chart_data]

                        if self._duration < 365:
                            prediction_window = 3
                        else:
                            prediction_window = 6
                        predicted_values = trend_narrative_obj.get_forecast_values(
                            grouped_data["value"],
                            prediction_window)[len(grouped_data["value"]):]
                        predicted_values = [
                            round(x, self._num_significant_digits)
                            for x in predicted_values
                        ]

                        forecasted_data = []
                        forecasted_data.append(card1chartdata["actual"][-1])
                        forecasted_dates = []
                        # forecast_start_time = datetime.strptime(card1chartdata["actual"][-1]["key"],"%b-%y")
                        if self._dataLevel == "month":
                            forecast_start_time = datetime.strptime(
                                card1chartdata["actual"][-1]["key"], "%b-%y")
                        elif self._dataLevel == "day":
                            try:
                                forecast_start_time = datetime.strptime(
                                    card1chartdata["actual"][-1]["key"],
                                    "%Y-%m-%d")
                            except:
                                forecast_start_time = datetime.strptime(
                                    card1chartdata["actual"][-1]["key"],
                                    '%Y-%m-%d %H:%M:%S')
                        for val in range(prediction_window):
                            if self._dataLevel == "month":
                                key = forecast_start_time + relativedelta(
                                    months=1 + val)
                                forecasted_dates.append(key)
                            elif self._dataLevel == "day":
                                key = forecast_start_time + relativedelta(
                                    days=1 + val)
                                forecasted_dates.append(key)
                        forecasted_list = list(
                            zip(forecasted_dates, predicted_values))
                        if self._dataLevel == "month":
                            forecasted_list = [{
                                "key": val[0].strftime("%b-%y"),
                                "value": val[1]
                            } for val in forecasted_list]
                        elif self._dataLevel == "day":
                            forecasted_list = [{
                                "key":
                                val[0].strftime("%Y-%m-%d"),
                                "value":
                                val[1]
                            } for val in forecasted_list]
                        forecasted_data += forecasted_list
                        card1chartdata["predicted"] = forecasted_data
                        # print json.dumps(card1chartdata,indent=2)
                        card1chartdata = ScatterChartData(data=card1chartdata)
                        chartJson = ChartJson()
                        chartJson.set_data(card1chartdata.get_data())
                        chartJson.set_label_text({
                            'x': ' ',
                            'y': 'No. of Observations'
                        })
                        chartJson.set_legend({
                            "actual": "Observed",
                            "predicted": "Forecast"
                        })
                        chartJson.set_chart_type("scatter_line")
                        chartJson.set_axes({"x": "key", "y": "value"})
                        chartJson.set_yaxis_number_format(".2f")
                        st_info = [
                            "Trend Analysis",
                            "Forecast Method : Holt Winters Method"
                        ]
                        measureTrendcard1Data.insert(
                            1, C3ChartData(data=chartJson, info=st_info))
                        measureTrendcard1Data.append(
                            HtmlData(data=card1BubbleData))
                        cardData = measureTrendcard1Data + measureTrendcard2Data
                        measureTrendCard.set_card_data(cardData)
                        measureTrendCard.set_card_name("Trend Analysis")
                        trendStoryNode = NarrativesTree(
                            "Trend", None, [], [measureTrendCard])
                        self._story_narrative.add_a_node(trendStoryNode)
                        self._result_setter.set_trend_node(trendStoryNode)

                        # prediction_data = [{"key":x["key"],"value":x["value"]} for x in trend_chart_data]
                        # last_val = prediction_data[-1]
                        # last_val.update({"predicted_value":last_val["value"]})
                        # prediction_data[-1] = last_val
                        #
                        # for val in range(prediction_window):
                        #     dataLevel = dataDict["dataLevel"]
                        #     if self._dataLevel == "month":
                        #         last_key = prediction_data[-1]["key"]
                        #         key = last_key+relativedelta(months=1)
                        #         prediction_data.append({"key":key,"predicted_value":predicted_values[val]})
                        #         forecasted_data.append({"key":key,"value":predicted_values[val]})
                        #     elif self._dataLevel == "day":
                        #         last_key = prediction_data[-1]["key"]
                        #         key = last_key+relativedelta(days=1)
                        #         prediction_data.append({"key":key,"predicted_value":predicted_values[val]})
                        # prediction_data_copy = prediction_data
                        # prediction_data = []
                        # for val in prediction_data_copy:
                        #     val["key"] = val["key"].strftime("%b-%y")
                        #     prediction_data.append(val)

                        # forecastDataDict = {"startForecast":predicted_values[0],
                        #                     "endForecast":predicted_values[prediction_window-1],
                        #                     "measure":dataDict["measure"],
                        #                     "forecast":True,
                        #                     "forecast_percentage": round((predicted_values[prediction_window-1]-predicted_values[0])/predicted_values[0],self._num_significant_digits),
                        #                     "prediction_window_text": str(prediction_window) + " months"
                        #                     }
                        #
                        # self._result_setter.update_executive_summary_data(forecastDataDict)
                        # summary3 = NarrativesUtils.get_template_output(self._base_dir,\
                        # 'trend_narrative_card3.html',forecastDataDict)
                        self._completionStatus += old_div(
                            self._scriptWeightDict[self._analysisName]["total"]
                            *
                            self._scriptStages["trendNarrativeEnd"]["weight"],
                            10)
                        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                                    "trendNarrativeEnd",\
                                                    "info",\
                                                    self._scriptStages["trendNarrativeEnd"]["summary"],\
                                                    self._completionStatus,\
                                                    self._completionStatus)
                        CommonUtils.save_progress_message(
                            self._messageURL, progressMessage)
                        self._dataframe_context.update_completion_status(
                            self._completionStatus)
                    else:
                        # self._result_setter.update_executive_summary_data({"trend_present":False})
                        print("Trend Analysis for Measure Failed")
                        print("#" * 20 + "Trend Analysis Error" + "#" * 20)
                        print(
                            "No date format for the date column %s was detected."
                            % (self._date_column_suggested))
                        print("#" * 60)
                        self._completionStatus += self._scriptWeightDict[
                            self._analysisName]["total"]
                        self._dataframe_context.update_completion_status(
                            completionStatus)
                        progressMessage = CommonUtils.create_progress_message_object("Trend","failedState","error",\
                                        "Trend Failed As "+"No Date Format For The Date Column %s Was Detected !!!" %(self._date_column_suggested),\
                                        completionStatus,completionStatus)
                        CommonUtils.save_progress_message(
                            messageURL, progressMessage)
                        self._dataframe_context.update_completion_status(
                            self._completionStatus)
                else:
                    # self._result_setter.update_executive_summary_data({"trend_present":False})
                    print("Trend Analysis for Measure Failed")
                    print("#" * 20 + "Trend Analysis Error" + "#" * 20)
                    print("No date column present for Trend Analysis.")
                    print("#" * 60)
                    self._completionStatus += self._scriptWeightDict[
                        self._analysisName]["total"]
                    self._dataframe_context.update_completion_status(
                        completionStatus)
                    progressMessage = CommonUtils.create_progress_message_object("Trend","failedState","error",\
                                    "No Date Column Present",\
                                    completionStatus,completionStatus)
                    CommonUtils.save_progress_message(messageURL,
                                                      progressMessage)
                    self._dataframe_context.update_completion_status(
                        self._completionStatus)
            else:
                print("overall Trend not Started YET")

        elif self._analysistype == "dimension":
            print("Dimension Trend Started")
            self._completionStatus += old_div(
                self._scriptWeightDict[self._analysisName]["total"] *
                self._scriptStages["initialization"]["weight"], 10)
            progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                        "initialization",\
                                        "info",\
                                        self._scriptStages["initialization"]["summary"],\
                                        self._completionStatus,\
                                        self._completionStatus)
            CommonUtils.save_progress_message(self._messageURL,
                                              progressMessage)
            self._dataframe_context.update_completion_status(
                self._completionStatus)

            self.narratives = {"card0": {}}
            if self._selected_date_columns != None:
                if self._dateFormatDetected:
                    # result_column_levels = [x[0] for x in self._data_frame.select(self._result_column).distinct().collect()]
                    try:
                        result_column_levels = self._metaParser.get_unique_level_names(
                            self._result_column)
                    except:
                        if self._pandas_flag:
                            result_column_levels = list(
                                self._data_frame[self._result_column].unique())
                        else:
                            result_column_levels = [
                                x[0] for x in self._data_frame.select(
                                    self._result_column).distinct().collect()
                            ]
                            # result_column_levels = self._data_frame.agg((F.collect_set(self._result_column).alias(self._result_column))).first().asDict()[self._result_column]

                    print("-" * 100)
                    # TODO Implement meta parser getter here
                    print(result_column_levels)
                    if self._pandas_flag:
                        level_count_df = self._data_frame[
                            self._result_column].value_counts()[0:2]
                        top2levels = list(level_count_df.index)
                    else:
                        level_count_df = self._data_frame.groupBy(
                            self._result_column).count().orderBy(
                                "count", ascending=False)
                        level_count_df_rows = level_count_df.collect()
                        top2levels = [
                            level_count_df_rows[0][0],
                            level_count_df_rows[1][0]
                        ]
                    cardData = []
                    chart_data = {}
                    cardData1 = []
                    c3_chart = {"dataType": "c3Chart", "data": {}}
                    print("#" * 40)
                    overall_count = NarrativesUtils.get_grouped_count_data_for_dimension_trend(
                        self._data_frame, self._dataLevel, self._result_column,
                        self._pandas_flag)
                    print("#" * 40)
                    for idx, level in enumerate(top2levels):
                        print("calculations in progress for the level :- ",
                              level)
                        if self._pandas_flag:
                            leveldf = self._data_frame[self._data_frame[
                                self._result_column] == level]
                        else:
                            leveldf = self._data_frame.filter(
                                col(self._result_column) == level)
                        grouped_data = NarrativesUtils.get_grouped_data_for_trend(
                            leveldf, self._dataLevel, self._result_column,
                            self._analysistype, self._pandas_flag)
                        grouped_data.rename(columns={"value": "value_count"},
                                            inplace=True)
                        grouped_data = pd.merge(grouped_data,
                                                overall_count,
                                                on='key',
                                                how='left')
                        # grouped_data["value"] = grouped_data["value_count"].apply(lambda x:round(x*100/float(self._data_frame.count()),self._num_significant_digits))
                        grouped_data["value"] = old_div(
                            grouped_data["value_count"],
                            grouped_data["totalCount"])
                        grouped_data["value"] = grouped_data["value"].apply(
                            lambda x: round(x * 100, self.
                                            _num_significant_digits))
                        if self._pandas_flag:
                            leveldf = leveldf.drop(self._date_column_suggested,
                                                   axis=1)
                            leveldf = leveldf.rename(
                                columns={
                                    "year_month": self._date_column_suggested
                                })
                            if "year_month" not in leveldf.columns:
                                leveldf["year_month"] = leveldf[
                                    self._date_column_suggested]
                            leveldf["value_col"] = 1
                        else:
                            leveldf = leveldf.drop(self._date_column_suggested)
                            leveldf = leveldf.withColumnRenamed(
                                "year_month", self._date_column_suggested)
                            if "year_month" not in leveldf.columns:
                                leveldf = leveldf.withColumn(
                                    "year_month",
                                    col(self._date_column_suggested))
                            leveldf = leveldf.withColumn('value_col', lit(1))

                        trend_narrative_obj = TrendNarrative(
                            self._result_column, self._date_column_suggested,
                            grouped_data, self._existingDateFormat,
                            self._requestedDateFormat, self._base_dir,
                            self._metaParser)
                        dataDict = trend_narrative_obj.generateDataDict(
                            grouped_data, self._dataLevel,
                            self._durationString)
                        dataDict["target_column"] = dataDict["measure"]
                        dataDict["measure"] = level
                        dataDict["duration"] = self._duration
                        dataDict["dataLevel"] = self._dataLevel
                        dataDict["durationString"] = self._durationString
                        # grouped_data.to_csv("/home/gulshan/marlabs/datasets/grouped_data"+str(idx))
                        # print json.dumps(dataDict,indent=2)
                        significant_dimensions = []
                        significant_dimension_dict = df_helper.get_chisquare_significant_dimension(
                        )
                        if significant_dimension_dict != {} and significant_dimension_dict != None:
                            significant_dimension_tuple = tuple(
                                significant_dimension_dict.items())
                            significant_dimension_tuple = sorted(
                                significant_dimension_tuple,
                                key=lambda x: x[1],
                                reverse=True)
                            significant_dimensions = [
                                x[0] for x in
                                significant_dimension_tuple[:self.
                                                            _number_of_dimensions_to_consider]
                            ]
                        else:
                            significant_dimensions = self._string_columns[:self
                                                                          .
                                                                          _number_of_dimensions_to_consider]
                        print("significant_dimensions", significant_dimensions)
                        reference_time = dataDict["reference_time"]
                        dataDict[
                            "significant_dimensions"] = significant_dimensions
                        if len(significant_dimensions) > 0:
                            st = time.time()
                            xtraData = trend_narrative_obj.get_xtra_calculations(
                                leveldf, grouped_data, significant_dimensions,
                                self._date_column_suggested, "value_col",
                                self._existingDateFormat, reference_time,
                                self._dataLevel, self._pandas_flag)
                            print("time for get_xtra_calculations",
                                  time.time() - st)
                            if xtraData != None:
                                dataDict.update(xtraData)
                        dimensionCount = trend_narrative_obj.generate_dimension_extra_narrative(
                            grouped_data, dataDict, self._dataLevel)
                        if dimensionCount != None:
                            dataDict.update(dimensionCount)

                        dataDict.update({
                            "level_index": idx,
                            "blockSplitter": self._blockSplitter,
                            "highlightFlag": self._highlightFlag
                        })

                        self._result_setter.update_executive_summary_data(
                            dataDict)
                        trendStory = NarrativesUtils.get_template_output(self._base_dir,\
                                                                        'dimension_trend.html',dataDict)
                        blocks = NarrativesUtils.block_splitter(
                            trendStory, self._blockSplitter)

                        if idx != 0:
                            cardData1 += blocks[2:]
                        else:
                            cardData1 += blocks

                        trend_chart_data = [
                            x for x in list(grouped_data[
                                ["key", "value"]].T.to_dict().values())
                            if x['key'] != None
                        ]
                        trend_chart_data = sorted(trend_chart_data,
                                                  key=lambda x: x["key"])
                        card1chartdata = trend_chart_data
                        if self._dataLevel == "day":
                            card1chartdata = [{
                                "key": str(val["key"]),
                                "value": val["value"]
                            } for val in card1chartdata]
                        elif self._dataLevel == "month":
                            card1chartdata = [{
                                "key":
                                val["key"].strftime("%b-%y"),
                                "value":
                                val["value"]
                            } for val in card1chartdata]
                        chart_data[level] = card1chartdata

                    labels = {
                        "x": "key",
                        "y": list(chart_data.keys())[0],
                        "y2": list(chart_data.keys())[1]
                    }
                    c3Chart = {
                        "data": chart_data,
                        "format": "%b-%y",
                        "label": labels,
                        "label_text": {
                            "x": "Time",
                            "y": "Percentage of " + labels["y"],
                            "y2": "Percentage of " + labels["y2"]
                        }
                    }

                    c3_chart["data"] = c3Chart
                    multiLineData = []
                    for idx in range(len(chart_data[top2levels[0]])):
                        key = chart_data[top2levels[0]][idx]["key"]
                        value = chart_data[top2levels[0]][idx]["value"]
                        try:
                            value1 = chart_data[top2levels[1]][idx]["value"]
                        except:
                            value1 = 0
                        multiLineData.append({
                            "key": key,
                            top2levels[0]: value,
                            top2levels[1]: value1
                        })
                    chartData = NormalChartData(multiLineData)
                    chartJson = ChartJson()
                    chartJson.set_data(chartData.get_data())
                    chartJson.set_label_text(c3Chart["label_text"])
                    chartJson.set_legend(c3Chart["label"])
                    chartJson.set_chart_type("line")
                    chartJson.set_yaxis_number_format(".2f")
                    chartJson.set_axes(labels)
                    st_info = [
                        "Trend Analysis",
                        "Forecast Method : Holt Winters Method"
                    ]
                    cardData1.insert(1,
                                     C3ChartData(data=chartJson, info=st_info))
                    trendCard = NormalCard(name="Trend Analysis",
                                           slug=None,
                                           cardData=cardData1)
                    trendStoryNode = NarrativesTree("Trend", None, [],
                                                    [trendCard])
                    self._story_narrative.add_a_node(trendStoryNode)
                    self._result_setter.set_trend_node(trendStoryNode)
                    self._completionStatus += old_div(
                        self._scriptWeightDict[self._analysisName]["total"] *
                        self._scriptStages["summarygeneration"]["weight"], 10)
                    progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                                "summarygeneration",\
                                                "info",\
                                                self._scriptStages["summarygeneration"]["summary"],\
                                                self._completionStatus,\
                                                self._completionStatus)
                    CommonUtils.save_progress_message(self._messageURL,
                                                      progressMessage)
                    self._dataframe_context.update_completion_status(
                        self._completionStatus)

                    self._completionStatus += old_div(
                        self._scriptWeightDict[self._analysisName]["total"] *
                        self._scriptStages["completion"]["weight"], 10)
                    progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                                "completion",\
                                                "info",\
                                                self._scriptStages["completion"]["summary"],\
                                                self._completionStatus,\
                                                self._completionStatus)
                    CommonUtils.save_progress_message(self._messageURL,
                                                      progressMessage)
                    self._dataframe_context.update_completion_status(
                        self._completionStatus)
                else:
                    self._result_setter.update_executive_summary_data(
                        {"trend_present": False})
                    print("Trend Analysis for Dimension Failed")
                    print("#" * 20 + "Trend Analysis Error" + "#" * 20)
                    if self._date_column_suggested:
                        print(
                            "No date format for the date column %s was detected."
                            % (self._date_column_suggested))
                    print("#" * 60)
                    self._completionStatus += self._scriptWeightDict[
                        self._analysisName]["total"]
                    self._dataframe_context.update_completion_status(
                        self._completionStatus)
                    progressMessage = CommonUtils.create_progress_message_object("Trend","failedState","error",\
                                    "Trend Failed As "+"No Date Format For The Date Column %s Was Detected !!!" %(self._date_column_suggested),\
                                    self._completionStatus,self._completionStatus)
                    CommonUtils.save_progress_message(messageURL,
                                                      progressMessage)
                    self._dataframe_context.update_completion_status(
                        self._completionStatus)

            else:
                self._result_setter.update_executive_summary_data(
                    {"trend_present": False})
                print("Trend Analysis for Dimension Failed")
                print("#" * 20 + "Trend Analysis Error" + "#" * 20)
                print("No date column present for Trend Analysis.")
                print("#" * 60)
                self._completionStatus += self._scriptWeightDict[
                    self._analysisName]["total"]
                self._dataframe_context.update_completion_status(
                    self._completionStatus)
                progressMessage = CommonUtils.create_progress_message_object("Trend","failedState","error",\
                                "No Date Column Present",\
                                self._completionStatus,self._completionStatus)
                CommonUtils.save_progress_message(messageURL, progressMessage)
                self._dataframe_context.update_completion_status(
                    self._completionStatus)
コード例 #3
0
 def __init__(self, df, dataframe_context):
     self._data_frame = df
     self._metaHelperInstance = MetaDataHelper(self._data_frame,
                                               self._data_frame.count())
     self._dataframe_context = dataframe_context
コード例 #4
0
class FeatureEngineeringHelper(object):
    """Contains Feature Engineering Operation Functions"""
    def __init__(self, df, dataframe_context):
        self._data_frame = df
        self._metaHelperInstance = MetaDataHelper(self._data_frame,
                                                  self._data_frame.count())
        self._dataframe_context = dataframe_context

        # self._dataframe_helper = dataframe_helper

    def to_date_(self,
                 col,
                 formats=GLOBALSETTINGS.
                 SUPPORTED_DATETIME_FORMATS["pyspark_formats"]):
        # Spark 2.2 or later syntax, for < 2.2 use unix_timestamp and cast
        return F.coalesce(*[to_date(col, f) for f in formats])

    def binning_all_measures(self, number_of_bins, consider_cols):
        dfSchemaFields = self._data_frame.schema.fields
        numeric_columns = []
        cols_to_be_binned = [x[:-4] for x in consider_cols if x[-4:] == "_bin"]
        for field in dfSchemaFields:
            if ColumnType(type(field.dataType)).get_abstract_data_type(
            ) == ColumnType.MEASURE:
                numeric_columns.append(field.name)
        for column_name in numeric_columns and cols_to_be_binned:
            self._data_frame = self.create_equal_sized_measure_bins(
                column_name, number_of_bins)
        return self._data_frame

    def create_bin_udf(self, dict):
        def check_key(x, dict):
            for key in list(dict.keys()):
                if (x >= dict[key][0] and x <= dict[key][1]):
                    return key

        return udf(lambda x: check_key(x, dict) if x != None else "None")

    # def binning_all_measures_sumeet(self, n_bins):
    #     dfSchemaFields = self._data_frame.schema.fields
    #     numeric_columns = []
    #     for field in dfSchemaFields:
    #         if ColumnType(type(field.dataType)).get_abstract_data_type() == ColumnType.MEASURE:
    #             numeric_columns.append(field.name)
    #     for column_name in numeric_columns:
    #         col_min = self._data_frame.select(F.min(column_name)).collect()[0][0]
    #         col_max = self._data_frame.select(F.max(column_name)).collect()[0][0]
    #         bins_unrounded = linspace(col_min, col_max, n_bins + 1)
    #
    #         bins = []
    #         bins.insert(0, col_min)
    #         for val in bins_unrounded[1:n_bins]:
    #             bins.append(round(val, 2))
    #         bins.append(col_max)
    #
    #         bucketizer = Bucketizer(splits = bins, inputCol = column_name, outputCol = column_name + "_binned")
    #         self._data_frame = bucketizer.transform(self._data_frame)
    #
    #         keys = []
    #         lists = []
    #         for val in range(0, n_bins):
    #             keys.append(str(bins[val]) + "-" + str(bins[val + 1]))
    #             list = []
    #             list.append(bins[val])
    #             list.append(bins[val + 1])
    #             lists.append(list)
    #
    #         dict = {}
    #         for i in range(0, n_bins):
    #             dict[keys[i]] = lists[i]
    #
    #         map_list = [x for x in range(n_bins)]
    #         dict_new = {}
    #         for n in range(0, n_bins):
    #             dict_new[map_list[n]] = keys[n]
    #
    #         def create_level_udf_sumeet(dict):
    #             def check_key(x, dict):
    #                 for key in dict.keys():
    #                     if x == key:
    #                         return dict[key]
    #             return udf(lambda x: check_key(x,dict))
    #
    #         self._data_frame = self._data_frame.withColumn(column_name + "_binned", create_level_udf_sumeet(dict_new)(col(column_name + "_binned")))
    #     return self._data_frame

    def create_level_udf(self, dict):
        selected_list = []
        for key in list(dict.keys()):
            selected_list = selected_list + dict[key]

        def check_key(x, dict):
            for key in list(dict.keys()):
                if x in selected_list:
                    if x in dict[key]:
                        return key
                else:
                    return x

        return udf(lambda x: check_key(x, dict) if x != None else x)

    def create_new_levels_dimension(self, column_name, dict):
        self._data_frame = self._data_frame.withColumn(
            column_name + "_level",
            self.create_level_udf(dict)(col(column_name)))
        return self._data_frame

    def create_level_udf_time(self, dict, date_format):
        def convert_to_date(value):
            if isinstance(value, str):
                value = datetime.strptime(value, date_format)
            elif isinstance(value, str):
                value = datetime.strptime(value, date_format)
            else:
                value = value
            return value

        def convert_to_date_from_level_value(value):
            value = datetime.strptime(value, '%d/%m/%Y')
            return datetime.date(value)

        def check_key(date, dict):
            date = convert_to_date(date)
            for key, value in list(dict.items()):
                val1_date = convert_to_date_from_level_value(value[0])
                val2_date = convert_to_date_from_level_value(value[1])
                date_range = [val1_date, val2_date]
                if (date >= date_range[0] and date <= date_range[1]):
                    return key

        return udf(lambda x: check_key(x, dict) if x != None else x)

    def create_new_levels_datetimes(self, col_for_timelevels, dict):
        # self._metaHelperInstance = MetaDataHelper(self._data_frame, self._data_frame.count())
        self._data_frame = self._data_frame.withColumn(
            col_for_timelevels + '_temp', self.to_date_(col_for_timelevels))
        uniqueVals = self._data_frame.select(
            col_for_timelevels +
            '_temp').distinct().na.drop().limit(100).collect()
        try:
            date_format = self._metaHelperInstance.get_datetime_format(
                uniqueVals)
        except TypeError:
            date_format = None
        self._data_frame = self._data_frame.withColumn(
            col_for_timelevels + "_t_level",
            self.create_level_udf_time(dict, date_format)(
                col(col_for_timelevels + '_temp')))
        self._data_frame = self._data_frame.drop(col_for_timelevels + '_temp')
        return self._data_frame

    def create_bin_udf(self, dict):
        def check_key(x, dict):
            for key in list(dict.keys()):
                if (x >= dict[key][0] and x <= dict[key][1]):
                    return key

        return udf(lambda x: check_key(x, dict) if x != None else "None")

    def create_equal_sized_measure_bins(self, column_name, number_of_bins):
        def create_dict_for_bin():
            min_max = self._data_frame.agg(
                F.min(column_name).alias('min'),
                F.max(column_name).alias('max')).collect()
            min_value = min_max[0]['min']
            max_value = min_max[0]['max']
            interval_size = (old_div((max_value - min_value) * 1.0,
                                     (number_of_bins - 1)))
            dict = {}
            temp = min_value
            while temp <= max_value:
                dict[str(round(temp, 3)) + "-" +
                     str(round(temp + interval_size, 3))] = [
                         temp, temp + interval_size
                     ]
                temp = temp + interval_size
            return dict

        dict = create_dict_for_bin()
        self._data_frame = self._data_frame.withColumn(
            column_name + "_bin",
            self.create_bin_udf(dict)(col(column_name)))
        return self._data_frame

    def create_custom_measure_bins(self, column_name, list_of_intervals):
        def create_dict_for_bin():
            min_max = self._data_frame.agg(
                F.min(column_name).alias('min'),
                F.max(column_name).alias('max')).collect()
            min_value = min_max[0]['min']
            max_value = min_max[0]['max']
            dict = {}
            if list_of_intervals[0] > min_value:
                dict[str(min_value) + "-" + str(list_of_intervals[0])] = [
                    min_value, list_of_intervals[0]
                ]
            for i in range(len(list_of_intervals)):
                if i + 2 <= len(list_of_intervals):
                    dict[str(list_of_intervals[i]) + "-" +
                         str(list_of_intervals[i + 1])] = [
                             list_of_intervals[i], list_of_intervals[i + 1]
                         ]
            if list_of_intervals[-1] < max_value:
                dict[str(list_of_intervals[-1]) + "-" +
                     str(max_value)] = [list_of_intervals[-1], max_value]

            return dict

        dict = create_dict_for_bin()
        self._data_frame = self._data_frame.withColumn(
            column_name + "_c_bin",
            self.create_bin_udf(dict)(col(column_name)))
        return self._data_frame

    '''To be verified'''

    def replace_values_in_column(self, column_name, range, value):
        if False:
            if value == "median":
                dp_helper_obj = DataPreprocessingHelper(
                    self._data_frame, self._dataframe_context)
                median_val = dp_helper_obj.get_median(self._data_frame,
                                                      column_name)
                replace_value = median_val
                self._data_frame = self._data_frame.withColumn(
                    column_name,
                    when(((self._data_frame[column_name] >= range[0]) &
                          (self._data_frame[column_name] <= range[1])),
                         replace_value).otherwise(
                             self._data_frame[column_name]))
            if value == "mode":
                dp_helper_obj = DataPreprocessingHelper(
                    self._data_frame, self._dataframe_context)
                mode_val = dp_helper_obj.get_mode(self._data_frame,
                                                  column_name)
                replace_value = mode_val
                self._data_frame = self._data_frame.withColumn(
                    column_name,
                    when(((self._data_frame[column_name] >= range[0]) &
                          (self._data_frame[column_name] <= range[1])),
                         replace_value).otherwise(
                             self._data_frame[column_name]))
            else:
                replace_value = value
                self._data_frame = self._data_frame.withColumn(
                    column_name,
                    when(((self._data_frame[column_name] >= range[0]) &
                          (self._data_frame[column_name] <= range[1])),
                         replace_value).otherwise(
                             self._data_frame[column_name]))
        else:
            if value == "median":
                dp_helper_obj = DataPreprocessingHelper(
                    self._data_frame, self._dataframe_context)
                median_val = dp_helper_obj.get_median(self._data_frame,
                                                      column_name)
                replace_value = median_val
                self._data_frame = self._data_frame.withColumn(
                    column_name + "_treated_" + str(range) + "_median",
                    when(self._data_frame[column_name] == range,
                         replace_value).otherwise(
                             self._data_frame[column_name]))
            elif value == "mode":
                dp_helper_obj = DataPreprocessingHelper(
                    self._data_frame, self._dataframe_context)
                mode_val = dp_helper_obj.get_mode(self._data_frame,
                                                  column_name)
                replace_value = mode_val
                self._data_frame = self._data_frame.withColumn(
                    column_name + "_treated_" + str(range) + "_mode",
                    when(self._data_frame[column_name] == range,
                         replace_value).otherwise(
                             self._data_frame[column_name]))
            elif value == "mean":
                dp_helper_obj = DataPreprocessingHelper(
                    self._data_frame, self._dataframe_context)
                mean_value = self._data_frame.agg(avg(column_name)).first()[0]
                replace_value = mean_value
                self._data_frame = self._data_frame.withColumn(
                    column_name + "_treated_" + str(range) + "_mean",
                    when(self._data_frame[column_name] == range,
                         replace_value).otherwise(
                             self._data_frame[column_name]))
            else:
                replace_value = value
                self._data_frame = self._data_frame.withColumn(
                    column_name + "_treated_" + str(range) + "_" +
                    str(replace_value),
                    when(self._data_frame[column_name] == range,
                         replace_value).otherwise(
                             self._data_frame[column_name]))
        return self._data_frame

    def standardize_column(self, column_name):
        def standardize_column_helper(mean, sd):
            return udf(lambda x: old_div((x - mean) * 1.0, sd)
                       if x != None else x)

        mean = self._data_frame.select(F.mean(column_name)).collect()[0][0]
        StdDev = self._data_frame.select(
            F.stddev_samp(column_name)).collect()[0][0]
        self._data_frame = self._data_frame.withColumn(
            column_name + "_fs_standardized",
            standardize_column_helper(mean, StdDev)(col(column_name)))
        self._data_frame = self._data_frame.withColumn(
            column_name + "_fs_standardized",
            self._data_frame[column_name + "_fs_standardized"].cast('float'))
        return self._data_frame

    '''Rounds off the returned value ==> values formed are either 0 or 1'''

    def normalize_column(self, column_name):
        def normalize_column_helper(min, max):
            return udf(lambda x: old_div((x - min) * 1.0, (max - min))
                       if x != None else x)

        max = self._data_frame.select(F.max(column_name)).collect()[0][0]
        min = self._data_frame.select(F.min(column_name)).collect()[0][0]
        self._data_frame = self._data_frame.withColumn(
            column_name + "_fs_normalized",
            normalize_column_helper(min, max)(col(column_name)))
        self._data_frame = self._data_frame.withColumn(
            column_name + "_fs_normalized",
            self._data_frame[column_name + "_fs_normalized"].cast('float'))
        return self._data_frame

    def replacerUDF(self, value, operation):
        if operation == "prod":
            return udf(lambda x: x * value if x != None else x)
        if operation == "add":
            return udf(lambda x: x + value if x != None else x)
        if operation == "subs":
            return udf(lambda x: x - value if x != None else x)
        if operation == "divide":
            return udf(lambda x: old_div(x, value) if x != None else x)
        if operation == "Reciprocal":
            return udf(lambda x: old_div(1, x) if x != None else x)
        if operation == "NthRoot":
            try:
                return udf(lambda x: x**(1.0 / value) if x != None else x)
            except:
                return udf(lambda x: x)
        if operation == "exponential":
            return udf(lambda x: x**value if x != None else x)
        if operation == "logTransform":
            return udf(lambda x: math.log(x, 10) if x != None else x)
        if operation == "modulus":
            return udf(lambda x: abs(x) if x != None else x)

    def logTransform_column(self, column_name):
        column_min = self._data_frame.select(
            F.min(column_name)).collect()[0][0]
        value_to_be_added = abs(column_min) + 1
        if column_min > 0:
            self._data_frame = self._data_frame.withColumn(
                column_name + "_vt_log_transformed",
                self.replacerUDF(10, "logTransform")(col(column_name)))
            self._data_frame = self._data_frame.withColumn(
                column_name + "_vt_log_transformed",
                self._data_frame[column_name +
                                 "_vt_log_transformed"].cast('float'))
        else:
            self._data_frame = self._data_frame.withColumn(
                column_name + "_temp_transformed",
                self.replacerUDF(value_to_be_added, "add")(col(column_name)))
            self._data_frame = self._data_frame.withColumn(
                column_name + "_vt_log_transformed",
                self.replacerUDF(10, "logTransform")(col(column_name +
                                                         "_temp_transformed")))
            self._data_frame = self._data_frame.withColumn(
                column_name + "_vt_log_transformed",
                self._data_frame[column_name +
                                 "_vt_log_transformed"].cast('float'))
            self._data_frame = self._data_frame.drop(column_name +
                                                     "_temp_transformed")
        return self._data_frame

    def modulus_transform_column(self, column_name):
        self._data_frame = self._data_frame.withColumn(
            column_name + "_vt_modulus_transformed",
            self.replacerUDF(10, "modulus")(col(column_name)))
        self._data_frame = self._data_frame.withColumn(
            column_name + "_vt_modulus_transformed",
            self._data_frame[column_name +
                             "_vt_modulus_transformed"].cast('float'))
        return self._data_frame

    def cuberoot_transform_column(self, column_name):
        self._data_frame = self._data_frame.withColumn(
            column_name + "_vt_cuberoot_transformed",
            self.replacerUDF(3, "NthRoot")(col(column_name)))
        self._data_frame = self._data_frame.withColumn(
            column_name + "_vt_cuberoot_transformed",
            self._data_frame[column_name +
                             "_vt_cuberoot_transformed"].cast('float'))
        return self._data_frame

    def squareroot_transform_column(self, column_name):
        column_min = self._data_frame.select(
            F.min(column_name)).collect()[0][0]
        if column_min >= 0:
            self._data_frame = self._data_frame.withColumn(
                column_name + "_vt_squareroot_transformed",
                self.replacerUDF(2, "NthRoot")(col(column_name)))
            self._data_frame = self._data_frame.withColumn(
                column_name + "_vt_squareroot_transformed",
                self._data_frame[column_name +
                                 "_vt_squareroot_transformed"].cast('float'))
        else:
            self._data_frame = self._data_frame.withColumn(
                column_name + "_vt_squareroot_transformed", F.lit(0))
        return self._data_frame

    def label_encoding_column(self, column_name):
        indexers = [
            StringIndexer(inputCol=column_name,
                          outputCol=column_name + "_ed_label_encoded",
                          handleInvalid="keep").fit(self._data_frame)
        ]
        pipeline = Pipeline(stages=indexers)
        self._data_frame = pipeline.fit(self._data_frame).transform(
            self._data_frame)
        return self._data_frame
#Need to check for an alternative for oneHot Encoding for Pyspark

    def onehot_encoding_column(self, column_name):
        self._data_frame = self.label_encoding_column(column_name)
        encoder = OneHotEncoder(dropLast=False,
                                inputCol=column_name + "_ed_label_encoded",
                                outputCol=column_name + "_ed_one_hot_encoded")
        self._data_frame = encoder.transform(self._data_frame)
        self._data_frame = self._data_frame.withColumn(
            column_name + "_ed_one_hot_encoded",
            self._data_frame[column_name +
                             "_ed_one_hot_encoded"].cast('string'))
        self._data_frame = self._data_frame.drop(column_name +
                                                 "_ed_label_encoded")
        return self._data_frame

    def character_count_string(self, column_name):
        def character_count_string_helper():
            return udf(lambda x: x.count("") - 1 if x != None else 0)

        self._data_frame = self._data_frame.withColumn(
            column_name + "_character_count",
            character_count_string_helper()(col(column_name)))
        self._data_frame = self._data_frame.withColumn(
            column_name + "_character_count",
            self._data_frame[column_name + "_character_count"].cast('float'))
        return self._data_frame

    def contains_word_helper(self, word):
        return udf(lambda x: False
                   if x == None or x.lower().find(word) == -1 else True)

    def contains_word(self, column_name, word):
        # word = word.lower()
        self._data_frame = self._data_frame.withColumn(
            column_name + "_contains_" + word,
            self.contains_word_helper(word)(col(column_name)))
        return self._data_frame

    '''Given that all datetime columns follow same string format == "dd/MM/yyyy" for date'''

    def convert_to_timestamp(self, datetime_col, timeformat):
        timestamped = datetime_col + "_timestamped"
        self._data_frame = self._data_frame.withColumn(
            timestamped,
            to_timestamp(self._data_frame[datetime_col],
                         timeformat).alias(datetime_col))
        return self._data_frame

#Timeformat is hardcoded as "dd/MM/yyyy"

    def count_time_since(self, col_for_time_since, time_since_date):
        '''Columns to be passed for calculating duration need to be in TimeStamped format'''
        '''time_since_date should be in dd/MM/yyyy format'''
        # print "COUNT TIME SINCE - "
        # self._metaHelperInstance = MetaDataHelper(self._data_frame, self._data_frame.count())
        self._data_frame = self._data_frame.withColumn(
            col_for_time_since + '_temp', self.to_date_(col_for_time_since))
        uniqueVals = self._data_frame.select(
            col_for_time_since +
            '_temp').distinct().na.drop().limit(1000).collect()
        try:
            date_format = self._metaHelperInstance.get_datetime_format(
                uniqueVals)
            self._data_frame = self._data_frame.withColumn(
                "TIME_SINCE_DATE", F.lit(time_since_date))
            to_date_udf = udf(
                lambda x: datetime.strptime(x, date_format)
                if x != None else x, DateType())
            self._data_frame = self._data_frame.withColumn(
                col_for_time_since + '_temp',
                to_date_udf(col(col_for_time_since + '_temp')))
            self._data_frame = self._data_frame.withColumn(
                "TIME_SINCE_DATE(Timestamped)",
                to_timestamp(self._data_frame["TIME_SINCE_DATE"],
                             "dd/MM/yyyy"))
            self._data_frame = self._data_frame.withColumn(
                col_for_time_since + "_time_since",
                datediff(self._data_frame["TIME_SINCE_DATE(Timestamped)"],
                         self._data_frame[col_for_time_since + '_temp']))
            self._data_frame = self._data_frame.drop(
                "TIME_SINCE_DATE", "TIME_SINCE_DATE(Timestamped)")
        except TypeError:
            self._data_frame = self._data_frame.withColumn(
                "TIME_SINCE_DATE", F.lit(time_since_date))
            self._data_frame = self._data_frame.withColumn(
                "TIME_SINCE_DATE(Timestamped)",
                to_timestamp(self._data_frame["TIME_SINCE_DATE"],
                             "dd/MM/yyyy"))
            self._data_frame = self._data_frame.withColumn(
                col_for_time_since + "_time_since",
                datediff(self._data_frame["TIME_SINCE_DATE(Timestamped)"],
                         self._data_frame[col_for_time_since + '_temp']))
            self._data_frame = self._data_frame.drop(
                "TIME_SINCE_DATE", "TIME_SINCE_DATE(Timestamped)")

        # self._data_frame = self._data_frame.withColumn(col_for_time_since, to_timestamp(self._data_frame[col_for_time_since], "dd/MM/yyyy").alias(col_for_time_since))
        # self._data_frame = self._data_frame.withColumn(col_for_time_since, F.from_unixtime(F.unix_timestamp(self._data_frame[col_for_time_since]), "dd/MM/yyyy").alias(col_for_time_since))
        self._data_frame = self._data_frame.drop(col_for_time_since + '_temp')
        return self._data_frame

#TODO - Check for timestamp conversion related issues if any

    def month_to_string(self, dict):
        def month_to_string_helper(x, dict):
            for key in list(dict.keys()):
                if int(x) == key:
                    return dict[key]

        #return udf(lambda x: dict_for_month_helper(x,dict))
        return udf(lambda x: month_to_string_helper(x, dict)
                   if x != None else x)

#Timeformat is hardcoded as "dd/MM/yyyy"

    def extract_datetime_info(self, datetime_col, info_to_extract):
        self._data_frame = self._data_frame.withColumn(
            datetime_col + '_temp', self.to_date_(datetime_col))
        timestamped = datetime_col + "_timestamped"
        # self._metaHelperInstance = MetaDataHelper(self._data_frame, self._data_frame.count())
        uniqueVals = self._data_frame.select(
            datetime_col + '_temp').distinct().na.drop().limit(10).collect()
        try:
            date_format = self._metaHelperInstance.get_datetime_format(
                uniqueVals)
            to_date_udf = udf(
                lambda x: datetime.strptime(x, date_format)
                if x != None else x, DateType())
            self._data_frame = self._data_frame.withColumn(
                datetime_col + '_temp',
                to_date_udf(self._data_frame[datetime_col +
                                             '_temp']).alias(datetime_col +
                                                             '_temp'))
            if info_to_extract == "year":
                self._data_frame = self._data_frame.withColumn(
                    datetime_col + "_year",
                    year(
                        to_timestamp(self._data_frame[datetime_col + '_temp'],
                                     "dd/MM/yyyy")))
            if info_to_extract == "month_of_year":
                dict = {
                    1: "January",
                    2: "February",
                    3: "March",
                    4: "April",
                    5: "May",
                    6: "June",
                    7: "July",
                    8: "August",
                    9: "September",
                    10: "October",
                    11: "November",
                    12: "December"
                }
                self._data_frame = self._data_frame.withColumn(
                    datetime_col + "_month",
                    month(
                        to_timestamp(self._data_frame[datetime_col + '_temp'],
                                     "dd/MM/yyyy")))
                self._data_frame = self._data_frame.withColumn(
                    datetime_col + "_etf_month_of_year",
                    self.month_to_string(dict)(col(datetime_col + "_month")))
            if info_to_extract == "day_of_month":
                self._data_frame = self._data_frame.withColumn(
                    datetime_col + "_day_of_month",
                    dayofmonth(
                        to_timestamp(self._data_frame[datetime_col + '_temp'],
                                     "dd/MM/yyyy")))
            if info_to_extract == "day_of_year":
                self._data_frame = self._data_frame.withColumn(
                    datetime_col + "_day_of_year",
                    dayofyear(
                        to_timestamp(self._data_frame[datetime_col + '_temp'],
                                     "dd/MM/yyyy")))
            if info_to_extract == "day_of_week":
                self._data_frame = self._data_frame.withColumn(
                    datetime_col + "_etf_day_of_week",
                    dayofweek(datetime_col + '_temp'))
            if info_to_extract == "week_of_year":
                self._data_frame = self._data_frame.withColumn(
                    datetime_col + "_week_of_year",
                    weekofyear(
                        to_timestamp(self._data_frame[datetime_col + '_temp'],
                                     "dd/MM/yyyy")))
            if info_to_extract == "hour":
                self._data_frame = self._data_frame.withColumn(
                    datetime_col + "_hour",
                    hour(
                        to_timestamp(self._data_frame[datetime_col + '_temp'],
                                     "dd/MM/yyyy")))
            if info_to_extract == "minute":
                self._data_frame = self._data_frame.withColumn(
                    datetime_col + "_minute",
                    minute(
                        to_timestamp(self._data_frame[datetime_col + '_temp'],
                                     "dd/MM/yyyy")))
            if info_to_extract == "date":
                self._data_frame = self._data_frame.withColumn(
                    datetime_col + "_date",
                    to_timestamp(self._data_frame[datetime_col + '_temp'],
                                 "dd/MM/yyyy").cast("date"))
            else:
                pass
        except TypeError:
            if info_to_extract == "year":
                self._data_frame = self._data_frame.withColumn(
                    datetime_col + "_year",
                    year(
                        to_timestamp(self._data_frame[datetime_col + '_temp'],
                                     "dd/MM/yyyy")))
            if info_to_extract == "month_of_year":
                dict = {
                    1: "January",
                    2: "February",
                    3: "March",
                    4: "April",
                    5: "May",
                    6: "June",
                    7: "July",
                    8: "August",
                    9: "September",
                    10: "October",
                    11: "November",
                    12: "December"
                }
                self._data_frame = self._data_frame.withColumn(
                    datetime_col + "_month",
                    month(
                        to_timestamp(self._data_frame[datetime_col + '_temp'],
                                     "dd/MM/yyyy")))
                self._data_frame = self._data_frame.withColumn(
                    datetime_col + "_etf_month_of_year",
                    self.month_to_string(dict)(col(datetime_col + "_month")))
            if info_to_extract == "day_of_month":
                self._data_frame = self._data_frame.withColumn(
                    datetime_col + "_day_of_month",
                    dayofmonth(
                        to_timestamp(self._data_frame[datetime_col + '_temp'],
                                     "dd/MM/yyyy")))
            if info_to_extract == "day_of_year":
                self._data_frame = self._data_frame.withColumn(
                    datetime_col + "_day_of_year",
                    dayofyear(
                        to_timestamp(self._data_frame[datetime_col + '_temp'],
                                     "dd/MM/yyyy")))
            if info_to_extract == "day_of_week":
                self._data_frame = self._data_frame.withColumn(
                    datetime_col + "_etf_day_of_week",
                    dayofweek(datetime_col + '_temp'))
            if info_to_extract == "week_of_year":
                self._data_frame = self._data_frame.withColumn(
                    datetime_col + "_week_of_year",
                    weekofyear(
                        to_timestamp(self._data_frame[datetime_col + '_temp'],
                                     "dd/MM/yyyy")))
            if info_to_extract == "hour":
                self._data_frame = self._data_frame.withColumn(
                    datetime_col + "_hour",
                    hour(
                        to_timestamp(self._data_frame[datetime_col + '_temp'],
                                     "dd/MM/yyyy")))
            if info_to_extract == "minute":
                self._data_frame = self._data_frame.withColumn(
                    datetime_col + "_minute",
                    minute(
                        to_timestamp(self._data_frame[datetime_col + '_temp'],
                                     "dd/MM/yyyy")))
            if info_to_extract == "date":
                self._data_frame = self._data_frame.withColumn(
                    datetime_col + "_date",
                    to_timestamp(self._data_frame[datetime_col + '_temp'],
                                 "dd/MM/yyyy").cast("date"))
            else:
                pass
        self._data_frame = self._data_frame.drop(datetime_col + '_temp')
        # self._data_frame = self._data_frame.withColumn(datetime_col, to_timestamp(self._data_frame[datetime_col +'_temp'], "dd/MM/yyyy"))
        # self._data_frame = self._data_frame.withColumn(datetime_col, F.from_unixtime(F.unix_timestamp(self._data_frame[datetime_col +'_temp']), "dd/MM/yyyy"))
        return self._data_frame

    def is_weekend_helper(self):
        def weekend_checker(x):
            if (int(x) < 6):
                return False
            else:
                return True

        return udf(lambda x: weekend_checker(x) if x != None else x)

#Timeformat is hardcoded as "dd/MM/yyyy"

    def is_weekend(self, datetime_col):
        # self._metaHelperInstance = MetaDataHelper(self._data_frame, self._data_frame.count())
        self._data_frame = self._data_frame.withColumn(
            datetime_col + '_temp', self.to_date_(datetime_col))
        uniqueVals = self._data_frame.select(
            datetime_col + '_temp').distinct().na.drop().limit(10).collect()
        try:
            date_format = self._metaHelperInstance.get_datetime_format(
                uniqueVals)
            to_date_udf = udf(
                lambda x: datetime.strptime(x, date_format)
                if x != None else x, DateType())
            self._data_frame = self._data_frame.withColumn(
                datetime_col + '_temp',
                to_date_udf(col(datetime_col + '_temp')))
            self._data_frame = self._data_frame.withColumn(
                datetime_col + "_day", dayofmonth(datetime_col + '_temp'))
            self._data_frame = self._data_frame.withColumn(
                datetime_col + "_is_weekend",
                self.is_weekend_helper()(col(datetime_col + "_day")))
            self._data_frame = self._data_frame.drop(datetime_col + "_day")

        except TypeError:
            self._data_frame = self._data_frame.withColumn(
                datetime_col + "_day", dayofmonth(datetime_col + '_temp'))
            self._data_frame = self._data_frame.withColumn(
                datetime_col + "_is_weekend",
                self.is_weekend_helper()(col(datetime_col + "_day")))
            self._data_frame = self._data_frame.drop(datetime_col + "_day")
        self._data_frame = self._data_frame.drop(datetime_col + '_temp')

        # self._data_frame = self._data_frame.withColumn(datetime_col, to_timestamp(self._data_frame[datetime_col], "dd/MM/yyyy"))
        # self._data_frame = self._data_frame.withColumn(datetime_col, F.from_unixtime(F.unix_timestamp(self._data_frame[datetime_col]), "dd/MM/yyyy"))
        return self._data_frame
class FeatureEngineeringHelperPandas(object):
    """Contains Feature Engineering Operation Functions"""

    def __init__(self, df, dataframe_context):
        self._data_frame = df
        self._metaHelperInstance = MetaDataHelper(self._data_frame, self._data_frame.shape[0])
        self._dataframe_context = dataframe_context

        # self._dataframe_helper = dataframe_helper

    def binning_all_measures(self, number_of_bins, consider_cols):
        numeric_columns = []
        cols_to_be_binned = [x[:-4] for x in consider_cols if x[-4:] == "_bin"]
        numeric_columns = [col for col in self._data_frame.columns if
                           self._data_frame[col].dtypes in ['int32', 'int64', 'float32', 'float64', 'int', 'float']]
        for column_name in numeric_columns and cols_to_be_binned:
            self._data_frame = self.create_equal_sized_measure_bins(column_name, number_of_bins)
        return self._data_frame

    def check_key(self, x, bin_label):
        if x is not None:
            for key in list(bin_label.keys()):
                if x >= bin_label[key][0] and x <= bin_label[key][1]:
                    return key
        else:
            return "None"

    def create_level(self, x, level_dict, selected_list):
        for key in list(level_dict.keys()):
            if x in selected_list:
                if x in level_dict[key]:
                    return key
            else:
                return x

    def create_new_levels_dimension(self, column_name, level_dict):
        selected_list = []
        for key in list(level_dict.keys()):
            selected_list = selected_list + level_dict[key]
        self._data_frame[column_name + "_level"] = self._data_frame[column_name].apply(self.create_level,
                                                                                       level_dict=level_dict,
                                                                                       selected_list=selected_list)
        return self._data_frame

    def create_level_udf_time(self, dict, date_format):
        pass

    def convert_to_date(self, value, date_format):
        if isinstance(value, str):
            value = pd.to_datetime(value, format='%d/%m/%Y')
        elif isinstance(value, str):
            value = pd.to_datetime(value, format='%d/%m/%Y')
        else:
            value = value
        return value

    def convert_to_date_from_level_value(self, value):
        value = pd.to_datetime(value, format='%d/%m/%Y')
        return value

    def check_key_date_bins(self, date, dict, date_format):
        if date is not None:
            date = self.convert_to_date(date, date_format)
            for key, value in list(dict.items()):
                val1_date = self.convert_to_date_from_level_value(value[0])
                val2_date = self.convert_to_date_from_level_value(value[1])
                date_range = [val1_date, val2_date]
                if date >= date_range[0] and date <= date_range[1]:
                    return key
                else:
                    return "None"

    def create_new_levels_datetimes(self, col_for_timelevels, dict):
        self._data_frame[col_for_timelevels + '_temp'] = pd.to_datetime(self._data_frame[col_for_timelevels],
                                                                        errors='ignore')
        unique_vals = self._data_frame[col_for_timelevels + '_temp'].head(15)
        try:
            date_format = self._metaHelperInstance.get_datetime_format_pandas(unique_vals)
        except:
            date_format = None
        self._data_frame[col_for_timelevels + '_t_level'] = self._data_frame[col_for_timelevels + '_temp'].apply(
            self.check_key_date_bins, dict=dict, date_format=date_format)
        self._data_frame = self._data_frame.drop(col_for_timelevels + '_temp', axis=1)
        return self._data_frame

    def create_equal_sized_measure_bins(self, column_name, number_of_bins):
        def create_dict_for_bin():
            min_value = np.min(self._data_frame[column_name])
            max_value = np.max(self._data_frame[column_name])
            interval_size = (old_div((max_value - min_value) * 1.0, (number_of_bins - 1)))
            bin_dict = {}
            temp = min_value
            while temp <= max_value:
                bin_dict[str(round(temp, 3)) + "-" + str(round(temp + interval_size, 3))] = [temp, temp + interval_size]
                temp = temp + interval_size
            return bin_dict

        bin_dict = create_dict_for_bin()
        self._data_frame[column_name + "_bin"] = self._data_frame[column_name].apply(self.check_key, bin_label=bin_dict)
        return self._data_frame

    def create_custom_measure_bins(self, column_name, list_of_intervals):
        def create_dict_for_bin():
            min_value = np.min(self._data_frame[column_name])
            max_value = np.max(self._data_frame[column_name])
            bin_dict = {}
            if list_of_intervals[0] > min_value:
                bin_dict[str(min_value) + "-" + str(list_of_intervals[0])] = [min_value, list_of_intervals[0]]
            for i in range(len(list_of_intervals)):
                if i + 2 <= len(list_of_intervals):
                    bin_dict[str(list_of_intervals[i]) + "-" + str(list_of_intervals[i + 1])] = [list_of_intervals[i],
                                                                                                 list_of_intervals[
                                                                                                     i + 1]]
            if list_of_intervals[-1] < max_value:
                bin_dict[str(list_of_intervals[-1]) + "-" + str(max_value)] = [list_of_intervals[-1], max_value]

            return bin_dict

        bin_dict = create_dict_for_bin()
        self._data_frame[column_name + "_c_bin"] = self._data_frame[column_name].apply(self.check_key,
                                                                                       bin_label=bin_dict)
        return self._data_frame

    def replace_values_in_column(self, column_name, range, value):
        if value == "median":
            dp_helper_obj = DataPreprocessingHelperPandas(self._data_frame, self._dataframe_context)
            median_val = dp_helper_obj.get_median(self._data_frame, column_name)
            replace_value = median_val
            self._data_frame[column_name + "_treated_" + str(range) + "_median"] = self._data_frame[column_name].apply(
                lambda x: replace_value if x == range else x)
        elif value == "mode":
            dp_helper_obj = DataPreprocessingHelperPandas(self._data_frame, self._dataframe_context)
            mode_val = dp_helper_obj.get_mode(self._data_frame, column_name)
            replace_value = mode_val
            self._data_frame[column_name + "_treated_" + str(range) + "_mode"] = self._data_frame[column_name].apply(
                lambda x: replace_value if x == range else x)
        elif value == "mean":
            mean_value = np.mean(self._data_frame[column_name])
            replace_value = mean_value
            self._data_frame[column_name + "_treated_" + str(range) + "_mean"] = self._data_frame[column_name].apply(
                lambda x: replace_value if x == range else x)
        else:
            replace_value = value
            self._data_frame[column_name + "_treated_" + str(range) + "_" + str(replace_value)] = self._data_frame[
                column_name].apply(lambda x: replace_value if x == range else x)
        return self._data_frame

    def standardize_column(self, column_name):
        def standardize_column_helper(mean, sd):
            return [lambda x: round(float((x - mean) * 1.0 / sd), 3) if x is not None else x]

        mean, std_dev = self._data_frame[column_name].mean(), self._data_frame[column_name].std()
        self._data_frame[column_name + '_fs_standardized'] = self._data_frame[column_name].apply(
            standardize_column_helper(mean, std_dev))
        return self._data_frame

    def normalize_column(self, column_name):
        def normalize_column_helper(min, max):
            return [lambda x: round(float((x - min) * 1.0 / (max - min)), 3) if x is not None else x]

        max_value, min_value = self._data_frame[column_name].max(), self._data_frame[column_name].min()
        self._data_frame[column_name + '_fs_normalized'] = self._data_frame[column_name].apply(
            normalize_column_helper(min_value, max_value))
        return self._data_frame

    def replacerUDF(self, value, operation):
        if operation == "prod":
            return [lambda x: x * value if x is not None else x]
        if operation == "add":
            return [lambda x: x + value if x is not None else x]
        if operation == "subs":
            return [lambda x: x - value if x is not None else x]
        if operation == "divide":
            return [(lambda x: int(x, value) if x is not None else x)]
        if operation == "Reciprocal":
            return [(lambda x: int(1, x) if x is not None else x)]
        if operation == "NthRoot":
            try:
                return [(lambda x: x ** (1.0 / value) if x is not None else x)]
            except:
                return [(lambda x: x)]
        if operation == "exponential":
            return [(lambda x: x ** value if x is not None else x)]
        if operation == "logTransform":
            return [lambda x: math.log(x, 10) if x is not None else x]
        if operation == "modulus":
            return [(lambda x: abs(x) if x is not None else x)]

    def logTransform_column(self, column_name):
        column_min = self._data_frame[column_name].min()
        value_to_be_added = abs(column_min) + 1
        if column_min > 0:
            self._data_frame[column_name + "_vt_log_transformed"] = self._data_frame[column_name].apply(
                self.replacerUDF(10, "logTransform"))
            self._data_frame[column_name + "_vt_log_transformed"] = self._data_frame[
                column_name + "_vt_log_transformed"].astype('float')
        else:
            self._data_frame[column_name + "_temp_transformed"] = self._data_frame[column_name].apply(
                self.replacerUDF(value_to_be_added, "add"))
            self._data_frame[column_name + "_vt_log_transformed"] = self._data_frame[
                column_name + "_temp_transformed"].apply(self.replacerUDF(10, "logTransform"))
            self._data_frame[column_name + "_vt_log_transformed"] = self._data_frame[
                column_name + "_vt_log_transformed"].astype('float')
            self._data_frame = self._data_frame.drop(column_name + "_temp_transformed", axis=1)
        return self._data_frame

    def modulus_transform_column(self, column_name):
        self._data_frame[column_name + "_vt_modulus_transformed"] = self._data_frame[column_name].apply(
            self.replacerUDF(10, "modulus"))
        self._data_frame[column_name + "_vt_modulus_transformed"] = self._data_frame[
            column_name + "_vt_modulus_transformed"].astype('float')
        return self._data_frame

    def cuberoot_transform_column(self, column_name):
        self._data_frame[column_name + "_vt_cuberoot_transformed"] = self._data_frame[column_name].apply(
            self.replacerUDF(3, "NthRoot"))
        self._data_frame[column_name + "_vt_cuberoot_transformed"] = self._data_frame[
            column_name + "_vt_cuberoot_transformed"].astype('float')
        return self._data_frame

    def squareroot_transform_column(self, column_name):
        column_min = self._data_frame[column_name].min()
        if column_min >= 0:
            self._data_frame[column_name + "_vt_squareroot_transformed"] = self._data_frame[column_name].apply(
                self.replacerUDF(2, "NthRoot"))
            self._data_frame[column_name + "_vt_squareroot_transformed"] = self._data_frame[
                column_name + "_vt_squareroot_transformed"].astype('float')
        else:
            self._data_frame[column_name + "_vt_squareroot_transformed"] = 0
        return self._data_frame

    def label_encoding_column(self, column_name):
        self._data_frame[column_name + '_ed_label_encoded'] = LabelEncoder().fit_transform(
            self._data_frame[column_name].astype(str))
        return self._data_frame

    def onehot_encoding_column(self, column_name):
        if self._data_frame[column_name].isnull().any():
            self._data_frame[column_name].fillna(self._data_frame[column_name].mode()[0], inplace=True)
        temp = self._data_frame[[column_name]]
        enc = OneHotEncoder_pandas(drop='first')
        k1 = enc.fit_transform(temp).toarray()
        temp = pd.DataFrame(k1, columns=list(enc.get_feature_names()))
        feature_names = list(enc.get_feature_names())
        temp.set_index(self._data_frame.index, inplace=True)
        for col_name in feature_names:
            self._data_frame[column_name + '_' + col_name.partition('_')[2] + '_one_hot'] = temp[col_name].astype('int')
        # X.drop(column_name,axis = 1,inplace = True)
        return self._data_frame

    def character_count_string(self, column_name):
        self._data_frame[column_name + "_character_count"] = self._data_frame[column_name].apply(
            lambda x: x.count("") - 1 if x is not None else 0)
        self._data_frame[column_name + "_character_count"] = self._data_frame[column_name + "_character_count"].astype(
            'float')
        return self._data_frame

    def contains_word(self, column_name, word):
        # word = word.lower()
        self._data_frame[column_name + "_contains_" + word] = self._data_frame[column_name].apply(
            lambda x: False if x is None or x.lower().find(word) == -1 else True)
        return self._data_frame

    def convert_to_timestamp(self, datetime_col, timeformat):
        pass

    def count_time_since(self, col_for_time_since, time_since_date):
        self._data_frame[col_for_time_since + '_temp'] = pd.to_datetime(self._data_frame[col_for_time_since],
                                                                        errors='coerce')
        unique_vals = self._data_frame[col_for_time_since].drop_duplicates().head(10)
        try:
            date_format = self._metaHelperInstance.get_datetime_format_pandas(unique_vals)
            self._data_frame['TIME_SINCE_DATE'] = time_since_date
            self._data_frame[col_for_time_since + '_temp'] = pd.to_datetime(
                self._data_frame[col_for_time_since + '_temp'], format=date_format)
            self._data_frame['TIME_SINCE_DATE_Timestamped'] = pd.to_datetime(self._data_frame['TIME_SINCE_DATE'],
                                                                             format='%d/%m/%Y')
            self._data_frame[col_for_time_since + "_time_since"] = self._data_frame['TIME_SINCE_DATE_Timestamped'] - \
                                                                   self._data_frame[col_for_time_since + '_temp']
            self._data_frame[col_for_time_since + "_time_since"] = self._data_frame[
                                                                       col_for_time_since + "_time_since"] / np.timedelta64(
                1, 'D')
            self._data_frame[col_for_time_since + "_time_since"] = self._data_frame[
                col_for_time_since + "_time_since"].apply(np.ceil)
        except:
            self._data_frame['TIME_SINCE_DATE'] = time_since_date
            self._data_frame['TIME_SINCE_DATE_Timestamped'] = pd.to_datetime(self._data_frame['TIME_SINCE_DATE'],
                                                                             format='%d/%m/%Y',
                                                                             infer_datetime_format=True)
            self._data_frame[col_for_time_since + "_time_since"] = self._data_frame['TIME_SINCE_DATE_Timestamped'] - \
                                                                   self._data_frame[col_for_time_since + '_temp']
            self._data_frame[col_for_time_since + "_time_since"] = self._data_frame[
                                                                       col_for_time_since + "_time_since"] / np.timedelta64(
                1, 'D')
            self._data_frame[col_for_time_since + "_time_since"] = self._data_frame[
                col_for_time_since + "_time_since"].apply(np.ceil)
        self._data_frame = self._data_frame.drop(["TIME_SINCE_DATE", "TIME_SINCE_DATE_Timestamped"], axis=1)
        self._data_frame = self._data_frame.drop([col_for_time_since + '_temp'], axis=1)
        return self._data_frame

    def month_to_string(self, dict):
        pass

    def extract_datetime_info(self, datetime_col, info_to_extract):
        self._data_frame[datetime_col] = pd.to_datetime(self._data_frame[datetime_col], errors='ignore')
        self._data_frame[datetime_col + '_temp'] = self._data_frame[datetime_col].dt.date
        unique_vals = self._data_frame[datetime_col + '_temp'].head(15)
        try:
            date_format = self._metaHelperInstance.get_datetime_format_pandas(unique_vals)
            if info_to_extract == "year":
                self._data_frame[datetime_col + '_year'] = self._data_frame[datetime_col].dt.year
            if info_to_extract == "month_of_year":
                self._data_frame[datetime_col + '_etf_month_of_year'] = self._data_frame[datetime_col].dt.month
            if info_to_extract == "day_of_month":
                self._data_frame[datetime_col + '_day_of_month'] = self._data_frame[datetime_col].dt.day
            if info_to_extract == "day_of_year":
                self._data_frame[datetime_col + '_day_of_year'] = self._data_frame[datetime_col].dt.dayofyear
            if info_to_extract == "day_of_week":
                self._data_frame[datetime_col + '_etf_day_of_week'] = self._data_frame[datetime_col].dt.dayofweek
            if info_to_extract == "week_of_year":
                self._data_frame[datetime_col + '_week_of_year'] = self._data_frame[datetime_col].dt.weekofyear
            if info_to_extract == "hour":
                self._data_frame[datetime_col + '_hour'] = self._data_frame[datetime_col].dt.hour
            if info_to_extract == "minute":
                self._data_frame[datetime_col + '_minute'] = self._data_frame[datetime_col].dt.minute
            if info_to_extract == "date":
                self._data_frame[datetime_col + '_date'] = self._data_frame[datetime_col].dt.date
            else:
                pass
        except:
            if info_to_extract == "year":
                self._data_frame[datetime_col + '_year'] = self._data_frame[datetime_col].dt.year
            if info_to_extract == "month_of_year":
                self._data_frame[datetime_col + '_etf_month_of_year'] = self._data_frame[datetime_col].dt.month
            if info_to_extract == "day_of_month":
                self._data_frame[datetime_col + '_day_of_month'] = self._data_frame[datetime_col].dt.day
            if info_to_extract == "day_of_year":
                self._data_frame[datetime_col + '_day_of_year'] = self._data_frame[datetime_col].dt.dayofyear
            if info_to_extract == "day_of_week":
                self._data_frame[datetime_col + '_etf_day_of_week'] = self._data_frame[datetime_col].dt.dayofweek
            if info_to_extract == "week_of_year":
                self._data_frame[datetime_col + '_week_of_year'] = self._data_frame[datetime_col].dt.weekofyear
            if info_to_extract == "hour":
                self._data_frame[datetime_col + '_hour'] = self._data_frame[datetime_col].dt.hour
            if info_to_extract == "minute":
                self._data_frame[datetime_col + '_minute'] = self._data_frame[datetime_col].dt.minute
            if info_to_extract == "date":
                self._data_frame[datetime_col + '_date'] = self._data_frame[datetime_col].dt.date
        self._data_frame = self._data_frame.drop(datetime_col + '_temp', axis=1)
        return self._data_frame

    def is_weekend_helper(self):
        pass

    def is_weekend(self, datetime_col):
        self._data_frame[datetime_col + '_temp'] = pd.to_datetime(self._data_frame[datetime_col], errors='coerce')
        unique_vals = self._data_frame[datetime_col].drop_duplicates().head(10)
        try:
            date_format = self._metaHelperInstance.get_datetime_format_pandas(unique_vals)
            self._data_frame[datetime_col + '_temp'] = pd.to_datetime(self._data_frame[datetime_col + '_temp'],
                                                                      format=date_format)
            self._data_frame[datetime_col + 'is_weekend'] = np.where(self._data_frame[datetime_col].dt.dayofweek >= 5,
                                                                     'True', 'False')
        except:
            self._data_frame[datetime_col + 'is_weekend'] = np.where(
                self._data_frame[datetime_col + '_temp'].dt.dayofweek >= 5, 'True', 'False')
        self._data_frame = self._data_frame.drop(datetime_col + '_temp', axis=1)
        return self._data_frame
コード例 #6
0
    def run(self):
        self._start_time = time.time()
        metaHelperInstance = MetaDataHelper(self._data_frame, self._total_rows)
        sampleData = metaHelperInstance.get_sample_data()
        if not self._pandas_flag:
            sampleData = sampleData.toPandas()
        time_taken_sampling = time.time()-self._start_time
        self._completionStatus += self._scriptStages["sampling"]["weight"]
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "sampling",\
                                    "info",\
                                    self._scriptStages["sampling"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsgFlag)

        metaData = []
        metaData.append(MetaData(name="noOfRows",value=self._total_rows,display=True,displayName="Rows"))
        metaData.append(MetaData(name="noOfColumns",value=self._total_columns,display=True,displayName="Columns"))
        # self._percentage_columns = metaHelperInstance.get_percentage_columns(self._string_columns)
        separation_time=time.time()
        self._timestamp_string_columns=[]
        uniqueVals = []
        dateTimeSuggestions = {}
        if not self._pandas_flag:
            for column in self._string_columns:
                if self._column_type_dict[column]["actual"] != "boolean":
                    # uniqueVals = self._data_frame.select(column).na.drop().distinct().limit(10).collect()
                    uniqueVals = sampleData[column].unique().tolist()
                else:
                    uniqueVals = []
                ## TODO : remove pandas if not needed later
                if self._pandas_flag:
                    if len(uniqueVals) > 0 and metaHelperInstance.get_datetime_format_pandas([self._data_frame.sort_values(by=column,ascending=False)[column][0]])!=None:
                        dateColumnFormat = metaHelperInstance.get_datetime_format_pandas(uniqueVals)
                    else:
                        dateColumnFormat = None
                else:
                    if len(uniqueVals) > 0 and metaHelperInstance.get_datetime_format([self._data_frame.orderBy([column],ascending=[False]).select(column).first()[0]])!=None:
                        dateColumnFormat = metaHelperInstance.get_datetime_format(uniqueVals)
                    else:
                        dateColumnFormat = None

                if dateColumnFormat:
                    dateTimeSuggestions.update({column:dateColumnFormat})
                    data=ColumnData()
                    data.set_level_count_to_null()
                    data.set_chart_data_to_null()
                    data.set_date_suggestion_flag(True)
                    data.set_abstract_datatype("datetime")
                    data.set_actual_datatype("datetime")
                    self._timestamp_string_columns.append(column)
                    ## TO DO : remove pandas if not needed later
                    if self._pandas_flag:
                        self._data_frame[column] = pd.to_datetime(self._data_frame[column],format=dateColumnFormat)
                    else:
                        self._data_frame = self._data_frame.withColumn(column, self.to_date_(column))
        sampleData = metaHelperInstance.format_sampledata_timestamp_columns(sampleData,self._timestamp_columns,self._stripTimestamp)
        print("sampling takes",time_taken_sampling)
        self._string_columns = list(set(self._string_columns)-set(self._timestamp_string_columns))

        self._timestamp_columns = self._timestamp_columns+self._timestamp_string_columns
        # self.update_column_type_dict()

        print("time taken for separating date columns from string is :", time.time()-separation_time)


        # if len(self._percentage_columns)>0:
        #     self._data_frame = CommonUtils.convert_percentage_columns(self._data_frame,self._percentage_columns)
        #     self._numeric_columns = self._numeric_columns + self._percentage_columns
        #     self._string_columns = list(set(self._string_columns)-set(self._percentage_columns))
        #     self.update_column_type_dict()

        # self._dollar_columns = metaHelperInstance.get_dollar_columns(self._string_columns)
        # if len(self._dollar_columns)>0:
        #     self._data_frame = CommonUtils.convert_dollar_columns(self._data_frame,self._dollar_columns)
        #     self._numeric_columns = self._numeric_columns + self._dollar_columns
        #     self._string_columns = list(set(self._string_columns)-set(self._dollar_columns))
        #     self.update_column_type_dict()


        columnData = []
        headers = []

        self._start_time = time.time()
        print("Count of Numeric columns",len(self._numeric_columns))
        try:
            measureColumnStat,measureCharts = metaHelperInstance.calculate_measure_column_stats(self._data_frame,self._numeric_columns,binColumn=self._binned_stat_flag,pandas_flag=self._pandas_flag)
        except Exception as e:
            raise Exception(e)
        time_taken_measurestats = time.time()-self._start_time
        self._completionStatus += self._scriptStages["measurestats"]["weight"]
        print("measure stats takes",time_taken_measurestats)
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "measurestats",\
                                    "info",\
                                    self._scriptStages["measurestats"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsgFlag)
        print("Count of DateTime columns",len(self._timestamp_columns))

        self._start_time = time.time()
        # time_columns=self._timestamp_columns
        # time_string_columns=self._timestamp_string_columns
        # original_timestamp_columns=list(set(self._timestamp_columns)-set(self._timestamp_string_columns))
        timeDimensionColumnStat,timeDimensionCharts, unprocessed_columns = metaHelperInstance.calculate_time_dimension_column_stats(self._data_frame,self._timestamp_columns,level_count_flag=self._level_count_flag,pandas_flag=self._pandas_flag)
        self._string_columns = self._string_columns + unprocessed_columns
        self._timestamp_columns = list(set(self._timestamp_columns) - set(unprocessed_columns))
        self.update_column_type_dict()


        if len(self._numeric_columns) > 1:
            # print "self._numeric_columns : ", self._numeric_columns
            metaData.append(MetaData(name="measures",value=len(self._numeric_columns),display=True,displayName="Measures"))
        else:
            metaData.append(MetaData(name="measures",value=len(self._numeric_columns),display=True,displayName="Measure"))
        if len(self._string_columns) > 1:
            metaData.append(MetaData(name="dimensions",value=len(self._string_columns+self._boolean_columns),display=True,displayName="Dimensions"))
        else:
            metaData.append(MetaData(name="dimensions",value=len(self._string_columns+self._boolean_columns),display=True,displayName="Dimension"))
        if len(self._timestamp_columns) > 1:
            metaData.append(MetaData(name="timeDimension",value=len(self._timestamp_columns),display=True,displayName="Time Dimensions"))
        else:
            metaData.append(MetaData(name="timeDimension",value=len(self._timestamp_columns),display=True,displayName="Time Dimension"))

        metaData.append(MetaData(name="measureColumns",value = self._numeric_columns,display=False))
        metaData.append(MetaData(name="dimensionColumns",value = self._string_columns+self._boolean_columns,display=False))
        metaData.append(MetaData(name="timeDimensionColumns",value = self._timestamp_columns,display=False))
        # metaData.append(MetaData(name="percentageColumns",value = self._percentage_columns,display=False))
        # metaData.append(MetaData(name="dollarColumns",value = self._dollar_columns,display=False))

        # timeDimensionColumnStat2,timeDimensionCharts2,unprocessed_columns = metaHelperInstance.calculate_time_dimension_column_stats_from_string(self._data_frame,self._timestamp_string_columns,level_count_flag=self._level_count_flag)
        # gc.collect()
        # timeDimensionColumnStat.update(timeDimensionColumnStat2)
        # timeDimensionCharts.update(timeDimensionCharts2)
        time_taken_tdstats = time.time()-self._start_time
        self._completionStatus += self._scriptStages["timedimensionstats"]["weight"]
        print("time dimension stats takes",time_taken_tdstats)
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "timedimensionstats",\
                                    "info",\
                                    self._scriptStages["timedimensionstats"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsgFlag)

        self._start_time = time.time()
        try :
            dimensionColumnStat,dimensionCharts = metaHelperInstance.calculate_dimension_column_stats(self._data_frame,self._string_columns+self._boolean_columns,levelCount=self._level_count_flag,pandas_flag=self._pandas_flag)
        except Exception as e:
            raise Exception(e)
        self._dataSize["dimensionLevelCountDict"] = {k:[x for x in v if x["name"]=="numberOfUniqueValues"][0]["value"] for k,v in list(dimensionColumnStat.items())}
        self._dataSize["totalLevels"] = sum(self._dataSize["dimensionLevelCountDict"].values())

        time_taken_dimensionstats = time.time()-self._start_time
        self._completionStatus += self._scriptStages["dimensionstats"]["weight"]
        # print "dimension stats takes",time_taken_dimensionstats
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "dimensionstats",\
                                    "info",\
                                    self._scriptStages["dimensionstats"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsgFlag)

        self._start_time = time.time()
        ignoreColumnSuggestions = []
        ignoreColumnReason = []
        utf8ColumnSuggestion = []

        dup_cols = []
        #columns = self._data_frame.columns
        measureDupCols=self.checkDupColName(measureColumnStat)
        dimensionDupCols=self.checkDupColName(dimensionColumnStat)
        timeDimensionDupCols=self.checkDupColName(timeDimensionColumnStat)
        if self._pandas_flag:
            for i in measureDupCols:
                if self.checkDuplicateCols_pandas(i[0],i[1]) == True:
                    for j in i[1:]:
                        if dict(name="Duplicate",value=True) not in measureColumnStat[j]:
                            measureColumnStat[j].append(dict(name="Duplicate",value=i[0]))
            for i in dimensionDupCols:
                if self.checkDuplicateCols_pandas(i[0],i[1]) == True:
                    for j in i[1:]:
                        if dict(name="Duplicate",value=True) not in dimensionColumnStat[j]:
                            dimensionColumnStat[j].append(dict(name="Duplicate",value=i[0]))
            for i in timeDimensionDupCols:
                if self.checkDuplicateCols_pandas(i[0],i[1]) == True:
                    for j in i[1:]:
                        if dict(name="Duplicate",value=True) not in timeDimensionColumnStat[j]:
                            timeDimensionColumnStat[j].append(dict(name="Duplicate",value=i[0]))
        else:
            for i in measureDupCols:
                if self.checkDuplicateCols(i[0],i[1]) == True:
                    for j in i[1:]:
                        if dict(name="Duplicate",value=True) not in measureColumnStat[j]:
                            measureColumnStat[j].append(dict(name="Duplicate",value=i[0]))
            for i in dimensionDupCols:
                if self.checkDuplicateCols(i[0],i[1],True) == True:
                    for j in i[1:]:
                        if dict(name="Duplicate",value=True) not in dimensionColumnStat[j]:
                            dimensionColumnStat[j].append(dict(name="Duplicate",value=i[0]))
            for i in timeDimensionDupCols:
                if self.checkDuplicateCols(i[0],i[1]) == True:
                    for j in i[1:]:
                        if dict(name="Duplicate",value=True) not in timeDimensionColumnStat[j]:
                            timeDimensionColumnStat[j].append(dict(name="Duplicate",value=i[0]))

        for column in self._data_frame.columns:
            random_slug = uuid.uuid4().hex
            headers.append(ColumnHeader(name=column,slug=random_slug))
            data = ColumnData()
            data.set_slug(random_slug)
            data.set_name(column)
            data.set_abstract_datatype(self._column_type_dict[column]["abstract"])
            data.set_checker(True)
            changeflage=False
            columnStat = []
            columnChartData = None
            check_datatype_change=self.actual_col_datatype_update
            if len(check_datatype_change)!=0:
                for i in check_datatype_change:
                    if list(i.keys())[0]==column:
                        changeflage=True
                        changeType=i[column]
                        break
                    else:
                        changeflage=False
            else:
                changeflage=False
            if self._column_type_dict[column]["abstract"] == "measure":
                data.set_column_stats(measureColumnStat[column])
                data.set_column_chart(measureCharts[column])
                if changeflage:
                    data.set_actual_datatype("dimension")
                else:
                    data.set_actual_datatype(self._column_type_dict[column]["actual"])
            elif self._column_type_dict[column]["abstract"] == "dimension":
                data.set_column_stats(dimensionColumnStat[column])
                data.set_column_chart(dimensionCharts[column])
                if changeflage:
                    data.set_actual_datatype("measure")
                else:
                    data.set_actual_datatype(self._column_type_dict[column]["actual"])
            elif self._column_type_dict[column]["abstract"] == "datetime":
                data.set_column_stats(timeDimensionColumnStat[column])
                data.set_column_chart(timeDimensionCharts[column])
                if changeflage:
                    data.set_actual_datatype("dimension")
                else:
                    data.set_actual_datatype(self._column_type_dict[column]["actual"])
            if self._column_type_dict[column]["abstract"] == "measure":
                #if column not in self._real_columns:
                ignoreSuggestion,ignoreReason = metaHelperInstance.get_ignore_column_suggestions(self._data_frame,self._total_rows,column,"measure",measureColumnStat[column],max_levels=self._max_levels)
                if ignoreSuggestion:
                    ignoreColumnSuggestions.append(column)
                    ignoreColumnReason.append(ignoreReason)
                    #data.set_level_count_to_null()
                    #data.set_chart_data_to_null()
                    data.set_ignore_suggestion_flag(True)
                    data.set_ignore_suggestion_message(ignoreReason)
            elif self._column_type_dict[column]["abstract"] == "dimension":
                ignoreSuggestion,ignoreReason = metaHelperInstance.get_ignore_column_suggestions(self._data_frame,self._total_rows,column,"dimension",dimensionColumnStat[column],max_levels=self._max_levels)
                if ignoreSuggestion:
                    ignoreColumnSuggestions.append(column)
                    ignoreColumnReason.append(ignoreReason)
                    if ignoreReason=="Number of Levels are more than the defined thershold":
                        data.set_ignore_suggestion_preview_flag(False)
                    #data.set_level_count_to_null()
                    #data.set_chart_data_to_null()
                    data.set_ignore_suggestion_flag(True)
                    data.set_ignore_suggestion_message(ignoreReason)
                if self._level_count_flag:
                    utf8Suggestion = metaHelperInstance.get_utf8_suggestions(dimensionColumnStat[column])
                else:
                    utf8Suggestion = False
                if utf8Suggestion:
                    utf8ColumnSuggestion.append(column)
                    ignoreSuggestion,ignoreReason = metaHelperInstance.get_ignore_column_suggestions(self._data_frame,self._total_rows,column,"dimension",dimensionColumnStat[column],max_levels=self._max_levels)
                    if ignoreSuggestion:
                        ignoreColumnSuggestions.append(column)
                        ignoreColumnReason.append(ignoreReason)
                        #data.set_level_count_to_null()
                        #data.set_chart_data_to_null()
                        data.set_ignore_suggestion_flag(True)
                        data.set_ignore_suggestion_message(ignoreReason)

            elif self._column_type_dict[column]["abstract"] == "datetime":
                ignoreSuggestion,ignoreReason = metaHelperInstance.get_ignore_column_suggestions(self._data_frame,self._total_rows,column,"datetime",timeDimensionColumnStat[column],max_levels=self._max_levels)
                if ignoreSuggestion:
                    ignoreColumnSuggestions.append(column)
                    ignoreColumnReason.append(ignoreReason)
                    #data.set_level_count_to_null()
                    #data.set_chart_data_to_null()
                    data.set_ignore_suggestion_flag(True)
                    data.set_ignore_suggestion_message(ignoreReason)
            columnData.append(data)
            if len(uniqueVals) > 0:
                dateColumnFormat = metaHelperInstance.get_datetime_format(uniqueVals)
            else:
                dateColumnFormat = None
            if dateColumnFormat:
                dateTimeSuggestions.update({column:dateColumnFormat})
        for utfCol in utf8ColumnSuggestion:
            ignoreColumnSuggestions.append(utfCol)
            ignoreColumnReason.append("utf8 values present")
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,"custom","info","Validating Metadata Information",self._completionStatus,self._completionStatus,display=True)
        CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsgFlag)
        metaData.append(MetaData(name="ignoreColumnSuggestions",value = ignoreColumnSuggestions,display=False))
        metaData.append(MetaData(name="ignoreColumnReason",value = ignoreColumnReason,display=False))
        metaData.append(MetaData(name="utf8ColumnSuggestion",value = utf8ColumnSuggestion,display=False))
        metaData.append(MetaData(name="dateTimeSuggestions",value = dateTimeSuggestions,display=False))
        metaData.append(MetaData(name="dataSizeSummary",value = self._dataSize,display=False))
        dfMetaData = DfMetaData()
        dfMetaData.set_column_data(columnData)
        dfMetaData.set_header(headers)
        dfMetaData.set_meta_data(metaData)
        dfMetaData.set_sample_data(sampleData)

        time_taken_suggestions = time.time()-self._start_time
        self._completionStatus += self._scriptStages["suggestions"]["weight"]
        # print "suggestions take",time_taken_suggestions
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "suggestions",\
                                    "info",\
                                    self._scriptStages["suggestions"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsgFlag)
        self._dataframe_context.update_completion_status(self._completionStatus)
        return dfMetaData
コード例 #7
0
    def run(self):
        self._start_time = time.time()
        metaHelperInstance = MetaDataHelper(self._data_frame, self._total_rows)
        sampleData = metaHelperInstance.get_sample_data()
        sampleData = sampleData.toPandas()
        sampleData = metaHelperInstance.format_sampledata_timestamp_columns(
            sampleData, self._timestamp_columns, self._stripTimestamp)
        time_taken_sampling = time.time() - self._start_time
        self._completionStatus += self._scriptStages["sampling"]["weight"]
        print "sampling takes", time_taken_sampling
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "sampling",\
                                    "info",\
                                    self._scriptStages["sampling"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsgFlag)

        metaData = []
        metaData.append(
            MetaData(name="noOfRows",
                     value=self._total_rows,
                     display=True,
                     displayName="Rows"))
        metaData.append(
            MetaData(name="noOfColumns",
                     value=self._total_columns,
                     display=True,
                     displayName="Columns"))
        self._percentage_columns = metaHelperInstance.get_percentage_columns(
            self._string_columns)
        if len(self._percentage_columns) > 0:
            self._data_frame = CommonUtils.convert_percentage_columns(
                self._data_frame, self._percentage_columns)
            self._numeric_columns = self._numeric_columns + self._percentage_columns
            self._string_columns = list(
                set(self._string_columns) - set(self._percentage_columns))
            self.update_column_type_dict()

        self._dollar_columns = metaHelperInstance.get_dollar_columns(
            self._string_columns)
        if len(self._dollar_columns) > 0:
            self._data_frame = CommonUtils.convert_dollar_columns(
                self._data_frame, self._dollar_columns)
            self._numeric_columns = self._numeric_columns + self._dollar_columns
            self._string_columns = list(
                set(self._string_columns) - set(self._dollar_columns))
            self.update_column_type_dict()

        if len(self._numeric_columns) > 1:
            # print "self._numeric_columns : ", self._numeric_columns
            metaData.append(
                MetaData(name="measures",
                         value=len(self._numeric_columns),
                         display=True,
                         displayName="Measures"))
        else:
            metaData.append(
                MetaData(name="measures",
                         value=len(self._numeric_columns),
                         display=True,
                         displayName="Measure"))
        if len(self._string_columns) > 1:
            metaData.append(
                MetaData(name="dimensions",
                         value=len(self._string_columns +
                                   self._boolean_columns),
                         display=True,
                         displayName="Dimensions"))
        else:
            metaData.append(
                MetaData(name="dimensions",
                         value=len(self._string_columns +
                                   self._boolean_columns),
                         display=True,
                         displayName="Dimension"))
        if len(self._timestamp_columns) > 1:
            metaData.append(
                MetaData(name="timeDimension",
                         value=len(self._timestamp_columns),
                         display=True,
                         displayName="Time Dimensions"))
        else:
            metaData.append(
                MetaData(name="timeDimension",
                         value=len(self._timestamp_columns),
                         display=True,
                         displayName="Time Dimension"))

        metaData.append(
            MetaData(name="measureColumns",
                     value=self._numeric_columns,
                     display=False))
        metaData.append(
            MetaData(name="dimensionColumns",
                     value=self._string_columns + self._boolean_columns,
                     display=False))
        metaData.append(
            MetaData(name="timeDimensionColumns",
                     value=self._timestamp_columns,
                     display=False))
        metaData.append(
            MetaData(name="percentageColumns",
                     value=self._percentage_columns,
                     display=False))
        metaData.append(
            MetaData(name="dollarColumns",
                     value=self._dollar_columns,
                     display=False))
        columnData = []
        headers = []

        self._start_time = time.time()
        print "Count of Numeric columns", len(self._numeric_columns)
        measureColumnStat, measureCharts = metaHelperInstance.calculate_measure_column_stats(
            self._data_frame,
            self._numeric_columns,
            binColumn=self._binned_stat_flag)
        time_taken_measurestats = time.time() - self._start_time
        self._completionStatus += self._scriptStages["measurestats"]["weight"]
        print "measure stats takes", time_taken_measurestats
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "measurestats",\
                                    "info",\
                                    self._scriptStages["measurestats"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsgFlag)

        self._start_time = time.time()
        dimensionColumnStat, dimensionCharts = metaHelperInstance.calculate_dimension_column_stats(
            self._data_frame,
            self._string_columns + self._boolean_columns,
            levelCount=self._level_count_flag)
        # print dimensionColumnStat
        self._dataSize["dimensionLevelCountDict"] = {
            k: filter(lambda x: x["name"] == "numberOfUniqueValues",
                      v)[0]["value"]
            for k, v in dimensionColumnStat.items()
        }
        self._dataSize["totalLevels"] = sum(
            self._dataSize["dimensionLevelCountDict"].values())

        time_taken_dimensionstats = time.time() - self._start_time
        self._completionStatus += self._scriptStages["dimensionstats"][
            "weight"]
        # print "dimension stats takes",time_taken_dimensionstats
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "dimensionstats",\
                                    "info",\
                                    self._scriptStages["dimensionstats"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsgFlag)

        self._start_time = time.time()
        timeDimensionColumnStat, timeDimensionCharts = metaHelperInstance.calculate_time_dimension_column_stats(
            self._data_frame,
            self._timestamp_columns,
            level_count_flag=self._level_count_flag)
        time_taken_tdstats = time.time() - self._start_time
        self._completionStatus += self._scriptStages["timedimensionstats"][
            "weight"]
        # print "time dimension stats takes",time_taken_tdstats
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "timedimensionstats",\
                                    "info",\
                                    self._scriptStages["timedimensionstats"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsgFlag)

        self._start_time = time.time()
        ignoreColumnSuggestions = []
        ignoreColumnReason = []
        utf8ColumnSuggestion = []
        dateTimeSuggestions = {}
        for column in self._data_frame.columns:
            random_slug = uuid.uuid4().hex
            headers.append(ColumnHeader(name=column, slug=random_slug))
            data = ColumnData()
            data.set_slug(random_slug)
            data.set_name(column)
            data.set_abstract_datatype(
                self._column_type_dict[column]["abstract"])

            columnStat = []
            columnChartData = None
            if self._column_type_dict[column]["abstract"] == "measure":
                data.set_column_stats(measureColumnStat[column])
                data.set_column_chart(measureCharts[column])
                data.set_actual_datatype(
                    self._column_type_dict[column]["actual"])
            elif self._column_type_dict[column]["abstract"] == "dimension":
                data.set_column_stats(dimensionColumnStat[column])
                data.set_column_chart(dimensionCharts[column])
                data.set_actual_datatype(
                    self._column_type_dict[column]["actual"])
            elif self._column_type_dict[column]["abstract"] == "datetime":
                data.set_column_stats(timeDimensionColumnStat[column])
                data.set_column_chart(timeDimensionCharts[column])
                data.set_actual_datatype(
                    self._column_type_dict[column]["actual"])

            if self._column_type_dict[column]["abstract"] == "measure":
                if column not in self._real_columns:
                    ignoreSuggestion, ignoreReason = metaHelperInstance.get_ignore_column_suggestions(
                        self._data_frame,
                        column,
                        "measure",
                        measureColumnStat[column],
                        max_levels=self._max_levels)
                    if ignoreSuggestion:
                        ignoreColumnSuggestions.append(column)
                        ignoreColumnReason.append(ignoreReason)
                        data.set_level_count_to_null()
                        data.set_chart_data_to_null()
                        data.set_ignore_suggestion_flag(True)
                        data.set_ignore_suggestion_message(ignoreReason)

            elif self._column_type_dict[column]["abstract"] == "dimension":
                if self._level_count_flag:
                    utf8Suggestion = metaHelperInstance.get_utf8_suggestions(
                        dimensionColumnStat[column])
                else:
                    utf8Suggestion = False
                if self._column_type_dict[column]["actual"] != "boolean":
                    uniqueVals = self._data_frame.select(
                        column).distinct().na.drop().collect()
                else:
                    uniqueVals = []
                if len(uniqueVals) > 0:
                    dateColumnFormat = metaHelperInstance.get_datetime_format(
                        uniqueVals)
                else:
                    dateColumnFormat = None
                if dateColumnFormat:
                    dateTimeSuggestions.update({column: dateColumnFormat})
                    data.set_level_count_to_null()
                    data.set_chart_data_to_null()
                    data.set_date_suggestion_flag(True)

                if utf8Suggestion:
                    utf8ColumnSuggestion.append(column)
                ignoreSuggestion, ignoreReason = metaHelperInstance.get_ignore_column_suggestions(
                    self._data_frame,
                    column,
                    "dimension",
                    dimensionColumnStat[column],
                    max_levels=self._max_levels)
                if ignoreSuggestion:
                    ignoreColumnSuggestions.append(column)
                    ignoreColumnReason.append(ignoreReason)
                    data.set_level_count_to_null()
                    data.set_chart_data_to_null()
                    data.set_ignore_suggestion_flag(True)
                    data.set_ignore_suggestion_message(ignoreReason)

            columnData.append(data)
        for dateColumn in dateTimeSuggestions.keys():
            if dateColumn in ignoreColumnSuggestions:
                ignoreColIdx = ignoreColumnSuggestions.index(dateColumn)
                ignoreColumnSuggestions.remove(dateColumn)
                del (ignoreColumnReason[ignoreColIdx])
        for utfCol in utf8ColumnSuggestion:
            ignoreColumnSuggestions.append(utfCol)
            ignoreColumnReason.append("utf8 values present")
        progressMessage = CommonUtils.create_progress_message_object(
            self._analysisName,
            "custom",
            "info",
            "Validating Metadata Information",
            self._completionStatus,
            self._completionStatus,
            display=True)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsgFlag)
        metaData.append(
            MetaData(name="ignoreColumnSuggestions",
                     value=ignoreColumnSuggestions,
                     display=False))
        metaData.append(
            MetaData(name="ignoreColumnReason",
                     value=ignoreColumnReason,
                     display=False))
        metaData.append(
            MetaData(name="utf8ColumnSuggestion",
                     value=utf8ColumnSuggestion,
                     display=False))
        metaData.append(
            MetaData(name="dateTimeSuggestions",
                     value=dateTimeSuggestions,
                     display=False))
        metaData.append(
            MetaData(name="dataSizeSummary",
                     value=self._dataSize,
                     display=False))
        dfMetaData = DfMetaData()
        dfMetaData.set_column_data(columnData)
        dfMetaData.set_header(headers)
        dfMetaData.set_meta_data(metaData)
        dfMetaData.set_sample_data(sampleData)

        time_taken_suggestions = time.time() - self._start_time
        self._completionStatus += self._scriptStages["suggestions"]["weight"]
        # print "suggestions take",time_taken_suggestions
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "suggestions",\
                                    "info",\
                                    self._scriptStages["suggestions"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsgFlag)
        self._dataframe_context.update_completion_status(
            self._completionStatus)
        return dfMetaData