def _generate_summary(self): ignored_columns = self._dataframe_context.get_ignore_column_suggestions() if ignored_columns == None: ignored_columns = [] metaHelperInstance = MetaDataHelper(self._data_frame, self._total_rows) sampleData = metaHelperInstance.get_sample_data() try: sampleData = sampleData.toPandas() except: pass l1=[] l2=[] if self._pandas_flag: for column in self._dataframe_helper.get_string_columns(): uniqueVals = sampleData[column].unique().tolist() if len(uniqueVals) > 0 and metaHelperInstance.get_datetime_format_pandas([self._data_frame[column].sort_values(ascending=False)[0]])!=None: dateColumnFormat = metaHelperInstance.get_datetime_format_pandas(uniqueVals) l1.append(column) else: dateColumnFormat = None l2.append(column) # l1 = self._dataframe_helper.get_timestamp_columns() # l2 = self._dataframe_helper.get_string_columns() else: for column in self._dataframe_helper.get_string_columns(): uniqueVals = sampleData[column].unique().tolist() if len(uniqueVals) > 0 and metaHelperInstance.get_datetime_format([self._data_frame.orderBy([column],ascending=[False]).select(column).first()[0]])!=None: dateColumnFormat = metaHelperInstance.get_datetime_format(uniqueVals) l1.append(column) else: dateColumnFormat = None l2.append(column) data_dict = {"n_c" : self._dataframe_helper.get_num_columns(), "n_m" : len(self._dataframe_helper.get_numeric_columns()), "n_d" : len(l2), "n_td" : len(l1), "c" : self._column_name, "d" : l2, "m" : self._dataframe_helper.get_numeric_columns(), "td" : l1, "observations" : self._dataframe_helper.get_num_rows(), "ignorecolumns" : ignored_columns, "n_t" : len(self._dataframe_helper.get_string_columns())+len(self._dataframe_helper.get_numeric_columns())+len(self._dataframe_helper.get_timestamp_columns()) # "n_t" : self._dataframe_helper.get_num_columns()+len(ignored_columns) } self.summary = NarrativesUtils.get_template_output(self._base_dir,\ 'descr_stats_summary.html',data_dict) MeasureSummaryCard = SummaryCard(name='Summary',slug=None,cardData = None) MeasureSummaryCard.set_no_of_measures(data_dict["n_m"]) MeasureSummaryCard.set_no_of_dimensions(data_dict["n_d"]) MeasureSummaryCard.set_no_of_time_dimensions(data_dict["n_td"]) MeasureSummaryCard.set_summary_html(NarrativesUtils.block_splitter(self.summary,self._blockSplitter)) self._story_narrative.add_a_card(MeasureSummaryCard) self._headNode.add_a_card(MeasureSummaryCard)
def __init__(self, df_helper, df_context, result_setter, spark, story_narrative, meta_parser): self._story_narrative = story_narrative self._result_setter = result_setter self._spark = spark self._dataframe_helper = df_helper self._dataframe_context = df_context self._pandas_flag = df_context._pandas_flag self._data_frame = df_helper.get_data_frame() self._num_significant_digits = NarrativesUtils.get_significant_digit_settings( "trend") self._metaParser = meta_parser self._result_column = self._dataframe_context.get_result_column() self._string_columns = self._dataframe_helper.get_string_columns() self._timestamp_columns = self._dataframe_helper.get_timestamp_columns( ) # self._selected_date_columns = None self._selected_date_columns = self._dataframe_context.get_selected_date_columns( ) self._all_date_columns = self._dataframe_context.get_date_columns() self._string_columns = list( set(self._string_columns) - set(self._all_date_columns)) self._dateFormatDetected = False self._existingDateFormat = None self._dateFormatConversionDict = NarrativesUtils.date_formats_mapping_dict( ) self._dateColumnFormatDict = df_context.get_date_format_dict() if self._dataframe_context.get_requested_date_format() != None: self._requestedDateFormat = df_context.get_requested_date_format() else: self._requestedDateFormat = None self._analysistype = self._dataframe_context.get_analysis_type() self._trendSettings = self._dataframe_context.get_trend_settings() self._trendSpecificMeasure = False if self._trendSettings != None: if self._analysistype == "dimension" and self._trendSettings[ "name"] != "Count": self._trendSpecificMeasure = True self._analysistype = "measure" self._result_column = self._trendSettings["selectedMeasure"] elif self._analysistype == "measure" and self._trendSettings[ "name"] != "Count": self._result_column = self._trendSettings["selectedMeasure"] self._trend_subsection = self._result_setter.get_trend_section_name() self._regression_trend_card = None self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER self._highlightFlag = "|~HIGHLIGHT~|" self._trend_on_td_column = False self._number_of_dimensions_to_consider = 10 self._completionStatus = self._dataframe_context.get_completion_status( ) self._analysisName = self._dataframe_context.get_analysis_name() self._messageURL = self._dataframe_context.get_message_url() if self._analysistype == "dimension": self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight( ) self._scriptStages = { "initialization": { "summary": "Initialized The Frequency Narratives", "weight": 0 }, "summarygeneration": { "summary": "Summary Generation Finished", "weight": 4 }, "completion": { "summary": "Frequency Stats Narratives Done", "weight": 0 }, } elif self._analysistype == "measure": if self._trendSpecificMeasure: self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight( ) else: self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight( ) self._scriptStages = { "trendNarrativeStart": { "summary": "Started The Descriptive Stats Narratives", "weight": 1 }, "trendNarrativeEnd": { "summary": "Narratives For Descriptive Stats Finished", "weight": 0 }, } self._base_dir = "/trend/" if self._pandas_flag and self._selected_date_columns and not self._dateColumnFormatDict and not self._timestamp_columns: for column in self._selected_date_columns: uniqueVals = self._data_frame[column].astype( str).unique().tolist() metaHelperInstance = MetaDataHelper(self._data_frame, self._data_frame.shape[0]) if len(uniqueVals ) > 0 and metaHelperInstance.get_datetime_format_pandas( [ self._data_frame.sort_values( by=column, ascending=False)[column][0] ]) != None: dateColumnFormat = metaHelperInstance.get_datetime_format_pandas( uniqueVals) self._dateColumnFormatDict.update( {column: dateColumnFormat}) dateColCheck = NarrativesUtils.check_date_column_formats(self._selected_date_columns,\ self._timestamp_columns,\ self._dateColumnFormatDict,\ self._dateFormatConversionDict, self._requestedDateFormat) print(dateColCheck) self._dateFormatDetected = dateColCheck["dateFormatDetected"] self._trend_on_td_column = dateColCheck["trendOnTdCol"] if self._dateFormatDetected: self._requestedDateFormat = dateColCheck["requestedDateFormat"] self._existingDateFormat = dateColCheck["existingDateFormat"] # self._date_column_suggested is the column used for trend self._date_column_suggested = dateColCheck["suggestedDateColumn"] if self._existingDateFormat: self._data_frame, dataRangeStats = NarrativesUtils.calculate_data_range_stats( self._data_frame, self._existingDateFormat, self._date_column_suggested, self._trend_on_td_column, self._pandas_flag) print(dataRangeStats) self._durationString = dataRangeStats["durationString"] self._duration = dataRangeStats["duration"] self._dataLevel = dataRangeStats["dataLevel"] first_date = dataRangeStats["firstDate"] last_date = dataRangeStats["lastDate"] if self._timestamp_columns != None: if self._selected_date_columns == None: self._selected_date_columns = self._timestamp_columns else: self._selected_date_columns += self._timestamp_columns if self._pandas_flag: pass else: if self._trend_subsection == "regression": if self._selected_date_columns != None: if self._dateFormatDetected: trend_subsection_data = self._result_setter.get_trend_section_data( ) measure_column = trend_subsection_data[ "measure_column"] result_column = trend_subsection_data["result_column"] base_dir = trend_subsection_data["base_dir"] card3heading = 'How ' + result_column + ' and ' + measure_column + ' changed over time' if self._dataLevel == "day": grouped_data = self._data_frame.groupBy( "suggestedDate").agg({ measure_column: 'sum', result_column: 'sum' }) grouped_data = grouped_data.withColumnRenamed( grouped_data.columns[-1], result_column) grouped_data = grouped_data.withColumnRenamed( grouped_data.columns[-2], measure_column) grouped_data = grouped_data.withColumn( "year_month", udf(lambda x: x.strftime("%b-%y"))( "suggestedDate")) grouped_data = grouped_data.orderBy( "suggestedDate", ascending=True) grouped_data = grouped_data.withColumnRenamed( grouped_data.columns[0], "key") grouped_data = grouped_data.toPandas() elif self._dataLevel == "month": grouped_data = self._data_frame.groupBy( "year_month").agg({ measure_column: 'sum', result_column: 'sum' }) grouped_data = grouped_data.withColumnRenamed( grouped_data.columns[-1], result_column) grouped_data = grouped_data.withColumnRenamed( grouped_data.columns[-2], measure_column) grouped_data = grouped_data.withColumn( "suggestedDate", udf(lambda x: datetime.strptime(x, "%b-%y"))( "year_month")) grouped_data = grouped_data.orderBy( "suggestedDate", ascending=True) grouped_data = grouped_data.withColumnRenamed( "suggestedDate", "key") grouped_data = grouped_data.select([ "key", measure_column, result_column, "year_month" ]).toPandas() grouped_data["key"] = grouped_data[ "year_month"].apply( lambda x: datetime.strptime(x, "%b-%y" ).date()) trend_narrative_obj = TrendNarrative( self._result_column, self._date_column_suggested, grouped_data, self._existingDateFormat, self._requestedDateFormat, self._base_dir, self._metaParser) card3data = trend_narrative_obj.generate_regression_trend_data( grouped_data, measure_column, result_column, self._dataLevel, self._durationString) card3narrative = NarrativesUtils.get_template_output(base_dir,\ 'regression_card3.html',card3data) card3chart = trend_narrative_obj.generate_regression_trend_chart( grouped_data, self._dataLevel) card3paragraphs = NarrativesUtils.paragraph_splitter( card3narrative) card2 = { 'charts': card3chart, 'paragraphs': card3paragraphs, 'heading': card3heading } self.set_regression_trend_card_data(card2) else: print("NO DATE FORMAT DETECTED") else: print("NO DATE COLUMNS PRESENT") if self._analysistype == "measure": self._completionStatus += old_div( self._scriptWeightDict[self._analysisName]["total"] * self._scriptStages["trendNarrativeStart"]["weight"], 10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "trendNarrativeStart",\ "info",\ self._scriptStages["trendNarrativeStart"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) # self._startMeasureTrend = self._result_setter.get_trend_section_completion_status() self._startMeasureTrend = True if self._startMeasureTrend == True: self.narratives = { "SectionHeading": "", "card1": {}, "card2": {}, "card3": {} } if self._selected_date_columns != None: if self._dateFormatDetected: grouped_data = NarrativesUtils.get_grouped_data_for_trend( self._data_frame, self._dataLevel, self._result_column, self._analysistype, self._pandas_flag) if self._pandas_flag: self._data_frame = self._data_frame.drop( self._date_column_suggested, axis=1) else: self._data_frame = self._data_frame.drop( self._date_column_suggested) # self._data_frame = self._data_frame.withColumnRenamed("year_month", self._date_column_suggested) significant_dimensions = [] significant_dimension_dict = df_helper.get_significant_dimension( ) if significant_dimension_dict != {} and significant_dimension_dict != None: significant_dimension_tuple = tuple( significant_dimension_dict.items()) significant_dimension_tuple = sorted( significant_dimension_tuple, key=lambda x: x[1], reverse=True) significant_dimensions = [ x[0] for x in significant_dimension_tuple[:self. _number_of_dimensions_to_consider] ] else: significant_dimensions = self._string_columns[:self . _number_of_dimensions_to_consider] print("significant_dimensions", significant_dimensions) trend_narrative_obj = TrendNarrative( self._result_column, self._date_column_suggested, grouped_data, self._existingDateFormat, self._requestedDateFormat, self._base_dir, self._metaParser) # grouped_data.to_csv("/home/gulshan/marlabs/datasets/trend_grouped_pandas.csv",index=False) dataDict = trend_narrative_obj.generateDataDict( grouped_data, self._dataLevel, self._durationString) # # update reference time with max value reference_time = dataDict["reference_time"] dataDict["duration"] = self._duration dataDict["dataLevel"] = self._dataLevel dataDict["durationString"] = self._durationString dataDict[ "significant_dimensions"] = significant_dimensions if len(significant_dimensions) > 0: if self._dataLevel == "day": datetimeformat = self._existingDateFormat elif self._dataLevel == "month": datetimeformat = "%b-%y" # xtraData = trend_narrative_obj.get_xtra_calculations(self._data_frame,grouped_data,significant_dimensions,self._date_column_suggested,self._result_column,self._existingDateFormat,reference_time,self._dataLevel, self._pandas_flag) xtraData = trend_narrative_obj.get_xtra_calculations( self._data_frame, grouped_data, significant_dimensions, self._date_column_suggested, self._result_column, datetimeformat, reference_time, self._dataLevel, self._pandas_flag) if xtraData != None: dataDict.update(xtraData) # print 'Trend dataDict: %s' %(json.dumps(dataDict, indent=2)) self._result_setter.update_executive_summary_data( dataDict) dataDict.update({ "blockSplitter": self._blockSplitter, "highlightFlag": self._highlightFlag }) summary1 = NarrativesUtils.get_template_output(self._base_dir,\ 'measure_trend_card1.html',dataDict) summary2 = NarrativesUtils.get_template_output(self._base_dir,\ 'measure_trend_card2.html',dataDict) measureTrendCard = NormalCard() measureTrendcard1Data = NarrativesUtils.block_splitter( summary1, self._blockSplitter, highlightFlag=self._highlightFlag) measureTrendcard2Data = NarrativesUtils.block_splitter( summary2, self._blockSplitter) # print measureTrendcard1Data bubbledata = dataDict["bubbleData"] # print bubbledata card1BubbleData = "<div class='col-md-6 col-xs-12 xs-p-20'><h2 class='text-center'><span>{}</span><br/><small>{}</small></h2></div><div class='col-md-6 col-xs-12 xs-p-20'><h2 class='text-center'><span>{}</span><br/><small>{}</small></h2></div>".format( bubbledata[0]["value"], bubbledata[0]["text"], bubbledata[1]["value"], bubbledata[1]["text"]) # print card1BubbleData trend_chart_data = list( grouped_data[["key", "value"]].T.to_dict().values()) trend_chart_data = sorted(trend_chart_data, key=lambda x: x["key"]) card1chartdata = {"actual": [], "predicted": []} if self._dataLevel == "day": card1chartdata["actual"] = [{ "key": str(val["key"]), "value": val["value"] } for val in trend_chart_data] elif self._dataLevel == "month": card1chartdata["actual"] = [{ "key": val["key"].strftime("%b-%y"), "value": val["value"] } for val in trend_chart_data] if self._duration < 365: prediction_window = 3 else: prediction_window = 6 predicted_values = trend_narrative_obj.get_forecast_values( grouped_data["value"], prediction_window)[len(grouped_data["value"]):] predicted_values = [ round(x, self._num_significant_digits) for x in predicted_values ] forecasted_data = [] forecasted_data.append(card1chartdata["actual"][-1]) forecasted_dates = [] # forecast_start_time = datetime.strptime(card1chartdata["actual"][-1]["key"],"%b-%y") if self._dataLevel == "month": forecast_start_time = datetime.strptime( card1chartdata["actual"][-1]["key"], "%b-%y") elif self._dataLevel == "day": try: forecast_start_time = datetime.strptime( card1chartdata["actual"][-1]["key"], "%Y-%m-%d") except: forecast_start_time = datetime.strptime( card1chartdata["actual"][-1]["key"], '%Y-%m-%d %H:%M:%S') for val in range(prediction_window): if self._dataLevel == "month": key = forecast_start_time + relativedelta( months=1 + val) forecasted_dates.append(key) elif self._dataLevel == "day": key = forecast_start_time + relativedelta( days=1 + val) forecasted_dates.append(key) forecasted_list = list( zip(forecasted_dates, predicted_values)) if self._dataLevel == "month": forecasted_list = [{ "key": val[0].strftime("%b-%y"), "value": val[1] } for val in forecasted_list] elif self._dataLevel == "day": forecasted_list = [{ "key": val[0].strftime("%Y-%m-%d"), "value": val[1] } for val in forecasted_list] forecasted_data += forecasted_list card1chartdata["predicted"] = forecasted_data # print json.dumps(card1chartdata,indent=2) card1chartdata = ScatterChartData(data=card1chartdata) chartJson = ChartJson() chartJson.set_data(card1chartdata.get_data()) chartJson.set_label_text({ 'x': ' ', 'y': 'No. of Observations' }) chartJson.set_legend({ "actual": "Observed", "predicted": "Forecast" }) chartJson.set_chart_type("scatter_line") chartJson.set_axes({"x": "key", "y": "value"}) chartJson.set_yaxis_number_format(".2f") st_info = [ "Trend Analysis", "Forecast Method : Holt Winters Method" ] measureTrendcard1Data.insert( 1, C3ChartData(data=chartJson, info=st_info)) measureTrendcard1Data.append( HtmlData(data=card1BubbleData)) cardData = measureTrendcard1Data + measureTrendcard2Data measureTrendCard.set_card_data(cardData) measureTrendCard.set_card_name("Trend Analysis") trendStoryNode = NarrativesTree( "Trend", None, [], [measureTrendCard]) self._story_narrative.add_a_node(trendStoryNode) self._result_setter.set_trend_node(trendStoryNode) # prediction_data = [{"key":x["key"],"value":x["value"]} for x in trend_chart_data] # last_val = prediction_data[-1] # last_val.update({"predicted_value":last_val["value"]}) # prediction_data[-1] = last_val # # for val in range(prediction_window): # dataLevel = dataDict["dataLevel"] # if self._dataLevel == "month": # last_key = prediction_data[-1]["key"] # key = last_key+relativedelta(months=1) # prediction_data.append({"key":key,"predicted_value":predicted_values[val]}) # forecasted_data.append({"key":key,"value":predicted_values[val]}) # elif self._dataLevel == "day": # last_key = prediction_data[-1]["key"] # key = last_key+relativedelta(days=1) # prediction_data.append({"key":key,"predicted_value":predicted_values[val]}) # prediction_data_copy = prediction_data # prediction_data = [] # for val in prediction_data_copy: # val["key"] = val["key"].strftime("%b-%y") # prediction_data.append(val) # forecastDataDict = {"startForecast":predicted_values[0], # "endForecast":predicted_values[prediction_window-1], # "measure":dataDict["measure"], # "forecast":True, # "forecast_percentage": round((predicted_values[prediction_window-1]-predicted_values[0])/predicted_values[0],self._num_significant_digits), # "prediction_window_text": str(prediction_window) + " months" # } # # self._result_setter.update_executive_summary_data(forecastDataDict) # summary3 = NarrativesUtils.get_template_output(self._base_dir,\ # 'trend_narrative_card3.html',forecastDataDict) self._completionStatus += old_div( self._scriptWeightDict[self._analysisName]["total"] * self._scriptStages["trendNarrativeEnd"]["weight"], 10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "trendNarrativeEnd",\ "info",\ self._scriptStages["trendNarrativeEnd"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message( self._messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) else: # self._result_setter.update_executive_summary_data({"trend_present":False}) print("Trend Analysis for Measure Failed") print("#" * 20 + "Trend Analysis Error" + "#" * 20) print( "No date format for the date column %s was detected." % (self._date_column_suggested)) print("#" * 60) self._completionStatus += self._scriptWeightDict[ self._analysisName]["total"] self._dataframe_context.update_completion_status( completionStatus) progressMessage = CommonUtils.create_progress_message_object("Trend","failedState","error",\ "Trend Failed As "+"No Date Format For The Date Column %s Was Detected !!!" %(self._date_column_suggested),\ completionStatus,completionStatus) CommonUtils.save_progress_message( messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) else: # self._result_setter.update_executive_summary_data({"trend_present":False}) print("Trend Analysis for Measure Failed") print("#" * 20 + "Trend Analysis Error" + "#" * 20) print("No date column present for Trend Analysis.") print("#" * 60) self._completionStatus += self._scriptWeightDict[ self._analysisName]["total"] self._dataframe_context.update_completion_status( completionStatus) progressMessage = CommonUtils.create_progress_message_object("Trend","failedState","error",\ "No Date Column Present",\ completionStatus,completionStatus) CommonUtils.save_progress_message(messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) else: print("overall Trend not Started YET") elif self._analysistype == "dimension": print("Dimension Trend Started") self._completionStatus += old_div( self._scriptWeightDict[self._analysisName]["total"] * self._scriptStages["initialization"]["weight"], 10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "initialization",\ "info",\ self._scriptStages["initialization"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) self.narratives = {"card0": {}} if self._selected_date_columns != None: if self._dateFormatDetected: # result_column_levels = [x[0] for x in self._data_frame.select(self._result_column).distinct().collect()] try: result_column_levels = self._metaParser.get_unique_level_names( self._result_column) except: if self._pandas_flag: result_column_levels = list( self._data_frame[self._result_column].unique()) else: result_column_levels = [ x[0] for x in self._data_frame.select( self._result_column).distinct().collect() ] # result_column_levels = self._data_frame.agg((F.collect_set(self._result_column).alias(self._result_column))).first().asDict()[self._result_column] print("-" * 100) # TODO Implement meta parser getter here print(result_column_levels) if self._pandas_flag: level_count_df = self._data_frame[ self._result_column].value_counts()[0:2] top2levels = list(level_count_df.index) else: level_count_df = self._data_frame.groupBy( self._result_column).count().orderBy( "count", ascending=False) level_count_df_rows = level_count_df.collect() top2levels = [ level_count_df_rows[0][0], level_count_df_rows[1][0] ] cardData = [] chart_data = {} cardData1 = [] c3_chart = {"dataType": "c3Chart", "data": {}} print("#" * 40) overall_count = NarrativesUtils.get_grouped_count_data_for_dimension_trend( self._data_frame, self._dataLevel, self._result_column, self._pandas_flag) print("#" * 40) for idx, level in enumerate(top2levels): print("calculations in progress for the level :- ", level) if self._pandas_flag: leveldf = self._data_frame[self._data_frame[ self._result_column] == level] else: leveldf = self._data_frame.filter( col(self._result_column) == level) grouped_data = NarrativesUtils.get_grouped_data_for_trend( leveldf, self._dataLevel, self._result_column, self._analysistype, self._pandas_flag) grouped_data.rename(columns={"value": "value_count"}, inplace=True) grouped_data = pd.merge(grouped_data, overall_count, on='key', how='left') # grouped_data["value"] = grouped_data["value_count"].apply(lambda x:round(x*100/float(self._data_frame.count()),self._num_significant_digits)) grouped_data["value"] = old_div( grouped_data["value_count"], grouped_data["totalCount"]) grouped_data["value"] = grouped_data["value"].apply( lambda x: round(x * 100, self. _num_significant_digits)) if self._pandas_flag: leveldf = leveldf.drop(self._date_column_suggested, axis=1) leveldf = leveldf.rename( columns={ "year_month": self._date_column_suggested }) if "year_month" not in leveldf.columns: leveldf["year_month"] = leveldf[ self._date_column_suggested] leveldf["value_col"] = 1 else: leveldf = leveldf.drop(self._date_column_suggested) leveldf = leveldf.withColumnRenamed( "year_month", self._date_column_suggested) if "year_month" not in leveldf.columns: leveldf = leveldf.withColumn( "year_month", col(self._date_column_suggested)) leveldf = leveldf.withColumn('value_col', lit(1)) trend_narrative_obj = TrendNarrative( self._result_column, self._date_column_suggested, grouped_data, self._existingDateFormat, self._requestedDateFormat, self._base_dir, self._metaParser) dataDict = trend_narrative_obj.generateDataDict( grouped_data, self._dataLevel, self._durationString) dataDict["target_column"] = dataDict["measure"] dataDict["measure"] = level dataDict["duration"] = self._duration dataDict["dataLevel"] = self._dataLevel dataDict["durationString"] = self._durationString # grouped_data.to_csv("/home/gulshan/marlabs/datasets/grouped_data"+str(idx)) # print json.dumps(dataDict,indent=2) significant_dimensions = [] significant_dimension_dict = df_helper.get_chisquare_significant_dimension( ) if significant_dimension_dict != {} and significant_dimension_dict != None: significant_dimension_tuple = tuple( significant_dimension_dict.items()) significant_dimension_tuple = sorted( significant_dimension_tuple, key=lambda x: x[1], reverse=True) significant_dimensions = [ x[0] for x in significant_dimension_tuple[:self. _number_of_dimensions_to_consider] ] else: significant_dimensions = self._string_columns[:self . _number_of_dimensions_to_consider] print("significant_dimensions", significant_dimensions) reference_time = dataDict["reference_time"] dataDict[ "significant_dimensions"] = significant_dimensions if len(significant_dimensions) > 0: st = time.time() xtraData = trend_narrative_obj.get_xtra_calculations( leveldf, grouped_data, significant_dimensions, self._date_column_suggested, "value_col", self._existingDateFormat, reference_time, self._dataLevel, self._pandas_flag) print("time for get_xtra_calculations", time.time() - st) if xtraData != None: dataDict.update(xtraData) dimensionCount = trend_narrative_obj.generate_dimension_extra_narrative( grouped_data, dataDict, self._dataLevel) if dimensionCount != None: dataDict.update(dimensionCount) dataDict.update({ "level_index": idx, "blockSplitter": self._blockSplitter, "highlightFlag": self._highlightFlag }) self._result_setter.update_executive_summary_data( dataDict) trendStory = NarrativesUtils.get_template_output(self._base_dir,\ 'dimension_trend.html',dataDict) blocks = NarrativesUtils.block_splitter( trendStory, self._blockSplitter) if idx != 0: cardData1 += blocks[2:] else: cardData1 += blocks trend_chart_data = [ x for x in list(grouped_data[ ["key", "value"]].T.to_dict().values()) if x['key'] != None ] trend_chart_data = sorted(trend_chart_data, key=lambda x: x["key"]) card1chartdata = trend_chart_data if self._dataLevel == "day": card1chartdata = [{ "key": str(val["key"]), "value": val["value"] } for val in card1chartdata] elif self._dataLevel == "month": card1chartdata = [{ "key": val["key"].strftime("%b-%y"), "value": val["value"] } for val in card1chartdata] chart_data[level] = card1chartdata labels = { "x": "key", "y": list(chart_data.keys())[0], "y2": list(chart_data.keys())[1] } c3Chart = { "data": chart_data, "format": "%b-%y", "label": labels, "label_text": { "x": "Time", "y": "Percentage of " + labels["y"], "y2": "Percentage of " + labels["y2"] } } c3_chart["data"] = c3Chart multiLineData = [] for idx in range(len(chart_data[top2levels[0]])): key = chart_data[top2levels[0]][idx]["key"] value = chart_data[top2levels[0]][idx]["value"] try: value1 = chart_data[top2levels[1]][idx]["value"] except: value1 = 0 multiLineData.append({ "key": key, top2levels[0]: value, top2levels[1]: value1 }) chartData = NormalChartData(multiLineData) chartJson = ChartJson() chartJson.set_data(chartData.get_data()) chartJson.set_label_text(c3Chart["label_text"]) chartJson.set_legend(c3Chart["label"]) chartJson.set_chart_type("line") chartJson.set_yaxis_number_format(".2f") chartJson.set_axes(labels) st_info = [ "Trend Analysis", "Forecast Method : Holt Winters Method" ] cardData1.insert(1, C3ChartData(data=chartJson, info=st_info)) trendCard = NormalCard(name="Trend Analysis", slug=None, cardData=cardData1) trendStoryNode = NarrativesTree("Trend", None, [], [trendCard]) self._story_narrative.add_a_node(trendStoryNode) self._result_setter.set_trend_node(trendStoryNode) self._completionStatus += old_div( self._scriptWeightDict[self._analysisName]["total"] * self._scriptStages["summarygeneration"]["weight"], 10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "summarygeneration",\ "info",\ self._scriptStages["summarygeneration"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) self._completionStatus += old_div( self._scriptWeightDict[self._analysisName]["total"] * self._scriptStages["completion"]["weight"], 10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "completion",\ "info",\ self._scriptStages["completion"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) else: self._result_setter.update_executive_summary_data( {"trend_present": False}) print("Trend Analysis for Dimension Failed") print("#" * 20 + "Trend Analysis Error" + "#" * 20) if self._date_column_suggested: print( "No date format for the date column %s was detected." % (self._date_column_suggested)) print("#" * 60) self._completionStatus += self._scriptWeightDict[ self._analysisName]["total"] self._dataframe_context.update_completion_status( self._completionStatus) progressMessage = CommonUtils.create_progress_message_object("Trend","failedState","error",\ "Trend Failed As "+"No Date Format For The Date Column %s Was Detected !!!" %(self._date_column_suggested),\ self._completionStatus,self._completionStatus) CommonUtils.save_progress_message(messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) else: self._result_setter.update_executive_summary_data( {"trend_present": False}) print("Trend Analysis for Dimension Failed") print("#" * 20 + "Trend Analysis Error" + "#" * 20) print("No date column present for Trend Analysis.") print("#" * 60) self._completionStatus += self._scriptWeightDict[ self._analysisName]["total"] self._dataframe_context.update_completion_status( self._completionStatus) progressMessage = CommonUtils.create_progress_message_object("Trend","failedState","error",\ "No Date Column Present",\ self._completionStatus,self._completionStatus) CommonUtils.save_progress_message(messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus)
def __init__(self, df, dataframe_context): self._data_frame = df self._metaHelperInstance = MetaDataHelper(self._data_frame, self._data_frame.count()) self._dataframe_context = dataframe_context
class FeatureEngineeringHelper(object): """Contains Feature Engineering Operation Functions""" def __init__(self, df, dataframe_context): self._data_frame = df self._metaHelperInstance = MetaDataHelper(self._data_frame, self._data_frame.count()) self._dataframe_context = dataframe_context # self._dataframe_helper = dataframe_helper def to_date_(self, col, formats=GLOBALSETTINGS. SUPPORTED_DATETIME_FORMATS["pyspark_formats"]): # Spark 2.2 or later syntax, for < 2.2 use unix_timestamp and cast return F.coalesce(*[to_date(col, f) for f in formats]) def binning_all_measures(self, number_of_bins, consider_cols): dfSchemaFields = self._data_frame.schema.fields numeric_columns = [] cols_to_be_binned = [x[:-4] for x in consider_cols if x[-4:] == "_bin"] for field in dfSchemaFields: if ColumnType(type(field.dataType)).get_abstract_data_type( ) == ColumnType.MEASURE: numeric_columns.append(field.name) for column_name in numeric_columns and cols_to_be_binned: self._data_frame = self.create_equal_sized_measure_bins( column_name, number_of_bins) return self._data_frame def create_bin_udf(self, dict): def check_key(x, dict): for key in list(dict.keys()): if (x >= dict[key][0] and x <= dict[key][1]): return key return udf(lambda x: check_key(x, dict) if x != None else "None") # def binning_all_measures_sumeet(self, n_bins): # dfSchemaFields = self._data_frame.schema.fields # numeric_columns = [] # for field in dfSchemaFields: # if ColumnType(type(field.dataType)).get_abstract_data_type() == ColumnType.MEASURE: # numeric_columns.append(field.name) # for column_name in numeric_columns: # col_min = self._data_frame.select(F.min(column_name)).collect()[0][0] # col_max = self._data_frame.select(F.max(column_name)).collect()[0][0] # bins_unrounded = linspace(col_min, col_max, n_bins + 1) # # bins = [] # bins.insert(0, col_min) # for val in bins_unrounded[1:n_bins]: # bins.append(round(val, 2)) # bins.append(col_max) # # bucketizer = Bucketizer(splits = bins, inputCol = column_name, outputCol = column_name + "_binned") # self._data_frame = bucketizer.transform(self._data_frame) # # keys = [] # lists = [] # for val in range(0, n_bins): # keys.append(str(bins[val]) + "-" + str(bins[val + 1])) # list = [] # list.append(bins[val]) # list.append(bins[val + 1]) # lists.append(list) # # dict = {} # for i in range(0, n_bins): # dict[keys[i]] = lists[i] # # map_list = [x for x in range(n_bins)] # dict_new = {} # for n in range(0, n_bins): # dict_new[map_list[n]] = keys[n] # # def create_level_udf_sumeet(dict): # def check_key(x, dict): # for key in dict.keys(): # if x == key: # return dict[key] # return udf(lambda x: check_key(x,dict)) # # self._data_frame = self._data_frame.withColumn(column_name + "_binned", create_level_udf_sumeet(dict_new)(col(column_name + "_binned"))) # return self._data_frame def create_level_udf(self, dict): selected_list = [] for key in list(dict.keys()): selected_list = selected_list + dict[key] def check_key(x, dict): for key in list(dict.keys()): if x in selected_list: if x in dict[key]: return key else: return x return udf(lambda x: check_key(x, dict) if x != None else x) def create_new_levels_dimension(self, column_name, dict): self._data_frame = self._data_frame.withColumn( column_name + "_level", self.create_level_udf(dict)(col(column_name))) return self._data_frame def create_level_udf_time(self, dict, date_format): def convert_to_date(value): if isinstance(value, str): value = datetime.strptime(value, date_format) elif isinstance(value, str): value = datetime.strptime(value, date_format) else: value = value return value def convert_to_date_from_level_value(value): value = datetime.strptime(value, '%d/%m/%Y') return datetime.date(value) def check_key(date, dict): date = convert_to_date(date) for key, value in list(dict.items()): val1_date = convert_to_date_from_level_value(value[0]) val2_date = convert_to_date_from_level_value(value[1]) date_range = [val1_date, val2_date] if (date >= date_range[0] and date <= date_range[1]): return key return udf(lambda x: check_key(x, dict) if x != None else x) def create_new_levels_datetimes(self, col_for_timelevels, dict): # self._metaHelperInstance = MetaDataHelper(self._data_frame, self._data_frame.count()) self._data_frame = self._data_frame.withColumn( col_for_timelevels + '_temp', self.to_date_(col_for_timelevels)) uniqueVals = self._data_frame.select( col_for_timelevels + '_temp').distinct().na.drop().limit(100).collect() try: date_format = self._metaHelperInstance.get_datetime_format( uniqueVals) except TypeError: date_format = None self._data_frame = self._data_frame.withColumn( col_for_timelevels + "_t_level", self.create_level_udf_time(dict, date_format)( col(col_for_timelevels + '_temp'))) self._data_frame = self._data_frame.drop(col_for_timelevels + '_temp') return self._data_frame def create_bin_udf(self, dict): def check_key(x, dict): for key in list(dict.keys()): if (x >= dict[key][0] and x <= dict[key][1]): return key return udf(lambda x: check_key(x, dict) if x != None else "None") def create_equal_sized_measure_bins(self, column_name, number_of_bins): def create_dict_for_bin(): min_max = self._data_frame.agg( F.min(column_name).alias('min'), F.max(column_name).alias('max')).collect() min_value = min_max[0]['min'] max_value = min_max[0]['max'] interval_size = (old_div((max_value - min_value) * 1.0, (number_of_bins - 1))) dict = {} temp = min_value while temp <= max_value: dict[str(round(temp, 3)) + "-" + str(round(temp + interval_size, 3))] = [ temp, temp + interval_size ] temp = temp + interval_size return dict dict = create_dict_for_bin() self._data_frame = self._data_frame.withColumn( column_name + "_bin", self.create_bin_udf(dict)(col(column_name))) return self._data_frame def create_custom_measure_bins(self, column_name, list_of_intervals): def create_dict_for_bin(): min_max = self._data_frame.agg( F.min(column_name).alias('min'), F.max(column_name).alias('max')).collect() min_value = min_max[0]['min'] max_value = min_max[0]['max'] dict = {} if list_of_intervals[0] > min_value: dict[str(min_value) + "-" + str(list_of_intervals[0])] = [ min_value, list_of_intervals[0] ] for i in range(len(list_of_intervals)): if i + 2 <= len(list_of_intervals): dict[str(list_of_intervals[i]) + "-" + str(list_of_intervals[i + 1])] = [ list_of_intervals[i], list_of_intervals[i + 1] ] if list_of_intervals[-1] < max_value: dict[str(list_of_intervals[-1]) + "-" + str(max_value)] = [list_of_intervals[-1], max_value] return dict dict = create_dict_for_bin() self._data_frame = self._data_frame.withColumn( column_name + "_c_bin", self.create_bin_udf(dict)(col(column_name))) return self._data_frame '''To be verified''' def replace_values_in_column(self, column_name, range, value): if False: if value == "median": dp_helper_obj = DataPreprocessingHelper( self._data_frame, self._dataframe_context) median_val = dp_helper_obj.get_median(self._data_frame, column_name) replace_value = median_val self._data_frame = self._data_frame.withColumn( column_name, when(((self._data_frame[column_name] >= range[0]) & (self._data_frame[column_name] <= range[1])), replace_value).otherwise( self._data_frame[column_name])) if value == "mode": dp_helper_obj = DataPreprocessingHelper( self._data_frame, self._dataframe_context) mode_val = dp_helper_obj.get_mode(self._data_frame, column_name) replace_value = mode_val self._data_frame = self._data_frame.withColumn( column_name, when(((self._data_frame[column_name] >= range[0]) & (self._data_frame[column_name] <= range[1])), replace_value).otherwise( self._data_frame[column_name])) else: replace_value = value self._data_frame = self._data_frame.withColumn( column_name, when(((self._data_frame[column_name] >= range[0]) & (self._data_frame[column_name] <= range[1])), replace_value).otherwise( self._data_frame[column_name])) else: if value == "median": dp_helper_obj = DataPreprocessingHelper( self._data_frame, self._dataframe_context) median_val = dp_helper_obj.get_median(self._data_frame, column_name) replace_value = median_val self._data_frame = self._data_frame.withColumn( column_name + "_treated_" + str(range) + "_median", when(self._data_frame[column_name] == range, replace_value).otherwise( self._data_frame[column_name])) elif value == "mode": dp_helper_obj = DataPreprocessingHelper( self._data_frame, self._dataframe_context) mode_val = dp_helper_obj.get_mode(self._data_frame, column_name) replace_value = mode_val self._data_frame = self._data_frame.withColumn( column_name + "_treated_" + str(range) + "_mode", when(self._data_frame[column_name] == range, replace_value).otherwise( self._data_frame[column_name])) elif value == "mean": dp_helper_obj = DataPreprocessingHelper( self._data_frame, self._dataframe_context) mean_value = self._data_frame.agg(avg(column_name)).first()[0] replace_value = mean_value self._data_frame = self._data_frame.withColumn( column_name + "_treated_" + str(range) + "_mean", when(self._data_frame[column_name] == range, replace_value).otherwise( self._data_frame[column_name])) else: replace_value = value self._data_frame = self._data_frame.withColumn( column_name + "_treated_" + str(range) + "_" + str(replace_value), when(self._data_frame[column_name] == range, replace_value).otherwise( self._data_frame[column_name])) return self._data_frame def standardize_column(self, column_name): def standardize_column_helper(mean, sd): return udf(lambda x: old_div((x - mean) * 1.0, sd) if x != None else x) mean = self._data_frame.select(F.mean(column_name)).collect()[0][0] StdDev = self._data_frame.select( F.stddev_samp(column_name)).collect()[0][0] self._data_frame = self._data_frame.withColumn( column_name + "_fs_standardized", standardize_column_helper(mean, StdDev)(col(column_name))) self._data_frame = self._data_frame.withColumn( column_name + "_fs_standardized", self._data_frame[column_name + "_fs_standardized"].cast('float')) return self._data_frame '''Rounds off the returned value ==> values formed are either 0 or 1''' def normalize_column(self, column_name): def normalize_column_helper(min, max): return udf(lambda x: old_div((x - min) * 1.0, (max - min)) if x != None else x) max = self._data_frame.select(F.max(column_name)).collect()[0][0] min = self._data_frame.select(F.min(column_name)).collect()[0][0] self._data_frame = self._data_frame.withColumn( column_name + "_fs_normalized", normalize_column_helper(min, max)(col(column_name))) self._data_frame = self._data_frame.withColumn( column_name + "_fs_normalized", self._data_frame[column_name + "_fs_normalized"].cast('float')) return self._data_frame def replacerUDF(self, value, operation): if operation == "prod": return udf(lambda x: x * value if x != None else x) if operation == "add": return udf(lambda x: x + value if x != None else x) if operation == "subs": return udf(lambda x: x - value if x != None else x) if operation == "divide": return udf(lambda x: old_div(x, value) if x != None else x) if operation == "Reciprocal": return udf(lambda x: old_div(1, x) if x != None else x) if operation == "NthRoot": try: return udf(lambda x: x**(1.0 / value) if x != None else x) except: return udf(lambda x: x) if operation == "exponential": return udf(lambda x: x**value if x != None else x) if operation == "logTransform": return udf(lambda x: math.log(x, 10) if x != None else x) if operation == "modulus": return udf(lambda x: abs(x) if x != None else x) def logTransform_column(self, column_name): column_min = self._data_frame.select( F.min(column_name)).collect()[0][0] value_to_be_added = abs(column_min) + 1 if column_min > 0: self._data_frame = self._data_frame.withColumn( column_name + "_vt_log_transformed", self.replacerUDF(10, "logTransform")(col(column_name))) self._data_frame = self._data_frame.withColumn( column_name + "_vt_log_transformed", self._data_frame[column_name + "_vt_log_transformed"].cast('float')) else: self._data_frame = self._data_frame.withColumn( column_name + "_temp_transformed", self.replacerUDF(value_to_be_added, "add")(col(column_name))) self._data_frame = self._data_frame.withColumn( column_name + "_vt_log_transformed", self.replacerUDF(10, "logTransform")(col(column_name + "_temp_transformed"))) self._data_frame = self._data_frame.withColumn( column_name + "_vt_log_transformed", self._data_frame[column_name + "_vt_log_transformed"].cast('float')) self._data_frame = self._data_frame.drop(column_name + "_temp_transformed") return self._data_frame def modulus_transform_column(self, column_name): self._data_frame = self._data_frame.withColumn( column_name + "_vt_modulus_transformed", self.replacerUDF(10, "modulus")(col(column_name))) self._data_frame = self._data_frame.withColumn( column_name + "_vt_modulus_transformed", self._data_frame[column_name + "_vt_modulus_transformed"].cast('float')) return self._data_frame def cuberoot_transform_column(self, column_name): self._data_frame = self._data_frame.withColumn( column_name + "_vt_cuberoot_transformed", self.replacerUDF(3, "NthRoot")(col(column_name))) self._data_frame = self._data_frame.withColumn( column_name + "_vt_cuberoot_transformed", self._data_frame[column_name + "_vt_cuberoot_transformed"].cast('float')) return self._data_frame def squareroot_transform_column(self, column_name): column_min = self._data_frame.select( F.min(column_name)).collect()[0][0] if column_min >= 0: self._data_frame = self._data_frame.withColumn( column_name + "_vt_squareroot_transformed", self.replacerUDF(2, "NthRoot")(col(column_name))) self._data_frame = self._data_frame.withColumn( column_name + "_vt_squareroot_transformed", self._data_frame[column_name + "_vt_squareroot_transformed"].cast('float')) else: self._data_frame = self._data_frame.withColumn( column_name + "_vt_squareroot_transformed", F.lit(0)) return self._data_frame def label_encoding_column(self, column_name): indexers = [ StringIndexer(inputCol=column_name, outputCol=column_name + "_ed_label_encoded", handleInvalid="keep").fit(self._data_frame) ] pipeline = Pipeline(stages=indexers) self._data_frame = pipeline.fit(self._data_frame).transform( self._data_frame) return self._data_frame #Need to check for an alternative for oneHot Encoding for Pyspark def onehot_encoding_column(self, column_name): self._data_frame = self.label_encoding_column(column_name) encoder = OneHotEncoder(dropLast=False, inputCol=column_name + "_ed_label_encoded", outputCol=column_name + "_ed_one_hot_encoded") self._data_frame = encoder.transform(self._data_frame) self._data_frame = self._data_frame.withColumn( column_name + "_ed_one_hot_encoded", self._data_frame[column_name + "_ed_one_hot_encoded"].cast('string')) self._data_frame = self._data_frame.drop(column_name + "_ed_label_encoded") return self._data_frame def character_count_string(self, column_name): def character_count_string_helper(): return udf(lambda x: x.count("") - 1 if x != None else 0) self._data_frame = self._data_frame.withColumn( column_name + "_character_count", character_count_string_helper()(col(column_name))) self._data_frame = self._data_frame.withColumn( column_name + "_character_count", self._data_frame[column_name + "_character_count"].cast('float')) return self._data_frame def contains_word_helper(self, word): return udf(lambda x: False if x == None or x.lower().find(word) == -1 else True) def contains_word(self, column_name, word): # word = word.lower() self._data_frame = self._data_frame.withColumn( column_name + "_contains_" + word, self.contains_word_helper(word)(col(column_name))) return self._data_frame '''Given that all datetime columns follow same string format == "dd/MM/yyyy" for date''' def convert_to_timestamp(self, datetime_col, timeformat): timestamped = datetime_col + "_timestamped" self._data_frame = self._data_frame.withColumn( timestamped, to_timestamp(self._data_frame[datetime_col], timeformat).alias(datetime_col)) return self._data_frame #Timeformat is hardcoded as "dd/MM/yyyy" def count_time_since(self, col_for_time_since, time_since_date): '''Columns to be passed for calculating duration need to be in TimeStamped format''' '''time_since_date should be in dd/MM/yyyy format''' # print "COUNT TIME SINCE - " # self._metaHelperInstance = MetaDataHelper(self._data_frame, self._data_frame.count()) self._data_frame = self._data_frame.withColumn( col_for_time_since + '_temp', self.to_date_(col_for_time_since)) uniqueVals = self._data_frame.select( col_for_time_since + '_temp').distinct().na.drop().limit(1000).collect() try: date_format = self._metaHelperInstance.get_datetime_format( uniqueVals) self._data_frame = self._data_frame.withColumn( "TIME_SINCE_DATE", F.lit(time_since_date)) to_date_udf = udf( lambda x: datetime.strptime(x, date_format) if x != None else x, DateType()) self._data_frame = self._data_frame.withColumn( col_for_time_since + '_temp', to_date_udf(col(col_for_time_since + '_temp'))) self._data_frame = self._data_frame.withColumn( "TIME_SINCE_DATE(Timestamped)", to_timestamp(self._data_frame["TIME_SINCE_DATE"], "dd/MM/yyyy")) self._data_frame = self._data_frame.withColumn( col_for_time_since + "_time_since", datediff(self._data_frame["TIME_SINCE_DATE(Timestamped)"], self._data_frame[col_for_time_since + '_temp'])) self._data_frame = self._data_frame.drop( "TIME_SINCE_DATE", "TIME_SINCE_DATE(Timestamped)") except TypeError: self._data_frame = self._data_frame.withColumn( "TIME_SINCE_DATE", F.lit(time_since_date)) self._data_frame = self._data_frame.withColumn( "TIME_SINCE_DATE(Timestamped)", to_timestamp(self._data_frame["TIME_SINCE_DATE"], "dd/MM/yyyy")) self._data_frame = self._data_frame.withColumn( col_for_time_since + "_time_since", datediff(self._data_frame["TIME_SINCE_DATE(Timestamped)"], self._data_frame[col_for_time_since + '_temp'])) self._data_frame = self._data_frame.drop( "TIME_SINCE_DATE", "TIME_SINCE_DATE(Timestamped)") # self._data_frame = self._data_frame.withColumn(col_for_time_since, to_timestamp(self._data_frame[col_for_time_since], "dd/MM/yyyy").alias(col_for_time_since)) # self._data_frame = self._data_frame.withColumn(col_for_time_since, F.from_unixtime(F.unix_timestamp(self._data_frame[col_for_time_since]), "dd/MM/yyyy").alias(col_for_time_since)) self._data_frame = self._data_frame.drop(col_for_time_since + '_temp') return self._data_frame #TODO - Check for timestamp conversion related issues if any def month_to_string(self, dict): def month_to_string_helper(x, dict): for key in list(dict.keys()): if int(x) == key: return dict[key] #return udf(lambda x: dict_for_month_helper(x,dict)) return udf(lambda x: month_to_string_helper(x, dict) if x != None else x) #Timeformat is hardcoded as "dd/MM/yyyy" def extract_datetime_info(self, datetime_col, info_to_extract): self._data_frame = self._data_frame.withColumn( datetime_col + '_temp', self.to_date_(datetime_col)) timestamped = datetime_col + "_timestamped" # self._metaHelperInstance = MetaDataHelper(self._data_frame, self._data_frame.count()) uniqueVals = self._data_frame.select( datetime_col + '_temp').distinct().na.drop().limit(10).collect() try: date_format = self._metaHelperInstance.get_datetime_format( uniqueVals) to_date_udf = udf( lambda x: datetime.strptime(x, date_format) if x != None else x, DateType()) self._data_frame = self._data_frame.withColumn( datetime_col + '_temp', to_date_udf(self._data_frame[datetime_col + '_temp']).alias(datetime_col + '_temp')) if info_to_extract == "year": self._data_frame = self._data_frame.withColumn( datetime_col + "_year", year( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "month_of_year": dict = { 1: "January", 2: "February", 3: "March", 4: "April", 5: "May", 6: "June", 7: "July", 8: "August", 9: "September", 10: "October", 11: "November", 12: "December" } self._data_frame = self._data_frame.withColumn( datetime_col + "_month", month( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) self._data_frame = self._data_frame.withColumn( datetime_col + "_etf_month_of_year", self.month_to_string(dict)(col(datetime_col + "_month"))) if info_to_extract == "day_of_month": self._data_frame = self._data_frame.withColumn( datetime_col + "_day_of_month", dayofmonth( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "day_of_year": self._data_frame = self._data_frame.withColumn( datetime_col + "_day_of_year", dayofyear( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "day_of_week": self._data_frame = self._data_frame.withColumn( datetime_col + "_etf_day_of_week", dayofweek(datetime_col + '_temp')) if info_to_extract == "week_of_year": self._data_frame = self._data_frame.withColumn( datetime_col + "_week_of_year", weekofyear( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "hour": self._data_frame = self._data_frame.withColumn( datetime_col + "_hour", hour( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "minute": self._data_frame = self._data_frame.withColumn( datetime_col + "_minute", minute( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "date": self._data_frame = self._data_frame.withColumn( datetime_col + "_date", to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy").cast("date")) else: pass except TypeError: if info_to_extract == "year": self._data_frame = self._data_frame.withColumn( datetime_col + "_year", year( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "month_of_year": dict = { 1: "January", 2: "February", 3: "March", 4: "April", 5: "May", 6: "June", 7: "July", 8: "August", 9: "September", 10: "October", 11: "November", 12: "December" } self._data_frame = self._data_frame.withColumn( datetime_col + "_month", month( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) self._data_frame = self._data_frame.withColumn( datetime_col + "_etf_month_of_year", self.month_to_string(dict)(col(datetime_col + "_month"))) if info_to_extract == "day_of_month": self._data_frame = self._data_frame.withColumn( datetime_col + "_day_of_month", dayofmonth( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "day_of_year": self._data_frame = self._data_frame.withColumn( datetime_col + "_day_of_year", dayofyear( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "day_of_week": self._data_frame = self._data_frame.withColumn( datetime_col + "_etf_day_of_week", dayofweek(datetime_col + '_temp')) if info_to_extract == "week_of_year": self._data_frame = self._data_frame.withColumn( datetime_col + "_week_of_year", weekofyear( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "hour": self._data_frame = self._data_frame.withColumn( datetime_col + "_hour", hour( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "minute": self._data_frame = self._data_frame.withColumn( datetime_col + "_minute", minute( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "date": self._data_frame = self._data_frame.withColumn( datetime_col + "_date", to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy").cast("date")) else: pass self._data_frame = self._data_frame.drop(datetime_col + '_temp') # self._data_frame = self._data_frame.withColumn(datetime_col, to_timestamp(self._data_frame[datetime_col +'_temp'], "dd/MM/yyyy")) # self._data_frame = self._data_frame.withColumn(datetime_col, F.from_unixtime(F.unix_timestamp(self._data_frame[datetime_col +'_temp']), "dd/MM/yyyy")) return self._data_frame def is_weekend_helper(self): def weekend_checker(x): if (int(x) < 6): return False else: return True return udf(lambda x: weekend_checker(x) if x != None else x) #Timeformat is hardcoded as "dd/MM/yyyy" def is_weekend(self, datetime_col): # self._metaHelperInstance = MetaDataHelper(self._data_frame, self._data_frame.count()) self._data_frame = self._data_frame.withColumn( datetime_col + '_temp', self.to_date_(datetime_col)) uniqueVals = self._data_frame.select( datetime_col + '_temp').distinct().na.drop().limit(10).collect() try: date_format = self._metaHelperInstance.get_datetime_format( uniqueVals) to_date_udf = udf( lambda x: datetime.strptime(x, date_format) if x != None else x, DateType()) self._data_frame = self._data_frame.withColumn( datetime_col + '_temp', to_date_udf(col(datetime_col + '_temp'))) self._data_frame = self._data_frame.withColumn( datetime_col + "_day", dayofmonth(datetime_col + '_temp')) self._data_frame = self._data_frame.withColumn( datetime_col + "_is_weekend", self.is_weekend_helper()(col(datetime_col + "_day"))) self._data_frame = self._data_frame.drop(datetime_col + "_day") except TypeError: self._data_frame = self._data_frame.withColumn( datetime_col + "_day", dayofmonth(datetime_col + '_temp')) self._data_frame = self._data_frame.withColumn( datetime_col + "_is_weekend", self.is_weekend_helper()(col(datetime_col + "_day"))) self._data_frame = self._data_frame.drop(datetime_col + "_day") self._data_frame = self._data_frame.drop(datetime_col + '_temp') # self._data_frame = self._data_frame.withColumn(datetime_col, to_timestamp(self._data_frame[datetime_col], "dd/MM/yyyy")) # self._data_frame = self._data_frame.withColumn(datetime_col, F.from_unixtime(F.unix_timestamp(self._data_frame[datetime_col]), "dd/MM/yyyy")) return self._data_frame
class FeatureEngineeringHelperPandas(object): """Contains Feature Engineering Operation Functions""" def __init__(self, df, dataframe_context): self._data_frame = df self._metaHelperInstance = MetaDataHelper(self._data_frame, self._data_frame.shape[0]) self._dataframe_context = dataframe_context # self._dataframe_helper = dataframe_helper def binning_all_measures(self, number_of_bins, consider_cols): numeric_columns = [] cols_to_be_binned = [x[:-4] for x in consider_cols if x[-4:] == "_bin"] numeric_columns = [col for col in self._data_frame.columns if self._data_frame[col].dtypes in ['int32', 'int64', 'float32', 'float64', 'int', 'float']] for column_name in numeric_columns and cols_to_be_binned: self._data_frame = self.create_equal_sized_measure_bins(column_name, number_of_bins) return self._data_frame def check_key(self, x, bin_label): if x is not None: for key in list(bin_label.keys()): if x >= bin_label[key][0] and x <= bin_label[key][1]: return key else: return "None" def create_level(self, x, level_dict, selected_list): for key in list(level_dict.keys()): if x in selected_list: if x in level_dict[key]: return key else: return x def create_new_levels_dimension(self, column_name, level_dict): selected_list = [] for key in list(level_dict.keys()): selected_list = selected_list + level_dict[key] self._data_frame[column_name + "_level"] = self._data_frame[column_name].apply(self.create_level, level_dict=level_dict, selected_list=selected_list) return self._data_frame def create_level_udf_time(self, dict, date_format): pass def convert_to_date(self, value, date_format): if isinstance(value, str): value = pd.to_datetime(value, format='%d/%m/%Y') elif isinstance(value, str): value = pd.to_datetime(value, format='%d/%m/%Y') else: value = value return value def convert_to_date_from_level_value(self, value): value = pd.to_datetime(value, format='%d/%m/%Y') return value def check_key_date_bins(self, date, dict, date_format): if date is not None: date = self.convert_to_date(date, date_format) for key, value in list(dict.items()): val1_date = self.convert_to_date_from_level_value(value[0]) val2_date = self.convert_to_date_from_level_value(value[1]) date_range = [val1_date, val2_date] if date >= date_range[0] and date <= date_range[1]: return key else: return "None" def create_new_levels_datetimes(self, col_for_timelevels, dict): self._data_frame[col_for_timelevels + '_temp'] = pd.to_datetime(self._data_frame[col_for_timelevels], errors='ignore') unique_vals = self._data_frame[col_for_timelevels + '_temp'].head(15) try: date_format = self._metaHelperInstance.get_datetime_format_pandas(unique_vals) except: date_format = None self._data_frame[col_for_timelevels + '_t_level'] = self._data_frame[col_for_timelevels + '_temp'].apply( self.check_key_date_bins, dict=dict, date_format=date_format) self._data_frame = self._data_frame.drop(col_for_timelevels + '_temp', axis=1) return self._data_frame def create_equal_sized_measure_bins(self, column_name, number_of_bins): def create_dict_for_bin(): min_value = np.min(self._data_frame[column_name]) max_value = np.max(self._data_frame[column_name]) interval_size = (old_div((max_value - min_value) * 1.0, (number_of_bins - 1))) bin_dict = {} temp = min_value while temp <= max_value: bin_dict[str(round(temp, 3)) + "-" + str(round(temp + interval_size, 3))] = [temp, temp + interval_size] temp = temp + interval_size return bin_dict bin_dict = create_dict_for_bin() self._data_frame[column_name + "_bin"] = self._data_frame[column_name].apply(self.check_key, bin_label=bin_dict) return self._data_frame def create_custom_measure_bins(self, column_name, list_of_intervals): def create_dict_for_bin(): min_value = np.min(self._data_frame[column_name]) max_value = np.max(self._data_frame[column_name]) bin_dict = {} if list_of_intervals[0] > min_value: bin_dict[str(min_value) + "-" + str(list_of_intervals[0])] = [min_value, list_of_intervals[0]] for i in range(len(list_of_intervals)): if i + 2 <= len(list_of_intervals): bin_dict[str(list_of_intervals[i]) + "-" + str(list_of_intervals[i + 1])] = [list_of_intervals[i], list_of_intervals[ i + 1]] if list_of_intervals[-1] < max_value: bin_dict[str(list_of_intervals[-1]) + "-" + str(max_value)] = [list_of_intervals[-1], max_value] return bin_dict bin_dict = create_dict_for_bin() self._data_frame[column_name + "_c_bin"] = self._data_frame[column_name].apply(self.check_key, bin_label=bin_dict) return self._data_frame def replace_values_in_column(self, column_name, range, value): if value == "median": dp_helper_obj = DataPreprocessingHelperPandas(self._data_frame, self._dataframe_context) median_val = dp_helper_obj.get_median(self._data_frame, column_name) replace_value = median_val self._data_frame[column_name + "_treated_" + str(range) + "_median"] = self._data_frame[column_name].apply( lambda x: replace_value if x == range else x) elif value == "mode": dp_helper_obj = DataPreprocessingHelperPandas(self._data_frame, self._dataframe_context) mode_val = dp_helper_obj.get_mode(self._data_frame, column_name) replace_value = mode_val self._data_frame[column_name + "_treated_" + str(range) + "_mode"] = self._data_frame[column_name].apply( lambda x: replace_value if x == range else x) elif value == "mean": mean_value = np.mean(self._data_frame[column_name]) replace_value = mean_value self._data_frame[column_name + "_treated_" + str(range) + "_mean"] = self._data_frame[column_name].apply( lambda x: replace_value if x == range else x) else: replace_value = value self._data_frame[column_name + "_treated_" + str(range) + "_" + str(replace_value)] = self._data_frame[ column_name].apply(lambda x: replace_value if x == range else x) return self._data_frame def standardize_column(self, column_name): def standardize_column_helper(mean, sd): return [lambda x: round(float((x - mean) * 1.0 / sd), 3) if x is not None else x] mean, std_dev = self._data_frame[column_name].mean(), self._data_frame[column_name].std() self._data_frame[column_name + '_fs_standardized'] = self._data_frame[column_name].apply( standardize_column_helper(mean, std_dev)) return self._data_frame def normalize_column(self, column_name): def normalize_column_helper(min, max): return [lambda x: round(float((x - min) * 1.0 / (max - min)), 3) if x is not None else x] max_value, min_value = self._data_frame[column_name].max(), self._data_frame[column_name].min() self._data_frame[column_name + '_fs_normalized'] = self._data_frame[column_name].apply( normalize_column_helper(min_value, max_value)) return self._data_frame def replacerUDF(self, value, operation): if operation == "prod": return [lambda x: x * value if x is not None else x] if operation == "add": return [lambda x: x + value if x is not None else x] if operation == "subs": return [lambda x: x - value if x is not None else x] if operation == "divide": return [(lambda x: int(x, value) if x is not None else x)] if operation == "Reciprocal": return [(lambda x: int(1, x) if x is not None else x)] if operation == "NthRoot": try: return [(lambda x: x ** (1.0 / value) if x is not None else x)] except: return [(lambda x: x)] if operation == "exponential": return [(lambda x: x ** value if x is not None else x)] if operation == "logTransform": return [lambda x: math.log(x, 10) if x is not None else x] if operation == "modulus": return [(lambda x: abs(x) if x is not None else x)] def logTransform_column(self, column_name): column_min = self._data_frame[column_name].min() value_to_be_added = abs(column_min) + 1 if column_min > 0: self._data_frame[column_name + "_vt_log_transformed"] = self._data_frame[column_name].apply( self.replacerUDF(10, "logTransform")) self._data_frame[column_name + "_vt_log_transformed"] = self._data_frame[ column_name + "_vt_log_transformed"].astype('float') else: self._data_frame[column_name + "_temp_transformed"] = self._data_frame[column_name].apply( self.replacerUDF(value_to_be_added, "add")) self._data_frame[column_name + "_vt_log_transformed"] = self._data_frame[ column_name + "_temp_transformed"].apply(self.replacerUDF(10, "logTransform")) self._data_frame[column_name + "_vt_log_transformed"] = self._data_frame[ column_name + "_vt_log_transformed"].astype('float') self._data_frame = self._data_frame.drop(column_name + "_temp_transformed", axis=1) return self._data_frame def modulus_transform_column(self, column_name): self._data_frame[column_name + "_vt_modulus_transformed"] = self._data_frame[column_name].apply( self.replacerUDF(10, "modulus")) self._data_frame[column_name + "_vt_modulus_transformed"] = self._data_frame[ column_name + "_vt_modulus_transformed"].astype('float') return self._data_frame def cuberoot_transform_column(self, column_name): self._data_frame[column_name + "_vt_cuberoot_transformed"] = self._data_frame[column_name].apply( self.replacerUDF(3, "NthRoot")) self._data_frame[column_name + "_vt_cuberoot_transformed"] = self._data_frame[ column_name + "_vt_cuberoot_transformed"].astype('float') return self._data_frame def squareroot_transform_column(self, column_name): column_min = self._data_frame[column_name].min() if column_min >= 0: self._data_frame[column_name + "_vt_squareroot_transformed"] = self._data_frame[column_name].apply( self.replacerUDF(2, "NthRoot")) self._data_frame[column_name + "_vt_squareroot_transformed"] = self._data_frame[ column_name + "_vt_squareroot_transformed"].astype('float') else: self._data_frame[column_name + "_vt_squareroot_transformed"] = 0 return self._data_frame def label_encoding_column(self, column_name): self._data_frame[column_name + '_ed_label_encoded'] = LabelEncoder().fit_transform( self._data_frame[column_name].astype(str)) return self._data_frame def onehot_encoding_column(self, column_name): if self._data_frame[column_name].isnull().any(): self._data_frame[column_name].fillna(self._data_frame[column_name].mode()[0], inplace=True) temp = self._data_frame[[column_name]] enc = OneHotEncoder_pandas(drop='first') k1 = enc.fit_transform(temp).toarray() temp = pd.DataFrame(k1, columns=list(enc.get_feature_names())) feature_names = list(enc.get_feature_names()) temp.set_index(self._data_frame.index, inplace=True) for col_name in feature_names: self._data_frame[column_name + '_' + col_name.partition('_')[2] + '_one_hot'] = temp[col_name].astype('int') # X.drop(column_name,axis = 1,inplace = True) return self._data_frame def character_count_string(self, column_name): self._data_frame[column_name + "_character_count"] = self._data_frame[column_name].apply( lambda x: x.count("") - 1 if x is not None else 0) self._data_frame[column_name + "_character_count"] = self._data_frame[column_name + "_character_count"].astype( 'float') return self._data_frame def contains_word(self, column_name, word): # word = word.lower() self._data_frame[column_name + "_contains_" + word] = self._data_frame[column_name].apply( lambda x: False if x is None or x.lower().find(word) == -1 else True) return self._data_frame def convert_to_timestamp(self, datetime_col, timeformat): pass def count_time_since(self, col_for_time_since, time_since_date): self._data_frame[col_for_time_since + '_temp'] = pd.to_datetime(self._data_frame[col_for_time_since], errors='coerce') unique_vals = self._data_frame[col_for_time_since].drop_duplicates().head(10) try: date_format = self._metaHelperInstance.get_datetime_format_pandas(unique_vals) self._data_frame['TIME_SINCE_DATE'] = time_since_date self._data_frame[col_for_time_since + '_temp'] = pd.to_datetime( self._data_frame[col_for_time_since + '_temp'], format=date_format) self._data_frame['TIME_SINCE_DATE_Timestamped'] = pd.to_datetime(self._data_frame['TIME_SINCE_DATE'], format='%d/%m/%Y') self._data_frame[col_for_time_since + "_time_since"] = self._data_frame['TIME_SINCE_DATE_Timestamped'] - \ self._data_frame[col_for_time_since + '_temp'] self._data_frame[col_for_time_since + "_time_since"] = self._data_frame[ col_for_time_since + "_time_since"] / np.timedelta64( 1, 'D') self._data_frame[col_for_time_since + "_time_since"] = self._data_frame[ col_for_time_since + "_time_since"].apply(np.ceil) except: self._data_frame['TIME_SINCE_DATE'] = time_since_date self._data_frame['TIME_SINCE_DATE_Timestamped'] = pd.to_datetime(self._data_frame['TIME_SINCE_DATE'], format='%d/%m/%Y', infer_datetime_format=True) self._data_frame[col_for_time_since + "_time_since"] = self._data_frame['TIME_SINCE_DATE_Timestamped'] - \ self._data_frame[col_for_time_since + '_temp'] self._data_frame[col_for_time_since + "_time_since"] = self._data_frame[ col_for_time_since + "_time_since"] / np.timedelta64( 1, 'D') self._data_frame[col_for_time_since + "_time_since"] = self._data_frame[ col_for_time_since + "_time_since"].apply(np.ceil) self._data_frame = self._data_frame.drop(["TIME_SINCE_DATE", "TIME_SINCE_DATE_Timestamped"], axis=1) self._data_frame = self._data_frame.drop([col_for_time_since + '_temp'], axis=1) return self._data_frame def month_to_string(self, dict): pass def extract_datetime_info(self, datetime_col, info_to_extract): self._data_frame[datetime_col] = pd.to_datetime(self._data_frame[datetime_col], errors='ignore') self._data_frame[datetime_col + '_temp'] = self._data_frame[datetime_col].dt.date unique_vals = self._data_frame[datetime_col + '_temp'].head(15) try: date_format = self._metaHelperInstance.get_datetime_format_pandas(unique_vals) if info_to_extract == "year": self._data_frame[datetime_col + '_year'] = self._data_frame[datetime_col].dt.year if info_to_extract == "month_of_year": self._data_frame[datetime_col + '_etf_month_of_year'] = self._data_frame[datetime_col].dt.month if info_to_extract == "day_of_month": self._data_frame[datetime_col + '_day_of_month'] = self._data_frame[datetime_col].dt.day if info_to_extract == "day_of_year": self._data_frame[datetime_col + '_day_of_year'] = self._data_frame[datetime_col].dt.dayofyear if info_to_extract == "day_of_week": self._data_frame[datetime_col + '_etf_day_of_week'] = self._data_frame[datetime_col].dt.dayofweek if info_to_extract == "week_of_year": self._data_frame[datetime_col + '_week_of_year'] = self._data_frame[datetime_col].dt.weekofyear if info_to_extract == "hour": self._data_frame[datetime_col + '_hour'] = self._data_frame[datetime_col].dt.hour if info_to_extract == "minute": self._data_frame[datetime_col + '_minute'] = self._data_frame[datetime_col].dt.minute if info_to_extract == "date": self._data_frame[datetime_col + '_date'] = self._data_frame[datetime_col].dt.date else: pass except: if info_to_extract == "year": self._data_frame[datetime_col + '_year'] = self._data_frame[datetime_col].dt.year if info_to_extract == "month_of_year": self._data_frame[datetime_col + '_etf_month_of_year'] = self._data_frame[datetime_col].dt.month if info_to_extract == "day_of_month": self._data_frame[datetime_col + '_day_of_month'] = self._data_frame[datetime_col].dt.day if info_to_extract == "day_of_year": self._data_frame[datetime_col + '_day_of_year'] = self._data_frame[datetime_col].dt.dayofyear if info_to_extract == "day_of_week": self._data_frame[datetime_col + '_etf_day_of_week'] = self._data_frame[datetime_col].dt.dayofweek if info_to_extract == "week_of_year": self._data_frame[datetime_col + '_week_of_year'] = self._data_frame[datetime_col].dt.weekofyear if info_to_extract == "hour": self._data_frame[datetime_col + '_hour'] = self._data_frame[datetime_col].dt.hour if info_to_extract == "minute": self._data_frame[datetime_col + '_minute'] = self._data_frame[datetime_col].dt.minute if info_to_extract == "date": self._data_frame[datetime_col + '_date'] = self._data_frame[datetime_col].dt.date self._data_frame = self._data_frame.drop(datetime_col + '_temp', axis=1) return self._data_frame def is_weekend_helper(self): pass def is_weekend(self, datetime_col): self._data_frame[datetime_col + '_temp'] = pd.to_datetime(self._data_frame[datetime_col], errors='coerce') unique_vals = self._data_frame[datetime_col].drop_duplicates().head(10) try: date_format = self._metaHelperInstance.get_datetime_format_pandas(unique_vals) self._data_frame[datetime_col + '_temp'] = pd.to_datetime(self._data_frame[datetime_col + '_temp'], format=date_format) self._data_frame[datetime_col + 'is_weekend'] = np.where(self._data_frame[datetime_col].dt.dayofweek >= 5, 'True', 'False') except: self._data_frame[datetime_col + 'is_weekend'] = np.where( self._data_frame[datetime_col + '_temp'].dt.dayofweek >= 5, 'True', 'False') self._data_frame = self._data_frame.drop(datetime_col + '_temp', axis=1) return self._data_frame
def run(self): self._start_time = time.time() metaHelperInstance = MetaDataHelper(self._data_frame, self._total_rows) sampleData = metaHelperInstance.get_sample_data() if not self._pandas_flag: sampleData = sampleData.toPandas() time_taken_sampling = time.time()-self._start_time self._completionStatus += self._scriptStages["sampling"]["weight"] progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "sampling",\ "info",\ self._scriptStages["sampling"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsgFlag) metaData = [] metaData.append(MetaData(name="noOfRows",value=self._total_rows,display=True,displayName="Rows")) metaData.append(MetaData(name="noOfColumns",value=self._total_columns,display=True,displayName="Columns")) # self._percentage_columns = metaHelperInstance.get_percentage_columns(self._string_columns) separation_time=time.time() self._timestamp_string_columns=[] uniqueVals = [] dateTimeSuggestions = {} if not self._pandas_flag: for column in self._string_columns: if self._column_type_dict[column]["actual"] != "boolean": # uniqueVals = self._data_frame.select(column).na.drop().distinct().limit(10).collect() uniqueVals = sampleData[column].unique().tolist() else: uniqueVals = [] ## TODO : remove pandas if not needed later if self._pandas_flag: if len(uniqueVals) > 0 and metaHelperInstance.get_datetime_format_pandas([self._data_frame.sort_values(by=column,ascending=False)[column][0]])!=None: dateColumnFormat = metaHelperInstance.get_datetime_format_pandas(uniqueVals) else: dateColumnFormat = None else: if len(uniqueVals) > 0 and metaHelperInstance.get_datetime_format([self._data_frame.orderBy([column],ascending=[False]).select(column).first()[0]])!=None: dateColumnFormat = metaHelperInstance.get_datetime_format(uniqueVals) else: dateColumnFormat = None if dateColumnFormat: dateTimeSuggestions.update({column:dateColumnFormat}) data=ColumnData() data.set_level_count_to_null() data.set_chart_data_to_null() data.set_date_suggestion_flag(True) data.set_abstract_datatype("datetime") data.set_actual_datatype("datetime") self._timestamp_string_columns.append(column) ## TO DO : remove pandas if not needed later if self._pandas_flag: self._data_frame[column] = pd.to_datetime(self._data_frame[column],format=dateColumnFormat) else: self._data_frame = self._data_frame.withColumn(column, self.to_date_(column)) sampleData = metaHelperInstance.format_sampledata_timestamp_columns(sampleData,self._timestamp_columns,self._stripTimestamp) print("sampling takes",time_taken_sampling) self._string_columns = list(set(self._string_columns)-set(self._timestamp_string_columns)) self._timestamp_columns = self._timestamp_columns+self._timestamp_string_columns # self.update_column_type_dict() print("time taken for separating date columns from string is :", time.time()-separation_time) # if len(self._percentage_columns)>0: # self._data_frame = CommonUtils.convert_percentage_columns(self._data_frame,self._percentage_columns) # self._numeric_columns = self._numeric_columns + self._percentage_columns # self._string_columns = list(set(self._string_columns)-set(self._percentage_columns)) # self.update_column_type_dict() # self._dollar_columns = metaHelperInstance.get_dollar_columns(self._string_columns) # if len(self._dollar_columns)>0: # self._data_frame = CommonUtils.convert_dollar_columns(self._data_frame,self._dollar_columns) # self._numeric_columns = self._numeric_columns + self._dollar_columns # self._string_columns = list(set(self._string_columns)-set(self._dollar_columns)) # self.update_column_type_dict() columnData = [] headers = [] self._start_time = time.time() print("Count of Numeric columns",len(self._numeric_columns)) try: measureColumnStat,measureCharts = metaHelperInstance.calculate_measure_column_stats(self._data_frame,self._numeric_columns,binColumn=self._binned_stat_flag,pandas_flag=self._pandas_flag) except Exception as e: raise Exception(e) time_taken_measurestats = time.time()-self._start_time self._completionStatus += self._scriptStages["measurestats"]["weight"] print("measure stats takes",time_taken_measurestats) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "measurestats",\ "info",\ self._scriptStages["measurestats"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsgFlag) print("Count of DateTime columns",len(self._timestamp_columns)) self._start_time = time.time() # time_columns=self._timestamp_columns # time_string_columns=self._timestamp_string_columns # original_timestamp_columns=list(set(self._timestamp_columns)-set(self._timestamp_string_columns)) timeDimensionColumnStat,timeDimensionCharts, unprocessed_columns = metaHelperInstance.calculate_time_dimension_column_stats(self._data_frame,self._timestamp_columns,level_count_flag=self._level_count_flag,pandas_flag=self._pandas_flag) self._string_columns = self._string_columns + unprocessed_columns self._timestamp_columns = list(set(self._timestamp_columns) - set(unprocessed_columns)) self.update_column_type_dict() if len(self._numeric_columns) > 1: # print "self._numeric_columns : ", self._numeric_columns metaData.append(MetaData(name="measures",value=len(self._numeric_columns),display=True,displayName="Measures")) else: metaData.append(MetaData(name="measures",value=len(self._numeric_columns),display=True,displayName="Measure")) if len(self._string_columns) > 1: metaData.append(MetaData(name="dimensions",value=len(self._string_columns+self._boolean_columns),display=True,displayName="Dimensions")) else: metaData.append(MetaData(name="dimensions",value=len(self._string_columns+self._boolean_columns),display=True,displayName="Dimension")) if len(self._timestamp_columns) > 1: metaData.append(MetaData(name="timeDimension",value=len(self._timestamp_columns),display=True,displayName="Time Dimensions")) else: metaData.append(MetaData(name="timeDimension",value=len(self._timestamp_columns),display=True,displayName="Time Dimension")) metaData.append(MetaData(name="measureColumns",value = self._numeric_columns,display=False)) metaData.append(MetaData(name="dimensionColumns",value = self._string_columns+self._boolean_columns,display=False)) metaData.append(MetaData(name="timeDimensionColumns",value = self._timestamp_columns,display=False)) # metaData.append(MetaData(name="percentageColumns",value = self._percentage_columns,display=False)) # metaData.append(MetaData(name="dollarColumns",value = self._dollar_columns,display=False)) # timeDimensionColumnStat2,timeDimensionCharts2,unprocessed_columns = metaHelperInstance.calculate_time_dimension_column_stats_from_string(self._data_frame,self._timestamp_string_columns,level_count_flag=self._level_count_flag) # gc.collect() # timeDimensionColumnStat.update(timeDimensionColumnStat2) # timeDimensionCharts.update(timeDimensionCharts2) time_taken_tdstats = time.time()-self._start_time self._completionStatus += self._scriptStages["timedimensionstats"]["weight"] print("time dimension stats takes",time_taken_tdstats) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "timedimensionstats",\ "info",\ self._scriptStages["timedimensionstats"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsgFlag) self._start_time = time.time() try : dimensionColumnStat,dimensionCharts = metaHelperInstance.calculate_dimension_column_stats(self._data_frame,self._string_columns+self._boolean_columns,levelCount=self._level_count_flag,pandas_flag=self._pandas_flag) except Exception as e: raise Exception(e) self._dataSize["dimensionLevelCountDict"] = {k:[x for x in v if x["name"]=="numberOfUniqueValues"][0]["value"] for k,v in list(dimensionColumnStat.items())} self._dataSize["totalLevels"] = sum(self._dataSize["dimensionLevelCountDict"].values()) time_taken_dimensionstats = time.time()-self._start_time self._completionStatus += self._scriptStages["dimensionstats"]["weight"] # print "dimension stats takes",time_taken_dimensionstats progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "dimensionstats",\ "info",\ self._scriptStages["dimensionstats"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsgFlag) self._start_time = time.time() ignoreColumnSuggestions = [] ignoreColumnReason = [] utf8ColumnSuggestion = [] dup_cols = [] #columns = self._data_frame.columns measureDupCols=self.checkDupColName(measureColumnStat) dimensionDupCols=self.checkDupColName(dimensionColumnStat) timeDimensionDupCols=self.checkDupColName(timeDimensionColumnStat) if self._pandas_flag: for i in measureDupCols: if self.checkDuplicateCols_pandas(i[0],i[1]) == True: for j in i[1:]: if dict(name="Duplicate",value=True) not in measureColumnStat[j]: measureColumnStat[j].append(dict(name="Duplicate",value=i[0])) for i in dimensionDupCols: if self.checkDuplicateCols_pandas(i[0],i[1]) == True: for j in i[1:]: if dict(name="Duplicate",value=True) not in dimensionColumnStat[j]: dimensionColumnStat[j].append(dict(name="Duplicate",value=i[0])) for i in timeDimensionDupCols: if self.checkDuplicateCols_pandas(i[0],i[1]) == True: for j in i[1:]: if dict(name="Duplicate",value=True) not in timeDimensionColumnStat[j]: timeDimensionColumnStat[j].append(dict(name="Duplicate",value=i[0])) else: for i in measureDupCols: if self.checkDuplicateCols(i[0],i[1]) == True: for j in i[1:]: if dict(name="Duplicate",value=True) not in measureColumnStat[j]: measureColumnStat[j].append(dict(name="Duplicate",value=i[0])) for i in dimensionDupCols: if self.checkDuplicateCols(i[0],i[1],True) == True: for j in i[1:]: if dict(name="Duplicate",value=True) not in dimensionColumnStat[j]: dimensionColumnStat[j].append(dict(name="Duplicate",value=i[0])) for i in timeDimensionDupCols: if self.checkDuplicateCols(i[0],i[1]) == True: for j in i[1:]: if dict(name="Duplicate",value=True) not in timeDimensionColumnStat[j]: timeDimensionColumnStat[j].append(dict(name="Duplicate",value=i[0])) for column in self._data_frame.columns: random_slug = uuid.uuid4().hex headers.append(ColumnHeader(name=column,slug=random_slug)) data = ColumnData() data.set_slug(random_slug) data.set_name(column) data.set_abstract_datatype(self._column_type_dict[column]["abstract"]) data.set_checker(True) changeflage=False columnStat = [] columnChartData = None check_datatype_change=self.actual_col_datatype_update if len(check_datatype_change)!=0: for i in check_datatype_change: if list(i.keys())[0]==column: changeflage=True changeType=i[column] break else: changeflage=False else: changeflage=False if self._column_type_dict[column]["abstract"] == "measure": data.set_column_stats(measureColumnStat[column]) data.set_column_chart(measureCharts[column]) if changeflage: data.set_actual_datatype("dimension") else: data.set_actual_datatype(self._column_type_dict[column]["actual"]) elif self._column_type_dict[column]["abstract"] == "dimension": data.set_column_stats(dimensionColumnStat[column]) data.set_column_chart(dimensionCharts[column]) if changeflage: data.set_actual_datatype("measure") else: data.set_actual_datatype(self._column_type_dict[column]["actual"]) elif self._column_type_dict[column]["abstract"] == "datetime": data.set_column_stats(timeDimensionColumnStat[column]) data.set_column_chart(timeDimensionCharts[column]) if changeflage: data.set_actual_datatype("dimension") else: data.set_actual_datatype(self._column_type_dict[column]["actual"]) if self._column_type_dict[column]["abstract"] == "measure": #if column not in self._real_columns: ignoreSuggestion,ignoreReason = metaHelperInstance.get_ignore_column_suggestions(self._data_frame,self._total_rows,column,"measure",measureColumnStat[column],max_levels=self._max_levels) if ignoreSuggestion: ignoreColumnSuggestions.append(column) ignoreColumnReason.append(ignoreReason) #data.set_level_count_to_null() #data.set_chart_data_to_null() data.set_ignore_suggestion_flag(True) data.set_ignore_suggestion_message(ignoreReason) elif self._column_type_dict[column]["abstract"] == "dimension": ignoreSuggestion,ignoreReason = metaHelperInstance.get_ignore_column_suggestions(self._data_frame,self._total_rows,column,"dimension",dimensionColumnStat[column],max_levels=self._max_levels) if ignoreSuggestion: ignoreColumnSuggestions.append(column) ignoreColumnReason.append(ignoreReason) if ignoreReason=="Number of Levels are more than the defined thershold": data.set_ignore_suggestion_preview_flag(False) #data.set_level_count_to_null() #data.set_chart_data_to_null() data.set_ignore_suggestion_flag(True) data.set_ignore_suggestion_message(ignoreReason) if self._level_count_flag: utf8Suggestion = metaHelperInstance.get_utf8_suggestions(dimensionColumnStat[column]) else: utf8Suggestion = False if utf8Suggestion: utf8ColumnSuggestion.append(column) ignoreSuggestion,ignoreReason = metaHelperInstance.get_ignore_column_suggestions(self._data_frame,self._total_rows,column,"dimension",dimensionColumnStat[column],max_levels=self._max_levels) if ignoreSuggestion: ignoreColumnSuggestions.append(column) ignoreColumnReason.append(ignoreReason) #data.set_level_count_to_null() #data.set_chart_data_to_null() data.set_ignore_suggestion_flag(True) data.set_ignore_suggestion_message(ignoreReason) elif self._column_type_dict[column]["abstract"] == "datetime": ignoreSuggestion,ignoreReason = metaHelperInstance.get_ignore_column_suggestions(self._data_frame,self._total_rows,column,"datetime",timeDimensionColumnStat[column],max_levels=self._max_levels) if ignoreSuggestion: ignoreColumnSuggestions.append(column) ignoreColumnReason.append(ignoreReason) #data.set_level_count_to_null() #data.set_chart_data_to_null() data.set_ignore_suggestion_flag(True) data.set_ignore_suggestion_message(ignoreReason) columnData.append(data) if len(uniqueVals) > 0: dateColumnFormat = metaHelperInstance.get_datetime_format(uniqueVals) else: dateColumnFormat = None if dateColumnFormat: dateTimeSuggestions.update({column:dateColumnFormat}) for utfCol in utf8ColumnSuggestion: ignoreColumnSuggestions.append(utfCol) ignoreColumnReason.append("utf8 values present") progressMessage = CommonUtils.create_progress_message_object(self._analysisName,"custom","info","Validating Metadata Information",self._completionStatus,self._completionStatus,display=True) CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsgFlag) metaData.append(MetaData(name="ignoreColumnSuggestions",value = ignoreColumnSuggestions,display=False)) metaData.append(MetaData(name="ignoreColumnReason",value = ignoreColumnReason,display=False)) metaData.append(MetaData(name="utf8ColumnSuggestion",value = utf8ColumnSuggestion,display=False)) metaData.append(MetaData(name="dateTimeSuggestions",value = dateTimeSuggestions,display=False)) metaData.append(MetaData(name="dataSizeSummary",value = self._dataSize,display=False)) dfMetaData = DfMetaData() dfMetaData.set_column_data(columnData) dfMetaData.set_header(headers) dfMetaData.set_meta_data(metaData) dfMetaData.set_sample_data(sampleData) time_taken_suggestions = time.time()-self._start_time self._completionStatus += self._scriptStages["suggestions"]["weight"] # print "suggestions take",time_taken_suggestions progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "suggestions",\ "info",\ self._scriptStages["suggestions"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsgFlag) self._dataframe_context.update_completion_status(self._completionStatus) return dfMetaData
def run(self): self._start_time = time.time() metaHelperInstance = MetaDataHelper(self._data_frame, self._total_rows) sampleData = metaHelperInstance.get_sample_data() sampleData = sampleData.toPandas() sampleData = metaHelperInstance.format_sampledata_timestamp_columns( sampleData, self._timestamp_columns, self._stripTimestamp) time_taken_sampling = time.time() - self._start_time self._completionStatus += self._scriptStages["sampling"]["weight"] print "sampling takes", time_taken_sampling progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "sampling",\ "info",\ self._scriptStages["sampling"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=self._ignoreMsgFlag) metaData = [] metaData.append( MetaData(name="noOfRows", value=self._total_rows, display=True, displayName="Rows")) metaData.append( MetaData(name="noOfColumns", value=self._total_columns, display=True, displayName="Columns")) self._percentage_columns = metaHelperInstance.get_percentage_columns( self._string_columns) if len(self._percentage_columns) > 0: self._data_frame = CommonUtils.convert_percentage_columns( self._data_frame, self._percentage_columns) self._numeric_columns = self._numeric_columns + self._percentage_columns self._string_columns = list( set(self._string_columns) - set(self._percentage_columns)) self.update_column_type_dict() self._dollar_columns = metaHelperInstance.get_dollar_columns( self._string_columns) if len(self._dollar_columns) > 0: self._data_frame = CommonUtils.convert_dollar_columns( self._data_frame, self._dollar_columns) self._numeric_columns = self._numeric_columns + self._dollar_columns self._string_columns = list( set(self._string_columns) - set(self._dollar_columns)) self.update_column_type_dict() if len(self._numeric_columns) > 1: # print "self._numeric_columns : ", self._numeric_columns metaData.append( MetaData(name="measures", value=len(self._numeric_columns), display=True, displayName="Measures")) else: metaData.append( MetaData(name="measures", value=len(self._numeric_columns), display=True, displayName="Measure")) if len(self._string_columns) > 1: metaData.append( MetaData(name="dimensions", value=len(self._string_columns + self._boolean_columns), display=True, displayName="Dimensions")) else: metaData.append( MetaData(name="dimensions", value=len(self._string_columns + self._boolean_columns), display=True, displayName="Dimension")) if len(self._timestamp_columns) > 1: metaData.append( MetaData(name="timeDimension", value=len(self._timestamp_columns), display=True, displayName="Time Dimensions")) else: metaData.append( MetaData(name="timeDimension", value=len(self._timestamp_columns), display=True, displayName="Time Dimension")) metaData.append( MetaData(name="measureColumns", value=self._numeric_columns, display=False)) metaData.append( MetaData(name="dimensionColumns", value=self._string_columns + self._boolean_columns, display=False)) metaData.append( MetaData(name="timeDimensionColumns", value=self._timestamp_columns, display=False)) metaData.append( MetaData(name="percentageColumns", value=self._percentage_columns, display=False)) metaData.append( MetaData(name="dollarColumns", value=self._dollar_columns, display=False)) columnData = [] headers = [] self._start_time = time.time() print "Count of Numeric columns", len(self._numeric_columns) measureColumnStat, measureCharts = metaHelperInstance.calculate_measure_column_stats( self._data_frame, self._numeric_columns, binColumn=self._binned_stat_flag) time_taken_measurestats = time.time() - self._start_time self._completionStatus += self._scriptStages["measurestats"]["weight"] print "measure stats takes", time_taken_measurestats progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "measurestats",\ "info",\ self._scriptStages["measurestats"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=self._ignoreMsgFlag) self._start_time = time.time() dimensionColumnStat, dimensionCharts = metaHelperInstance.calculate_dimension_column_stats( self._data_frame, self._string_columns + self._boolean_columns, levelCount=self._level_count_flag) # print dimensionColumnStat self._dataSize["dimensionLevelCountDict"] = { k: filter(lambda x: x["name"] == "numberOfUniqueValues", v)[0]["value"] for k, v in dimensionColumnStat.items() } self._dataSize["totalLevels"] = sum( self._dataSize["dimensionLevelCountDict"].values()) time_taken_dimensionstats = time.time() - self._start_time self._completionStatus += self._scriptStages["dimensionstats"][ "weight"] # print "dimension stats takes",time_taken_dimensionstats progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "dimensionstats",\ "info",\ self._scriptStages["dimensionstats"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=self._ignoreMsgFlag) self._start_time = time.time() timeDimensionColumnStat, timeDimensionCharts = metaHelperInstance.calculate_time_dimension_column_stats( self._data_frame, self._timestamp_columns, level_count_flag=self._level_count_flag) time_taken_tdstats = time.time() - self._start_time self._completionStatus += self._scriptStages["timedimensionstats"][ "weight"] # print "time dimension stats takes",time_taken_tdstats progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "timedimensionstats",\ "info",\ self._scriptStages["timedimensionstats"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=self._ignoreMsgFlag) self._start_time = time.time() ignoreColumnSuggestions = [] ignoreColumnReason = [] utf8ColumnSuggestion = [] dateTimeSuggestions = {} for column in self._data_frame.columns: random_slug = uuid.uuid4().hex headers.append(ColumnHeader(name=column, slug=random_slug)) data = ColumnData() data.set_slug(random_slug) data.set_name(column) data.set_abstract_datatype( self._column_type_dict[column]["abstract"]) columnStat = [] columnChartData = None if self._column_type_dict[column]["abstract"] == "measure": data.set_column_stats(measureColumnStat[column]) data.set_column_chart(measureCharts[column]) data.set_actual_datatype( self._column_type_dict[column]["actual"]) elif self._column_type_dict[column]["abstract"] == "dimension": data.set_column_stats(dimensionColumnStat[column]) data.set_column_chart(dimensionCharts[column]) data.set_actual_datatype( self._column_type_dict[column]["actual"]) elif self._column_type_dict[column]["abstract"] == "datetime": data.set_column_stats(timeDimensionColumnStat[column]) data.set_column_chart(timeDimensionCharts[column]) data.set_actual_datatype( self._column_type_dict[column]["actual"]) if self._column_type_dict[column]["abstract"] == "measure": if column not in self._real_columns: ignoreSuggestion, ignoreReason = metaHelperInstance.get_ignore_column_suggestions( self._data_frame, column, "measure", measureColumnStat[column], max_levels=self._max_levels) if ignoreSuggestion: ignoreColumnSuggestions.append(column) ignoreColumnReason.append(ignoreReason) data.set_level_count_to_null() data.set_chart_data_to_null() data.set_ignore_suggestion_flag(True) data.set_ignore_suggestion_message(ignoreReason) elif self._column_type_dict[column]["abstract"] == "dimension": if self._level_count_flag: utf8Suggestion = metaHelperInstance.get_utf8_suggestions( dimensionColumnStat[column]) else: utf8Suggestion = False if self._column_type_dict[column]["actual"] != "boolean": uniqueVals = self._data_frame.select( column).distinct().na.drop().collect() else: uniqueVals = [] if len(uniqueVals) > 0: dateColumnFormat = metaHelperInstance.get_datetime_format( uniqueVals) else: dateColumnFormat = None if dateColumnFormat: dateTimeSuggestions.update({column: dateColumnFormat}) data.set_level_count_to_null() data.set_chart_data_to_null() data.set_date_suggestion_flag(True) if utf8Suggestion: utf8ColumnSuggestion.append(column) ignoreSuggestion, ignoreReason = metaHelperInstance.get_ignore_column_suggestions( self._data_frame, column, "dimension", dimensionColumnStat[column], max_levels=self._max_levels) if ignoreSuggestion: ignoreColumnSuggestions.append(column) ignoreColumnReason.append(ignoreReason) data.set_level_count_to_null() data.set_chart_data_to_null() data.set_ignore_suggestion_flag(True) data.set_ignore_suggestion_message(ignoreReason) columnData.append(data) for dateColumn in dateTimeSuggestions.keys(): if dateColumn in ignoreColumnSuggestions: ignoreColIdx = ignoreColumnSuggestions.index(dateColumn) ignoreColumnSuggestions.remove(dateColumn) del (ignoreColumnReason[ignoreColIdx]) for utfCol in utf8ColumnSuggestion: ignoreColumnSuggestions.append(utfCol) ignoreColumnReason.append("utf8 values present") progressMessage = CommonUtils.create_progress_message_object( self._analysisName, "custom", "info", "Validating Metadata Information", self._completionStatus, self._completionStatus, display=True) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=self._ignoreMsgFlag) metaData.append( MetaData(name="ignoreColumnSuggestions", value=ignoreColumnSuggestions, display=False)) metaData.append( MetaData(name="ignoreColumnReason", value=ignoreColumnReason, display=False)) metaData.append( MetaData(name="utf8ColumnSuggestion", value=utf8ColumnSuggestion, display=False)) metaData.append( MetaData(name="dateTimeSuggestions", value=dateTimeSuggestions, display=False)) metaData.append( MetaData(name="dataSizeSummary", value=self._dataSize, display=False)) dfMetaData = DfMetaData() dfMetaData.set_column_data(columnData) dfMetaData.set_header(headers) dfMetaData.set_meta_data(metaData) dfMetaData.set_sample_data(sampleData) time_taken_suggestions = time.time() - self._start_time self._completionStatus += self._scriptStages["suggestions"]["weight"] # print "suggestions take",time_taken_suggestions progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "suggestions",\ "info",\ self._scriptStages["suggestions"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=self._ignoreMsgFlag) self._dataframe_context.update_completion_status( self._completionStatus) return dfMetaData