def submit_job_through_yarn(): # print sys.argv # print json.loads(sys.argv[1]) json_config = json.loads(sys.argv[1]) # json_config["config"] = "" configJson = json_config["job_config"] config = configJson["config"] jobConfig = configJson["job_config"] jobType = jobConfig["job_type"] jobName = jobConfig["job_name"] jobURL = jobConfig["job_url"] messageURL = jobConfig["message_url"] killURL = jobConfig["kill_url"] try: main(json_config["job_config"]) except Exception as e: # print jobURL, killURL data = {"status": "killed", "jobURL": jobURL} resp = send_kill_command(killURL, data) while str(resp.text) != '{"result": "success"}': data = {"status": "killed", "jobURL": jobURL} resp = send_kill_command(killURL, data) # print resp.text print('Main Method Did Not End ....., ', str(e)) progressMessage = CommonUtils.create_progress_message_object( "Main Method Did Not End .....", "Main Method Did Not End .....", "Error", str(e), "Failed", 100) CommonUtils.save_progress_message(messageURL, progressMessage, emptyBin=True)
def __init__(self, data_frame, df_helper, df_context, meta_parser, spark): self._data_frame = data_frame self._dataframe_helper = df_helper self._dataframe_context = df_context self._metaParser = meta_parser self._spark = spark self._ignoreRegressionElasticityMessages = self._dataframe_context.get_ignore_msg_regression_elasticity( ) self._completionStatus = self._dataframe_context.get_completion_status( ) self._analysisName = self._dataframe_context.get_analysis_name() self._analysisDict = self._dataframe_context.get_analysis_dict() self._messageURL = self._dataframe_context.get_message_url() self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight( ) self._scriptStages = { "regressionTrainingStart": { "summary": "Started the Regression Script", "weight": 0 }, "regressionTrainingEnd": { "summary": "Regression coefficients calculated", "weight": 10 }, } progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "regressionTrainingStart",\ "info",\ self._scriptStages["regressionTrainingStart"]["summary"],\ self._completionStatus,\ self._completionStatus) if self._ignoreRegressionElasticityMessages != True: CommonUtils.save_progress_message( self._messageURL, progressMessage, ignore=self._ignoreRegressionElasticityMessages) self._dataframe_context.update_completion_status( self._completionStatus) self._data_frame = self._dataframe_helper.fill_missing_values( self._data_frame)
def __init__(self, data_frame, df_helper, df_context, scriptWeight=None, analysisName=None): self._data_frame = data_frame self._dataframe_helper = df_helper self._dataframe_context = df_context self._completionStatus = self._dataframe_context.get_completion_status( ) self._messageURL = self._dataframe_context.get_message_url() if analysisName == None: self._analysisName = self._dataframe_context.get_analysis_name() else: self._analysisName = analysisName if scriptWeight == None: self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight( ) else: self._scriptWeightDict = scriptWeight self._scriptStages = { "statCalculationStart": { "summary": "Initialized the Descriptive Stats Scripts", "weight": 0 }, "statCalculationEnd": { "summary": "Descriptive Stats Calculated", "weight": 10 }, } progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "statCalculationStart",\ "info",\ self._scriptStages["statCalculationStart"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus)
def Predict(self): self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight( ) self._scriptStages = { "initialization": { "summary": "Initialized the Random Forest Scripts", "weight": 2 }, "prediction": { "summary": "Random Forest Model Prediction Finished", "weight": 2 }, "frequency": { "summary": "descriptive analysis finished", "weight": 2 }, "chisquare": { "summary": "chi Square analysis finished", "weight": 4 }, "completion": { "summary": "all analysis finished", "weight": 4 }, } self._completionStatus += old_div( self._scriptWeightDict[self._analysisName]["total"] * self._scriptStages["initialization"]["weight"], 10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "initialization",\ "info",\ self._scriptStages["initialization"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=self._ignoreMsg) self._dataframe_context.update_completion_status( self._completionStatus) # Match with the level_counts and then clean the data dataSanity = True level_counts_train = self._dataframe_context.get_level_count_dict() cat_cols = self._dataframe_helper.get_string_columns() # level_counts_score = CommonUtils.get_level_count_dict(self._data_frame,cat_cols,self._dataframe_context.get_column_separator(),output_type="dict") # if level_counts_train != {}: # for key in level_counts_train: # if key in level_counts_score: # if level_counts_train[key] != level_counts_score[key]: # dataSanity = False # else: # dataSanity = False categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() test_data_path = self._dataframe_context.get_input_file() if self._mlEnv == "spark": pass elif self._mlEnv == "sklearn": score_data_path = self._dataframe_context.get_score_path( ) + "/data.csv" if score_data_path.startswith("file"): score_data_path = score_data_path[7:] trained_model_path = self._dataframe_context.get_model_path() trained_model_path += "/" + self._dataframe_context.get_model_for_scoring( ) + ".pkl" if trained_model_path.startswith("file"): trained_model_path = trained_model_path[7:] score_summary_path = self._dataframe_context.get_score_path( ) + "/Summary/summary.json" if score_summary_path.startswith("file"): score_summary_path = score_summary_path[7:] trained_model = joblib.load(trained_model_path) # pandas_df = self._data_frame.toPandas() df = self._data_frame.toPandas() model_columns = self._dataframe_context.get_model_features() pandas_df = MLUtils.create_dummy_columns( df, [x for x in categorical_columns if x != result_column]) pandas_df = MLUtils.fill_missing_columns(pandas_df, model_columns, result_column) if uid_col: pandas_df = pandas_df[[ x for x in pandas_df.columns if x != uid_col ]] y_score = trained_model.predict(pandas_df) y_prob = trained_model.predict_proba(pandas_df) y_prob = MLUtils.calculate_predicted_probability(y_prob) y_prob = list([round(x, 2) for x in y_prob]) score = { "predicted_class": y_score, "predicted_probability": y_prob } df["predicted_class"] = score["predicted_class"] labelMappingDict = self._dataframe_context.get_label_map() df["predicted_class"] = df["predicted_class"].apply( lambda x: labelMappingDict[x] if x != None else "NA") df["predicted_probability"] = score["predicted_probability"] self._score_summary[ "prediction_split"] = MLUtils.calculate_scored_probability_stats( df) self._score_summary["result_column"] = result_column if result_column in df.columns: df.drop(result_column, axis=1, inplace=True) df = df.rename(index=str, columns={"predicted_class": result_column}) df.to_csv(score_data_path, header=True, index=False) uidCol = self._dataframe_context.get_uid_column() if uidCol == None: uidCols = self._metaParser.get_suggested_uid_columns() if len(uidCols) > 0: uidCol = uidCols[0] uidTableData = [] predictedClasses = list(df[result_column].unique()) if uidCol: if uidCol in df.columns: for level in predictedClasses: levelDf = df[df[result_column] == level] levelDf = levelDf[[ uidCol, "predicted_probability", result_column ]] levelDf.sort_values(by="predicted_probability", ascending=False, inplace=True) levelDf["predicted_probability"] = levelDf[ "predicted_probability"].apply( lambda x: humanize.apnumber(x * 100) + "%" if x * 100 >= 10 else str(int(x * 100)) + "%") uidTableData.append(levelDf[:5]) uidTableData = pd.concat(uidTableData) uidTableData = [list(arr) for arr in list(uidTableData.values)] uidTableData = [[uidCol, "Probability", result_column] ] + uidTableData uidTable = TableData() uidTable.set_table_width(25) uidTable.set_table_data(uidTableData) uidTable.set_table_type("normalHideColumn") self._result_setter.set_unique_identifier_table( json.loads( CommonUtils.convert_python_object_to_json(uidTable))) self._completionStatus += old_div( self._scriptWeightDict[self._analysisName]["total"] * self._scriptStages["prediction"]["weight"], 10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "prediction",\ "info",\ self._scriptStages["prediction"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=self._ignoreMsg) self._dataframe_context.update_completion_status( self._completionStatus) # CommonUtils.write_to_file(score_summary_path,json.dumps({"scoreSummary":self._score_summary})) print("STARTING DIMENSION ANALYSIS ...") columns_to_keep = [] columns_to_drop = [] # considercolumnstype = self._dataframe_context.get_score_consider_columns_type() # considercolumns = self._dataframe_context.get_score_consider_columns() # if considercolumnstype != None: # if considercolumns != None: # if considercolumnstype == ["excluding"]: # columns_to_drop = considercolumns # elif considercolumnstype == ["including"]: # columns_to_keep = considercolumns columns_to_keep = self._dataframe_context.get_score_consider_columns() if len(columns_to_keep) > 0: columns_to_drop = list(set(df.columns) - set(columns_to_keep)) else: columns_to_drop += ["predicted_probability"] columns_to_drop = [ x for x in columns_to_drop if x in df.columns and x != result_column ] print("columns_to_drop", columns_to_drop) df.drop(columns_to_drop, axis=1, inplace=True) resultColLevelCount = dict(df[result_column].value_counts()) # self._metaParser.update_level_counts(result_column,resultColLevelCount) self._metaParser.update_column_dict( result_column, { "LevelCount": resultColLevelCount, "numberOfUniqueValues": len(list(resultColLevelCount.keys())) }) self._dataframe_context.set_story_on_scored_data(True) SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) spark_scored_df = SQLctx.createDataFrame(df) # spark_scored_df.write.csv(score_data_path+"/data",mode="overwrite",header=True) # TODO update metadata for the newly created dataframe self._dataframe_context.update_consider_columns(columns_to_keep) df_helper = DataFrameHelper(spark_scored_df, self._dataframe_context, self._metaParser) df_helper.set_params() spark_scored_df = df_helper.get_data_frame() # try: # fs = time.time() # narratives_file = self._dataframe_context.get_score_path()+"/narratives/FreqDimension/data.json" # if narratives_file.startswith("file"): # narratives_file = narratives_file[7:] # result_file = self._dataframe_context.get_score_path()+"/results/FreqDimension/data.json" # if result_file.startswith("file"): # result_file = result_file[7:] # init_freq_dim = FreqDimensions(df, df_helper, self._dataframe_context,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName) # df_freq_dimension_obj = init_freq_dim.test_all(dimension_columns=[result_column]) # df_freq_dimension_result = CommonUtils.as_dict(df_freq_dimension_obj) # narratives_obj = DimensionColumnNarrative(result_column, df_helper, self._dataframe_context, df_freq_dimension_obj,self._result_setter,self._prediction_narrative,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName) # narratives = CommonUtils.as_dict(narratives_obj) # # print "Frequency Analysis Done in ", time.time() - fs, " seconds." # self._completionStatus += self._scriptWeightDict[self._analysisName]["total"]*self._scriptStages["frequency"]["weight"]/10 # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ # "frequency",\ # "info",\ # self._scriptStages["frequency"]["summary"],\ # self._completionStatus,\ # self._completionStatus) # CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsg) # self._dataframe_context.update_completion_status(self._completionStatus) # print "Frequency ",self._completionStatus # except: # print "Frequency Analysis Failed " # # try: # fs = time.time() # narratives_file = self._dataframe_context.get_score_path()+"/narratives/ChiSquare/data.json" # if narratives_file.startswith("file"): # narratives_file = narratives_file[7:] # result_file = self._dataframe_context.get_score_path()+"/results/ChiSquare/data.json" # if result_file.startswith("file"): # result_file = result_file[7:] # init_chisquare_obj = ChiSquare(df, df_helper, self._dataframe_context,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName) # df_chisquare_obj = init_chisquare_obj.test_all(dimension_columns= [result_column]) # df_chisquare_result = CommonUtils.as_dict(df_chisquare_obj) # chisquare_narratives = CommonUtils.as_dict(ChiSquareNarratives(df_helper, df_chisquare_obj, self._dataframe_context,df,self._prediction_narrative,self._result_setter,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName)) # except: # print "ChiSquare Analysis Failed " if len(predictedClasses) >= 2: try: fs = time.time() df_decision_tree_obj = DecisionTrees( spark_scored_df, df_helper, self._dataframe_context, self._spark, self._metaParser, scriptWeight=self._scriptWeightDict, analysisName=self._analysisName).test_all( dimension_columns=[result_column]) narratives_obj = CommonUtils.as_dict( DecisionTreeNarrative(result_column, df_decision_tree_obj, self._dataframe_helper, self._dataframe_context, self._metaParser, self._result_setter, story_narrative=None, analysisName=self._analysisName, scriptWeight=self._scriptWeightDict)) print(narratives_obj) except: print("DecisionTree Analysis Failed ") else: data_dict = { "npred": len(predictedClasses), "nactual": len(list(labelMappingDict.values())) } if data_dict["nactual"] > 2: levelCountDict[predictedClasses[0]] = resultColLevelCount[ predictedClasses[0]] levelCountDict["Others"] = sum([ v for k, v in list(resultColLevelCount.items()) if k != predictedClasses[0] ]) else: levelCountDict = resultColLevelCount otherClass = list( set(labelMappingDict.values()) - set(predictedClasses))[0] levelCountDict[otherClass] = 0 print(levelCountDict) total = float( sum([x for x in list(levelCountDict.values()) if x != None])) levelCountTuple = [({ "name": k, "count": v, "percentage": humanize.apnumber(old_div(v * 100, total)) + "%" if old_div(v * 100, total) >= 10 else str(int(old_div(v * 100, total))) + "%" }) for k, v in list(levelCountDict.items()) if v != None] levelCountTuple = sorted(levelCountTuple, key=lambda x: x["count"], reverse=True) data_dict["blockSplitter"] = "|~NEWBLOCK~|" data_dict["targetcol"] = result_column data_dict["nlevel"] = len(list(levelCountDict.keys())) data_dict["topLevel"] = levelCountTuple[0] data_dict["secondLevel"] = levelCountTuple[1] maincardSummary = NarrativesUtils.get_template_output( "/apps/", 'scorewithoutdtree.html', data_dict) main_card = NormalCard() main_card_data = [] main_card_narrative = NarrativesUtils.block_splitter( maincardSummary, "|~NEWBLOCK~|") main_card_data += main_card_narrative chartData = NormalChartData([levelCountDict]).get_data() chartJson = ChartJson(data=chartData) chartJson.set_title(result_column) chartJson.set_chart_type("donut") mainCardChart = C3ChartData(data=chartJson) mainCardChart.set_width_percent(33) main_card_data.append(mainCardChart) uidTable = self._result_setter.get_unique_identifier_table() if uidTable != None: main_card_data.append(uidTable) main_card.set_card_data(main_card_data) main_card.set_card_name( "Predicting Key Drivers of {}".format(result_column)) self._result_setter.set_score_dtree_cards([main_card], {})
def __init__(self, df_helper, df_context, result_setter, spark, story_narrative, meta_parser): self._story_narrative = story_narrative self._result_setter = result_setter self._spark = spark self._dataframe_helper = df_helper self._dataframe_context = df_context self._pandas_flag = df_context._pandas_flag self._data_frame = df_helper.get_data_frame() self._num_significant_digits = NarrativesUtils.get_significant_digit_settings( "trend") self._metaParser = meta_parser self._result_column = self._dataframe_context.get_result_column() self._string_columns = self._dataframe_helper.get_string_columns() self._timestamp_columns = self._dataframe_helper.get_timestamp_columns( ) # self._selected_date_columns = None self._selected_date_columns = self._dataframe_context.get_selected_date_columns( ) self._all_date_columns = self._dataframe_context.get_date_columns() self._string_columns = list( set(self._string_columns) - set(self._all_date_columns)) self._dateFormatDetected = False self._existingDateFormat = None self._dateFormatConversionDict = NarrativesUtils.date_formats_mapping_dict( ) self._dateColumnFormatDict = df_context.get_date_format_dict() if self._dataframe_context.get_requested_date_format() != None: self._requestedDateFormat = df_context.get_requested_date_format() else: self._requestedDateFormat = None self._analysistype = self._dataframe_context.get_analysis_type() self._trendSettings = self._dataframe_context.get_trend_settings() self._trendSpecificMeasure = False if self._trendSettings != None: if self._analysistype == "dimension" and self._trendSettings[ "name"] != "Count": self._trendSpecificMeasure = True self._analysistype = "measure" self._result_column = self._trendSettings["selectedMeasure"] elif self._analysistype == "measure" and self._trendSettings[ "name"] != "Count": self._result_column = self._trendSettings["selectedMeasure"] self._trend_subsection = self._result_setter.get_trend_section_name() self._regression_trend_card = None self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER self._highlightFlag = "|~HIGHLIGHT~|" self._trend_on_td_column = False self._number_of_dimensions_to_consider = 10 self._completionStatus = self._dataframe_context.get_completion_status( ) self._analysisName = self._dataframe_context.get_analysis_name() self._messageURL = self._dataframe_context.get_message_url() if self._analysistype == "dimension": self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight( ) self._scriptStages = { "initialization": { "summary": "Initialized The Frequency Narratives", "weight": 0 }, "summarygeneration": { "summary": "Summary Generation Finished", "weight": 4 }, "completion": { "summary": "Frequency Stats Narratives Done", "weight": 0 }, } elif self._analysistype == "measure": if self._trendSpecificMeasure: self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight( ) else: self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight( ) self._scriptStages = { "trendNarrativeStart": { "summary": "Started The Descriptive Stats Narratives", "weight": 1 }, "trendNarrativeEnd": { "summary": "Narratives For Descriptive Stats Finished", "weight": 0 }, } self._base_dir = "/trend/" if self._pandas_flag and self._selected_date_columns and not self._dateColumnFormatDict and not self._timestamp_columns: for column in self._selected_date_columns: uniqueVals = self._data_frame[column].astype( str).unique().tolist() metaHelperInstance = MetaDataHelper(self._data_frame, self._data_frame.shape[0]) if len(uniqueVals ) > 0 and metaHelperInstance.get_datetime_format_pandas( [ self._data_frame.sort_values( by=column, ascending=False)[column][0] ]) != None: dateColumnFormat = metaHelperInstance.get_datetime_format_pandas( uniqueVals) self._dateColumnFormatDict.update( {column: dateColumnFormat}) dateColCheck = NarrativesUtils.check_date_column_formats(self._selected_date_columns,\ self._timestamp_columns,\ self._dateColumnFormatDict,\ self._dateFormatConversionDict, self._requestedDateFormat) print(dateColCheck) self._dateFormatDetected = dateColCheck["dateFormatDetected"] self._trend_on_td_column = dateColCheck["trendOnTdCol"] if self._dateFormatDetected: self._requestedDateFormat = dateColCheck["requestedDateFormat"] self._existingDateFormat = dateColCheck["existingDateFormat"] # self._date_column_suggested is the column used for trend self._date_column_suggested = dateColCheck["suggestedDateColumn"] if self._existingDateFormat: self._data_frame, dataRangeStats = NarrativesUtils.calculate_data_range_stats( self._data_frame, self._existingDateFormat, self._date_column_suggested, self._trend_on_td_column, self._pandas_flag) print(dataRangeStats) self._durationString = dataRangeStats["durationString"] self._duration = dataRangeStats["duration"] self._dataLevel = dataRangeStats["dataLevel"] first_date = dataRangeStats["firstDate"] last_date = dataRangeStats["lastDate"] if self._timestamp_columns != None: if self._selected_date_columns == None: self._selected_date_columns = self._timestamp_columns else: self._selected_date_columns += self._timestamp_columns if self._pandas_flag: pass else: if self._trend_subsection == "regression": if self._selected_date_columns != None: if self._dateFormatDetected: trend_subsection_data = self._result_setter.get_trend_section_data( ) measure_column = trend_subsection_data[ "measure_column"] result_column = trend_subsection_data["result_column"] base_dir = trend_subsection_data["base_dir"] card3heading = 'How ' + result_column + ' and ' + measure_column + ' changed over time' if self._dataLevel == "day": grouped_data = self._data_frame.groupBy( "suggestedDate").agg({ measure_column: 'sum', result_column: 'sum' }) grouped_data = grouped_data.withColumnRenamed( grouped_data.columns[-1], result_column) grouped_data = grouped_data.withColumnRenamed( grouped_data.columns[-2], measure_column) grouped_data = grouped_data.withColumn( "year_month", udf(lambda x: x.strftime("%b-%y"))( "suggestedDate")) grouped_data = grouped_data.orderBy( "suggestedDate", ascending=True) grouped_data = grouped_data.withColumnRenamed( grouped_data.columns[0], "key") grouped_data = grouped_data.toPandas() elif self._dataLevel == "month": grouped_data = self._data_frame.groupBy( "year_month").agg({ measure_column: 'sum', result_column: 'sum' }) grouped_data = grouped_data.withColumnRenamed( grouped_data.columns[-1], result_column) grouped_data = grouped_data.withColumnRenamed( grouped_data.columns[-2], measure_column) grouped_data = grouped_data.withColumn( "suggestedDate", udf(lambda x: datetime.strptime(x, "%b-%y"))( "year_month")) grouped_data = grouped_data.orderBy( "suggestedDate", ascending=True) grouped_data = grouped_data.withColumnRenamed( "suggestedDate", "key") grouped_data = grouped_data.select([ "key", measure_column, result_column, "year_month" ]).toPandas() grouped_data["key"] = grouped_data[ "year_month"].apply( lambda x: datetime.strptime(x, "%b-%y" ).date()) trend_narrative_obj = TrendNarrative( self._result_column, self._date_column_suggested, grouped_data, self._existingDateFormat, self._requestedDateFormat, self._base_dir, self._metaParser) card3data = trend_narrative_obj.generate_regression_trend_data( grouped_data, measure_column, result_column, self._dataLevel, self._durationString) card3narrative = NarrativesUtils.get_template_output(base_dir,\ 'regression_card3.html',card3data) card3chart = trend_narrative_obj.generate_regression_trend_chart( grouped_data, self._dataLevel) card3paragraphs = NarrativesUtils.paragraph_splitter( card3narrative) card2 = { 'charts': card3chart, 'paragraphs': card3paragraphs, 'heading': card3heading } self.set_regression_trend_card_data(card2) else: print("NO DATE FORMAT DETECTED") else: print("NO DATE COLUMNS PRESENT") if self._analysistype == "measure": self._completionStatus += old_div( self._scriptWeightDict[self._analysisName]["total"] * self._scriptStages["trendNarrativeStart"]["weight"], 10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "trendNarrativeStart",\ "info",\ self._scriptStages["trendNarrativeStart"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) # self._startMeasureTrend = self._result_setter.get_trend_section_completion_status() self._startMeasureTrend = True if self._startMeasureTrend == True: self.narratives = { "SectionHeading": "", "card1": {}, "card2": {}, "card3": {} } if self._selected_date_columns != None: if self._dateFormatDetected: grouped_data = NarrativesUtils.get_grouped_data_for_trend( self._data_frame, self._dataLevel, self._result_column, self._analysistype, self._pandas_flag) if self._pandas_flag: self._data_frame = self._data_frame.drop( self._date_column_suggested, axis=1) else: self._data_frame = self._data_frame.drop( self._date_column_suggested) # self._data_frame = self._data_frame.withColumnRenamed("year_month", self._date_column_suggested) significant_dimensions = [] significant_dimension_dict = df_helper.get_significant_dimension( ) if significant_dimension_dict != {} and significant_dimension_dict != None: significant_dimension_tuple = tuple( significant_dimension_dict.items()) significant_dimension_tuple = sorted( significant_dimension_tuple, key=lambda x: x[1], reverse=True) significant_dimensions = [ x[0] for x in significant_dimension_tuple[:self. _number_of_dimensions_to_consider] ] else: significant_dimensions = self._string_columns[:self . _number_of_dimensions_to_consider] print("significant_dimensions", significant_dimensions) trend_narrative_obj = TrendNarrative( self._result_column, self._date_column_suggested, grouped_data, self._existingDateFormat, self._requestedDateFormat, self._base_dir, self._metaParser) # grouped_data.to_csv("/home/gulshan/marlabs/datasets/trend_grouped_pandas.csv",index=False) dataDict = trend_narrative_obj.generateDataDict( grouped_data, self._dataLevel, self._durationString) # # update reference time with max value reference_time = dataDict["reference_time"] dataDict["duration"] = self._duration dataDict["dataLevel"] = self._dataLevel dataDict["durationString"] = self._durationString dataDict[ "significant_dimensions"] = significant_dimensions if len(significant_dimensions) > 0: if self._dataLevel == "day": datetimeformat = self._existingDateFormat elif self._dataLevel == "month": datetimeformat = "%b-%y" # xtraData = trend_narrative_obj.get_xtra_calculations(self._data_frame,grouped_data,significant_dimensions,self._date_column_suggested,self._result_column,self._existingDateFormat,reference_time,self._dataLevel, self._pandas_flag) xtraData = trend_narrative_obj.get_xtra_calculations( self._data_frame, grouped_data, significant_dimensions, self._date_column_suggested, self._result_column, datetimeformat, reference_time, self._dataLevel, self._pandas_flag) if xtraData != None: dataDict.update(xtraData) # print 'Trend dataDict: %s' %(json.dumps(dataDict, indent=2)) self._result_setter.update_executive_summary_data( dataDict) dataDict.update({ "blockSplitter": self._blockSplitter, "highlightFlag": self._highlightFlag }) summary1 = NarrativesUtils.get_template_output(self._base_dir,\ 'measure_trend_card1.html',dataDict) summary2 = NarrativesUtils.get_template_output(self._base_dir,\ 'measure_trend_card2.html',dataDict) measureTrendCard = NormalCard() measureTrendcard1Data = NarrativesUtils.block_splitter( summary1, self._blockSplitter, highlightFlag=self._highlightFlag) measureTrendcard2Data = NarrativesUtils.block_splitter( summary2, self._blockSplitter) # print measureTrendcard1Data bubbledata = dataDict["bubbleData"] # print bubbledata card1BubbleData = "<div class='col-md-6 col-xs-12 xs-p-20'><h2 class='text-center'><span>{}</span><br/><small>{}</small></h2></div><div class='col-md-6 col-xs-12 xs-p-20'><h2 class='text-center'><span>{}</span><br/><small>{}</small></h2></div>".format( bubbledata[0]["value"], bubbledata[0]["text"], bubbledata[1]["value"], bubbledata[1]["text"]) # print card1BubbleData trend_chart_data = list( grouped_data[["key", "value"]].T.to_dict().values()) trend_chart_data = sorted(trend_chart_data, key=lambda x: x["key"]) card1chartdata = {"actual": [], "predicted": []} if self._dataLevel == "day": card1chartdata["actual"] = [{ "key": str(val["key"]), "value": val["value"] } for val in trend_chart_data] elif self._dataLevel == "month": card1chartdata["actual"] = [{ "key": val["key"].strftime("%b-%y"), "value": val["value"] } for val in trend_chart_data] if self._duration < 365: prediction_window = 3 else: prediction_window = 6 predicted_values = trend_narrative_obj.get_forecast_values( grouped_data["value"], prediction_window)[len(grouped_data["value"]):] predicted_values = [ round(x, self._num_significant_digits) for x in predicted_values ] forecasted_data = [] forecasted_data.append(card1chartdata["actual"][-1]) forecasted_dates = [] # forecast_start_time = datetime.strptime(card1chartdata["actual"][-1]["key"],"%b-%y") if self._dataLevel == "month": forecast_start_time = datetime.strptime( card1chartdata["actual"][-1]["key"], "%b-%y") elif self._dataLevel == "day": try: forecast_start_time = datetime.strptime( card1chartdata["actual"][-1]["key"], "%Y-%m-%d") except: forecast_start_time = datetime.strptime( card1chartdata["actual"][-1]["key"], '%Y-%m-%d %H:%M:%S') for val in range(prediction_window): if self._dataLevel == "month": key = forecast_start_time + relativedelta( months=1 + val) forecasted_dates.append(key) elif self._dataLevel == "day": key = forecast_start_time + relativedelta( days=1 + val) forecasted_dates.append(key) forecasted_list = list( zip(forecasted_dates, predicted_values)) if self._dataLevel == "month": forecasted_list = [{ "key": val[0].strftime("%b-%y"), "value": val[1] } for val in forecasted_list] elif self._dataLevel == "day": forecasted_list = [{ "key": val[0].strftime("%Y-%m-%d"), "value": val[1] } for val in forecasted_list] forecasted_data += forecasted_list card1chartdata["predicted"] = forecasted_data # print json.dumps(card1chartdata,indent=2) card1chartdata = ScatterChartData(data=card1chartdata) chartJson = ChartJson() chartJson.set_data(card1chartdata.get_data()) chartJson.set_label_text({ 'x': ' ', 'y': 'No. of Observations' }) chartJson.set_legend({ "actual": "Observed", "predicted": "Forecast" }) chartJson.set_chart_type("scatter_line") chartJson.set_axes({"x": "key", "y": "value"}) chartJson.set_yaxis_number_format(".2f") st_info = [ "Trend Analysis", "Forecast Method : Holt Winters Method" ] measureTrendcard1Data.insert( 1, C3ChartData(data=chartJson, info=st_info)) measureTrendcard1Data.append( HtmlData(data=card1BubbleData)) cardData = measureTrendcard1Data + measureTrendcard2Data measureTrendCard.set_card_data(cardData) measureTrendCard.set_card_name("Trend Analysis") trendStoryNode = NarrativesTree( "Trend", None, [], [measureTrendCard]) self._story_narrative.add_a_node(trendStoryNode) self._result_setter.set_trend_node(trendStoryNode) # prediction_data = [{"key":x["key"],"value":x["value"]} for x in trend_chart_data] # last_val = prediction_data[-1] # last_val.update({"predicted_value":last_val["value"]}) # prediction_data[-1] = last_val # # for val in range(prediction_window): # dataLevel = dataDict["dataLevel"] # if self._dataLevel == "month": # last_key = prediction_data[-1]["key"] # key = last_key+relativedelta(months=1) # prediction_data.append({"key":key,"predicted_value":predicted_values[val]}) # forecasted_data.append({"key":key,"value":predicted_values[val]}) # elif self._dataLevel == "day": # last_key = prediction_data[-1]["key"] # key = last_key+relativedelta(days=1) # prediction_data.append({"key":key,"predicted_value":predicted_values[val]}) # prediction_data_copy = prediction_data # prediction_data = [] # for val in prediction_data_copy: # val["key"] = val["key"].strftime("%b-%y") # prediction_data.append(val) # forecastDataDict = {"startForecast":predicted_values[0], # "endForecast":predicted_values[prediction_window-1], # "measure":dataDict["measure"], # "forecast":True, # "forecast_percentage": round((predicted_values[prediction_window-1]-predicted_values[0])/predicted_values[0],self._num_significant_digits), # "prediction_window_text": str(prediction_window) + " months" # } # # self._result_setter.update_executive_summary_data(forecastDataDict) # summary3 = NarrativesUtils.get_template_output(self._base_dir,\ # 'trend_narrative_card3.html',forecastDataDict) self._completionStatus += old_div( self._scriptWeightDict[self._analysisName]["total"] * self._scriptStages["trendNarrativeEnd"]["weight"], 10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "trendNarrativeEnd",\ "info",\ self._scriptStages["trendNarrativeEnd"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message( self._messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) else: # self._result_setter.update_executive_summary_data({"trend_present":False}) print("Trend Analysis for Measure Failed") print("#" * 20 + "Trend Analysis Error" + "#" * 20) print( "No date format for the date column %s was detected." % (self._date_column_suggested)) print("#" * 60) self._completionStatus += self._scriptWeightDict[ self._analysisName]["total"] self._dataframe_context.update_completion_status( completionStatus) progressMessage = CommonUtils.create_progress_message_object("Trend","failedState","error",\ "Trend Failed As "+"No Date Format For The Date Column %s Was Detected !!!" %(self._date_column_suggested),\ completionStatus,completionStatus) CommonUtils.save_progress_message( messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) else: # self._result_setter.update_executive_summary_data({"trend_present":False}) print("Trend Analysis for Measure Failed") print("#" * 20 + "Trend Analysis Error" + "#" * 20) print("No date column present for Trend Analysis.") print("#" * 60) self._completionStatus += self._scriptWeightDict[ self._analysisName]["total"] self._dataframe_context.update_completion_status( completionStatus) progressMessage = CommonUtils.create_progress_message_object("Trend","failedState","error",\ "No Date Column Present",\ completionStatus,completionStatus) CommonUtils.save_progress_message(messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) else: print("overall Trend not Started YET") elif self._analysistype == "dimension": print("Dimension Trend Started") self._completionStatus += old_div( self._scriptWeightDict[self._analysisName]["total"] * self._scriptStages["initialization"]["weight"], 10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "initialization",\ "info",\ self._scriptStages["initialization"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) self.narratives = {"card0": {}} if self._selected_date_columns != None: if self._dateFormatDetected: # result_column_levels = [x[0] for x in self._data_frame.select(self._result_column).distinct().collect()] try: result_column_levels = self._metaParser.get_unique_level_names( self._result_column) except: if self._pandas_flag: result_column_levels = list( self._data_frame[self._result_column].unique()) else: result_column_levels = [ x[0] for x in self._data_frame.select( self._result_column).distinct().collect() ] # result_column_levels = self._data_frame.agg((F.collect_set(self._result_column).alias(self._result_column))).first().asDict()[self._result_column] print("-" * 100) # TODO Implement meta parser getter here print(result_column_levels) if self._pandas_flag: level_count_df = self._data_frame[ self._result_column].value_counts()[0:2] top2levels = list(level_count_df.index) else: level_count_df = self._data_frame.groupBy( self._result_column).count().orderBy( "count", ascending=False) level_count_df_rows = level_count_df.collect() top2levels = [ level_count_df_rows[0][0], level_count_df_rows[1][0] ] cardData = [] chart_data = {} cardData1 = [] c3_chart = {"dataType": "c3Chart", "data": {}} print("#" * 40) overall_count = NarrativesUtils.get_grouped_count_data_for_dimension_trend( self._data_frame, self._dataLevel, self._result_column, self._pandas_flag) print("#" * 40) for idx, level in enumerate(top2levels): print("calculations in progress for the level :- ", level) if self._pandas_flag: leveldf = self._data_frame[self._data_frame[ self._result_column] == level] else: leveldf = self._data_frame.filter( col(self._result_column) == level) grouped_data = NarrativesUtils.get_grouped_data_for_trend( leveldf, self._dataLevel, self._result_column, self._analysistype, self._pandas_flag) grouped_data.rename(columns={"value": "value_count"}, inplace=True) grouped_data = pd.merge(grouped_data, overall_count, on='key', how='left') # grouped_data["value"] = grouped_data["value_count"].apply(lambda x:round(x*100/float(self._data_frame.count()),self._num_significant_digits)) grouped_data["value"] = old_div( grouped_data["value_count"], grouped_data["totalCount"]) grouped_data["value"] = grouped_data["value"].apply( lambda x: round(x * 100, self. _num_significant_digits)) if self._pandas_flag: leveldf = leveldf.drop(self._date_column_suggested, axis=1) leveldf = leveldf.rename( columns={ "year_month": self._date_column_suggested }) if "year_month" not in leveldf.columns: leveldf["year_month"] = leveldf[ self._date_column_suggested] leveldf["value_col"] = 1 else: leveldf = leveldf.drop(self._date_column_suggested) leveldf = leveldf.withColumnRenamed( "year_month", self._date_column_suggested) if "year_month" not in leveldf.columns: leveldf = leveldf.withColumn( "year_month", col(self._date_column_suggested)) leveldf = leveldf.withColumn('value_col', lit(1)) trend_narrative_obj = TrendNarrative( self._result_column, self._date_column_suggested, grouped_data, self._existingDateFormat, self._requestedDateFormat, self._base_dir, self._metaParser) dataDict = trend_narrative_obj.generateDataDict( grouped_data, self._dataLevel, self._durationString) dataDict["target_column"] = dataDict["measure"] dataDict["measure"] = level dataDict["duration"] = self._duration dataDict["dataLevel"] = self._dataLevel dataDict["durationString"] = self._durationString # grouped_data.to_csv("/home/gulshan/marlabs/datasets/grouped_data"+str(idx)) # print json.dumps(dataDict,indent=2) significant_dimensions = [] significant_dimension_dict = df_helper.get_chisquare_significant_dimension( ) if significant_dimension_dict != {} and significant_dimension_dict != None: significant_dimension_tuple = tuple( significant_dimension_dict.items()) significant_dimension_tuple = sorted( significant_dimension_tuple, key=lambda x: x[1], reverse=True) significant_dimensions = [ x[0] for x in significant_dimension_tuple[:self. _number_of_dimensions_to_consider] ] else: significant_dimensions = self._string_columns[:self . _number_of_dimensions_to_consider] print("significant_dimensions", significant_dimensions) reference_time = dataDict["reference_time"] dataDict[ "significant_dimensions"] = significant_dimensions if len(significant_dimensions) > 0: st = time.time() xtraData = trend_narrative_obj.get_xtra_calculations( leveldf, grouped_data, significant_dimensions, self._date_column_suggested, "value_col", self._existingDateFormat, reference_time, self._dataLevel, self._pandas_flag) print("time for get_xtra_calculations", time.time() - st) if xtraData != None: dataDict.update(xtraData) dimensionCount = trend_narrative_obj.generate_dimension_extra_narrative( grouped_data, dataDict, self._dataLevel) if dimensionCount != None: dataDict.update(dimensionCount) dataDict.update({ "level_index": idx, "blockSplitter": self._blockSplitter, "highlightFlag": self._highlightFlag }) self._result_setter.update_executive_summary_data( dataDict) trendStory = NarrativesUtils.get_template_output(self._base_dir,\ 'dimension_trend.html',dataDict) blocks = NarrativesUtils.block_splitter( trendStory, self._blockSplitter) if idx != 0: cardData1 += blocks[2:] else: cardData1 += blocks trend_chart_data = [ x for x in list(grouped_data[ ["key", "value"]].T.to_dict().values()) if x['key'] != None ] trend_chart_data = sorted(trend_chart_data, key=lambda x: x["key"]) card1chartdata = trend_chart_data if self._dataLevel == "day": card1chartdata = [{ "key": str(val["key"]), "value": val["value"] } for val in card1chartdata] elif self._dataLevel == "month": card1chartdata = [{ "key": val["key"].strftime("%b-%y"), "value": val["value"] } for val in card1chartdata] chart_data[level] = card1chartdata labels = { "x": "key", "y": list(chart_data.keys())[0], "y2": list(chart_data.keys())[1] } c3Chart = { "data": chart_data, "format": "%b-%y", "label": labels, "label_text": { "x": "Time", "y": "Percentage of " + labels["y"], "y2": "Percentage of " + labels["y2"] } } c3_chart["data"] = c3Chart multiLineData = [] for idx in range(len(chart_data[top2levels[0]])): key = chart_data[top2levels[0]][idx]["key"] value = chart_data[top2levels[0]][idx]["value"] try: value1 = chart_data[top2levels[1]][idx]["value"] except: value1 = 0 multiLineData.append({ "key": key, top2levels[0]: value, top2levels[1]: value1 }) chartData = NormalChartData(multiLineData) chartJson = ChartJson() chartJson.set_data(chartData.get_data()) chartJson.set_label_text(c3Chart["label_text"]) chartJson.set_legend(c3Chart["label"]) chartJson.set_chart_type("line") chartJson.set_yaxis_number_format(".2f") chartJson.set_axes(labels) st_info = [ "Trend Analysis", "Forecast Method : Holt Winters Method" ] cardData1.insert(1, C3ChartData(data=chartJson, info=st_info)) trendCard = NormalCard(name="Trend Analysis", slug=None, cardData=cardData1) trendStoryNode = NarrativesTree("Trend", None, [], [trendCard]) self._story_narrative.add_a_node(trendStoryNode) self._result_setter.set_trend_node(trendStoryNode) self._completionStatus += old_div( self._scriptWeightDict[self._analysisName]["total"] * self._scriptStages["summarygeneration"]["weight"], 10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "summarygeneration",\ "info",\ self._scriptStages["summarygeneration"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) self._completionStatus += old_div( self._scriptWeightDict[self._analysisName]["total"] * self._scriptStages["completion"]["weight"], 10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "completion",\ "info",\ self._scriptStages["completion"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) else: self._result_setter.update_executive_summary_data( {"trend_present": False}) print("Trend Analysis for Dimension Failed") print("#" * 20 + "Trend Analysis Error" + "#" * 20) if self._date_column_suggested: print( "No date format for the date column %s was detected." % (self._date_column_suggested)) print("#" * 60) self._completionStatus += self._scriptWeightDict[ self._analysisName]["total"] self._dataframe_context.update_completion_status( self._completionStatus) progressMessage = CommonUtils.create_progress_message_object("Trend","failedState","error",\ "Trend Failed As "+"No Date Format For The Date Column %s Was Detected !!!" %(self._date_column_suggested),\ self._completionStatus,self._completionStatus) CommonUtils.save_progress_message(messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) else: self._result_setter.update_executive_summary_data( {"trend_present": False}) print("Trend Analysis for Dimension Failed") print("#" * 20 + "Trend Analysis Error" + "#" * 20) print("No date column present for Trend Analysis.") print("#" * 60) self._completionStatus += self._scriptWeightDict[ self._analysisName]["total"] self._dataframe_context.update_completion_status( self._completionStatus) progressMessage = CommonUtils.create_progress_message_object("Trend","failedState","error",\ "No Date Column Present",\ self._completionStatus,self._completionStatus) CommonUtils.save_progress_message(messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus)
def generate_narratives(self): regression_narrative_obj = LinearRegressionNarrative( self._df_regression_result, self._correlations, self._dataframe_helper, self._dataframe_context, self._metaParser, self._spark ) main_card_data = regression_narrative_obj.generate_main_card_data() main_card_narrative = NarrativesUtils.get_template_output(self._base_dir,\ 'regression_main_card.html',main_card_data) self.narratives['main_card'] = {} self.narratives["main_card"]['paragraphs'] = NarrativesUtils.paragraph_splitter(main_card_narrative) self.narratives["main_card"]['header'] = 'Key Measures that affect ' + self.result_column self.narratives["main_card"]['chart'] = {} self.narratives["main_card"]['chart']['heading'] = '' self.narratives["main_card"]['chart']['data'] = [[i for i,j in self._all_coeffs], [j['coefficient'] for i,j in self._all_coeffs]] self.narratives["main_card"]['chart']['label'] = {'x':'Measure Name', 'y': 'Change in ' + self.result_column + ' per unit increase'} main_card = NormalCard() main_card_header = HtmlData(data = '<h3>Key Measures that affect ' + self.result_column+"</h3>") main_card_paragraphs = NarrativesUtils.block_splitter(main_card_narrative,self._blockSplitter) main_card_chart_data = [{"key":val[0],"value":val[1]} for val in zip([i for i,j in self._all_coeffs],[j['coefficient'] for i,j in self._all_coeffs])] main_card_chart = NormalChartData(data=main_card_chart_data) mainCardChartJson = ChartJson() mainCardChartJson.set_data(main_card_chart.get_data()) mainCardChartJson.set_label_text({'x':'Influencing Factors','y': 'Change in ' + self.result_column + ' per unit increase'}) mainCardChartJson.set_chart_type("bar") mainCardChartJson.set_axes({"x":"key","y":"value"}) mainCardChartJson.set_yaxis_number_format(".2f") # st_info = ["Test : Regression","Threshold for p-value: 0.05", "Effect Size: Regression Coefficient"] chart_data = sorted(main_card_chart_data,key=lambda x:x["value"],reverse=True) statistical_info_array=[ ("Test Type","Regression"), ("Effect Size","Coefficients"), ("Max Effect Size",chart_data[0]["key"]), ("Min Effect Size",chart_data[-1]["key"]), ] statistical_inferenc = "" if len(chart_data) == 1: statistical_inference = "{} is the only variable that have significant influence over {} (Target) having an \ Effect size of {}".format(chart_data[0]["key"],self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4)) elif len(chart_data) == 2: statistical_inference = "There are two variables ({} and {}) that have significant influence over {} (Target) and the \ Effect size ranges are {} and {} respectively".format(chart_data[0]["key"],chart_data[1]["key"],self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4),round(chart_data[1]["value"],4)) else: statistical_inference = "There are {} variables that have significant influence over {} (Target) and the \ Effect size ranges from {} to {}".format(len(chart_data),self._dataframe_context.get_result_column(),round(chart_data[0]["value"],4),round(chart_data[-1]["value"],4)) if statistical_inference != "": statistical_info_array.append(("Inference",statistical_inference)) statistical_info_array = NarrativesUtils.statistical_info_array_formatter(statistical_info_array) main_card.set_card_data(data = [main_card_header]+main_card_paragraphs+[C3ChartData(data=mainCardChartJson,info=statistical_info_array)]) main_card.set_card_name("Key Influencers") self._regressionNode.add_a_card(main_card) count = 0 for measure_column in self.significant_measures: sigMeasureNode = NarrativesTree() sigMeasureNode.set_name(measure_column) measureCard1 = NormalCard() measureCard1.set_card_name("{}: Impact on {}".format(measure_column,self.result_column)) measureCard1Data = [] if self._run_dimension_level_regression: measureCard2 = NormalCard() measureCard2.set_card_name("Key Areas where it Matters") measureCard2Data = [] measure_column_cards = {} card0 = {} card1data = regression_narrative_obj.generate_card1_data(measure_column) card1heading = "<h3>Impact of "+measure_column+" on "+self.result_column+"</h3>" measureCard1Header = HtmlData(data=card1heading) card1data.update({"blockSplitter":self._blockSplitter}) card1narrative = NarrativesUtils.get_template_output(self._base_dir,\ 'regression_card1.html',card1data) card1paragraphs = NarrativesUtils.block_splitter(card1narrative,self._blockSplitter) card0 = {"paragraphs":card1paragraphs} card0["charts"] = {} card0['charts']['chart2']={} # card0['charts']['chart2']['data']=card1data["chart_data"] # card0['charts']['chart2']['heading'] = '' # card0['charts']['chart2']['labels'] = {} card0['charts']['chart1']={} card0["heading"] = card1heading measure_column_cards['card0'] = card0 measureCard1Header = HtmlData(data=card1heading) measureCard1Data += [measureCard1Header] measureCard1para = card1paragraphs measureCard1Data += measureCard1para if self._run_dimension_level_regression: print("running narratives for key area dict") self._dim_regression = self.run_regression_for_dimension_levels() card2table, card2data=regression_narrative_obj.generate_card2_data(measure_column,self._dim_regression) card2data.update({"blockSplitter":self._blockSplitter}) card2narrative = NarrativesUtils.get_template_output(self._base_dir,\ 'regression_card2.html',card2data) card2paragraphs = NarrativesUtils.block_splitter(card2narrative,self._blockSplitter) card1 = {'tables': card2table, 'paragraphs' : card2paragraphs, 'heading' : 'Key Areas where ' + measure_column + ' matters'} measure_column_cards['card1'] = card1 measureCard2Data += card2paragraphs if "table1" in card2table: table1data = regression_narrative_obj.convert_table_data(card2table["table1"]) card2Table1 = TableData() card2Table1.set_table_data(table1data) card2Table1.set_table_type("heatMap") card2Table1.set_table_top_header(card2table["table1"]["heading"]) card2Table1Json = json.loads(CommonUtils.convert_python_object_to_json(card2Table1)) # measureCard2Data.insert(3,card2Table1) measureCard2Data.insert(3,card2Table1Json) if "table2" in card2table: table2data = regression_narrative_obj.convert_table_data(card2table["table2"]) card2Table2 = TableData() card2Table2.set_table_data(table2data) card2Table2.set_table_type("heatMap") card2Table2.set_table_top_header(card2table["table2"]["heading"]) # measureCard2Data.insert(5,card2Table2) card2Table2Json = json.loads(CommonUtils.convert_python_object_to_json(card2Table2)) # measureCard2Data.append(card2Table2) measureCard2Data.append(card2Table2Json) # self._result_setter.set_trend_section_data({"result_column":self.result_column, # "measure_column":measure_column, # "base_dir":self._base_dir # }) # trend_narratives_obj = TimeSeriesNarrative(self._dataframe_helper, self._dataframe_context, self._result_setter, self._spark, self._story_narrative) # card2 = trend_narratives_obj.get_regression_trend_card_data() # if card2: # measure_column_cards['card2'] = card2 # # # card3 = {} progressMessage = CommonUtils.create_progress_message_object(self._analysisName,"custom","info","Analyzing Key Influencers",self._completionStatus,self._completionStatus,display=True) CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=False) card4data = regression_narrative_obj.generate_card4_data(self.result_column,measure_column) card4data.update({"blockSplitter":self._blockSplitter}) # card4heading = "Sensitivity Analysis: Effect of "+self.result_column+" on Segments of "+measure_column card4narrative = NarrativesUtils.get_template_output(self._base_dir,\ 'regression_card4.html',card4data) card4paragraphs = NarrativesUtils.block_splitter(card4narrative,self._blockSplitter) # card3 = {"paragraphs":card4paragraphs} card0['paragraphs'] = card1paragraphs+card4paragraphs card4Chart = card4data["charts"] # st_info = ["Test : Regression", "Variables : "+ self.result_column +", "+measure_column,"Intercept : "+str(round(self._df_regression_result.get_intercept(),2)), "Regression Coefficient : "+ str(round(self._df_regression_result.get_coeff(measure_column),2))] statistical_info_array=[ ("Test Type","Regression"), ("Coefficient",str(round(self._df_regression_result.get_coeff(measure_column),2))), ("P-Value","<= 0.05"), ("Intercept",str(round(self._df_regression_result.get_intercept(),2))), ("R Square ",str(round(self._df_regression_result.get_rsquare(),2))), ] inferenceTuple = () coeff = self._df_regression_result.get_coeff(measure_column) if coeff > 0: inferenceTuple = ("Inference","For every additional unit of increase in {} there will be an increase of {} units in {} (target).".format(measure_column,str(round(coeff,2)),self._dataframe_context.get_result_column())) else: inferenceTuple = ("Inference","For every additional unit of decrease in {} there will be an decrease of {} units in {} (target).".format(measure_column,str(round(coeff,2)),self._dataframe_context.get_result_column())) if len(inferenceTuple) > 0: statistical_info_array.append(inferenceTuple) statistical_info_array = NarrativesUtils.statistical_info_array_formatter(statistical_info_array) card4paragraphs.insert(2,C3ChartData(data=card4Chart,info=statistical_info_array)) measureCard1Data += card4paragraphs self.narratives['cards'].append(measure_column_cards) if count == 0: card4data.pop("charts") self._result_setter.update_executive_summary_data(card4data) count += 1 measureCard1.set_card_data(measureCard1Data) if self._run_dimension_level_regression: measureCard2.set_card_data(measureCard2Data) sigMeasureNode.add_cards([measureCard1,measureCard2]) sigMeasureNode.add_cards([measureCard1]) self._regressionNode.add_a_node(sigMeasureNode) # self._result_setter.set_trend_section_completion_status(True) self._story_narrative.add_a_node(self._regressionNode)
def __init__(self, df_helper, df_context, result_setter, spark, df_regression_result, correlations,story_narrative,meta_parser): self._metaParser = meta_parser self._result_setter = result_setter self._story_narrative = story_narrative self._df_regression_result = df_regression_result self._correlations = correlations self._dataframe_helper = df_helper self._dataframe_context = df_context self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER # self._result_setter.set_trend_section_name("regression") self._measure_columns = self._dataframe_helper.get_numeric_columns() self._dimension_columns = self._dataframe_helper.get_string_columns() self._date_columns = self._dataframe_context.get_date_columns() self._uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(self._uid_col): self._dimension_columns = list(set(self._dimension_columns) - {self._uid_col}) if len(self._date_columns) >0 : self._dimension_columns = list(set(self._dimension_columns)-set(self._date_columns)) self._spark = spark self.measures = [] self.result_column = self._dataframe_helper.resultcolumn self.all_coefficients = self._df_regression_result.get_all_coeff() all_coeff = [(x,self.all_coefficients[x]) for x in list(self.all_coefficients.keys())] all_coeff = sorted(all_coeff,key = lambda x:abs(x[1]["coefficient"]),reverse = True) self._all_coeffs = all_coeff self.significant_measures = [x[0] for x in all_coeff if x[1]['p_value']<=0.05] print(self.significant_measures) print("regression narratives started") self.narratives = {"heading": self.result_column + "Performance Report", "main_card":{}, "cards":[] } self._base_dir = "/regression/" self._run_dimension_level_regression = False # self._dim_regression = self.run_regression_for_dimension_levels() self._regressionNode = NarrativesTree() self._completionStatus = self._dataframe_context.get_completion_status() self._analysisName = self._dataframe_context.get_analysis_name() self._messageURL = self._dataframe_context.get_message_url() self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight() self._scriptStages = { "regressionNarrativeStart":{ "summary":"Started The Regression Narratives", "weight":1 }, "regressionNarrativeEnd":{ "summary":"Narratives For Regression Finished", "weight":0 }, } self._completionStatus += old_div(self._scriptWeightDict[self._analysisName]["narratives"]*self._scriptStages["regressionNarrativeStart"]["weight"],10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "regressionNarrativeStart",\ "info",\ self._scriptStages["regressionNarrativeStart"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL,progressMessage) self._dataframe_context.update_completion_status(self._completionStatus) self.generate_narratives() self._regressionNode.set_name("Influencers") self._result_setter.set_regression_node(self._regressionNode) self._completionStatus += old_div(self._scriptWeightDict[self._analysisName]["narratives"]*self._scriptStages["regressionNarrativeEnd"]["weight"],10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "regressionNarrativeEnd",\ "info",\ self._scriptStages["regressionNarrativeEnd"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL,progressMessage) self._dataframe_context.update_completion_status(self._completionStatus)
def run(self): self._start_time = time.time() metaHelperInstance = MetaDataHelper(self._data_frame, self._total_rows) sampleData = metaHelperInstance.get_sample_data() sampleData = sampleData.toPandas() sampleData = metaHelperInstance.format_sampledata_timestamp_columns( sampleData, self._timestamp_columns, self._stripTimestamp) time_taken_sampling = time.time() - self._start_time self._completionStatus += self._scriptStages["sampling"]["weight"] print "sampling takes", time_taken_sampling progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "sampling",\ "info",\ self._scriptStages["sampling"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=self._ignoreMsgFlag) metaData = [] metaData.append( MetaData(name="noOfRows", value=self._total_rows, display=True, displayName="Rows")) metaData.append( MetaData(name="noOfColumns", value=self._total_columns, display=True, displayName="Columns")) self._percentage_columns = metaHelperInstance.get_percentage_columns( self._string_columns) if len(self._percentage_columns) > 0: self._data_frame = CommonUtils.convert_percentage_columns( self._data_frame, self._percentage_columns) self._numeric_columns = self._numeric_columns + self._percentage_columns self._string_columns = list( set(self._string_columns) - set(self._percentage_columns)) self.update_column_type_dict() self._dollar_columns = metaHelperInstance.get_dollar_columns( self._string_columns) if len(self._dollar_columns) > 0: self._data_frame = CommonUtils.convert_dollar_columns( self._data_frame, self._dollar_columns) self._numeric_columns = self._numeric_columns + self._dollar_columns self._string_columns = list( set(self._string_columns) - set(self._dollar_columns)) self.update_column_type_dict() if len(self._numeric_columns) > 1: # print "self._numeric_columns : ", self._numeric_columns metaData.append( MetaData(name="measures", value=len(self._numeric_columns), display=True, displayName="Measures")) else: metaData.append( MetaData(name="measures", value=len(self._numeric_columns), display=True, displayName="Measure")) if len(self._string_columns) > 1: metaData.append( MetaData(name="dimensions", value=len(self._string_columns + self._boolean_columns), display=True, displayName="Dimensions")) else: metaData.append( MetaData(name="dimensions", value=len(self._string_columns + self._boolean_columns), display=True, displayName="Dimension")) if len(self._timestamp_columns) > 1: metaData.append( MetaData(name="timeDimension", value=len(self._timestamp_columns), display=True, displayName="Time Dimensions")) else: metaData.append( MetaData(name="timeDimension", value=len(self._timestamp_columns), display=True, displayName="Time Dimension")) metaData.append( MetaData(name="measureColumns", value=self._numeric_columns, display=False)) metaData.append( MetaData(name="dimensionColumns", value=self._string_columns + self._boolean_columns, display=False)) metaData.append( MetaData(name="timeDimensionColumns", value=self._timestamp_columns, display=False)) metaData.append( MetaData(name="percentageColumns", value=self._percentage_columns, display=False)) metaData.append( MetaData(name="dollarColumns", value=self._dollar_columns, display=False)) columnData = [] headers = [] self._start_time = time.time() print "Count of Numeric columns", len(self._numeric_columns) measureColumnStat, measureCharts = metaHelperInstance.calculate_measure_column_stats( self._data_frame, self._numeric_columns, binColumn=self._binned_stat_flag) time_taken_measurestats = time.time() - self._start_time self._completionStatus += self._scriptStages["measurestats"]["weight"] print "measure stats takes", time_taken_measurestats progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "measurestats",\ "info",\ self._scriptStages["measurestats"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=self._ignoreMsgFlag) self._start_time = time.time() dimensionColumnStat, dimensionCharts = metaHelperInstance.calculate_dimension_column_stats( self._data_frame, self._string_columns + self._boolean_columns, levelCount=self._level_count_flag) # print dimensionColumnStat self._dataSize["dimensionLevelCountDict"] = { k: filter(lambda x: x["name"] == "numberOfUniqueValues", v)[0]["value"] for k, v in dimensionColumnStat.items() } self._dataSize["totalLevels"] = sum( self._dataSize["dimensionLevelCountDict"].values()) time_taken_dimensionstats = time.time() - self._start_time self._completionStatus += self._scriptStages["dimensionstats"][ "weight"] # print "dimension stats takes",time_taken_dimensionstats progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "dimensionstats",\ "info",\ self._scriptStages["dimensionstats"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=self._ignoreMsgFlag) self._start_time = time.time() timeDimensionColumnStat, timeDimensionCharts = metaHelperInstance.calculate_time_dimension_column_stats( self._data_frame, self._timestamp_columns, level_count_flag=self._level_count_flag) time_taken_tdstats = time.time() - self._start_time self._completionStatus += self._scriptStages["timedimensionstats"][ "weight"] # print "time dimension stats takes",time_taken_tdstats progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "timedimensionstats",\ "info",\ self._scriptStages["timedimensionstats"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=self._ignoreMsgFlag) self._start_time = time.time() ignoreColumnSuggestions = [] ignoreColumnReason = [] utf8ColumnSuggestion = [] dateTimeSuggestions = {} for column in self._data_frame.columns: random_slug = uuid.uuid4().hex headers.append(ColumnHeader(name=column, slug=random_slug)) data = ColumnData() data.set_slug(random_slug) data.set_name(column) data.set_abstract_datatype( self._column_type_dict[column]["abstract"]) columnStat = [] columnChartData = None if self._column_type_dict[column]["abstract"] == "measure": data.set_column_stats(measureColumnStat[column]) data.set_column_chart(measureCharts[column]) data.set_actual_datatype( self._column_type_dict[column]["actual"]) elif self._column_type_dict[column]["abstract"] == "dimension": data.set_column_stats(dimensionColumnStat[column]) data.set_column_chart(dimensionCharts[column]) data.set_actual_datatype( self._column_type_dict[column]["actual"]) elif self._column_type_dict[column]["abstract"] == "datetime": data.set_column_stats(timeDimensionColumnStat[column]) data.set_column_chart(timeDimensionCharts[column]) data.set_actual_datatype( self._column_type_dict[column]["actual"]) if self._column_type_dict[column]["abstract"] == "measure": if column not in self._real_columns: ignoreSuggestion, ignoreReason = metaHelperInstance.get_ignore_column_suggestions( self._data_frame, column, "measure", measureColumnStat[column], max_levels=self._max_levels) if ignoreSuggestion: ignoreColumnSuggestions.append(column) ignoreColumnReason.append(ignoreReason) data.set_level_count_to_null() data.set_chart_data_to_null() data.set_ignore_suggestion_flag(True) data.set_ignore_suggestion_message(ignoreReason) elif self._column_type_dict[column]["abstract"] == "dimension": if self._level_count_flag: utf8Suggestion = metaHelperInstance.get_utf8_suggestions( dimensionColumnStat[column]) else: utf8Suggestion = False if self._column_type_dict[column]["actual"] != "boolean": uniqueVals = self._data_frame.select( column).distinct().na.drop().collect() else: uniqueVals = [] if len(uniqueVals) > 0: dateColumnFormat = metaHelperInstance.get_datetime_format( uniqueVals) else: dateColumnFormat = None if dateColumnFormat: dateTimeSuggestions.update({column: dateColumnFormat}) data.set_level_count_to_null() data.set_chart_data_to_null() data.set_date_suggestion_flag(True) if utf8Suggestion: utf8ColumnSuggestion.append(column) ignoreSuggestion, ignoreReason = metaHelperInstance.get_ignore_column_suggestions( self._data_frame, column, "dimension", dimensionColumnStat[column], max_levels=self._max_levels) if ignoreSuggestion: ignoreColumnSuggestions.append(column) ignoreColumnReason.append(ignoreReason) data.set_level_count_to_null() data.set_chart_data_to_null() data.set_ignore_suggestion_flag(True) data.set_ignore_suggestion_message(ignoreReason) columnData.append(data) for dateColumn in dateTimeSuggestions.keys(): if dateColumn in ignoreColumnSuggestions: ignoreColIdx = ignoreColumnSuggestions.index(dateColumn) ignoreColumnSuggestions.remove(dateColumn) del (ignoreColumnReason[ignoreColIdx]) for utfCol in utf8ColumnSuggestion: ignoreColumnSuggestions.append(utfCol) ignoreColumnReason.append("utf8 values present") progressMessage = CommonUtils.create_progress_message_object( self._analysisName, "custom", "info", "Validating Metadata Information", self._completionStatus, self._completionStatus, display=True) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=self._ignoreMsgFlag) metaData.append( MetaData(name="ignoreColumnSuggestions", value=ignoreColumnSuggestions, display=False)) metaData.append( MetaData(name="ignoreColumnReason", value=ignoreColumnReason, display=False)) metaData.append( MetaData(name="utf8ColumnSuggestion", value=utf8ColumnSuggestion, display=False)) metaData.append( MetaData(name="dateTimeSuggestions", value=dateTimeSuggestions, display=False)) metaData.append( MetaData(name="dataSizeSummary", value=self._dataSize, display=False)) dfMetaData = DfMetaData() dfMetaData.set_column_data(columnData) dfMetaData.set_header(headers) dfMetaData.set_meta_data(metaData) dfMetaData.set_sample_data(sampleData) time_taken_suggestions = time.time() - self._start_time self._completionStatus += self._scriptStages["suggestions"]["weight"] # print "suggestions take",time_taken_suggestions progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "suggestions",\ "info",\ self._scriptStages["suggestions"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=self._ignoreMsgFlag) self._dataframe_context.update_completion_status( self._completionStatus) return dfMetaData
def run(self): self._start_time = time.time() metaHelperInstance = MetaDataHelper(self._data_frame, self._total_rows) sampleData = metaHelperInstance.get_sample_data() if not self._pandas_flag: sampleData = sampleData.toPandas() time_taken_sampling = time.time()-self._start_time self._completionStatus += self._scriptStages["sampling"]["weight"] progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "sampling",\ "info",\ self._scriptStages["sampling"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsgFlag) metaData = [] metaData.append(MetaData(name="noOfRows",value=self._total_rows,display=True,displayName="Rows")) metaData.append(MetaData(name="noOfColumns",value=self._total_columns,display=True,displayName="Columns")) # self._percentage_columns = metaHelperInstance.get_percentage_columns(self._string_columns) separation_time=time.time() self._timestamp_string_columns=[] uniqueVals = [] dateTimeSuggestions = {} if not self._pandas_flag: for column in self._string_columns: if self._column_type_dict[column]["actual"] != "boolean": # uniqueVals = self._data_frame.select(column).na.drop().distinct().limit(10).collect() uniqueVals = sampleData[column].unique().tolist() else: uniqueVals = [] ## TODO : remove pandas if not needed later if self._pandas_flag: if len(uniqueVals) > 0 and metaHelperInstance.get_datetime_format_pandas([self._data_frame.sort_values(by=column,ascending=False)[column][0]])!=None: dateColumnFormat = metaHelperInstance.get_datetime_format_pandas(uniqueVals) else: dateColumnFormat = None else: if len(uniqueVals) > 0 and metaHelperInstance.get_datetime_format([self._data_frame.orderBy([column],ascending=[False]).select(column).first()[0]])!=None: dateColumnFormat = metaHelperInstance.get_datetime_format(uniqueVals) else: dateColumnFormat = None if dateColumnFormat: dateTimeSuggestions.update({column:dateColumnFormat}) data=ColumnData() data.set_level_count_to_null() data.set_chart_data_to_null() data.set_date_suggestion_flag(True) data.set_abstract_datatype("datetime") data.set_actual_datatype("datetime") self._timestamp_string_columns.append(column) ## TO DO : remove pandas if not needed later if self._pandas_flag: self._data_frame[column] = pd.to_datetime(self._data_frame[column],format=dateColumnFormat) else: self._data_frame = self._data_frame.withColumn(column, self.to_date_(column)) sampleData = metaHelperInstance.format_sampledata_timestamp_columns(sampleData,self._timestamp_columns,self._stripTimestamp) print("sampling takes",time_taken_sampling) self._string_columns = list(set(self._string_columns)-set(self._timestamp_string_columns)) self._timestamp_columns = self._timestamp_columns+self._timestamp_string_columns # self.update_column_type_dict() print("time taken for separating date columns from string is :", time.time()-separation_time) # if len(self._percentage_columns)>0: # self._data_frame = CommonUtils.convert_percentage_columns(self._data_frame,self._percentage_columns) # self._numeric_columns = self._numeric_columns + self._percentage_columns # self._string_columns = list(set(self._string_columns)-set(self._percentage_columns)) # self.update_column_type_dict() # self._dollar_columns = metaHelperInstance.get_dollar_columns(self._string_columns) # if len(self._dollar_columns)>0: # self._data_frame = CommonUtils.convert_dollar_columns(self._data_frame,self._dollar_columns) # self._numeric_columns = self._numeric_columns + self._dollar_columns # self._string_columns = list(set(self._string_columns)-set(self._dollar_columns)) # self.update_column_type_dict() columnData = [] headers = [] self._start_time = time.time() print("Count of Numeric columns",len(self._numeric_columns)) try: measureColumnStat,measureCharts = metaHelperInstance.calculate_measure_column_stats(self._data_frame,self._numeric_columns,binColumn=self._binned_stat_flag,pandas_flag=self._pandas_flag) except Exception as e: raise Exception(e) time_taken_measurestats = time.time()-self._start_time self._completionStatus += self._scriptStages["measurestats"]["weight"] print("measure stats takes",time_taken_measurestats) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "measurestats",\ "info",\ self._scriptStages["measurestats"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsgFlag) print("Count of DateTime columns",len(self._timestamp_columns)) self._start_time = time.time() # time_columns=self._timestamp_columns # time_string_columns=self._timestamp_string_columns # original_timestamp_columns=list(set(self._timestamp_columns)-set(self._timestamp_string_columns)) timeDimensionColumnStat,timeDimensionCharts, unprocessed_columns = metaHelperInstance.calculate_time_dimension_column_stats(self._data_frame,self._timestamp_columns,level_count_flag=self._level_count_flag,pandas_flag=self._pandas_flag) self._string_columns = self._string_columns + unprocessed_columns self._timestamp_columns = list(set(self._timestamp_columns) - set(unprocessed_columns)) self.update_column_type_dict() if len(self._numeric_columns) > 1: # print "self._numeric_columns : ", self._numeric_columns metaData.append(MetaData(name="measures",value=len(self._numeric_columns),display=True,displayName="Measures")) else: metaData.append(MetaData(name="measures",value=len(self._numeric_columns),display=True,displayName="Measure")) if len(self._string_columns) > 1: metaData.append(MetaData(name="dimensions",value=len(self._string_columns+self._boolean_columns),display=True,displayName="Dimensions")) else: metaData.append(MetaData(name="dimensions",value=len(self._string_columns+self._boolean_columns),display=True,displayName="Dimension")) if len(self._timestamp_columns) > 1: metaData.append(MetaData(name="timeDimension",value=len(self._timestamp_columns),display=True,displayName="Time Dimensions")) else: metaData.append(MetaData(name="timeDimension",value=len(self._timestamp_columns),display=True,displayName="Time Dimension")) metaData.append(MetaData(name="measureColumns",value = self._numeric_columns,display=False)) metaData.append(MetaData(name="dimensionColumns",value = self._string_columns+self._boolean_columns,display=False)) metaData.append(MetaData(name="timeDimensionColumns",value = self._timestamp_columns,display=False)) # metaData.append(MetaData(name="percentageColumns",value = self._percentage_columns,display=False)) # metaData.append(MetaData(name="dollarColumns",value = self._dollar_columns,display=False)) # timeDimensionColumnStat2,timeDimensionCharts2,unprocessed_columns = metaHelperInstance.calculate_time_dimension_column_stats_from_string(self._data_frame,self._timestamp_string_columns,level_count_flag=self._level_count_flag) # gc.collect() # timeDimensionColumnStat.update(timeDimensionColumnStat2) # timeDimensionCharts.update(timeDimensionCharts2) time_taken_tdstats = time.time()-self._start_time self._completionStatus += self._scriptStages["timedimensionstats"]["weight"] print("time dimension stats takes",time_taken_tdstats) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "timedimensionstats",\ "info",\ self._scriptStages["timedimensionstats"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsgFlag) self._start_time = time.time() try : dimensionColumnStat,dimensionCharts = metaHelperInstance.calculate_dimension_column_stats(self._data_frame,self._string_columns+self._boolean_columns,levelCount=self._level_count_flag,pandas_flag=self._pandas_flag) except Exception as e: raise Exception(e) self._dataSize["dimensionLevelCountDict"] = {k:[x for x in v if x["name"]=="numberOfUniqueValues"][0]["value"] for k,v in list(dimensionColumnStat.items())} self._dataSize["totalLevels"] = sum(self._dataSize["dimensionLevelCountDict"].values()) time_taken_dimensionstats = time.time()-self._start_time self._completionStatus += self._scriptStages["dimensionstats"]["weight"] # print "dimension stats takes",time_taken_dimensionstats progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "dimensionstats",\ "info",\ self._scriptStages["dimensionstats"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsgFlag) self._start_time = time.time() ignoreColumnSuggestions = [] ignoreColumnReason = [] utf8ColumnSuggestion = [] dup_cols = [] #columns = self._data_frame.columns measureDupCols=self.checkDupColName(measureColumnStat) dimensionDupCols=self.checkDupColName(dimensionColumnStat) timeDimensionDupCols=self.checkDupColName(timeDimensionColumnStat) if self._pandas_flag: for i in measureDupCols: if self.checkDuplicateCols_pandas(i[0],i[1]) == True: for j in i[1:]: if dict(name="Duplicate",value=True) not in measureColumnStat[j]: measureColumnStat[j].append(dict(name="Duplicate",value=i[0])) for i in dimensionDupCols: if self.checkDuplicateCols_pandas(i[0],i[1]) == True: for j in i[1:]: if dict(name="Duplicate",value=True) not in dimensionColumnStat[j]: dimensionColumnStat[j].append(dict(name="Duplicate",value=i[0])) for i in timeDimensionDupCols: if self.checkDuplicateCols_pandas(i[0],i[1]) == True: for j in i[1:]: if dict(name="Duplicate",value=True) not in timeDimensionColumnStat[j]: timeDimensionColumnStat[j].append(dict(name="Duplicate",value=i[0])) else: for i in measureDupCols: if self.checkDuplicateCols(i[0],i[1]) == True: for j in i[1:]: if dict(name="Duplicate",value=True) not in measureColumnStat[j]: measureColumnStat[j].append(dict(name="Duplicate",value=i[0])) for i in dimensionDupCols: if self.checkDuplicateCols(i[0],i[1],True) == True: for j in i[1:]: if dict(name="Duplicate",value=True) not in dimensionColumnStat[j]: dimensionColumnStat[j].append(dict(name="Duplicate",value=i[0])) for i in timeDimensionDupCols: if self.checkDuplicateCols(i[0],i[1]) == True: for j in i[1:]: if dict(name="Duplicate",value=True) not in timeDimensionColumnStat[j]: timeDimensionColumnStat[j].append(dict(name="Duplicate",value=i[0])) for column in self._data_frame.columns: random_slug = uuid.uuid4().hex headers.append(ColumnHeader(name=column,slug=random_slug)) data = ColumnData() data.set_slug(random_slug) data.set_name(column) data.set_abstract_datatype(self._column_type_dict[column]["abstract"]) data.set_checker(True) changeflage=False columnStat = [] columnChartData = None check_datatype_change=self.actual_col_datatype_update if len(check_datatype_change)!=0: for i in check_datatype_change: if list(i.keys())[0]==column: changeflage=True changeType=i[column] break else: changeflage=False else: changeflage=False if self._column_type_dict[column]["abstract"] == "measure": data.set_column_stats(measureColumnStat[column]) data.set_column_chart(measureCharts[column]) if changeflage: data.set_actual_datatype("dimension") else: data.set_actual_datatype(self._column_type_dict[column]["actual"]) elif self._column_type_dict[column]["abstract"] == "dimension": data.set_column_stats(dimensionColumnStat[column]) data.set_column_chart(dimensionCharts[column]) if changeflage: data.set_actual_datatype("measure") else: data.set_actual_datatype(self._column_type_dict[column]["actual"]) elif self._column_type_dict[column]["abstract"] == "datetime": data.set_column_stats(timeDimensionColumnStat[column]) data.set_column_chart(timeDimensionCharts[column]) if changeflage: data.set_actual_datatype("dimension") else: data.set_actual_datatype(self._column_type_dict[column]["actual"]) if self._column_type_dict[column]["abstract"] == "measure": #if column not in self._real_columns: ignoreSuggestion,ignoreReason = metaHelperInstance.get_ignore_column_suggestions(self._data_frame,self._total_rows,column,"measure",measureColumnStat[column],max_levels=self._max_levels) if ignoreSuggestion: ignoreColumnSuggestions.append(column) ignoreColumnReason.append(ignoreReason) #data.set_level_count_to_null() #data.set_chart_data_to_null() data.set_ignore_suggestion_flag(True) data.set_ignore_suggestion_message(ignoreReason) elif self._column_type_dict[column]["abstract"] == "dimension": ignoreSuggestion,ignoreReason = metaHelperInstance.get_ignore_column_suggestions(self._data_frame,self._total_rows,column,"dimension",dimensionColumnStat[column],max_levels=self._max_levels) if ignoreSuggestion: ignoreColumnSuggestions.append(column) ignoreColumnReason.append(ignoreReason) if ignoreReason=="Number of Levels are more than the defined thershold": data.set_ignore_suggestion_preview_flag(False) #data.set_level_count_to_null() #data.set_chart_data_to_null() data.set_ignore_suggestion_flag(True) data.set_ignore_suggestion_message(ignoreReason) if self._level_count_flag: utf8Suggestion = metaHelperInstance.get_utf8_suggestions(dimensionColumnStat[column]) else: utf8Suggestion = False if utf8Suggestion: utf8ColumnSuggestion.append(column) ignoreSuggestion,ignoreReason = metaHelperInstance.get_ignore_column_suggestions(self._data_frame,self._total_rows,column,"dimension",dimensionColumnStat[column],max_levels=self._max_levels) if ignoreSuggestion: ignoreColumnSuggestions.append(column) ignoreColumnReason.append(ignoreReason) #data.set_level_count_to_null() #data.set_chart_data_to_null() data.set_ignore_suggestion_flag(True) data.set_ignore_suggestion_message(ignoreReason) elif self._column_type_dict[column]["abstract"] == "datetime": ignoreSuggestion,ignoreReason = metaHelperInstance.get_ignore_column_suggestions(self._data_frame,self._total_rows,column,"datetime",timeDimensionColumnStat[column],max_levels=self._max_levels) if ignoreSuggestion: ignoreColumnSuggestions.append(column) ignoreColumnReason.append(ignoreReason) #data.set_level_count_to_null() #data.set_chart_data_to_null() data.set_ignore_suggestion_flag(True) data.set_ignore_suggestion_message(ignoreReason) columnData.append(data) if len(uniqueVals) > 0: dateColumnFormat = metaHelperInstance.get_datetime_format(uniqueVals) else: dateColumnFormat = None if dateColumnFormat: dateTimeSuggestions.update({column:dateColumnFormat}) for utfCol in utf8ColumnSuggestion: ignoreColumnSuggestions.append(utfCol) ignoreColumnReason.append("utf8 values present") progressMessage = CommonUtils.create_progress_message_object(self._analysisName,"custom","info","Validating Metadata Information",self._completionStatus,self._completionStatus,display=True) CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsgFlag) metaData.append(MetaData(name="ignoreColumnSuggestions",value = ignoreColumnSuggestions,display=False)) metaData.append(MetaData(name="ignoreColumnReason",value = ignoreColumnReason,display=False)) metaData.append(MetaData(name="utf8ColumnSuggestion",value = utf8ColumnSuggestion,display=False)) metaData.append(MetaData(name="dateTimeSuggestions",value = dateTimeSuggestions,display=False)) metaData.append(MetaData(name="dataSizeSummary",value = self._dataSize,display=False)) dfMetaData = DfMetaData() dfMetaData.set_column_data(columnData) dfMetaData.set_header(headers) dfMetaData.set_meta_data(metaData) dfMetaData.set_sample_data(sampleData) time_taken_suggestions = time.time()-self._start_time self._completionStatus += self._scriptStages["suggestions"]["weight"] # print "suggestions take",time_taken_suggestions progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "suggestions",\ "info",\ self._scriptStages["suggestions"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsgFlag) self._dataframe_context.update_completion_status(self._completionStatus) return dfMetaData
def _generate_summary(self): data_dict = {} rules_dict = self._table data_dict["blockSplitter"] = self._blockSplitter data_dict["targetcol"] = self._colname groups = rules_dict.keys() probabilityCutoff = 75 probabilityGroups = [{ "probability": probabilityCutoff, "count": 0, "range": [probabilityCutoff, 100] }, { "probability": probabilityCutoff - 1, "count": 0, "range": [0, probabilityCutoff - 1] }] tableArray = [[ "Prediction Rule", "Probability", "Prediction", "Freq", "group", "richRules" ]] dropdownData = [] chartDict = {} targetLevel = self._dataframe_context.get_target_level_for_model() probabilityArrayAll = [] self._completionStatus = self._dataframe_context.get_completion_status( ) progressMessage = CommonUtils.create_progress_message_object( self._analysisName, "custom", "info", "Generating Prediction rules", self._completionStatus, self._completionStatus, display=True) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=False) self._dataframe_context.update_completion_status( self._completionStatus) targetValues = [x for x in rules_dict.keys() if x == targetLevel ] + [x for x in rules_dict.keys() if x != targetLevel] for idx, target in enumerate(targetValues): if idx == 0: if self._dataframe_context.get_story_on_scored_data() != True: dropdownData.append({ "displayName": target, "name": target, "selected": True, "id": idx + 1 }) else: dropdownData.append({ "displayName": "{} : {}".format(self._colname, target), "name": target, "selected": True, "id": idx + 1 }) else: if self._dataframe_context.get_story_on_scored_data() != True: dropdownData.append({ "displayName": target, "name": target, "selected": False, "id": idx + 1 }) else: dropdownData.append({ "displayName": "{} : {}".format(self._colname, target), "name": target, "selected": False, "id": idx + 1 }) rulesArray = rules_dict[target] probabilityArray = [ round(x, 2) for x in self.success_percent[target] ] probabilityArrayAll += probabilityArray groupArray = [ "strong" if x >= probabilityCutoff else "mixed" for x in probabilityArray ] for idx2, obj in enumerate(probabilityGroups): grpCount = len([ x for x in probabilityArray if x >= obj["range"][0] and x <= obj["range"][1] ]) obj["count"] += grpCount probabilityGroups[idx2] = obj predictionArray = [target] * len(rulesArray) freqArray = self.total_predictions[target] chartDict[target] = sum(freqArray) success = self.successful_predictions[target] success_percent = self.success_percent[target] richRulesArray = [] crudeRuleArray = [] analysisType = self._dataframe_context.get_analysis_type() targetCol = self._dataframe_context.get_result_column() binFlag = False if self._dataframe_context.get_custom_analysis_details() != None: binnedColObj = [ x["colName"] for x in self._dataframe_context.get_custom_analysis_details() ] if binnedColObj != None and targetCol in binnedColObj: binFlag = True for idx2, crudeRule in enumerate(rulesArray): richRule, crudeRule = NarrativesUtils.generate_rules( self._colname, target, crudeRule, freqArray[idx2], success[idx2], success_percent[idx2], analysisType, binFlag=binFlag) richRulesArray.append(richRule) crudeRuleArray.append(crudeRule) probabilityArray = map( lambda x: humanize.apnumber(x) + "%" if x >= 10 else str(int(x)) + "%", probabilityArray) # targetArray = zip(richRulesArray,probabilityArray,predictionArray,freqArray,groupArray) targetArray = zip(crudeRuleArray, probabilityArray, predictionArray, freqArray, groupArray, richRulesArray) targetArray = [list(x) for x in targetArray] tableArray += targetArray donutChartMaxLevel = 10 if self._dataframe_context.get_story_on_scored_data() == True: chartDict = {} probabilityRangeForChart = GLOBALSETTINGS.PROBABILITY_RANGE_FOR_DONUT_CHART chartDict = dict( zip(probabilityRangeForChart.keys(), [0] * len(probabilityRangeForChart))) for val in probabilityArrayAll: for grps, grpRange in probabilityRangeForChart.items(): if val > grpRange[0] and val <= grpRange[1]: chartDict[grps] = chartDict[grps] + 1 chartDict = {k: v for k, v in chartDict.items() if v != 0} else: chartDict = dict([(k, sum(v)) for k, v in self.total_predictions.items()]) chartDict = {k: v for k, v in chartDict.items() if v != 0} if len(chartDict) > donutChartMaxLevel: chartDict = NarrativesUtils.restructure_donut_chart_data( chartDict, nLevels=donutChartMaxLevel) chartData = NormalChartData([chartDict]).get_data() chartJson = ChartJson(data=chartData) chartJson.set_title(self._colname) chartJson.set_chart_type("donut") mainCardChart = C3ChartData(data=chartJson) mainCardChart.set_width_percent(45) # mainCardChart = {"dataType": "c3Chart","widthPercent":33 ,"data": {"data": [chartDict],"title":self._colname,"axes":{},"label_text":{},"legend":{},"yAxisNumberFormat": ".2s","types":None,"axisRotation":False, "chart_type": "donut"}} dropdownDict = { "dataType": "dropdown", "label": "Showing prediction rules for", "data": dropdownData } data_dict["probabilityGroups"] = probabilityGroups if self._dataframe_context.get_story_on_scored_data() != True: maincardSummary = NarrativesUtils.get_template_output(self._base_dir,\ 'decisiontreesummary.html',data_dict) else: predictedLevelcountArray = [(x[2], x[3]) for x in tableArray[1:]] predictedLevelCountDict = {} # predictedLevelcountDict = defaultdict(predictedLevelcountArray) for val in predictedLevelcountArray: predictedLevelCountDict.setdefault(val[0], []).append(val[1]) levelCountDict = {} for k, v in predictedLevelCountDict.items(): levelCountDict[k] = sum(v) # levelCountDict = self._metaParser.get_unique_level_dict(self._colname) total = float( sum([x for x in levelCountDict.values() if x != None])) levelCountTuple = [{ "name": k, "count": v, "percentage": round(v * 100 / total, 2) } for k, v in levelCountDict.items() if v != None] percentageArray = [x["percentage"] for x in levelCountTuple] percentageArray = NarrativesUtils.ret_smart_round(percentageArray) levelCountTuple = [{ "name": obj["name"], "count": obj["count"], "percentage": str(percentageArray[idx]) + "%" } for idx, obj in enumerate(levelCountTuple)] data_dict["nlevel"] = len(levelCountDict) print "levelCountTuple", levelCountTuple print "levelCountDict", levelCountDict if targetLevel in levelCountDict: data_dict["topLevel"] = [ x for x in levelCountTuple if x["name"] == targetLevel ][0] if len(levelCountTuple) > 1: data_dict["secondLevel"] = max([ x for x in levelCountTuple if x["name"] != targetLevel ], key=lambda x: x["count"]) else: data_dict["secondLevel"] = None else: data_dict["topLevel"] = levelCountTuple[0] if len(levelCountTuple) > 1: data_dict["secondLevel"] = levelCountTuple[1] else: data_dict["secondLevel"] = None print data_dict maincardSummary = NarrativesUtils.get_template_output( self._base_dir, 'decisiontreescore.html', data_dict) main_card = NormalCard() main_card_data = [] main_card_narrative = NarrativesUtils.block_splitter( maincardSummary, self._blockSplitter) main_card_data += main_card_narrative main_card_data.append(mainCardChart) main_card_data.append(dropdownDict) main_card_table = TableData() if self._dataframe_context.get_story_on_scored_data() == True: main_card_table.set_table_width(75) main_card_table.set_table_data(tableArray) main_card_table.set_table_type("popupDecisionTreeTable") main_card_data.append(main_card_table) uidTable = self._result_setter.get_unique_identifier_table() if uidTable != None: main_card_data.append(uidTable) else: main_card_table.set_table_width(100) main_card.set_card_data(main_card_data) main_card.set_card_name("Predicting Key Drivers of {}".format( self._colname)) self._decisionTreeNode.add_a_card(main_card)
def __init__(self, column_name, decision_tree_rules, df_helper, df_context, meta_parser, result_setter, story_narrative=None, analysisName=None, scriptWeight=None): self._story_narrative = story_narrative self._metaParser = meta_parser self._dataframe_context = df_context self._ignoreMsg = self._dataframe_context.get_message_ignore() self._result_setter = result_setter self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER self._column_name = column_name.lower() self._colname = column_name self._capitalized_column_name = "%s%s" % (column_name[0].upper(), column_name[1:]) self._decision_rules_dict = decision_tree_rules.get_decision_rules() self._decision_tree_json = CommonUtils.as_dict(decision_tree_rules) self._decision_tree_raw = self._decision_rules_dict # self._decision_tree_raw = {"tree":{"children":None}} # self._decision_tree_raw['tree']["children"] = self._decision_tree_json['tree']["children"] self._table = decision_tree_rules.get_table() self._new_table = {} self.successful_predictions = decision_tree_rules.get_success() self.total_predictions = decision_tree_rules.get_total() self.success_percent = decision_tree_rules.get_success_percent() self._important_vars = decision_tree_rules.get_significant_vars() self._target_distribution = decision_tree_rules.get_target_contributions( ) self._get_new_table() self._df_helper = df_helper self.subheader = None #self.table = {} self.dropdownComment = None self.dropdownValues = None self._base_dir = "/decisiontree/" self._completionStatus = self._dataframe_context.get_completion_status( ) if analysisName == None: self._analysisName = self._dataframe_context.get_analysis_name() else: self._analysisName = analysisName self._messageURL = self._dataframe_context.get_message_url() if scriptWeight == None: self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight( ) else: self._scriptWeightDict = scriptWeight self._scriptStages = { "dtreeNarrativeStart": { "summary": "Started the Decision Tree Narratives", "weight": 0 }, "dtreeNarrativeEnd": { "summary": "Narratives for Decision Tree Finished", "weight": 10 }, } self._completionStatus += self._scriptWeightDict[ self._analysisName]["narratives"] * self._scriptStages[ "dtreeNarrativeStart"]["weight"] / 10 progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "dtreeNarrativeStart",\ "info",\ self._scriptStages["dtreeNarrativeStart"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=self._ignoreMsg) self._dataframe_context.update_completion_status( self._completionStatus) self._decisionTreeNode = NarrativesTree() self._decisionTreeNode.set_name("Prediction") self._generate_narratives() # self._story_narrative.add_a_node(self._decisionTreeNode) self._result_setter.set_decision_tree_node(self._decisionTreeNode) self._result_setter.set_score_dtree_cards( json.loads( CommonUtils.convert_python_object_to_json( self._decisionTreeNode.get_all_cards()))) self._completionStatus = self._dataframe_context.get_completion_status( ) self._completionStatus += self._scriptWeightDict[ self._analysisName]["narratives"] * self._scriptStages[ "dtreeNarrativeEnd"]["weight"] / 10 progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "dtreeNarrativeEnd",\ "info",\ self._scriptStages["dtreeNarrativeEnd"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=self._ignoreMsg) self._dataframe_context.update_completion_status( self._completionStatus)
def _generate_narratives(self): try: nColsToUse = self._analysisDict[ self._analysisName]["noOfColumnsToUse"] except: nColsToUse = None self._anovaNodes = NarrativesTree() self._anovaNodes.set_name("Performance") for measure_column in self._df_anova_result.get_measure_columns(): measure_anova_result = self._df_anova_result.get_measure_result( measure_column) significant_dimensions_dict, insignificant_dimensions = measure_anova_result.get_OneWayAnovaSignificantDimensions( ) num_dimensions = len(list(significant_dimensions_dict.items()) ) + len(insignificant_dimensions) significant_dimensions = [ k for k, v in sorted(list(significant_dimensions_dict.items()), key=lambda x: -x[1]) ] if nColsToUse != None: significant_dimensions = significant_dimensions[:nColsToUse] num_significant_dimensions = len(significant_dimensions) num_insignificant_dimensions = len(insignificant_dimensions) print("num_significant_dimensions", num_significant_dimensions) if num_significant_dimensions > 0: mainCard = NormalCard(name="Overview of Key Factors") data_c3 = [] for sig_dim in significant_dimensions: data_c3.append({ 'dimension': sig_dim, 'effect_size': float(significant_dimensions_dict[sig_dim]) }) self.narratives = {} self.narratives[AnovaNarratives. KEY_HEADING] = "%s Performance Analysis" % ( measure_column, ) self.narratives['main_card'] = {} self.narratives['cards'] = [] self.narratives['main_card'][ AnovaNarratives. KEY_SUBHEADING] = "Relationship between %s and other Dimensions" % ( measure_column) self.narratives['main_card'][ AnovaNarratives.KEY_PARAGRAPH] = [] data_dict = { \ 'significant_dimensions' : significant_dimensions, 'insignificant_dimensions' : insignificant_dimensions, 'num_significant_dimensions' : num_significant_dimensions, 'num_insignificant_dimensions' : num_insignificant_dimensions, 'num_dimensions' : num_significant_dimensions+num_insignificant_dimensions, 'target' : measure_column \ } output = {'header': ''} output['content'] = NarrativesUtils.get_template_output( self._base_dir, 'anova_template_1.html', data_dict) self.narratives['main_card'][ AnovaNarratives.KEY_PARAGRAPH].append(output) output1 = {'header': ''} output1['content'] = NarrativesUtils.get_template_output( self._base_dir, 'anova_template_2.html', data_dict) lines = [] lines += NarrativesUtils.block_splitter( output['content'], self._blockSplitter) data_c3 = NormalChartData(data_c3) chart_data = data_c3.get_data() chartDataValues = [] effect_size_values = [] for obj in chart_data: effect_size_values.append(obj["effect_size"]) chart_data_min = min(effect_size_values) if chart_data_min < 0.00001: for obj in chart_data: chartDataValues.append(str(obj["effect_size"])) else: for obj in chart_data: chartDataValues.append(obj["effect_size"]) chart_json = ChartJson(data=chart_data, axes={ 'x': 'dimension', 'y': 'effect_size' }, label_text={ 'x': '', 'y': 'Effect Size (scaled exp values)' }, chart_type='bar') chart_json.set_axis_rotation(True) # chart_json.set_yaxis_number_format(".4f") chart_json.set_yaxis_number_format( NarrativesUtils.select_y_axis_format(chartDataValues)) # st_info = ["Test : ANOVA", "Threshold for p-value : 0.05", "Effect Size : Tukey's HSD"] statistical_info_array = [ ("Test Type", "ANOVA"), ("Effect Size", "ETA squared"), ("Max Effect Size", chart_data[0]["dimension"]), ("Min Effect Size", chart_data[-1]["dimension"]), ] statistical_inferenc = "" if len(chart_data) == 1: statistical_inference = "{} is the only variable that have significant association with the {} (Target) having an \ Effect size of {}".format( chart_data[0]["dimension"], self._dataframe_context.get_result_column(), round(chart_data[0]["effect_size"], 4)) elif len(chart_data) == 2: statistical_inference = "There are two variables ({} and {}) that have significant association with the {} (Target) and the \ Effect size ranges are {} and {} respectively".format( chart_data[0]["dimension"], chart_data[1]["dimension"], self._dataframe_context.get_result_column(), round(chart_data[0]["effect_size"], 4), round(chart_data[1]["effect_size"], 4)) else: statistical_inference = "There are {} variables that have significant association with the {} (Target) and the \ Effect size ranges from {} to {}".format( len(chart_data), self._dataframe_context.get_result_column(), round(chart_data[0]["effect_size"], 4), round(chart_data[-1]["effect_size"], 4)) if statistical_inference != "": statistical_info_array.append( ("Inference", statistical_inference)) statistical_info_array = NarrativesUtils.statistical_info_array_formatter( statistical_info_array) lines += [ C3ChartData(data=chart_json, info=statistical_info_array) ] lines += NarrativesUtils.block_splitter( output1['content'], self._blockSplitter) mainCard.set_card_data(lines) self._anovaNodes.add_a_card(mainCard) self.narratives['main_card'][ AnovaNarratives.KEY_PARAGRAPH].append(output1) self.narratives['main_card'][AnovaNarratives.KEY_CHART] = {} effect_size_chart = { 'heading': '', 'labels': { 'Dimension': 'Effect Size' }, 'data': significant_dimensions_dict } print(significant_dimensions_dict) self.narratives['main_card'][AnovaNarratives.KEY_CHART][ 'effect_size'] = effect_size_chart progressMessage = CommonUtils.create_progress_message_object( self._analysisName, "custom", "info", "Analyzing Key Drivers", self._completionStatus, self._completionStatus, display=True) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=False) self._generate_dimension_narratives(significant_dimensions, measure_anova_result, measure_column) else: mainCard = NormalCard(name="Overview of Key Factors") cardText = HtmlData( "There are no dimensions in the dataset that have significant influence on {}" .format(measure_column)) mainCard.set_card_data([cardText]) self._anovaNodes.add_a_card(mainCard)
def main(configJson): LOGGER = {} deployEnv = False # running the scripts from job-server env debugMode = True # runnning the scripts for local testing and development cfgMode = False # runnning the scripts by passing config.cfg path scriptStartTime = time.time() if isinstance(configJson, pyhocon.config_tree.ConfigTree) or isinstance( configJson, dict): deployEnv = True debugMode = False ignoreMsg = False elif isinstance(configJson, basestring): if configJson.endswith(".cfg"): print "######################## Running in cfgMode ########################" cfgMode = True debugMode = False ignoreMsg = False else: print "######################## Running in debugMode ######################" cfgMode = False debugMode = True ignoreMsg = True # Test Configs are defined in bi/settings/configs/localConfigs jobType = "stockAdvisor" if jobType == "testCase": configJson = get_test_configs(jobType, testFor="chisquare") else: configJson = get_test_configs(jobType) print "######################## Creating Spark Session ###########################" if debugMode: APP_NAME = "mAdvisor_running_in_debug_mode" else: if "job_config" in configJson.keys( ) and "job_name" in configJson["job_config"]: APP_NAME = configJson["job_config"]["job_name"] else: APP_NAME = "--missing--" if debugMode: spark = CommonUtils.get_spark_session(app_name=APP_NAME, hive_environment=False) else: spark = CommonUtils.get_spark_session(app_name=APP_NAME) spark.sparkContext.setLogLevel("ERROR") # applicationIDspark = spark.sparkContext.applicationId # spark.conf.set("spark.sql.execution.arrow.enabled", "true") print "######################### Parsing the configs #############################" config = configJson["config"] jobConfig = configJson["job_config"] jobType = jobConfig["job_type"] jobName = jobConfig["job_name"] jobURL = jobConfig["job_url"] messageURL = jobConfig["message_url"] try: errorURL = jobConfig["error_reporting_url"] except: errorURL = None if "app_id" in jobConfig: appid = jobConfig["app_id"] else: appid = None configJsonObj = configparser.ParserConfig(config) configJsonObj.set_json_params() dataframe_context = ContextSetter(configJsonObj) dataframe_context.set_job_type( jobType ) #jobType should be set before set_params call of dataframe_context dataframe_context.set_params() dataframe_context.set_message_url(messageURL) dataframe_context.set_app_id(appid) dataframe_context.set_debug_mode(debugMode) dataframe_context.set_job_url(jobURL) dataframe_context.set_app_name(APP_NAME) dataframe_context.set_error_url(errorURL) dataframe_context.set_logger(LOGGER) dataframe_context.set_xml_url(jobConfig["xml_url"]) dataframe_context.set_job_name(jobName) if debugMode == True: dataframe_context.set_environment("debugMode") dataframe_context.set_message_ignore(True) analysistype = dataframe_context.get_analysis_type() result_setter = ResultSetter(dataframe_context) # scripts_to_run = dataframe_context.get_scripts_to_run() appid = dataframe_context.get_app_id() completionStatus = 0 print "########################## Validate the Config ###############################" configValidator = ConfigValidator(dataframe_context) configValid = configValidator.get_sanity_check() if not configValid: progressMessage = CommonUtils.create_progress_message_object( "mAdvisor Job", "custom", "info", "Please Provide a Valid Configuration", completionStatus, completionStatus, display=True) CommonUtils.save_progress_message(messageURL, progressMessage, ignore=ignoreMsg) response = CommonUtils.save_result_json( dataframe_context.get_job_url(), json.dumps({})) CommonUtils.save_error_messages(errorURL, APP_NAME, "Invalid Config Provided", ignore=ignoreMsg) else: ########################## Initializing messages ############################## if jobType == "story": if analysistype == "measure": progressMessage = CommonUtils.create_progress_message_object( "Measure analysis", "custom", "info", "Analyzing Target Variable", completionStatus, completionStatus, display=True) else: progressMessage = CommonUtils.create_progress_message_object( "Dimension analysis", "custom", "info", "Analyzing Target Variable", completionStatus, completionStatus, display=True) CommonUtils.save_progress_message(messageURL, progressMessage, ignore=ignoreMsg, emptyBin=True) dataframe_context.update_completion_status(completionStatus) elif jobType == "metaData": progressMessage = CommonUtils.create_progress_message_object( "metaData", "custom", "info", "Preparing data for loading", completionStatus, completionStatus, display=True) CommonUtils.save_progress_message(messageURL, progressMessage, ignore=ignoreMsg, emptyBin=True) progressMessage = CommonUtils.create_progress_message_object( "metaData", "custom", "info", "Initializing the loading process", completionStatus, completionStatus, display=True) CommonUtils.save_progress_message(messageURL, progressMessage, ignore=ignoreMsg) progressMessage = CommonUtils.create_progress_message_object( "metaData", "custom", "info", "Data Upload in progress", completionStatus, completionStatus, display=True) CommonUtils.save_progress_message(messageURL, progressMessage, ignore=ignoreMsg) dataframe_context.update_completion_status(completionStatus) if jobType != "stockAdvisor": df = None data_loading_st = time.time() progressMessage = CommonUtils.create_progress_message_object( "scriptInitialization", "scriptInitialization", "info", "Loading the Dataset", completionStatus, completionStatus) if jobType != "story" and jobType != "metaData": CommonUtils.save_progress_message(messageURL, progressMessage, ignore=ignoreMsg, emptyBin=True) dataframe_context.update_completion_status(completionStatus) ########################## Load the dataframe ############################## df = MasterHelper.load_dataset(spark, dataframe_context) df = df.persist() if jobType != "metaData": metaParserInstance = MasterHelper.get_metadata( df, spark, dataframe_context) df, df_helper = MasterHelper.set_dataframe_helper( df, dataframe_context, metaParserInstance) # updating metaData for binned Cols colsToBin = df_helper.get_cols_to_bin() levelCountDict = df_helper.get_level_counts(colsToBin) metaParserInstance.update_level_counts(colsToBin, levelCountDict) ############################ MetaData Calculation ########################## if jobType == "metaData": MasterHelper.run_metadata(spark, df, dataframe_context) ############################################################################ ################################ Data Sub Setting ########################## if jobType == "subSetting": MasterHelper.run_subsetting(spark, df, dataframe_context, df_helper, metaParserInstance) ############################################################################ ################################ Story Creation ############################ if jobType == "story": if analysistype == "dimension": MasterHelper.run_dimension_analysis(spark, df, dataframe_context, df_helper, metaParserInstance) elif analysistype == "measure": MasterHelper.run_measure_analysis(spark, df, dataframe_context, df_helper, metaParserInstance) progressMessage = CommonUtils.create_progress_message_object( "final", "final", "info", "Job Finished", 100, 100, display=True) CommonUtils.save_progress_message(messageURL, progressMessage, ignore=ignoreMsg) ############################################################################ ################################ Model Training ############################ elif jobType == 'training': dataframe_context.set_ml_environment("sklearn") MasterHelper.train_models(spark, df, dataframe_context, df_helper, metaParserInstance) ############################################################################ ############################## Model Prediction ############################ elif jobType == 'prediction': dataframe_context.set_ml_environment("sklearn") MasterHelper.score_model(spark, df, dataframe_context, df_helper, metaParserInstance) ############################################################################ ################################### Test Cases ############################ if jobType == "testCase": print "Running Test Case for Chi-square Analysis---------------" # TestChiSquare().setUp() unittest.TextTestRunner(verbosity=2).run( unittest.TestLoader().loadTestsFromTestCase(TestChiSquare)) # TestChiSquare(df,df_helper,dataframe_context,metaParserInstance).run_chisquare_test() # TestChiSquare().setup() # TestChiSquare().run_chisquare_test() # TestChiSquare().test_upper() # test = test_chisquare.run_chisquare_test(df,df_helper,dataframe_context,metaParserInstance) # suit = unittest.TestLoader().loadTestsFromTestCase(TestChiSquare) ############################################################################ ################################### Stock ADVISOR ########################## if jobType == 'stockAdvisor': # spark.conf.set("spark.sql.execution.arrow.enabled", "false") file_names = dataframe_context.get_stock_symbol_list() stockObj = StockAdvisor(spark, file_names, dataframe_context, result_setter) stockAdvisorData = stockObj.Run() stockAdvisorDataJson = CommonUtils.convert_python_object_to_json( stockAdvisorData) # stockAdvisorDataJson["name"] = jobName print "*" * 100 print "Result : ", stockAdvisorDataJson response = CommonUtils.save_result_json(jobURL, stockAdvisorDataJson) ############################################################################ scriptEndTime = time.time() runtimeDict = {"startTime": scriptStartTime, "endTime": scriptEndTime} print runtimeDict CommonUtils.save_error_messages(errorURL, "jobRuntime", runtimeDict, ignore=ignoreMsg) print "Scripts Time : ", scriptEndTime - scriptStartTime, " seconds."
def _generate_summary(self): data_dict = {} rules_dict = self._table data_dict["blockSplitter"] = self._blockSplitter data_dict["targetcol"] = self._colname groups = rules_dict.keys() probabilityCutoff = 75 probabilityGroups = [{ "probability": probabilityCutoff, "count": 0, "range": [probabilityCutoff, 100] }, { "probability": probabilityCutoff - 1, "count": 0, "range": [0, probabilityCutoff - 1] }] tableArray = [[ "Prediction Rule", "Probability", "Prediction", "Freq", "group", "richRules" ]] dropdownData = [] chartDict = {} self._completionStatus = self._dataframe_context.get_completion_status( ) progressMessage = CommonUtils.create_progress_message_object( self._analysisName, "custom", "info", "Generating Prediction rules", self._completionStatus, self._completionStatus, display=True) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=False) for idx, target in enumerate(rules_dict.keys()): targetToDisplayInTable = target.split(":")[0].strip() if idx == 0: dropdownData.append({ "displayName": target, "name": targetToDisplayInTable, "searchTerm": targetToDisplayInTable, "selected": True, "id": idx + 1 }) else: dropdownData.append({ "displayName": target, "name": targetToDisplayInTable, "searchTerm": targetToDisplayInTable, "selected": False, "id": idx + 1 }) rulesArray = rules_dict[target] probabilityArray = [ round(x, 2) for x in self.success_percent[target] ] groupArray = [ "strong" if x >= probabilityCutoff else "mixed" for x in probabilityArray ] for idx2, obj in enumerate(probabilityGroups): grpCount = len([ x for x in probabilityArray if x >= obj["range"][0] and x <= obj["range"][1] ]) obj["count"] += grpCount probabilityGroups[idx2] = obj predictionArray = [targetToDisplayInTable] * len(rulesArray) freqArray = self.total_predictions[target] chartDict[target] = sum(freqArray) success = self.successful_predictions[target] success_percent = self.success_percent[target] richRulesArray = [] crudeRuleArray = [] analysisType = self._dataframe_context.get_analysis_type() targetCol = self._dataframe_context.get_result_column() binFlag = False if self._dataframe_context.get_custom_analysis_details() != None: binnedColObj = [ x["colName"] for x in self._dataframe_context.get_custom_analysis_details() ] if binnedColObj != None and targetCol in binnedColObj: binFlag = True for idx2, crudeRule in enumerate(rulesArray): richRule, crudeRule = NarrativesUtils.generate_rules( self._colname, target, crudeRule, freqArray[idx2], success[idx2], success_percent[idx2], analysisType, binFlag=binFlag) richRulesArray.append(richRule) crudeRuleArray.append(crudeRule) probabilityArray = map( lambda x: humanize.apnumber(x) + "%" if x >= 10 else str(int(x)) + "%", probabilityArray) # targetArray = zip(rulesArray,probabilityArray,predictionArray,freqArray,groupArray) targetArray = zip(crudeRuleArray, probabilityArray, predictionArray, freqArray, groupArray, richRulesArray) targetArray = [list(x) for x in targetArray] tableArray += targetArray donutChartMaxLevel = 10 if len(chartDict) > donutChartMaxLevel: chartDict = NarrativesUtils.restructure_donut_chart_data( chartDict, nLevels=donutChartMaxLevel) chartData = NormalChartData([chartDict]).get_data() chartJson = ChartJson(data=chartData) chartJson.set_title(self._colname) chartJson.set_chart_type("donut") mainCardChart = C3ChartData(data=chartJson) mainCardChart.set_width_percent(45) # mainCardChart = {"dataType": "c3Chart","widthPercent":33 ,"data": {"data": [chartDict],"title":self._colname,"axes":{},"label_text":{},"legend":{},"yAxisNumberFormat": ".2s","types":None,"axisRotation":False, "chart_type": "donut"}} dropdownDict = { "dataType": "dropdown", "label": "Showing prediction rules for", "data": dropdownData } data_dict["probabilityGroups"] = probabilityGroups maincardSummary = NarrativesUtils.get_template_output(self._base_dir,\ 'decisiontreesummary.html',data_dict) main_card = NormalCard() main_card_data = [] main_card_narrative = NarrativesUtils.block_splitter( maincardSummary, self._blockSplitter) main_card_data += main_card_narrative main_card_data.append(mainCardChart) main_card_data.append(dropdownDict) main_card_table = TableData() main_card_table.set_table_data(tableArray) main_card_table.set_table_type("popupDecisionTreeTable") main_card_data.append(main_card_table) main_card.set_card_data(main_card_data) main_card.set_card_name("Predicting Key Drivers of {}".format( self._colname)) self._decisionTreeNode.add_a_card(main_card)
# print resp.text print('Main Method Did Not End ....., ', str(e)) progressMessage = CommonUtils.create_progress_message_object( "Main Method Did Not End .....", "Main Method Did Not End .....", "Error", str(e), "Failed", 100) CommonUtils.save_progress_message(messageURL, progressMessage, emptyBin=True) if __name__ == '__main__': jobURL, killURL, messageURL = killer_setting(sys.argv[1]) try: main(sys.argv[1]) print('Main Method End .....') except Exception as e: print(jobURL, killURL) data = {"status": "killed", "jobURL": jobURL} resp = send_kill_command(killURL, data) while str(resp.text) != '{"result": "success"}': data = {"status": "killed", "jobURL": jobURL} resp = send_kill_command(killURL, data) progressMessage = CommonUtils.create_progress_message_object( "Main Method Did Not End .....", "Main Method Did Not End .....", "Error", str(e), "Failed", 100) CommonUtils.save_progress_message(messageURL, progressMessage, emptyBin=True) print('Main Method Did Not End ....., ', str(e))
def __init__(self, data_frame, df_helper, df_context, spark, meta_parser, max_depth=5, scriptWeight=None, analysisName=None): self._spark = spark self._maxDepth = max_depth self._metaParser = meta_parser self._dataframe_helper = df_helper self._dataframe_context = df_context self._pandas_flag = df_context._pandas_flag self._ignoreMsg = self._dataframe_context.get_message_ignore() self._analysisDict = self._dataframe_context.get_analysis_dict() self._measure_columns = self._dataframe_helper.get_numeric_columns() # if self._analysisDict: # for m in self._measure_columns: # if data_frame.select(F.countDistinct(m)).collect()[0][0]<self._analysisDict['Dimension vs. Dimension']['binSetting']['binCardinality']: # self._measure_columns.remove(m) self._dimension_columns = self._dataframe_helper.get_string_columns() self._date_columns = self._dataframe_context.get_date_columns() self._uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion( self._uid_col): self._dimension_columns = list( set(self._dimension_columns) - {self._uid_col}) if len(self._date_columns) > 0: self._dimension_columns = list( set(self._dimension_columns) - set(self._date_columns)) if not self._pandas_flag: self._data_frame = MLUtils.bucket_all_measures( data_frame, self._measure_columns, self._dimension_columns, pandas_flag=self._pandas_flag) else: self._data_frame = data_frame try: self._data_frame1 = self._data_frame.copy() except: self._data_frame1 = self._data_frame self._mapping_dict = {} self._new_rules = {} self._total = {} self._success = {} self._fail = {} self._probability = {} self._alias_dict = {} self._important_vars = {} self._total_list = [] self._row_count = [] self._targetlevels = [] self._new_list = [] self._count_list = [] self._rule_id = 0 self._path_dict = {} self._completionStatus = self._dataframe_context.get_completion_status( ) if analysisName == None: self._analysisName = self._dataframe_context.get_analysis_name() else: self._analysisName = analysisName self._messageURL = self._dataframe_context.get_message_url() if scriptWeight == None: self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight( ) else: self._scriptWeightDict = scriptWeight self._scriptStages = { "initialization": { "summary": "Initialized The Decision Tree Script", "weight": 0 }, "treegeneration": { "summary": "Decision Tree Generation Finished", "weight": 10 } } self._completionStatus += old_div( self._scriptWeightDict[self._analysisName]["script"] * self._scriptStages["initialization"]["weight"], 10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "initialization",\ "info",\ self._scriptStages["initialization"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=self._ignoreMsg) self._dataframe_context.update_completion_status( self._completionStatus) print("I AM HERE")
def main(configJson): LOGGER = {} deployEnv = False # running the scripts from job-server env debugMode = True # runnning the scripts for local testing and development cfgMode = False # runnning the scripts by passing config.cfg path scriptStartTime = time.time() if isinstance(configJson, pyhocon.config_tree.ConfigTree) or isinstance( configJson, dict): deployEnv = True debugMode = False ignoreMsg = False elif isinstance(configJson, basestring): if configJson.endswith(".cfg"): print( "||############################## Running in cfgMode ##############################||" ) cfgMode = True debugMode = False ignoreMsg = False else: print( "||############################## Running in debugMode ##############################||" ) cfgMode = False debugMode = True ignoreMsg = True # Test Configs are defined in bi/settings/configs/localConfigs jobType = "training" if jobType == "testCase": configJson = get_test_configs(jobType, testFor="chisquare") else: configJson = get_test_configs(jobType) print( "||############################## Creating Spark Session ##############################||" ) if debugMode: APP_NAME = "mAdvisor_running_in_debug_mode" else: config = configJson["config"] if config is None: configJson = requests.get(configJson["job_config"]["config_url"]) configJson = configJson.json() if "job_config" in list( configJson.keys()) and "job_name" in configJson["job_config"]: APP_NAME = configJson["job_config"]["job_name"] else: APP_NAME = "--missing--" if debugMode: spark = CommonUtils.get_spark_session(app_name=APP_NAME, hive_environment=False) else: spark = CommonUtils.get_spark_session(app_name=APP_NAME) spark.sparkContext.setLogLevel("ERROR") # applicationIDspark = spark.sparkContext.applicationId # spark.conf.set("spark.sql.execution.arrow.enabled", "true") print( "||############################## Parsing Config file ##############################||" ) config = configJson["config"] if "TRAINER_MODE" in config and config["TRAINER_MODE"] == "autoML": if "app_type" in config["FILE_SETTINGS"] and config["FILE_SETTINGS"][ "app_type"] == "classification": if config['FILE_SETTINGS']['inputfile'][0].startswith("https:"): config[ 'ALGORITHM_SETTING'] = GLOBALSETTINGS.algorithm_settings_pandas else: config[ 'ALGORITHM_SETTING'] = GLOBALSETTINGS.algorithm_settings_pyspark jobConfig = configJson["job_config"] jobType = jobConfig["job_type"] if jobType == "prediction": one_click = config["one_click"] jobName = jobConfig["job_name"] jobURL = jobConfig["job_url"] messageURL = jobConfig["message_url"] initialMessageURL = jobConfig["initial_messages"] messages = scriptStages.messages_list(config, jobConfig, jobType, jobName) messages_for_API = messages.send_messages() messages_for_API = json.dumps(messages_for_API) res = requests.put(url=initialMessageURL, data=messages_for_API) print( "---------------------Pipeline changes in SPARK container------------------" ) try: errorURL = jobConfig["error_reporting_url"] except: errorURL = None if "app_id" in jobConfig: appid = jobConfig["app_id"] else: appid = None configJsonObj = configparser.ParserConfig(config) configJsonObj.set_json_params() dataframe_context = ContextSetter(configJsonObj) dataframe_context.set_job_type( jobType ) #jobType should be set before set_params call of dataframe_context dataframe_context.set_params() dataframe_context.set_message_url(messageURL) dataframe_context.set_app_id(appid) dataframe_context.set_debug_mode(debugMode) dataframe_context.set_job_url(jobURL) dataframe_context.set_app_name(APP_NAME) dataframe_context.set_error_url(errorURL) dataframe_context.set_logger(LOGGER) dataframe_context.set_xml_url(jobConfig["xml_url"]) dataframe_context.set_job_name(jobName) if debugMode == True: dataframe_context.set_environment("debugMode") dataframe_context.set_message_ignore(True) analysistype = dataframe_context.get_analysis_type() result_setter = ResultSetter(dataframe_context) appid = dataframe_context.get_app_id() completionStatus = 0 print( "||############################## Validating the Config ##############################||" ) configValidator = ConfigValidator(dataframe_context) configValid = configValidator.get_sanity_check() if not configValid: progressMessage = CommonUtils.create_progress_message_object( "mAdvisor Job", "custom", "info", "Please Provide A Valid Configuration", completionStatus, completionStatus, display=True) CommonUtils.save_progress_message(messageURL, progressMessage, ignore=ignoreMsg) response = CommonUtils.save_result_json( dataframe_context.get_job_url(), json.dumps({})) CommonUtils.save_error_messages(errorURL, APP_NAME, "Invalid Config Provided", ignore=ignoreMsg) else: ########################## Initializing messages ############################## if jobType == "story": if analysistype == "measure": progressMessage = CommonUtils.create_progress_message_object( "Measure analysis", "custom", "info", "Analyzing Target Variable", completionStatus, completionStatus, display=True) else: progressMessage = CommonUtils.create_progress_message_object( "Dimension analysis", "custom", "info", "Analyzing Target Variable", completionStatus, completionStatus, display=True) CommonUtils.save_progress_message(messageURL, progressMessage, ignore=ignoreMsg, emptyBin=True) dataframe_context.update_completion_status(completionStatus) elif jobType == "metaData": progressMessage = CommonUtils.create_progress_message_object( "metaData", "custom", "info", "Preparing Data For Loading", completionStatus, completionStatus, display=True) CommonUtils.save_progress_message(messageURL, progressMessage, ignore=ignoreMsg, emptyBin=True) progressMessage = CommonUtils.create_progress_message_object( "metaData", "custom", "info", "Initializing The Loading Process", completionStatus, completionStatus, display=True) CommonUtils.save_progress_message(messageURL, progressMessage, ignore=ignoreMsg) progressMessage = CommonUtils.create_progress_message_object( "metaData", "custom", "info", "Uploading Data", completionStatus, completionStatus, display=True) CommonUtils.save_progress_message(messageURL, progressMessage, ignore=ignoreMsg) dataframe_context.update_completion_status(completionStatus) if jobType != "stockAdvisor": df = None data_loading_st = time.time() progressMessage = CommonUtils.create_progress_message_object( "scriptInitialization", "scriptInitialization", "info", "Loading The Dataset", completionStatus, completionStatus) if jobType != "story" and jobType != "metaData": CommonUtils.save_progress_message(messageURL, progressMessage, ignore=ignoreMsg, emptyBin=True) dataframe_context.update_completion_status(completionStatus) ########################## Load the dataframe ############################## df = MasterHelper.load_dataset(spark, dataframe_context) ###### pandas Flag ################ #dataframe_context._pandas_flag = False try: df = df.persist() except: pass rowscols = (df.count(), len(df.columns)) removed_col = [] new_cols_added = None if jobType != "metaData": # df,df_helper = MasterHelper.set_dataframe_helper(df,dataframe_context,metaParserInstance) if jobType == "training" or jobType == "prediction": automl_enable = False if dataframe_context.get_trainerMode() == "autoML": automl_enable = True one_click_json = {} if dataframe_context.get_trainerMode() == "autoML": # dataframe_context._pandas_flag = True if jobType == "training": # if dataframe_context._pandas_flag : # df = df.toPandas() fs = time.time() autoML_obj = autoML.AutoMl( df, dataframe_context, GLOBALSETTINGS.APPS_ID_MAP[appid]["type"]) one_click_json, linear_df, tree_df = autoML_obj.run( ) print("Automl Done in ", time.time() - fs, " seconds.") elif jobType == "prediction": #try: # df = df.toPandas() #except: # pass score_obj = autoMLScore.Scoring( df, one_click, dataframe_context._pandas_flag) linear_df, tree_df = score_obj.run() # linear print('No. of columns in Linear data :', len(list(linear_df.columns))) #linear_df = spark.createDataFrame(linear_df) metaParserInstance_linear_df = MasterHelper.get_metadata( linear_df, spark, dataframe_context, new_cols_added) linear_df, df_helper_linear_df = MasterHelper.set_dataframe_helper( linear_df, dataframe_context, metaParserInstance_linear_df) dataTypeChangeCols_linear_df = dataframe_context.get_change_datatype_details( ) colsToBin_linear_df = df_helper_linear_df.get_cols_to_bin( ) updateLevelCountCols_linear_df = colsToBin_linear_df try: for i in dataTypeChangeCols_linear_df: if i["columnType"] == "dimension" and i[ 'colName'] in list(linear_df.columns): updateLevelCountCols_linear_df.append( i["colName"]) except: pass levelCountDict_linear_df = df_helper_linear_df.get_level_counts( updateLevelCountCols_linear_df) metaParserInstance_linear_df.update_level_counts( updateLevelCountCols_linear_df, levelCountDict_linear_df) # Tree print('No. of columns in Tree data :', len(list(tree_df.columns))) #tree_df = spark.createDataFrame(tree_df) metaParserInstance_tree_df = MasterHelper.get_metadata( tree_df, spark, dataframe_context, new_cols_added) tree_df, df_helper_tree_df = MasterHelper.set_dataframe_helper( tree_df, dataframe_context, metaParserInstance_tree_df) dataTypeChangeCols_tree_df = dataframe_context.get_change_datatype_details( ) colsToBin_tree_df = df_helper_tree_df.get_cols_to_bin() updateLevelCountCols_tree_df = colsToBin_tree_df try: for i in dataTypeChangeCols_tree_df: if i["columnType"] == "dimension" and i[ 'colName'] in list(tree_df.columns): updateLevelCountCols_tree_df.append( i["colName"]) except: pass levelCountDict_tree_df = df_helper_tree_df.get_level_counts( updateLevelCountCols_tree_df) metaParserInstance_tree_df.update_level_counts( updateLevelCountCols_tree_df, levelCountDict_tree_df) else: dataCleansingDict = dataframe_context.get_dataCleansing_info( ) featureEngineeringDict = dataframe_context.get_featureEngginerring_info( ) if dataCleansingDict[ 'selected'] or featureEngineeringDict[ 'selected']: old_cols_list = df.columns completionStatus = 10 progressMessage = CommonUtils.create_progress_message_object( "scriptInitialization", "scriptInitialization", "info", "Performing Required Data Preprocessing And Feature Transformation Tasks", completionStatus, completionStatus) CommonUtils.save_progress_message(messageURL, progressMessage, ignore=ignoreMsg, emptyBin=True) dataframe_context.update_completion_status( completionStatus) ## TO DO : Change flag later this is only for testing pandas_flag = dataframe_context._pandas_flag if pandas_flag: try: df = df.toPandas() except: pass if dataCleansingDict['selected']: data_preprocessing_obj = data_preprocessing.DataPreprocessing( spark, df, dataCleansingDict, dataframe_context) df = data_preprocessing_obj.data_cleansing() removed_col = data_preprocessing_obj.removed_col dataframe_context.set_ignore_column_suggestions( removed_col) if featureEngineeringDict['selected']: feature_engineering_obj = feature_engineering.FeatureEngineering( spark, df, featureEngineeringDict, dataframe_context) feature_engineering_obj.consider_columns = dataframe_context.get_consider_columns( ) df = feature_engineering_obj.feature_engineering( ) new_cols_list = df.columns old_cols_list = list( set(old_cols_list) - set(removed_col)) if len(old_cols_list) < len(new_cols_list): new_cols_added = list( set(new_cols_list) - set(old_cols_list)) else: new_cols_added = None # if pandas_flag: # ## TODO: has to be removed now that metadata and DFhelper are in pandas # df=spark.createDataFrame(df) try: print(df.printSchema()) except: print(df.dtypes) metaParserInstance = MasterHelper.get_metadata( df, spark, dataframe_context, new_cols_added) df, df_helper = MasterHelper.set_dataframe_helper( df, dataframe_context, metaParserInstance) # updating metaData for binned Cols dataTypeChangeCols = dataframe_context.get_change_datatype_details( ) colsToBin = df_helper.get_cols_to_bin() updateLevelCountCols = colsToBin try: for i in dataTypeChangeCols: if i["columnType"] == "dimension": if jobType != "prediction": updateLevelCountCols.append( i["colName"]) elif i["colName"] != self.dataframe_context.get_result_column( ) and jobType == "prediction": #in prediction we should not add target updateLevelCountCols.append( i["colName"]) except: pass levelCountDict = df_helper.get_level_counts( updateLevelCountCols) metaParserInstance.update_level_counts( updateLevelCountCols, levelCountDict) else: metaParserInstance = MasterHelper.get_metadata( df, spark, dataframe_context, new_cols_added) df, df_helper = MasterHelper.set_dataframe_helper( df, dataframe_context, metaParserInstance) # updating metaData for binned Cols dataTypeChangeCols = dataframe_context.get_change_datatype_details( ) colsToBin = df_helper.get_cols_to_bin() updateLevelCountCols = colsToBin try: for i in dataTypeChangeCols: if i["columnType"] == "dimension": updateLevelCountCols.append(i["colName"]) except: pass levelCountDict = df_helper.get_level_counts( updateLevelCountCols) metaParserInstance.update_level_counts( updateLevelCountCols, levelCountDict) ############################ MetaData Calculation ########################## if jobType == "metaData": MasterHelper.run_metadata(spark, df, dataframe_context) ############################################################################ ################################ Data Sub Setting ########################## if jobType == "subSetting": MasterHelper.run_subsetting(spark, df, dataframe_context, df_helper, metaParserInstance) ############################################################################ ################################ Story Creation ############################ if jobType == "story": if analysistype == "dimension": MasterHelper.run_dimension_analysis(spark, df, dataframe_context, df_helper, metaParserInstance) elif analysistype == "measure": MasterHelper.run_measure_analysis(spark, df, dataframe_context, df_helper, metaParserInstance) progressMessage = CommonUtils.create_progress_message_object( "final", "final", "info", "Job Finished", 100, 100, display=True) CommonUtils.save_progress_message(messageURL, progressMessage, ignore=ignoreMsg) ############################################################################ ################################ Model Training ############################ elif jobType == 'training': # dataframe_context.set_ml_environment("sklearn") if automl_enable is True: MasterHelper.train_models_automl( spark, linear_df, tree_df, dataframe_context, df_helper_linear_df, df_helper_tree_df, metaParserInstance_linear_df, metaParserInstance_tree_df, one_click_json) else: MasterHelper.train_models(spark, df, dataframe_context, df_helper, metaParserInstance, one_click_json) ############################################################################ ############################## Model Prediction ############################ elif jobType == 'prediction': if automl_enable is True: MasterHelper.score_model_autoML(spark, linear_df, tree_df, dataframe_context, df_helper_linear_df, df_helper_tree_df, metaParserInstance_linear_df, metaParserInstance_tree_df) else: # dataframe_context.set_ml_environment("sklearn") MasterHelper.score_model(spark, df, dataframe_context, df_helper, metaParserInstance) ############################################################################ ################################### Test Cases ############################ if jobType == "testCase": print("Running Test Case for Chi-square Analysis---------------") # TestChiSquare().setUp() unittest.TextTestRunner(verbosity=2).run( unittest.TestLoader().loadTestsFromTestCase(TestChiSquare)) # TestChiSquare(df,df_helper,dataframe_context,metaParserInstance).run_chisquare_test() # TestChiSquare().setup() # TestChiSquare().run_chisquare_test() # TestChiSquare().test_upper() # test = test_chisquare.run_chisquare_test(df,df_helper,dataframe_context,metaParserInstance) # suit = unittest.TestLoader().loadTestsFromTestCase(TestChiSquare) ############################################################################ ################################### Stock ADVISOR ########################## if jobType == 'stockAdvisor': # spark.conf.set("spark.sql.execution.arrow.enabled", "false") file_names = dataframe_context.get_stock_symbol_list() stockObj = StockAdvisor(spark, file_names, dataframe_context, result_setter) stockAdvisorData = stockObj.Run() stockAdvisorDataJson = CommonUtils.convert_python_object_to_json( stockAdvisorData) # stockAdvisorDataJson["name"] = jobName print("*" * 100) print("Result : ", stockAdvisorDataJson) response = CommonUtils.save_result_json(jobURL, stockAdvisorDataJson) ############################################################################ scriptEndTime = time.time() runtimeDict = {"startTime": scriptStartTime, "endTime": scriptEndTime} print(runtimeDict) CommonUtils.save_error_messages(errorURL, "jobRuntime", runtimeDict, ignore=ignoreMsg) print("Scripts Time : ", scriptEndTime - scriptStartTime, " seconds.")
def test_all(self, measure_columns=None, dimension_columns=None): measures = measure_columns if measure_columns is None: measures = self._measure_columns self._target_dimension = dimension_columns[0] dimension = self._target_dimension #####Look into it for Issue 947################# max_num_levels = GLOBALSETTINGS.DTREE_OTHER_DIMENSION_MAX_LEVEL # max_num_levels = min(max_num_levels, round(self._dataframe_helper.get_num_rows()**0.5)) # all_dimensions = [dim for dim in self._dimension_columns if self._dataframe_helper.get_num_unique_values(dim) <= max_num_levels] all_dimensions = [ dim for dim in self._dimension_columns if self._metaParser.get_num_unique_values(dim) <= max_num_levels ] all_measures = self._measure_columns if self._pandas_flag: self._data_frame = self._data_frame[all_dimensions + all_measures] cat_feature_info = [] columns_without_dimension = [ x for x in all_dimensions if x != dimension ] mapping_dict = {} masterMappingDict = {} decision_tree_result = DecisionTreeResult() decision_tree_result.set_freq_distribution( self._metaParser.get_unique_level_dict(self._target_dimension), self._important_vars) if self._pandas_flag: try: all_dimensions.remove(dimension) except: pass actual_cols = list(self._data_frame.columns) print(actual_cols) self._data_frame = pd.get_dummies(self._data_frame, columns=all_dimensions) after_dummy_cols = list(self._data_frame.columns) def Diff(li1, li2): return (list( list(set(li1) - set(li2)) + list(set(li2) - set(li1)))) decision_tree_result.dummy_cols = [ Diff(after_dummy_cols, Diff(actual_cols, all_dimensions)), all_dimensions ] all_dimensions.append(dimension) #this has been done for scoring error if self._pandas_flag: self._data_frame, mapping_dict = MLUtils.add_string_index( self._data_frame, [dimension], self._pandas_flag) else: self._data_frame, mapping_dict = MLUtils.add_string_index( self._data_frame, all_dimensions, self._pandas_flag) if self._pandas_flag: print(self._data_frame.head(1)) else: print(self._data_frame.show(1)) # standard_measure_index = {0.0:'Low',1.0:'Medium',2.0:'High'} standard_measure_index = { 0.0: 'Low', 1.0: 'Below Average', 2.0: 'Average', 3.0: 'Above Average', 4.0: 'High' } for measure in all_measures: mapping_dict[measure] = standard_measure_index for k, v in list(mapping_dict.items()): temp = {} for k1, v1 in list(v.items()): self._alias_dict[v1.replace(",", "")] = v1 temp[k1] = v1.replace(",", "") mapping_dict[k] = temp self._mapping_dict = mapping_dict if not self._pandas_flag: for c in columns_without_dimension: if self._pandas_flag: cat_feature_info.append(len(self._data_frame[c].unique())) else: cat_feature_info.append( self._data_frame.select(c).distinct().count()) for c in all_measures: cat_feature_info.append(5) columns_without_dimension = columns_without_dimension + all_measures all_measures = [] if len(cat_feature_info) > 0: max_length = max(cat_feature_info) else: max_length = 32 else: decision_tree_result.mappingdict = mapping_dict[dimension] max_length = 32 cat_feature_info = dict(enumerate(cat_feature_info)) if self._pandas_flag: dimension_classes = len(self._data_frame[dimension].unique()) else: dimension_classes = self._data_frame.select( dimension).distinct().count() if not self._pandas_flag: self._data_frame = self._data_frame[[dimension] + columns_without_dimension + all_measures] print("=" * 200) # print self._data_frame.rdd.first() print("numClasses", dimension_classes) print("maxDepth", self._maxDepth) decision_tree_result._maxDepth = self._maxDepth print("maxBins", max_length) print("=" * 200) if self._pandas_flag: self._data_frame.columns = [ re.sub('\W+', '_', col.strip()) for col in self._data_frame.columns ] x = self._data_frame.drop(dimension, axis=1) y = self._data_frame[dimension] #tle = LabelEncoder() #y = tle.fit_transform(y) for i in x.columns: x[i] = x[i].fillna(x[i].mode()[0]) model = DecisionTreeClassifier(criterion='gini', max_depth=self._maxDepth, random_state=42) model = model.fit(x, y) output_result = self.tree_to_code(model, list(x.columns)) output_result = list(map(lambda x: x.strip(), output_result)) else: data = self._data_frame.rdd.map( lambda x: LabeledPoint(x[0], x[1:])) (trainingData, testData) = data.randomSplit([1.0, 0.0]) # TO DO : set maxBins at least equal to the max level of categories in dimension column # model = DecisionTree.trainClassifier(trainingData, numClasses=dimension_classes, categoricalFeaturesInfo=cat_feature_info, impurity='gini', maxDepth=self._maxDepth, maxBins=max_length) # Removed categoricalFeaturesInfo to be passed to DecisionTree to get all levels and consider all feature as continuous variables #But that results in wrong result in Prediction Rule eg: columns containing "yes" or "no" as its value is considered as float value(0.5) so removing categoricalFeaturesInfo={} with categoricalFeaturesInfo=cat_feature_info model = DecisionTree.trainClassifier( trainingData, numClasses=dimension_classes, categoricalFeaturesInfo=cat_feature_info, impurity='gini', maxDepth=self._maxDepth, maxBins=max_length) output_result = model.toDebugString() decision_tree = self.tree_json(output_result, self._data_frame, self._pandas_flag) self._new_tree = self.generate_new_tree(decision_tree) node_list = self.node_name_extractor(self._new_tree) node_list = list(self.flatten(node_list)) correct_count_list = [i[0] for i in self._count_list] tree_dict = dict(list(zip(node_list, correct_count_list))) self._new_tree = self.wrap_tree(self._new_tree, tree_dict) self._path_dict = self.path_dict_creator(node_list, self._new_tree) print("===" * 40) decision_tree_result.set_params(self._new_tree, self._new_rules, self._total, self._success, self._probability, self._path_dict) self._completionStatus += old_div( self._scriptWeightDict[self._analysisName]["script"] * self._scriptStages["treegeneration"]["weight"], 10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "treegeneration",\ "info",\ self._scriptStages["treegeneration"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=self._ignoreMsg) self._dataframe_context.update_completion_status( self._completionStatus) return decision_tree_result
def applyFilter(self): """ here all the filter settings will come from the df_context """ dimension_filters = self._dataframe_context.get_dimension_filters() measure_filters = self._dataframe_context.get_measure_filters() time_dimension_filters = self._dataframe_context.get_time_dimension_filters( ) self._completionStatus += self._scriptStages["initialization"][ "weight"] progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "initialization",\ "info",\ self._scriptStages["initialization"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage) if len(dimension_filters) > 0: for filter_dict in dimension_filters: if filter_dict["filterType"] == "valueIn": self.values_in(filter_dict["colname"], filter_dict["values"]) time_taken_dimensionfilters = time.time() - self._start_time self._completionStatus += self._scriptStages["dimensionfilters"][ "weight"] print "dimensionfilters takes", time_taken_dimensionfilters progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "dimensionfilters",\ "info",\ self._scriptStages["dimensionfilters"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage) if len(measure_filters) > 0: for filter_dict in measure_filters: if filter_dict["filterType"] == "valueRange": self.values_between(filter_dict["colname"],\ filter_dict["lowerBound"],\ filter_dict["upperBound"],\ greater_than_equal=1,\ less_than_equal =1) time_taken_measurefilters = time.time() - self._start_time self._completionStatus += self._scriptStages["measurefilters"][ "weight"] print "measurefilters takes", time_taken_measurefilters progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "measurefilters",\ "info",\ self._scriptStages["measurefilters"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage) if len(time_dimension_filters) > 0: for filter_dict in time_dimension_filters: if filter_dict["filterType"] == "valueRange": self.values_between(filter_dict["colname"],\ filter_dict["lowerBound"],\ filter_dict["upperBound"],\ greater_than_equal=1,\ less_than_equal =1) time_taken_datetimefilters = time.time() - self._start_time self._completionStatus += self._scriptStages["datetimefilters"][ "weight"] print "datetimefilters takes", time_taken_datetimefilters progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "datetimefilters",\ "info",\ self._scriptStages["datetimefilters"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage) return self._data_frame
def Predict(self): self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight( ) self._scriptStages = { "initialization": { "summary": "Initialized the Naive Bayes Scripts", "weight": 2 }, "prediction": { "summary": "Spark ML Naive Bayes Model Prediction Finished", "weight": 2 }, "frequency": { "summary": "descriptive analysis finished", "weight": 2 }, "chisquare": { "summary": "chi Square analysis finished", "weight": 4 }, "completion": { "summary": "all analysis finished", "weight": 4 }, } self._completionStatus += self._scriptWeightDict[self._analysisName][ "total"] * self._scriptStages["initialization"]["weight"] / 10 progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "initialization",\ "info",\ self._scriptStages["initialization"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) dataSanity = True level_counts_train = self._dataframe_context.get_level_count_dict() categorical_columns = self._dataframe_helper.get_string_columns() numerical_columns = self._dataframe_helper.get_numeric_columns() time_dimension_columns = self._dataframe_helper.get_timestamp_columns() result_column = self._dataframe_context.get_result_column() categorical_columns = [ x for x in categorical_columns if x != result_column ] level_counts_score = CommonUtils.get_level_count_dict( self._data_frame, categorical_columns, self._dataframe_context.get_column_separator(), output_type="dict", dataType="spark") for key in level_counts_train: if key in level_counts_score: if level_counts_train[key] != level_counts_score[key]: dataSanity = False else: dataSanity = False test_data_path = self._dataframe_context.get_input_file() score_data_path = self._dataframe_context.get_score_path( ) + "/data.csv" trained_model_path = self._dataframe_context.get_model_path() trained_model_path = "/".join( trained_model_path.split("/")[:-1] ) + "/" + self._slug + "/" + self._dataframe_context.get_model_for_scoring( ) # score_summary_path = self._dataframe_context.get_score_path()+"/Summary/summary.json" pipelineModel = MLUtils.load_pipeline(trained_model_path) df = self._data_frame transformed = pipelineModel.transform(df) label_indexer_dict = MLUtils.read_string_indexer_mapping( trained_model_path, SQLctx) prediction_to_levels = udf(lambda x: label_indexer_dict[x], StringType()) transformed = transformed.withColumn( result_column, prediction_to_levels(transformed.prediction)) if "probability" in transformed.columns: probability_dataframe = transformed.select( [result_column, "probability"]).toPandas() probability_dataframe = probability_dataframe.rename( index=str, columns={result_column: "predicted_class"}) probability_dataframe[ "predicted_probability"] = probability_dataframe[ "probability"].apply(lambda x: max(x)) self._score_summary[ "prediction_split"] = MLUtils.calculate_scored_probability_stats( probability_dataframe) self._score_summary["result_column"] = result_column scored_dataframe = transformed.select( categorical_columns + time_dimension_columns + numerical_columns + [result_column, "probability"]).toPandas() scored_dataframe['predicted_probability'] = probability_dataframe[ "predicted_probability"].values # scored_dataframe = scored_dataframe.rename(index=str, columns={"predicted_probability": "probability"}) else: self._score_summary["prediction_split"] = [] self._score_summary["result_column"] = result_column scored_dataframe = transformed.select(categorical_columns + time_dimension_columns + numerical_columns + [result_column]).toPandas() labelMappingDict = self._dataframe_context.get_label_map() if score_data_path.startswith("file"): score_data_path = score_data_path[7:] scored_dataframe.to_csv(score_data_path, header=True, index=False) uidCol = self._dataframe_context.get_uid_column() if uidCol == None: uidCols = self._metaParser.get_suggested_uid_columns() if len(uidCols) > 0: uidCol = uidCols[0] uidTableData = [] predictedClasses = list(scored_dataframe[result_column].unique()) if uidCol: if uidCol in df.columns: for level in predictedClasses: levelDf = scored_dataframe[scored_dataframe[result_column] == level] levelDf = levelDf[[ uidCol, "predicted_probability", result_column ]] levelDf.sort_values(by="predicted_probability", ascending=False, inplace=True) levelDf["predicted_probability"] = levelDf[ "predicted_probability"].apply( lambda x: humanize.apnumber(x * 100) + "%" if x * 100 >= 10 else str(int(x * 100)) + "%") uidTableData.append(levelDf[:5]) uidTableData = pd.concat(uidTableData) uidTableData = [list(arr) for arr in list(uidTableData.values)] uidTableData = [[uidCol, "Probability", result_column] ] + uidTableData uidTable = TableData() uidTable.set_table_width(25) uidTable.set_table_data(uidTableData) uidTable.set_table_type("normalHideColumn") self._result_setter.set_unique_identifier_table( json.loads( CommonUtils.convert_python_object_to_json(uidTable))) self._completionStatus += self._scriptWeightDict[self._analysisName][ "total"] * self._scriptStages["prediction"]["weight"] / 10 progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "prediction",\ "info",\ self._scriptStages["prediction"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) print("STARTING DIMENSION ANALYSIS ...") columns_to_keep = [] columns_to_drop = [] columns_to_keep = self._dataframe_context.get_score_consider_columns() if len(columns_to_keep) > 0: columns_to_drop = list(set(df.columns) - set(columns_to_keep)) else: columns_to_drop += ["predicted_probability"] scored_df = transformed.select(categorical_columns + time_dimension_columns + numerical_columns + [result_column]) columns_to_drop = [ x for x in columns_to_drop if x in scored_df.columns ] modified_df = scored_df.select( [x for x in scored_df.columns if x not in columns_to_drop]) resultColLevelCount = dict( modified_df.groupby(result_column).count().collect()) self._metaParser.update_column_dict( result_column, { "LevelCount": resultColLevelCount, "numberOfUniqueValues": len(resultColLevelCount.keys()) }) self._dataframe_context.set_story_on_scored_data(True) self._dataframe_context.update_consider_columns(columns_to_keep) df_helper = DataFrameHelper(modified_df, self._dataframe_context, self._metaParser) df_helper.set_params() spark_scored_df = df_helper.get_data_frame() if len(predictedClasses) >= 2: try: fs = time.time() df_decision_tree_obj = DecisionTrees( spark_scored_df, df_helper, self._dataframe_context, self._spark, self._metaParser, scriptWeight=self._scriptWeightDict, analysisName=self._analysisName).test_all( dimension_columns=[result_column]) narratives_obj = CommonUtils.as_dict( DecisionTreeNarrative(result_column, df_decision_tree_obj, self._dataframe_helper, self._dataframe_context, self._metaParser, self._result_setter, story_narrative=None, analysisName=self._analysisName, scriptWeight=self._scriptWeightDict)) print(narratives_obj) except Exception as e: print("DecisionTree Analysis Failed ", str(e)) else: data_dict = { "npred": len(predictedClasses), "nactual": len(labelMappingDict.values()) } if data_dict["nactual"] > 2: levelCountDict[predictedClasses[0]] = resultColLevelCount[ predictedClasses[0]] levelCountDict["Others"] = sum([ v for k, v in resultColLevelCount.items() if k != predictedClasses[0] ]) else: levelCountDict = resultColLevelCount otherClass = list( set(labelMappingDict.values()) - set(predictedClasses))[0] levelCountDict[otherClass] = 0 print(levelCountDict) total = float( sum([x for x in levelCountDict.values() if x != None])) levelCountTuple = [({ "name": k, "count": v, "percentage": humanize.apnumber(v * 100 / total) + "%" }) for k, v in levelCountDict.items() if v != None] levelCountTuple = sorted(levelCountTuple, key=lambda x: x["count"], reverse=True) data_dict["blockSplitter"] = "|~NEWBLOCK~|" data_dict["targetcol"] = result_column data_dict["nlevel"] = len(levelCountDict.keys()) data_dict["topLevel"] = levelCountTuple[0] data_dict["secondLevel"] = levelCountTuple[1] maincardSummary = NarrativesUtils.get_template_output( "/apps/", 'scorewithoutdtree.html', data_dict) main_card = NormalCard() main_card_data = [] main_card_narrative = NarrativesUtils.block_splitter( maincardSummary, "|~NEWBLOCK~|") main_card_data += main_card_narrative chartData = NormalChartData([levelCountDict]).get_data() chartJson = ChartJson(data=chartData) chartJson.set_title(result_column) chartJson.set_chart_type("donut") mainCardChart = C3ChartData(data=chartJson) mainCardChart.set_width_percent(33) main_card_data.append(mainCardChart) uidTable = self._result_setter.get_unique_identifier_table() if uidTable != None: main_card_data.append(uidTable) main_card.set_card_data(main_card_data) main_card.set_card_name( "Predicting Key Drivers of {}".format(result_column)) self._result_setter.set_score_dtree_cards([main_card], {})
def fit(self, output_column, input_columns=None): print "linear regression fit started" if output_column not in self._dataframe_helper.get_numeric_columns(): raise BIException('Output column: %s is not a measure column' % (output_column, )) if input_columns == None: input_columns = list( set(self._dataframe_helper.get_numeric_columns()) - {output_column}) nColsToUse = self._analysisDict[self._analysisName]["noOfColumnsToUse"] if nColsToUse != None: input_columns = input_columns[:nColsToUse] if len( set(input_columns) - set(self._dataframe_helper.get_numeric_columns())) != 0: raise BIException( 'At least one of the input columns %r is not a measure column' % (input_columns, )) all_measures = input_columns + [output_column] print all_measures measureDf = self._data_frame.select(all_measures) lr = LR(maxIter=LinearRegression.MAX_ITERATIONS, regParam=LinearRegression.REGULARIZATION_PARAM, elasticNetParam=1.0, labelCol=LinearRegression.LABEL_COLUMN_NAME, featuresCol=LinearRegression.FEATURES_COLUMN_NAME) st = time.time() pipeline = MLUtils.create_pyspark_ml_pipeline(input_columns, [], output_column) pipelineModel = pipeline.fit(measureDf) training_df = pipelineModel.transform(measureDf) training_df = training_df.withColumn("label", training_df[output_column]) print "time taken to create training_df", time.time() - st # st = time.time() # training_df.cache() # print "caching in ",time.time()-st st = time.time() lr_model = lr.fit(training_df) lr_summary = lr_model.evaluate(training_df) print "lr model summary", time.time() - st sample_data_dict = {} for input_col in input_columns: sample_data_dict[input_col] = None coefficients = [ float(val) if val != None else None for val in lr_model.coefficients.values ] try: p_values = [ float(val) if val != None else None for val in lr_model.summary.pValues ] except: p_values = [None] * len(coefficients) # print p_values # print coefficients regression_result = RegressionResult(output_column, list(set(input_columns))) regression_result.set_params(intercept=float(lr_model.intercept),\ coefficients=coefficients,\ p_values = p_values,\ rmse=float(lr_summary.rootMeanSquaredError), \ r2=float(lr_summary.r2),\ sample_data_dict=sample_data_dict) self._completionStatus = self._dataframe_context.get_completion_status( ) self._completionStatus += self._scriptWeightDict[ self._analysisName]["script"] progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "regressionTrainingEnd",\ "info",\ self._scriptStages["regressionTrainingEnd"]["summary"],\ self._completionStatus,\ self._completionStatus) if self._ignoreRegressionElasticityMessages != True: CommonUtils.save_progress_message( self._messageURL, progressMessage, ignore=self._ignoreRegressionElasticityMessages) self._dataframe_context.update_completion_status( self._completionStatus) return regression_result
def __init__(self, data_frame, spark, dataframe_context): self._dataframe_context = dataframe_context self._completionStatus = self._dataframe_context.get_completion_status( ) self._start_time = time.time() self._analysisName = "metadata" self._messageURL = self._dataframe_context.get_message_url() self._ignoreMsgFlag = self._dataframe_context.get_metadata_ignore_msg_flag( ) self._scriptStages = { "schema": { "summary": "Loaded the data and Schema is Run", "weight": 12 }, "sampling": { "summary": "Sampling the dataframe", "weight": 5 }, "measurestats": { "summary": "calculating stats for measure columns", "weight": 25 }, "dimensionstats": { "summary": "calculating stats for dimension columns", "weight": 25 }, "timedimensionstats": { "summary": "calculating stats for time dimension columns", "weight": 5 }, "suggestions": { "summary": "Ignore and Date Suggestions", "weight": 25 }, } self._binned_stat_flag = True self._level_count_flag = True self._stripTimestamp = True self._data_frame = data_frame self._spark = spark self._total_columns = len( [field.name for field in self._data_frame.schema.fields]) self._total_rows = self._data_frame.count() self._max_levels = min(200, round(self._total_rows**0.5)) self._percentage_columns = [] self._numeric_columns = [ field.name for field in self._data_frame.schema.fields if ColumnType(type(field.dataType)).get_abstract_data_type() == ColumnType.MEASURE ] self._string_columns = [ field.name for field in self._data_frame.schema.fields if ColumnType(type(field.dataType)).get_abstract_data_type() == ColumnType.DIMENSION ] self._timestamp_columns = [ field.name for field in self._data_frame.schema.fields if ColumnType(type(field.dataType)).get_abstract_data_type() == ColumnType.TIME_DIMENSION ] self._boolean_columns = [ field.name for field in self._data_frame.schema.fields if ColumnType(type(field.dataType)).get_abstract_data_type() == ColumnType.BOOLEAN ] self._real_columns = [ field.name for field in self._data_frame.schema.fields if ColumnType( type(field.dataType)).get_actual_data_type() == ColumnType.REAL ] self._column_type_dict = {} self._dataSize = { "nRows": self._total_rows, "nCols": self._total_columns, "nBooleans": None, "nMeasures": None, "nDimensions": None, "nTimeDimensions": None, "dimensionLevelCountDict": {}, "totalLevels": None } self.update_column_type_dict() time_taken_schema = time.time() - self._start_time print "schema rendering takes", time_taken_schema self._completionStatus += self._scriptStages["schema"]["weight"] progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "schema",\ "info",\ self._scriptStages["schema"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=self._ignoreMsgFlag) self._dataframe_context.update_completion_status( self._completionStatus)