def Predict(self): self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight( ) self._scriptStages = { "initialization": { "summary": "Initialized the Naive Bayes Scripts", "weight": 2 }, "prediction": { "summary": "Spark ML Naive Bayes Model Prediction Finished", "weight": 2 }, "frequency": { "summary": "descriptive analysis finished", "weight": 2 }, "chisquare": { "summary": "chi Square analysis finished", "weight": 4 }, "completion": { "summary": "all analysis finished", "weight": 4 }, } self._completionStatus += self._scriptWeightDict[self._analysisName][ "total"] * self._scriptStages["initialization"]["weight"] / 10 progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "initialization",\ "info",\ self._scriptStages["initialization"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) dataSanity = True level_counts_train = self._dataframe_context.get_level_count_dict() categorical_columns = self._dataframe_helper.get_string_columns() numerical_columns = self._dataframe_helper.get_numeric_columns() time_dimension_columns = self._dataframe_helper.get_timestamp_columns() result_column = self._dataframe_context.get_result_column() categorical_columns = [ x for x in categorical_columns if x != result_column ] level_counts_score = CommonUtils.get_level_count_dict( self._data_frame, categorical_columns, self._dataframe_context.get_column_separator(), output_type="dict", dataType="spark") for key in level_counts_train: if key in level_counts_score: if level_counts_train[key] != level_counts_score[key]: dataSanity = False else: dataSanity = False test_data_path = self._dataframe_context.get_input_file() score_data_path = self._dataframe_context.get_score_path( ) + "/data.csv" trained_model_path = self._dataframe_context.get_model_path() trained_model_path = "/".join( trained_model_path.split("/")[:-1] ) + "/" + self._slug + "/" + self._dataframe_context.get_model_for_scoring( ) # score_summary_path = self._dataframe_context.get_score_path()+"/Summary/summary.json" pipelineModel = MLUtils.load_pipeline(trained_model_path) df = self._data_frame transformed = pipelineModel.transform(df) label_indexer_dict = MLUtils.read_string_indexer_mapping( trained_model_path, SQLctx) prediction_to_levels = udf(lambda x: label_indexer_dict[x], StringType()) transformed = transformed.withColumn( result_column, prediction_to_levels(transformed.prediction)) if "probability" in transformed.columns: probability_dataframe = transformed.select( [result_column, "probability"]).toPandas() probability_dataframe = probability_dataframe.rename( index=str, columns={result_column: "predicted_class"}) probability_dataframe[ "predicted_probability"] = probability_dataframe[ "probability"].apply(lambda x: max(x)) self._score_summary[ "prediction_split"] = MLUtils.calculate_scored_probability_stats( probability_dataframe) self._score_summary["result_column"] = result_column scored_dataframe = transformed.select( categorical_columns + time_dimension_columns + numerical_columns + [result_column, "probability"]).toPandas() scored_dataframe['predicted_probability'] = probability_dataframe[ "predicted_probability"].values # scored_dataframe = scored_dataframe.rename(index=str, columns={"predicted_probability": "probability"}) else: self._score_summary["prediction_split"] = [] self._score_summary["result_column"] = result_column scored_dataframe = transformed.select(categorical_columns + time_dimension_columns + numerical_columns + [result_column]).toPandas() labelMappingDict = self._dataframe_context.get_label_map() if score_data_path.startswith("file"): score_data_path = score_data_path[7:] scored_dataframe.to_csv(score_data_path, header=True, index=False) uidCol = self._dataframe_context.get_uid_column() if uidCol == None: uidCols = self._metaParser.get_suggested_uid_columns() if len(uidCols) > 0: uidCol = uidCols[0] uidTableData = [] predictedClasses = list(scored_dataframe[result_column].unique()) if uidCol: if uidCol in df.columns: for level in predictedClasses: levelDf = scored_dataframe[scored_dataframe[result_column] == level] levelDf = levelDf[[ uidCol, "predicted_probability", result_column ]] levelDf.sort_values(by="predicted_probability", ascending=False, inplace=True) levelDf["predicted_probability"] = levelDf[ "predicted_probability"].apply( lambda x: humanize.apnumber(x * 100) + "%" if x * 100 >= 10 else str(int(x * 100)) + "%") uidTableData.append(levelDf[:5]) uidTableData = pd.concat(uidTableData) uidTableData = [list(arr) for arr in list(uidTableData.values)] uidTableData = [[uidCol, "Probability", result_column] ] + uidTableData uidTable = TableData() uidTable.set_table_width(25) uidTable.set_table_data(uidTableData) uidTable.set_table_type("normalHideColumn") self._result_setter.set_unique_identifier_table( json.loads( CommonUtils.convert_python_object_to_json(uidTable))) self._completionStatus += self._scriptWeightDict[self._analysisName][ "total"] * self._scriptStages["prediction"]["weight"] / 10 progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "prediction",\ "info",\ self._scriptStages["prediction"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) print("STARTING DIMENSION ANALYSIS ...") columns_to_keep = [] columns_to_drop = [] columns_to_keep = self._dataframe_context.get_score_consider_columns() if len(columns_to_keep) > 0: columns_to_drop = list(set(df.columns) - set(columns_to_keep)) else: columns_to_drop += ["predicted_probability"] scored_df = transformed.select(categorical_columns + time_dimension_columns + numerical_columns + [result_column]) columns_to_drop = [ x for x in columns_to_drop if x in scored_df.columns ] modified_df = scored_df.select( [x for x in scored_df.columns if x not in columns_to_drop]) resultColLevelCount = dict( modified_df.groupby(result_column).count().collect()) self._metaParser.update_column_dict( result_column, { "LevelCount": resultColLevelCount, "numberOfUniqueValues": len(resultColLevelCount.keys()) }) self._dataframe_context.set_story_on_scored_data(True) self._dataframe_context.update_consider_columns(columns_to_keep) df_helper = DataFrameHelper(modified_df, self._dataframe_context, self._metaParser) df_helper.set_params() spark_scored_df = df_helper.get_data_frame() if len(predictedClasses) >= 2: try: fs = time.time() df_decision_tree_obj = DecisionTrees( spark_scored_df, df_helper, self._dataframe_context, self._spark, self._metaParser, scriptWeight=self._scriptWeightDict, analysisName=self._analysisName).test_all( dimension_columns=[result_column]) narratives_obj = CommonUtils.as_dict( DecisionTreeNarrative(result_column, df_decision_tree_obj, self._dataframe_helper, self._dataframe_context, self._metaParser, self._result_setter, story_narrative=None, analysisName=self._analysisName, scriptWeight=self._scriptWeightDict)) print(narratives_obj) except Exception as e: print("DecisionTree Analysis Failed ", str(e)) else: data_dict = { "npred": len(predictedClasses), "nactual": len(labelMappingDict.values()) } if data_dict["nactual"] > 2: levelCountDict[predictedClasses[0]] = resultColLevelCount[ predictedClasses[0]] levelCountDict["Others"] = sum([ v for k, v in resultColLevelCount.items() if k != predictedClasses[0] ]) else: levelCountDict = resultColLevelCount otherClass = list( set(labelMappingDict.values()) - set(predictedClasses))[0] levelCountDict[otherClass] = 0 print(levelCountDict) total = float( sum([x for x in levelCountDict.values() if x != None])) levelCountTuple = [({ "name": k, "count": v, "percentage": humanize.apnumber(v * 100 / total) + "%" }) for k, v in levelCountDict.items() if v != None] levelCountTuple = sorted(levelCountTuple, key=lambda x: x["count"], reverse=True) data_dict["blockSplitter"] = "|~NEWBLOCK~|" data_dict["targetcol"] = result_column data_dict["nlevel"] = len(levelCountDict.keys()) data_dict["topLevel"] = levelCountTuple[0] data_dict["secondLevel"] = levelCountTuple[1] maincardSummary = NarrativesUtils.get_template_output( "/apps/", 'scorewithoutdtree.html', data_dict) main_card = NormalCard() main_card_data = [] main_card_narrative = NarrativesUtils.block_splitter( maincardSummary, "|~NEWBLOCK~|") main_card_data += main_card_narrative chartData = NormalChartData([levelCountDict]).get_data() chartJson = ChartJson(data=chartData) chartJson.set_title(result_column) chartJson.set_chart_type("donut") mainCardChart = C3ChartData(data=chartJson) mainCardChart.set_width_percent(33) main_card_data.append(mainCardChart) uidTable = self._result_setter.get_unique_identifier_table() if uidTable != None: main_card_data.append(uidTable) main_card.set_card_data(main_card_data) main_card.set_card_name( "Predicting Key Drivers of {}".format(result_column)) self._result_setter.set_score_dtree_cards([main_card], {})
def Predict(self): self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight( ) self._scriptStages = { "initialization": { "summary": "Initialized the Random Forest Scripts", "weight": 2 }, "prediction": { "summary": "Random Forest Model Prediction Finished", "weight": 2 }, "frequency": { "summary": "descriptive analysis finished", "weight": 2 }, "chisquare": { "summary": "chi Square analysis finished", "weight": 4 }, "completion": { "summary": "all analysis finished", "weight": 4 }, } self._completionStatus += old_div( self._scriptWeightDict[self._analysisName]["total"] * self._scriptStages["initialization"]["weight"], 10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "initialization",\ "info",\ self._scriptStages["initialization"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=self._ignoreMsg) self._dataframe_context.update_completion_status( self._completionStatus) # Match with the level_counts and then clean the data dataSanity = True level_counts_train = self._dataframe_context.get_level_count_dict() cat_cols = self._dataframe_helper.get_string_columns() # level_counts_score = CommonUtils.get_level_count_dict(self._data_frame,cat_cols,self._dataframe_context.get_column_separator(),output_type="dict") # if level_counts_train != {}: # for key in level_counts_train: # if key in level_counts_score: # if level_counts_train[key] != level_counts_score[key]: # dataSanity = False # else: # dataSanity = False categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() test_data_path = self._dataframe_context.get_input_file() if self._mlEnv == "spark": pass elif self._mlEnv == "sklearn": score_data_path = self._dataframe_context.get_score_path( ) + "/data.csv" if score_data_path.startswith("file"): score_data_path = score_data_path[7:] trained_model_path = self._dataframe_context.get_model_path() trained_model_path += "/" + self._dataframe_context.get_model_for_scoring( ) + ".pkl" if trained_model_path.startswith("file"): trained_model_path = trained_model_path[7:] score_summary_path = self._dataframe_context.get_score_path( ) + "/Summary/summary.json" if score_summary_path.startswith("file"): score_summary_path = score_summary_path[7:] trained_model = joblib.load(trained_model_path) # pandas_df = self._data_frame.toPandas() df = self._data_frame.toPandas() model_columns = self._dataframe_context.get_model_features() pandas_df = MLUtils.create_dummy_columns( df, [x for x in categorical_columns if x != result_column]) pandas_df = MLUtils.fill_missing_columns(pandas_df, model_columns, result_column) if uid_col: pandas_df = pandas_df[[ x for x in pandas_df.columns if x != uid_col ]] y_score = trained_model.predict(pandas_df) y_prob = trained_model.predict_proba(pandas_df) y_prob = MLUtils.calculate_predicted_probability(y_prob) y_prob = list([round(x, 2) for x in y_prob]) score = { "predicted_class": y_score, "predicted_probability": y_prob } df["predicted_class"] = score["predicted_class"] labelMappingDict = self._dataframe_context.get_label_map() df["predicted_class"] = df["predicted_class"].apply( lambda x: labelMappingDict[x] if x != None else "NA") df["predicted_probability"] = score["predicted_probability"] self._score_summary[ "prediction_split"] = MLUtils.calculate_scored_probability_stats( df) self._score_summary["result_column"] = result_column if result_column in df.columns: df.drop(result_column, axis=1, inplace=True) df = df.rename(index=str, columns={"predicted_class": result_column}) df.to_csv(score_data_path, header=True, index=False) uidCol = self._dataframe_context.get_uid_column() if uidCol == None: uidCols = self._metaParser.get_suggested_uid_columns() if len(uidCols) > 0: uidCol = uidCols[0] uidTableData = [] predictedClasses = list(df[result_column].unique()) if uidCol: if uidCol in df.columns: for level in predictedClasses: levelDf = df[df[result_column] == level] levelDf = levelDf[[ uidCol, "predicted_probability", result_column ]] levelDf.sort_values(by="predicted_probability", ascending=False, inplace=True) levelDf["predicted_probability"] = levelDf[ "predicted_probability"].apply( lambda x: humanize.apnumber(x * 100) + "%" if x * 100 >= 10 else str(int(x * 100)) + "%") uidTableData.append(levelDf[:5]) uidTableData = pd.concat(uidTableData) uidTableData = [list(arr) for arr in list(uidTableData.values)] uidTableData = [[uidCol, "Probability", result_column] ] + uidTableData uidTable = TableData() uidTable.set_table_width(25) uidTable.set_table_data(uidTableData) uidTable.set_table_type("normalHideColumn") self._result_setter.set_unique_identifier_table( json.loads( CommonUtils.convert_python_object_to_json(uidTable))) self._completionStatus += old_div( self._scriptWeightDict[self._analysisName]["total"] * self._scriptStages["prediction"]["weight"], 10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "prediction",\ "info",\ self._scriptStages["prediction"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=self._ignoreMsg) self._dataframe_context.update_completion_status( self._completionStatus) # CommonUtils.write_to_file(score_summary_path,json.dumps({"scoreSummary":self._score_summary})) print("STARTING DIMENSION ANALYSIS ...") columns_to_keep = [] columns_to_drop = [] # considercolumnstype = self._dataframe_context.get_score_consider_columns_type() # considercolumns = self._dataframe_context.get_score_consider_columns() # if considercolumnstype != None: # if considercolumns != None: # if considercolumnstype == ["excluding"]: # columns_to_drop = considercolumns # elif considercolumnstype == ["including"]: # columns_to_keep = considercolumns columns_to_keep = self._dataframe_context.get_score_consider_columns() if len(columns_to_keep) > 0: columns_to_drop = list(set(df.columns) - set(columns_to_keep)) else: columns_to_drop += ["predicted_probability"] columns_to_drop = [ x for x in columns_to_drop if x in df.columns and x != result_column ] print("columns_to_drop", columns_to_drop) df.drop(columns_to_drop, axis=1, inplace=True) resultColLevelCount = dict(df[result_column].value_counts()) # self._metaParser.update_level_counts(result_column,resultColLevelCount) self._metaParser.update_column_dict( result_column, { "LevelCount": resultColLevelCount, "numberOfUniqueValues": len(list(resultColLevelCount.keys())) }) self._dataframe_context.set_story_on_scored_data(True) SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) spark_scored_df = SQLctx.createDataFrame(df) # spark_scored_df.write.csv(score_data_path+"/data",mode="overwrite",header=True) # TODO update metadata for the newly created dataframe self._dataframe_context.update_consider_columns(columns_to_keep) df_helper = DataFrameHelper(spark_scored_df, self._dataframe_context, self._metaParser) df_helper.set_params() spark_scored_df = df_helper.get_data_frame() # try: # fs = time.time() # narratives_file = self._dataframe_context.get_score_path()+"/narratives/FreqDimension/data.json" # if narratives_file.startswith("file"): # narratives_file = narratives_file[7:] # result_file = self._dataframe_context.get_score_path()+"/results/FreqDimension/data.json" # if result_file.startswith("file"): # result_file = result_file[7:] # init_freq_dim = FreqDimensions(df, df_helper, self._dataframe_context,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName) # df_freq_dimension_obj = init_freq_dim.test_all(dimension_columns=[result_column]) # df_freq_dimension_result = CommonUtils.as_dict(df_freq_dimension_obj) # narratives_obj = DimensionColumnNarrative(result_column, df_helper, self._dataframe_context, df_freq_dimension_obj,self._result_setter,self._prediction_narrative,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName) # narratives = CommonUtils.as_dict(narratives_obj) # # print "Frequency Analysis Done in ", time.time() - fs, " seconds." # self._completionStatus += self._scriptWeightDict[self._analysisName]["total"]*self._scriptStages["frequency"]["weight"]/10 # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ # "frequency",\ # "info",\ # self._scriptStages["frequency"]["summary"],\ # self._completionStatus,\ # self._completionStatus) # CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsg) # self._dataframe_context.update_completion_status(self._completionStatus) # print "Frequency ",self._completionStatus # except: # print "Frequency Analysis Failed " # # try: # fs = time.time() # narratives_file = self._dataframe_context.get_score_path()+"/narratives/ChiSquare/data.json" # if narratives_file.startswith("file"): # narratives_file = narratives_file[7:] # result_file = self._dataframe_context.get_score_path()+"/results/ChiSquare/data.json" # if result_file.startswith("file"): # result_file = result_file[7:] # init_chisquare_obj = ChiSquare(df, df_helper, self._dataframe_context,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName) # df_chisquare_obj = init_chisquare_obj.test_all(dimension_columns= [result_column]) # df_chisquare_result = CommonUtils.as_dict(df_chisquare_obj) # chisquare_narratives = CommonUtils.as_dict(ChiSquareNarratives(df_helper, df_chisquare_obj, self._dataframe_context,df,self._prediction_narrative,self._result_setter,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName)) # except: # print "ChiSquare Analysis Failed " if len(predictedClasses) >= 2: try: fs = time.time() df_decision_tree_obj = DecisionTrees( spark_scored_df, df_helper, self._dataframe_context, self._spark, self._metaParser, scriptWeight=self._scriptWeightDict, analysisName=self._analysisName).test_all( dimension_columns=[result_column]) narratives_obj = CommonUtils.as_dict( DecisionTreeNarrative(result_column, df_decision_tree_obj, self._dataframe_helper, self._dataframe_context, self._metaParser, self._result_setter, story_narrative=None, analysisName=self._analysisName, scriptWeight=self._scriptWeightDict)) print(narratives_obj) except: print("DecisionTree Analysis Failed ") else: data_dict = { "npred": len(predictedClasses), "nactual": len(list(labelMappingDict.values())) } if data_dict["nactual"] > 2: levelCountDict[predictedClasses[0]] = resultColLevelCount[ predictedClasses[0]] levelCountDict["Others"] = sum([ v for k, v in list(resultColLevelCount.items()) if k != predictedClasses[0] ]) else: levelCountDict = resultColLevelCount otherClass = list( set(labelMappingDict.values()) - set(predictedClasses))[0] levelCountDict[otherClass] = 0 print(levelCountDict) total = float( sum([x for x in list(levelCountDict.values()) if x != None])) levelCountTuple = [({ "name": k, "count": v, "percentage": humanize.apnumber(old_div(v * 100, total)) + "%" if old_div(v * 100, total) >= 10 else str(int(old_div(v * 100, total))) + "%" }) for k, v in list(levelCountDict.items()) if v != None] levelCountTuple = sorted(levelCountTuple, key=lambda x: x["count"], reverse=True) data_dict["blockSplitter"] = "|~NEWBLOCK~|" data_dict["targetcol"] = result_column data_dict["nlevel"] = len(list(levelCountDict.keys())) data_dict["topLevel"] = levelCountTuple[0] data_dict["secondLevel"] = levelCountTuple[1] maincardSummary = NarrativesUtils.get_template_output( "/apps/", 'scorewithoutdtree.html', data_dict) main_card = NormalCard() main_card_data = [] main_card_narrative = NarrativesUtils.block_splitter( maincardSummary, "|~NEWBLOCK~|") main_card_data += main_card_narrative chartData = NormalChartData([levelCountDict]).get_data() chartJson = ChartJson(data=chartData) chartJson.set_title(result_column) chartJson.set_chart_type("donut") mainCardChart = C3ChartData(data=chartJson) mainCardChart.set_width_percent(33) main_card_data.append(mainCardChart) uidTable = self._result_setter.get_unique_identifier_table() if uidTable != None: main_card_data.append(uidTable) main_card.set_card_data(main_card_data) main_card.set_card_name( "Predicting Key Drivers of {}".format(result_column)) self._result_setter.set_score_dtree_cards([main_card], {})
def _generate_summary(self): data_dict = {} rules_dict = self._table data_dict["blockSplitter"] = self._blockSplitter data_dict["targetcol"] = self._colname groups = rules_dict.keys() probabilityCutoff = 75 probabilityGroups = [{ "probability": probabilityCutoff, "count": 0, "range": [probabilityCutoff, 100] }, { "probability": probabilityCutoff - 1, "count": 0, "range": [0, probabilityCutoff - 1] }] tableArray = [[ "Prediction Rule", "Probability", "Prediction", "Freq", "group", "richRules" ]] dropdownData = [] chartDict = {} targetLevel = self._dataframe_context.get_target_level_for_model() probabilityArrayAll = [] self._completionStatus = self._dataframe_context.get_completion_status( ) progressMessage = CommonUtils.create_progress_message_object( self._analysisName, "custom", "info", "Generating Prediction rules", self._completionStatus, self._completionStatus, display=True) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=False) self._dataframe_context.update_completion_status( self._completionStatus) targetValues = [x for x in rules_dict.keys() if x == targetLevel ] + [x for x in rules_dict.keys() if x != targetLevel] for idx, target in enumerate(targetValues): if idx == 0: if self._dataframe_context.get_story_on_scored_data() != True: dropdownData.append({ "displayName": target, "name": target, "selected": True, "id": idx + 1 }) else: dropdownData.append({ "displayName": "{} : {}".format(self._colname, target), "name": target, "selected": True, "id": idx + 1 }) else: if self._dataframe_context.get_story_on_scored_data() != True: dropdownData.append({ "displayName": target, "name": target, "selected": False, "id": idx + 1 }) else: dropdownData.append({ "displayName": "{} : {}".format(self._colname, target), "name": target, "selected": False, "id": idx + 1 }) rulesArray = rules_dict[target] probabilityArray = [ round(x, 2) for x in self.success_percent[target] ] probabilityArrayAll += probabilityArray groupArray = [ "strong" if x >= probabilityCutoff else "mixed" for x in probabilityArray ] for idx2, obj in enumerate(probabilityGroups): grpCount = len([ x for x in probabilityArray if x >= obj["range"][0] and x <= obj["range"][1] ]) obj["count"] += grpCount probabilityGroups[idx2] = obj predictionArray = [target] * len(rulesArray) freqArray = self.total_predictions[target] chartDict[target] = sum(freqArray) success = self.successful_predictions[target] success_percent = self.success_percent[target] richRulesArray = [] crudeRuleArray = [] analysisType = self._dataframe_context.get_analysis_type() targetCol = self._dataframe_context.get_result_column() binFlag = False if self._dataframe_context.get_custom_analysis_details() != None: binnedColObj = [ x["colName"] for x in self._dataframe_context.get_custom_analysis_details() ] if binnedColObj != None and targetCol in binnedColObj: binFlag = True for idx2, crudeRule in enumerate(rulesArray): richRule, crudeRule = NarrativesUtils.generate_rules( self._colname, target, crudeRule, freqArray[idx2], success[idx2], success_percent[idx2], analysisType, binFlag=binFlag) richRulesArray.append(richRule) crudeRuleArray.append(crudeRule) probabilityArray = map( lambda x: humanize.apnumber(x) + "%" if x >= 10 else str(int(x)) + "%", probabilityArray) # targetArray = zip(richRulesArray,probabilityArray,predictionArray,freqArray,groupArray) targetArray = zip(crudeRuleArray, probabilityArray, predictionArray, freqArray, groupArray, richRulesArray) targetArray = [list(x) for x in targetArray] tableArray += targetArray donutChartMaxLevel = 10 if self._dataframe_context.get_story_on_scored_data() == True: chartDict = {} probabilityRangeForChart = GLOBALSETTINGS.PROBABILITY_RANGE_FOR_DONUT_CHART chartDict = dict( zip(probabilityRangeForChart.keys(), [0] * len(probabilityRangeForChart))) for val in probabilityArrayAll: for grps, grpRange in probabilityRangeForChart.items(): if val > grpRange[0] and val <= grpRange[1]: chartDict[grps] = chartDict[grps] + 1 chartDict = {k: v for k, v in chartDict.items() if v != 0} else: chartDict = dict([(k, sum(v)) for k, v in self.total_predictions.items()]) chartDict = {k: v for k, v in chartDict.items() if v != 0} if len(chartDict) > donutChartMaxLevel: chartDict = NarrativesUtils.restructure_donut_chart_data( chartDict, nLevels=donutChartMaxLevel) chartData = NormalChartData([chartDict]).get_data() chartJson = ChartJson(data=chartData) chartJson.set_title(self._colname) chartJson.set_chart_type("donut") mainCardChart = C3ChartData(data=chartJson) mainCardChart.set_width_percent(45) # mainCardChart = {"dataType": "c3Chart","widthPercent":33 ,"data": {"data": [chartDict],"title":self._colname,"axes":{},"label_text":{},"legend":{},"yAxisNumberFormat": ".2s","types":None,"axisRotation":False, "chart_type": "donut"}} dropdownDict = { "dataType": "dropdown", "label": "Showing prediction rules for", "data": dropdownData } data_dict["probabilityGroups"] = probabilityGroups if self._dataframe_context.get_story_on_scored_data() != True: maincardSummary = NarrativesUtils.get_template_output(self._base_dir,\ 'decisiontreesummary.html',data_dict) else: predictedLevelcountArray = [(x[2], x[3]) for x in tableArray[1:]] predictedLevelCountDict = {} # predictedLevelcountDict = defaultdict(predictedLevelcountArray) for val in predictedLevelcountArray: predictedLevelCountDict.setdefault(val[0], []).append(val[1]) levelCountDict = {} for k, v in predictedLevelCountDict.items(): levelCountDict[k] = sum(v) # levelCountDict = self._metaParser.get_unique_level_dict(self._colname) total = float( sum([x for x in levelCountDict.values() if x != None])) levelCountTuple = [{ "name": k, "count": v, "percentage": round(v * 100 / total, 2) } for k, v in levelCountDict.items() if v != None] percentageArray = [x["percentage"] for x in levelCountTuple] percentageArray = NarrativesUtils.ret_smart_round(percentageArray) levelCountTuple = [{ "name": obj["name"], "count": obj["count"], "percentage": str(percentageArray[idx]) + "%" } for idx, obj in enumerate(levelCountTuple)] data_dict["nlevel"] = len(levelCountDict) print "levelCountTuple", levelCountTuple print "levelCountDict", levelCountDict if targetLevel in levelCountDict: data_dict["topLevel"] = [ x for x in levelCountTuple if x["name"] == targetLevel ][0] if len(levelCountTuple) > 1: data_dict["secondLevel"] = max([ x for x in levelCountTuple if x["name"] != targetLevel ], key=lambda x: x["count"]) else: data_dict["secondLevel"] = None else: data_dict["topLevel"] = levelCountTuple[0] if len(levelCountTuple) > 1: data_dict["secondLevel"] = levelCountTuple[1] else: data_dict["secondLevel"] = None print data_dict maincardSummary = NarrativesUtils.get_template_output( self._base_dir, 'decisiontreescore.html', data_dict) main_card = NormalCard() main_card_data = [] main_card_narrative = NarrativesUtils.block_splitter( maincardSummary, self._blockSplitter) main_card_data += main_card_narrative main_card_data.append(mainCardChart) main_card_data.append(dropdownDict) main_card_table = TableData() if self._dataframe_context.get_story_on_scored_data() == True: main_card_table.set_table_width(75) main_card_table.set_table_data(tableArray) main_card_table.set_table_type("popupDecisionTreeTable") main_card_data.append(main_card_table) uidTable = self._result_setter.get_unique_identifier_table() if uidTable != None: main_card_data.append(uidTable) else: main_card_table.set_table_width(100) main_card.set_card_data(main_card_data) main_card.set_card_name("Predicting Key Drivers of {}".format( self._colname)) self._decisionTreeNode.add_a_card(main_card)