def Predict(self): self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight() self._scriptStages = { "initialization":{ "summary":"Initialized the Decision Tree Regression Scripts", "weight":2 }, "predictionStart":{ "summary":"Decision Tree Regression Model Prediction Started", "weight":2 }, "predictionFinished":{ "summary":"Decision Tree Regression Model Prediction Finished", "weight":6 } } CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"initialization","info",display=True,emptyBin=False,customMsg=None,weightKey="total") SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) dataSanity = True categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns)-set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() test_data_path = self._dataframe_context.get_input_file() if self._mlEnv == "spark": score_data_path = self._dataframe_context.get_score_path()+"/data.csv" trained_model_path = "file://" + self._dataframe_context.get_model_path() trained_model_path += "/model" pipeline_path = "/".join(trained_model_path.split("/")[:-1])+"/pipeline" print "trained_model_path",trained_model_path print "pipeline_path",pipeline_path print "score_data_path",score_data_path pipelineModel = MLUtils.load_pipeline(pipeline_path) trained_model = MLUtils.load_dtree_regresssion_pyspark_model(trained_model_path) df = self._data_frame indexed = pipelineModel.transform(df) transformed = trained_model.transform(indexed) if result_column in transformed.columns: transformed = transformed.withColumnRenamed(result_column,"originalLabel") transformed = transformed.withColumnRenamed("prediction",result_column) pandas_scored_df = transformed.select(list(set(self._data_frame.columns+[result_column]))).toPandas() if score_data_path.startswith("file"): score_data_path = score_data_path[7:] pandas_scored_df.to_csv(score_data_path,header=True,index=False) print "STARTING Measure ANALYSIS ..." columns_to_keep = [] columns_to_drop = [] columns_to_keep = self._dataframe_context.get_score_consider_columns() if len(columns_to_keep) > 0: columns_to_drop = list(set(df.columns)-set(columns_to_keep)) else: columns_to_drop += ["predicted_probability"] columns_to_drop = [x for x in columns_to_drop if x in df.columns and x != result_column] print "columns_to_drop",columns_to_drop spark_scored_df = transformed.select(list(set(columns_to_keep+[result_column]))) elif self._mlEnv == "sklearn": CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"predictionStart","info",display=True,emptyBin=False,customMsg=None,weightKey="total") score_data_path = self._dataframe_context.get_score_path()+"/data.csv" trained_model_path = "file://" + self._dataframe_context.get_model_path() trained_model_path += "/"+self._dataframe_context.get_model_for_scoring()+".pkl" print "trained_model_path",trained_model_path print "score_data_path",score_data_path if trained_model_path.startswith("file"): trained_model_path = trained_model_path[7:] trained_model = joblib.load(trained_model_path) model_columns = self._dataframe_context.get_model_features() print "model_columns",model_columns df = self._data_frame.toPandas() # pandas_df = MLUtils.factorize_columns(df,[x for x in categorical_columns if x != result_column]) pandas_df = MLUtils.create_dummy_columns(df,[x for x in categorical_columns if x != result_column]) pandas_df = MLUtils.fill_missing_columns(pandas_df,model_columns,result_column) if uid_col: pandas_df = pandas_df[[x for x in pandas_df.columns if x != uid_col]] y_score = trained_model.predict(pandas_df) scoreKpiArray = MLUtils.get_scored_data_summary(y_score) kpiCard = NormalCard() kpiCardData = [KpiData(data=x) for x in scoreKpiArray] kpiCard.set_card_data(kpiCardData) kpiCard.set_cente_alignment(True) print CommonUtils.convert_python_object_to_json(kpiCard) self._result_setter.set_kpi_card_regression_score(kpiCard) pandas_df[result_column] = y_score df[result_column] = y_score df.to_csv(score_data_path,header=True,index=False) CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"predictionFinished","info",display=True,emptyBin=False,customMsg=None,weightKey="total") print "STARTING Measure ANALYSIS ..." columns_to_keep = [] columns_to_drop = [] columns_to_keep = self._dataframe_context.get_score_consider_columns() if len(columns_to_keep) > 0: columns_to_drop = list(set(df.columns)-set(columns_to_keep)) else: columns_to_drop += ["predicted_probability"] columns_to_drop = [x for x in columns_to_drop if x in df.columns and x != result_column] print "columns_to_drop",columns_to_drop pandas_scored_df = df[list(set(columns_to_keep+[result_column]))] spark_scored_df = SQLctx.createDataFrame(pandas_scored_df) # spark_scored_df.write.csv(score_data_path+"/data",mode="overwrite",header=True) # TODO update metadata for the newly created dataframe self._dataframe_context.update_consider_columns(columns_to_keep) print spark_scored_df.printSchema() df_helper = DataFrameHelper(spark_scored_df, self._dataframe_context,self._metaParser) df_helper.set_params() df = df_helper.get_data_frame() # self._dataframe_context.set_dont_send_message(True) try: fs = time.time() descr_stats_obj = DescriptiveStatsScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,scriptWeight=self._scriptWeightDict,analysisName="Descriptive analysis") descr_stats_obj.Run() print "DescriptiveStats Analysis Done in ", time.time() - fs, " seconds." except: print "Frequency Analysis Failed " # try: # fs = time.time() # df_helper.fill_na_dimension_nulls() # df = df_helper.get_data_frame() # dt_reg = DecisionTreeRegressionScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,self._metaParser,scriptWeight=self._scriptWeightDict,analysisName="Predictive modeling") # dt_reg.Run() # print "DecisionTrees Analysis Done in ", time.time() - fs, " seconds." # except: # print "DTREE FAILED" try: fs = time.time() two_way_obj = TwoWayAnovaScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,self._metaParser,scriptWeight=self._scriptWeightDict,analysisName="Measure vs. Dimension") two_way_obj.Run() print "OneWayAnova Analysis Done in ", time.time() - fs, " seconds." except: print "Anova Analysis Failed"
def Predict(self): self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight() self._scriptStages = { "initialization":{ "summary":"Initialized the Generalized Linear Regression Scripts", "weight":2 }, "predictionStart":{ "summary":"Generalized Linear Regression Model Prediction Started", "weight":2 }, "predictionFinished":{ "summary":"Generalized Linear Regression Model Prediction Finished", "weight":6 } } CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"initialization","info",display=True,emptyBin=False,customMsg=None,weightKey="total") SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) dataSanity = True categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns)-set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() test_data_path = self._dataframe_context.get_input_file() CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"predictionStart","info",display=True,emptyBin=False,customMsg=None,weightKey="total") test_data_path = self._dataframe_context.get_input_file() score_data_path = self._dataframe_context.get_score_path()+"/data.csv" trained_model_path = "file://" + self._dataframe_context.get_model_path() trained_model_path += "/model" pipeline_path = "/".join(trained_model_path.split("/")[:-1])+"/pipeline" print "trained_model_path",trained_model_path print "pipeline_path",pipeline_path print "score_data_path",score_data_path pipelineModel = MLUtils.load_pipeline(pipeline_path) trained_model = MLUtils.load_generalized_linear_regresssion_pyspark_model(trained_model_path) df = self._data_frame indexed = pipelineModel.transform(df) transformed = trained_model.transform(indexed) if result_column in transformed.columns: transformed = transformed.withColumnRenamed(result_column,"originalLabel") transformed = transformed.withColumnRenamed("prediction",result_column) pandas_scored_df = transformed.select(list(set(self._data_frame.columns+[result_column]))).toPandas() if score_data_path.startswith("file"): score_data_path = score_data_path[7:] pandas_scored_df.to_csv(score_data_path,header=True,index=False) CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"predictionFinished","info",display=True,emptyBin=False,customMsg=None,weightKey="total") print "STARTING Measure ANALYSIS ..." columns_to_keep = [] columns_to_drop = [] columns_to_keep = self._dataframe_context.get_score_consider_columns() if len(columns_to_keep) > 0: columns_to_drop = list(set(df.columns)-set(columns_to_keep)) else: columns_to_drop += ["predicted_probability"] columns_to_drop = [x for x in columns_to_drop if x in df.columns and x != result_column] print "columns_to_drop",columns_to_drop spark_scored_df = transformed.select(list(set(columns_to_keep+[result_column]))) df_helper = DataFrameHelper(spark_scored_df, self._dataframe_context,self._metaParser) df_helper.set_params() df = df_helper.get_data_frame() # self._dataframe_context.set_dont_send_message(True) try: fs = time.time() descr_stats_obj = DescriptiveStatsScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,scriptWeight=self._scriptWeightDict,analysisName="Descriptive analysis") descr_stats_obj.Run() print "DescriptiveStats Analysis Done in ", time.time() - fs, " seconds." except: print "Frequency Analysis Failed " try: fs = time.time() df_helper.fill_na_dimension_nulls() df = df_helper.get_data_frame() dt_reg = DecisionTreeRegressionScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,self._metaParser,scriptWeight=self._scriptWeightDict,analysisName="Predictive modeling") dt_reg.Run() print "DecisionTrees Analysis Done in ", time.time() - fs, " seconds." except: print "DTREE FAILED" try: fs = time.time() two_way_obj = TwoWayAnovaScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,self._metaParser,scriptWeight=self._scriptWeightDict,analysisName="Measure vs. Dimension") two_way_obj.Run() print "OneWayAnova Analysis Done in ", time.time() - fs, " seconds." except: print "Anova Analysis Failed"
def Predict(self): self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight( ) self._scriptStages = { "initialization": { "summary": "Initialized the Random Forest Scripts", "weight": 2 }, "prediction": { "summary": "Random Forest Model Prediction Finished", "weight": 2 }, "frequency": { "summary": "descriptive analysis finished", "weight": 2 }, "chisquare": { "summary": "chi Square analysis finished", "weight": 4 }, "completion": { "summary": "all analysis finished", "weight": 4 }, } self._completionStatus += old_div( self._scriptWeightDict[self._analysisName]["total"] * self._scriptStages["initialization"]["weight"], 10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "initialization",\ "info",\ self._scriptStages["initialization"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=self._ignoreMsg) self._dataframe_context.update_completion_status( self._completionStatus) # Match with the level_counts and then clean the data dataSanity = True level_counts_train = self._dataframe_context.get_level_count_dict() cat_cols = self._dataframe_helper.get_string_columns() # level_counts_score = CommonUtils.get_level_count_dict(self._data_frame,cat_cols,self._dataframe_context.get_column_separator(),output_type="dict") # if level_counts_train != {}: # for key in level_counts_train: # if key in level_counts_score: # if level_counts_train[key] != level_counts_score[key]: # dataSanity = False # else: # dataSanity = False categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() test_data_path = self._dataframe_context.get_input_file() if self._mlEnv == "spark": pass elif self._mlEnv == "sklearn": score_data_path = self._dataframe_context.get_score_path( ) + "/data.csv" if score_data_path.startswith("file"): score_data_path = score_data_path[7:] trained_model_path = self._dataframe_context.get_model_path() trained_model_path += "/" + self._dataframe_context.get_model_for_scoring( ) + ".pkl" if trained_model_path.startswith("file"): trained_model_path = trained_model_path[7:] score_summary_path = self._dataframe_context.get_score_path( ) + "/Summary/summary.json" if score_summary_path.startswith("file"): score_summary_path = score_summary_path[7:] trained_model = joblib.load(trained_model_path) # pandas_df = self._data_frame.toPandas() df = self._data_frame.toPandas() model_columns = self._dataframe_context.get_model_features() pandas_df = MLUtils.create_dummy_columns( df, [x for x in categorical_columns if x != result_column]) pandas_df = MLUtils.fill_missing_columns(pandas_df, model_columns, result_column) if uid_col: pandas_df = pandas_df[[ x for x in pandas_df.columns if x != uid_col ]] y_score = trained_model.predict(pandas_df) y_prob = trained_model.predict_proba(pandas_df) y_prob = MLUtils.calculate_predicted_probability(y_prob) y_prob = list([round(x, 2) for x in y_prob]) score = { "predicted_class": y_score, "predicted_probability": y_prob } df["predicted_class"] = score["predicted_class"] labelMappingDict = self._dataframe_context.get_label_map() df["predicted_class"] = df["predicted_class"].apply( lambda x: labelMappingDict[x] if x != None else "NA") df["predicted_probability"] = score["predicted_probability"] self._score_summary[ "prediction_split"] = MLUtils.calculate_scored_probability_stats( df) self._score_summary["result_column"] = result_column if result_column in df.columns: df.drop(result_column, axis=1, inplace=True) df = df.rename(index=str, columns={"predicted_class": result_column}) df.to_csv(score_data_path, header=True, index=False) uidCol = self._dataframe_context.get_uid_column() if uidCol == None: uidCols = self._metaParser.get_suggested_uid_columns() if len(uidCols) > 0: uidCol = uidCols[0] uidTableData = [] predictedClasses = list(df[result_column].unique()) if uidCol: if uidCol in df.columns: for level in predictedClasses: levelDf = df[df[result_column] == level] levelDf = levelDf[[ uidCol, "predicted_probability", result_column ]] levelDf.sort_values(by="predicted_probability", ascending=False, inplace=True) levelDf["predicted_probability"] = levelDf[ "predicted_probability"].apply( lambda x: humanize.apnumber(x * 100) + "%" if x * 100 >= 10 else str(int(x * 100)) + "%") uidTableData.append(levelDf[:5]) uidTableData = pd.concat(uidTableData) uidTableData = [list(arr) for arr in list(uidTableData.values)] uidTableData = [[uidCol, "Probability", result_column] ] + uidTableData uidTable = TableData() uidTable.set_table_width(25) uidTable.set_table_data(uidTableData) uidTable.set_table_type("normalHideColumn") self._result_setter.set_unique_identifier_table( json.loads( CommonUtils.convert_python_object_to_json(uidTable))) self._completionStatus += old_div( self._scriptWeightDict[self._analysisName]["total"] * self._scriptStages["prediction"]["weight"], 10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "prediction",\ "info",\ self._scriptStages["prediction"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=self._ignoreMsg) self._dataframe_context.update_completion_status( self._completionStatus) # CommonUtils.write_to_file(score_summary_path,json.dumps({"scoreSummary":self._score_summary})) print("STARTING DIMENSION ANALYSIS ...") columns_to_keep = [] columns_to_drop = [] # considercolumnstype = self._dataframe_context.get_score_consider_columns_type() # considercolumns = self._dataframe_context.get_score_consider_columns() # if considercolumnstype != None: # if considercolumns != None: # if considercolumnstype == ["excluding"]: # columns_to_drop = considercolumns # elif considercolumnstype == ["including"]: # columns_to_keep = considercolumns columns_to_keep = self._dataframe_context.get_score_consider_columns() if len(columns_to_keep) > 0: columns_to_drop = list(set(df.columns) - set(columns_to_keep)) else: columns_to_drop += ["predicted_probability"] columns_to_drop = [ x for x in columns_to_drop if x in df.columns and x != result_column ] print("columns_to_drop", columns_to_drop) df.drop(columns_to_drop, axis=1, inplace=True) resultColLevelCount = dict(df[result_column].value_counts()) # self._metaParser.update_level_counts(result_column,resultColLevelCount) self._metaParser.update_column_dict( result_column, { "LevelCount": resultColLevelCount, "numberOfUniqueValues": len(list(resultColLevelCount.keys())) }) self._dataframe_context.set_story_on_scored_data(True) SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) spark_scored_df = SQLctx.createDataFrame(df) # spark_scored_df.write.csv(score_data_path+"/data",mode="overwrite",header=True) # TODO update metadata for the newly created dataframe self._dataframe_context.update_consider_columns(columns_to_keep) df_helper = DataFrameHelper(spark_scored_df, self._dataframe_context, self._metaParser) df_helper.set_params() spark_scored_df = df_helper.get_data_frame() # try: # fs = time.time() # narratives_file = self._dataframe_context.get_score_path()+"/narratives/FreqDimension/data.json" # if narratives_file.startswith("file"): # narratives_file = narratives_file[7:] # result_file = self._dataframe_context.get_score_path()+"/results/FreqDimension/data.json" # if result_file.startswith("file"): # result_file = result_file[7:] # init_freq_dim = FreqDimensions(df, df_helper, self._dataframe_context,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName) # df_freq_dimension_obj = init_freq_dim.test_all(dimension_columns=[result_column]) # df_freq_dimension_result = CommonUtils.as_dict(df_freq_dimension_obj) # narratives_obj = DimensionColumnNarrative(result_column, df_helper, self._dataframe_context, df_freq_dimension_obj,self._result_setter,self._prediction_narrative,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName) # narratives = CommonUtils.as_dict(narratives_obj) # # print "Frequency Analysis Done in ", time.time() - fs, " seconds." # self._completionStatus += self._scriptWeightDict[self._analysisName]["total"]*self._scriptStages["frequency"]["weight"]/10 # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ # "frequency",\ # "info",\ # self._scriptStages["frequency"]["summary"],\ # self._completionStatus,\ # self._completionStatus) # CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsg) # self._dataframe_context.update_completion_status(self._completionStatus) # print "Frequency ",self._completionStatus # except: # print "Frequency Analysis Failed " # # try: # fs = time.time() # narratives_file = self._dataframe_context.get_score_path()+"/narratives/ChiSquare/data.json" # if narratives_file.startswith("file"): # narratives_file = narratives_file[7:] # result_file = self._dataframe_context.get_score_path()+"/results/ChiSquare/data.json" # if result_file.startswith("file"): # result_file = result_file[7:] # init_chisquare_obj = ChiSquare(df, df_helper, self._dataframe_context,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName) # df_chisquare_obj = init_chisquare_obj.test_all(dimension_columns= [result_column]) # df_chisquare_result = CommonUtils.as_dict(df_chisquare_obj) # chisquare_narratives = CommonUtils.as_dict(ChiSquareNarratives(df_helper, df_chisquare_obj, self._dataframe_context,df,self._prediction_narrative,self._result_setter,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName)) # except: # print "ChiSquare Analysis Failed " if len(predictedClasses) >= 2: try: fs = time.time() df_decision_tree_obj = DecisionTrees( spark_scored_df, df_helper, self._dataframe_context, self._spark, self._metaParser, scriptWeight=self._scriptWeightDict, analysisName=self._analysisName).test_all( dimension_columns=[result_column]) narratives_obj = CommonUtils.as_dict( DecisionTreeNarrative(result_column, df_decision_tree_obj, self._dataframe_helper, self._dataframe_context, self._metaParser, self._result_setter, story_narrative=None, analysisName=self._analysisName, scriptWeight=self._scriptWeightDict)) print(narratives_obj) except: print("DecisionTree Analysis Failed ") else: data_dict = { "npred": len(predictedClasses), "nactual": len(list(labelMappingDict.values())) } if data_dict["nactual"] > 2: levelCountDict[predictedClasses[0]] = resultColLevelCount[ predictedClasses[0]] levelCountDict["Others"] = sum([ v for k, v in list(resultColLevelCount.items()) if k != predictedClasses[0] ]) else: levelCountDict = resultColLevelCount otherClass = list( set(labelMappingDict.values()) - set(predictedClasses))[0] levelCountDict[otherClass] = 0 print(levelCountDict) total = float( sum([x for x in list(levelCountDict.values()) if x != None])) levelCountTuple = [({ "name": k, "count": v, "percentage": humanize.apnumber(old_div(v * 100, total)) + "%" if old_div(v * 100, total) >= 10 else str(int(old_div(v * 100, total))) + "%" }) for k, v in list(levelCountDict.items()) if v != None] levelCountTuple = sorted(levelCountTuple, key=lambda x: x["count"], reverse=True) data_dict["blockSplitter"] = "|~NEWBLOCK~|" data_dict["targetcol"] = result_column data_dict["nlevel"] = len(list(levelCountDict.keys())) data_dict["topLevel"] = levelCountTuple[0] data_dict["secondLevel"] = levelCountTuple[1] maincardSummary = NarrativesUtils.get_template_output( "/apps/", 'scorewithoutdtree.html', data_dict) main_card = NormalCard() main_card_data = [] main_card_narrative = NarrativesUtils.block_splitter( maincardSummary, "|~NEWBLOCK~|") main_card_data += main_card_narrative chartData = NormalChartData([levelCountDict]).get_data() chartJson = ChartJson(data=chartData) chartJson.set_title(result_column) chartJson.set_chart_type("donut") mainCardChart = C3ChartData(data=chartJson) mainCardChart.set_width_percent(33) main_card_data.append(mainCardChart) uidTable = self._result_setter.get_unique_identifier_table() if uidTable != None: main_card_data.append(uidTable) main_card.set_card_data(main_card_data) main_card.set_card_name( "Predicting Key Drivers of {}".format(result_column)) self._result_setter.set_score_dtree_cards([main_card], {})
def Predict(self): self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight( ) self._scriptStages = { "initialization": { "summary": "Initialized the Naive Bayes Scripts", "weight": 2 }, "prediction": { "summary": "Spark ML Naive Bayes Model Prediction Finished", "weight": 2 }, "frequency": { "summary": "descriptive analysis finished", "weight": 2 }, "chisquare": { "summary": "chi Square analysis finished", "weight": 4 }, "completion": { "summary": "all analysis finished", "weight": 4 }, } self._completionStatus += self._scriptWeightDict[self._analysisName][ "total"] * self._scriptStages["initialization"]["weight"] / 10 progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "initialization",\ "info",\ self._scriptStages["initialization"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) dataSanity = True level_counts_train = self._dataframe_context.get_level_count_dict() categorical_columns = self._dataframe_helper.get_string_columns() numerical_columns = self._dataframe_helper.get_numeric_columns() time_dimension_columns = self._dataframe_helper.get_timestamp_columns() result_column = self._dataframe_context.get_result_column() categorical_columns = [ x for x in categorical_columns if x != result_column ] level_counts_score = CommonUtils.get_level_count_dict( self._data_frame, categorical_columns, self._dataframe_context.get_column_separator(), output_type="dict", dataType="spark") for key in level_counts_train: if key in level_counts_score: if level_counts_train[key] != level_counts_score[key]: dataSanity = False else: dataSanity = False test_data_path = self._dataframe_context.get_input_file() score_data_path = self._dataframe_context.get_score_path( ) + "/data.csv" trained_model_path = self._dataframe_context.get_model_path() trained_model_path = "/".join( trained_model_path.split("/")[:-1] ) + "/" + self._slug + "/" + self._dataframe_context.get_model_for_scoring( ) # score_summary_path = self._dataframe_context.get_score_path()+"/Summary/summary.json" pipelineModel = MLUtils.load_pipeline(trained_model_path) df = self._data_frame transformed = pipelineModel.transform(df) label_indexer_dict = MLUtils.read_string_indexer_mapping( trained_model_path, SQLctx) prediction_to_levels = udf(lambda x: label_indexer_dict[x], StringType()) transformed = transformed.withColumn( result_column, prediction_to_levels(transformed.prediction)) if "probability" in transformed.columns: probability_dataframe = transformed.select( [result_column, "probability"]).toPandas() probability_dataframe = probability_dataframe.rename( index=str, columns={result_column: "predicted_class"}) probability_dataframe[ "predicted_probability"] = probability_dataframe[ "probability"].apply(lambda x: max(x)) self._score_summary[ "prediction_split"] = MLUtils.calculate_scored_probability_stats( probability_dataframe) self._score_summary["result_column"] = result_column scored_dataframe = transformed.select( categorical_columns + time_dimension_columns + numerical_columns + [result_column, "probability"]).toPandas() scored_dataframe['predicted_probability'] = probability_dataframe[ "predicted_probability"].values # scored_dataframe = scored_dataframe.rename(index=str, columns={"predicted_probability": "probability"}) else: self._score_summary["prediction_split"] = [] self._score_summary["result_column"] = result_column scored_dataframe = transformed.select(categorical_columns + time_dimension_columns + numerical_columns + [result_column]).toPandas() labelMappingDict = self._dataframe_context.get_label_map() if score_data_path.startswith("file"): score_data_path = score_data_path[7:] scored_dataframe.to_csv(score_data_path, header=True, index=False) uidCol = self._dataframe_context.get_uid_column() if uidCol == None: uidCols = self._metaParser.get_suggested_uid_columns() if len(uidCols) > 0: uidCol = uidCols[0] uidTableData = [] predictedClasses = list(scored_dataframe[result_column].unique()) if uidCol: if uidCol in df.columns: for level in predictedClasses: levelDf = scored_dataframe[scored_dataframe[result_column] == level] levelDf = levelDf[[ uidCol, "predicted_probability", result_column ]] levelDf.sort_values(by="predicted_probability", ascending=False, inplace=True) levelDf["predicted_probability"] = levelDf[ "predicted_probability"].apply( lambda x: humanize.apnumber(x * 100) + "%" if x * 100 >= 10 else str(int(x * 100)) + "%") uidTableData.append(levelDf[:5]) uidTableData = pd.concat(uidTableData) uidTableData = [list(arr) for arr in list(uidTableData.values)] uidTableData = [[uidCol, "Probability", result_column] ] + uidTableData uidTable = TableData() uidTable.set_table_width(25) uidTable.set_table_data(uidTableData) uidTable.set_table_type("normalHideColumn") self._result_setter.set_unique_identifier_table( json.loads( CommonUtils.convert_python_object_to_json(uidTable))) self._completionStatus += self._scriptWeightDict[self._analysisName][ "total"] * self._scriptStages["prediction"]["weight"] / 10 progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "prediction",\ "info",\ self._scriptStages["prediction"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) print("STARTING DIMENSION ANALYSIS ...") columns_to_keep = [] columns_to_drop = [] columns_to_keep = self._dataframe_context.get_score_consider_columns() if len(columns_to_keep) > 0: columns_to_drop = list(set(df.columns) - set(columns_to_keep)) else: columns_to_drop += ["predicted_probability"] scored_df = transformed.select(categorical_columns + time_dimension_columns + numerical_columns + [result_column]) columns_to_drop = [ x for x in columns_to_drop if x in scored_df.columns ] modified_df = scored_df.select( [x for x in scored_df.columns if x not in columns_to_drop]) resultColLevelCount = dict( modified_df.groupby(result_column).count().collect()) self._metaParser.update_column_dict( result_column, { "LevelCount": resultColLevelCount, "numberOfUniqueValues": len(resultColLevelCount.keys()) }) self._dataframe_context.set_story_on_scored_data(True) self._dataframe_context.update_consider_columns(columns_to_keep) df_helper = DataFrameHelper(modified_df, self._dataframe_context, self._metaParser) df_helper.set_params() spark_scored_df = df_helper.get_data_frame() if len(predictedClasses) >= 2: try: fs = time.time() df_decision_tree_obj = DecisionTrees( spark_scored_df, df_helper, self._dataframe_context, self._spark, self._metaParser, scriptWeight=self._scriptWeightDict, analysisName=self._analysisName).test_all( dimension_columns=[result_column]) narratives_obj = CommonUtils.as_dict( DecisionTreeNarrative(result_column, df_decision_tree_obj, self._dataframe_helper, self._dataframe_context, self._metaParser, self._result_setter, story_narrative=None, analysisName=self._analysisName, scriptWeight=self._scriptWeightDict)) print(narratives_obj) except Exception as e: print("DecisionTree Analysis Failed ", str(e)) else: data_dict = { "npred": len(predictedClasses), "nactual": len(labelMappingDict.values()) } if data_dict["nactual"] > 2: levelCountDict[predictedClasses[0]] = resultColLevelCount[ predictedClasses[0]] levelCountDict["Others"] = sum([ v for k, v in resultColLevelCount.items() if k != predictedClasses[0] ]) else: levelCountDict = resultColLevelCount otherClass = list( set(labelMappingDict.values()) - set(predictedClasses))[0] levelCountDict[otherClass] = 0 print(levelCountDict) total = float( sum([x for x in levelCountDict.values() if x != None])) levelCountTuple = [({ "name": k, "count": v, "percentage": humanize.apnumber(v * 100 / total) + "%" }) for k, v in levelCountDict.items() if v != None] levelCountTuple = sorted(levelCountTuple, key=lambda x: x["count"], reverse=True) data_dict["blockSplitter"] = "|~NEWBLOCK~|" data_dict["targetcol"] = result_column data_dict["nlevel"] = len(levelCountDict.keys()) data_dict["topLevel"] = levelCountTuple[0] data_dict["secondLevel"] = levelCountTuple[1] maincardSummary = NarrativesUtils.get_template_output( "/apps/", 'scorewithoutdtree.html', data_dict) main_card = NormalCard() main_card_data = [] main_card_narrative = NarrativesUtils.block_splitter( maincardSummary, "|~NEWBLOCK~|") main_card_data += main_card_narrative chartData = NormalChartData([levelCountDict]).get_data() chartJson = ChartJson(data=chartData) chartJson.set_title(result_column) chartJson.set_chart_type("donut") mainCardChart = C3ChartData(data=chartJson) mainCardChart.set_width_percent(33) main_card_data.append(mainCardChart) uidTable = self._result_setter.get_unique_identifier_table() if uidTable != None: main_card_data.append(uidTable) main_card.set_card_data(main_card_data) main_card.set_card_name( "Predicting Key Drivers of {}".format(result_column)) self._result_setter.set_score_dtree_cards([main_card], {})
def Predict(self): SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) dataSanity = True level_counts_train = self._dataframe_context.get_level_count_dict() categorical_columns = self._dataframe_helper.get_string_columns() numerical_columns = self._dataframe_helper.get_numeric_columns() time_dimension_columns = self._dataframe_helper.get_timestamp_columns() result_column = self._dataframe_context.get_result_column() categorical_columns = [ x for x in categorical_columns if x != result_column ] level_counts_score = CommonUtils.get_level_count_dict( self._data_frame, categorical_columns, self._dataframe_context.get_column_separator(), output_type="dict", dataType="spark") for key in level_counts_train: if key in level_counts_score: if level_counts_train[key] != level_counts_score[key]: dataSanity = False else: dataSanity = False test_data_path = self._dataframe_context.get_input_file() score_data_path = self._dataframe_context.get_score_path( ) + "/ScoredData/data.csv" trained_model_path = self._dataframe_context.get_model_path() if trained_model_path.endswith(".pkl"): trained_model_path = "/".join( trained_model_path.split("/")[:-1]) + "/model" pipeline_path = "/".join( trained_model_path.split("/")[:-1]) + "/pipeline" score_summary_path = self._dataframe_context.get_score_path( ) + "/Summary/summary.json" pipelineModel = MLUtils.load_pipeline(pipeline_path) if self._classifier == "OneVsRest": trained_model = MLUtils.load_one_vs_rest_model(trained_model_path) elif self._classifier == "lr": trained_model = MLUtils.load_logistic_model(trained_model_path) df = self._data_frame indexed = pipelineModel.transform(df) transformed = trained_model.transform(indexed) label_indexer_dict = MLUtils.read_string_indexer_mapping( pipeline_path, SQLctx) prediction_to_levels = udf(lambda x: label_indexer_dict[x], StringType()) transformed = transformed.withColumn( result_column, prediction_to_levels(transformed.prediction)) # udf_to_calculate_probability = udf(lambda x:max(x[0])) # transformed = transformed.withColumn("predicted_probability",udf_to_calculate_probability(transformed.probability)) # print transformed.select("predicted_probability").show(5) if "probability" in transformed.columns: probability_dataframe = transformed.select( [result_column, "probability"]).toPandas() probability_dataframe = probability_dataframe.rename( index=str, columns={result_column: "predicted_class"}) probability_dataframe[ "predicted_probability"] = probability_dataframe[ "probability"].apply(lambda x: max(x)) self._score_summary[ "prediction_split"] = MLUtils.calculate_scored_probability_stats( probability_dataframe) self._score_summary["result_column"] = result_column scored_dataframe = transformed.select( categorical_columns + time_dimension_columns + numerical_columns + [result_column, "probability"]).toPandas() # scored_dataframe = scored_dataframe.rename(index=str, columns={"predicted_probability": "probability"}) else: self._score_summary["prediction_split"] = [] self._score_summary["result_column"] = result_column scored_dataframe = transformed.select(categorical_columns + time_dimension_columns + numerical_columns + [result_column]).toPandas() if score_data_path.startswith("file"): score_data_path = score_data_path[7:] scored_dataframe.to_csv(score_data_path, header=True, index=False) # print json.dumps({"scoreSummary":self._score_summary},indent=2) CommonUtils.write_to_file( score_summary_path, json.dumps({"scoreSummary": self._score_summary})) print "STARTING DIMENSION ANALYSIS ..." columns_to_keep = [] columns_to_drop = [] considercolumnstype = self._dataframe_context.get_score_consider_columns_type( ) considercolumns = self._dataframe_context.get_score_consider_columns() if considercolumnstype != None: if considercolumns != None: if considercolumnstype == ["excluding"]: columns_to_drop = considercolumns elif considercolumnstype == ["including"]: columns_to_keep = considercolumns if len(columns_to_keep) > 0: columns_to_drop = list(set(df.columns) - set(columns_to_keep)) # spark_scored_df = transformed.select(categorical_columns+time_dimension_columns+numerical_columns+[result_column]) scored_df = transformed.select(categorical_columns + time_dimension_columns + numerical_columns + [result_column]) SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) spark_scored_df = SQLctx.createDataFrame(scored_df.toPandas()) columns_to_drop = [ x for x in columns_to_drop if x in spark_scored_df.columns ] modified_df = spark_scored_df.select( [x for x in spark_scored_df.columns if x not in columns_to_drop]) df_helper = DataFrameHelper(modified_df, self._dataframe_context) df_helper.set_params() df = df_helper.get_data_frame() try: fs = time.time() narratives_file = self._dataframe_context.get_score_path( ) + "/narratives/FreqDimension/data.json" result_file = self._dataframe_context.get_score_path( ) + "/results/FreqDimension/data.json" df_freq_dimension_obj = FreqDimensions( spark_scored_df, df_helper, self._dataframe_context).test_all( dimension_columns=[result_column]) df_freq_dimension_result = CommonUtils.as_dict( df_freq_dimension_obj) CommonUtils.write_to_file(result_file, json.dumps(df_freq_dimension_result)) narratives_obj = DimensionColumnNarrative(result_column, df_helper, self._dataframe_context, df_freq_dimension_obj) narratives = CommonUtils.as_dict(narratives_obj) CommonUtils.write_to_file(narratives_file, json.dumps(narratives)) print "Frequency Analysis Done in ", time.time() - fs, " seconds." except: print "Frequency Analysis Failed " try: fs = time.time() narratives_file = self._dataframe_context.get_score_path( ) + "/narratives/ChiSquare/data.json" result_file = self._dataframe_context.get_score_path( ) + "/results/ChiSquare/data.json" df_chisquare_obj = ChiSquare(df, df_helper, self._dataframe_context).test_all( dimension_columns=[result_column]) df_chisquare_result = CommonUtils.as_dict(df_chisquare_obj) # print 'RESULT: %s' % (json.dumps(df_chisquare_result, indent=2)) CommonUtils.write_to_file(result_file, json.dumps(df_chisquare_result)) chisquare_narratives = CommonUtils.as_dict( ChiSquareNarratives(df_helper, df_chisquare_obj, self._dataframe_context, df)) # print 'Narrarives: %s' %(json.dumps(chisquare_narratives, indent=2)) CommonUtils.write_to_file(narratives_file, json.dumps(chisquare_narratives)) print "ChiSquare Analysis Done in ", time.time() - fs, " seconds." except: print "ChiSquare Analysis Failed "
def Predict(self): self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight( ) self._scriptStages = { "initialization": { "summary": "Initialized The Neural Network (PyTorch) Scripts", "weight": 2 }, "predictionStart": { "summary": "Neural Network (PyTorch) Prediction Started", "weight": 2 }, "predictionFinished": { "summary": "Neural Network (PyTorch) Prediction Finished", "weight": 6 } } CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) dataSanity = True categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() test_data_path = self._dataframe_context.get_input_file() if self._mlEnv == "spark": pass elif self._mlEnv == "sklearn": CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "predictionStart", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") score_data_path = self._dataframe_context.get_score_path( ) + "/data.csv" trained_model_path = "file://" + self._dataframe_context.get_model_path( ) trained_model_path += "/" + self._dataframe_context.get_model_for_scoring( ) + ".pt" print("trained_model_path", trained_model_path) print("score_data_path", score_data_path) if trained_model_path.startswith("file"): trained_model_path = trained_model_path[7:] #trained_model = joblib.load(trained_model_path) trained_model = torch.load(trained_model_path, map_location=torch.device('cpu')) model_columns = self._dataframe_context.get_model_features() print("model_columns", model_columns) try: df = self._data_frame.toPandas() except: df = self._data_frame # pandas_df = MLUtils.factorize_columns(df,[x for x in categorical_columns if x != result_column]) pandas_df = MLUtils.create_dummy_columns( df, [x for x in categorical_columns if x != result_column]) pandas_df = MLUtils.fill_missing_columns(pandas_df, model_columns, result_column) if uid_col: pandas_df = pandas_df[[ x for x in pandas_df.columns if x != uid_col ]] test_df = np.stack( [pandas_df[col].values for col in pandas_df.columns], 1) tensored_test_df = torch.tensor(test_df, dtype=torch.float) outputs_test_df_tensored = trained_model(tensored_test_df.float()) y_score_mid = outputs_test_df_tensored.tolist() y_score = [x[0] for x in y_score_mid] scoreKpiArray = MLUtils.get_scored_data_summary(y_score) kpiCard = NormalCard() kpiCardData = [KpiData(data=x) for x in scoreKpiArray] kpiCard.set_card_data(kpiCardData) kpiCard.set_cente_alignment(True) print(CommonUtils.convert_python_object_to_json(kpiCard)) self._result_setter.set_kpi_card_regression_score(kpiCard) pandas_df[result_column] = y_score df[result_column] = y_score df.to_csv(score_data_path, header=True, index=False) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "predictionFinished", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") print("STARTING Measure ANALYSIS ...") columns_to_keep = [] columns_to_drop = [] columns_to_keep = self._dataframe_context.get_score_consider_columns( ) if len(columns_to_keep) > 0: columns_to_drop = list(set(df.columns) - set(columns_to_keep)) else: columns_to_drop += ["predicted_probability"] columns_to_drop = [ x for x in columns_to_drop if x in df.columns and x != result_column ] print("columns_to_drop", columns_to_drop) pandas_scored_df = df[list(set(columns_to_keep + [result_column]))] spark_scored_df = SQLctx.createDataFrame(pandas_scored_df) # spark_scored_df.write.csv(score_data_path+"/data",mode="overwrite",header=True) # TODO update metadata for the newly created dataframe self._dataframe_context.update_consider_columns(columns_to_keep) print(spark_scored_df.printSchema()) df_helper = DataFrameHelper(spark_scored_df, self._dataframe_context, self._metaParser) df_helper.set_params() df = df_helper.get_data_frame() # self._dataframe_context.set_dont_send_message(True) try: fs = time.time() descr_stats_obj = DescriptiveStatsScript( df, df_helper, self._dataframe_context, self._result_setter, self._spark, self._prediction_narrative, scriptWeight=self._scriptWeightDict, analysisName="Descriptive analysis") descr_stats_obj.Run() print("DescriptiveStats Analysis Done in ", time.time() - fs, " seconds.") except: print("Frequency Analysis Failed ") # try: # fs = time.time() # df_helper.fill_na_dimension_nulls() # df = df_helper.get_data_frame() # dt_reg = DecisionTreeRegressionScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,self._metaParser,scriptWeight=self._scriptWeightDict,analysisName="Predictive modeling") # dt_reg.Run() # print "DecisionTrees Analysis Done in ", time.time() - fs, " seconds." # except: # print "DTREE FAILED" try: fs = time.time() two_way_obj = TwoWayAnovaScript( df, df_helper, self._dataframe_context, self._result_setter, self._spark, self._prediction_narrative, self._metaParser, scriptWeight=self._scriptWeightDict, analysisName="Measure vs. Dimension") two_way_obj.Run() print("OneWayAnova Analysis Done in ", time.time() - fs, " seconds.") except: print("Anova Analysis Failed")