def __init__(self, data_frame, df_helper, df_context, spark, prediction_narrative, result_setter, meta_parser, mlEnvironment="pyspark"): self._metaParser = meta_parser self._prediction_narrative = prediction_narrative self._result_setter = result_setter self._data_frame = data_frame self._dataframe_helper = df_helper self._dataframe_context = df_context self._ignoreMsg = self._dataframe_context.get_message_ignore() self._spark = spark self._model_summary = MLModelSummary() self._score_summary = {} self._slug = GLOBALSETTINGS.MODEL_SLUG_MAPPING["sparkmlpclassifier"] self._targetLevel = self._dataframe_context.get_target_level_for_model() self._completionStatus = self._dataframe_context.get_completion_status() self._analysisName = self._slug self._messageURL = self._dataframe_context.get_message_url() self._scriptWeightDict = self._dataframe_context.get_ml_model_training_weight() self._mlEnv = mlEnvironment self._scriptStages = { "initialization": { "summary": "Initialized the Multilayer Perceptron Scripts", "weight": 4 }, "training": { "summary": "Multilayer Perceptron Model Training Started", "weight": 2 }, "completion": { "summary": "Multilayer Perceptron Model Training Finished", "weight": 4 }, }
def __init__(self, data_frame, df_helper,df_context, spark, prediction_narrative, result_setter,meta_parser): self._metaParser = meta_parser self._prediction_narrative = prediction_narrative self._result_setter = result_setter self._data_frame = data_frame self._dataframe_helper = df_helper self._dataframe_context = df_context self._spark = spark self._model_summary = MLModelSummary() self._score_summary = {} self._slug = GLOBALSETTINGS.MODEL_SLUG_MAPPING["generalizedlinearregression"]
def __init__(self, data_frame, df_helper, df_context, spark, prediction_narrative, result_setter, meta_parser, mlEnvironment="sklearn"): self._metaParser = meta_parser self._prediction_narrative = prediction_narrative self._result_setter = result_setter self._data_frame = data_frame self._dataframe_helper = df_helper self._dataframe_context = df_context self._spark = spark self._model_summary = MLModelSummary() self._score_summary = {} self._slug = GLOBALSETTINGS.MODEL_SLUG_MAPPING[ "Neural Network (TensorFlow)"] self._analysisName = "Neural Network (TensorFlow)" self._dataframe_context.set_analysis_name(self._analysisName) self._mlEnv = mlEnvironment self._datasetName = CommonUtils.get_dataset_name( self._dataframe_context.CSV_FILE) self._completionStatus = self._dataframe_context.get_completion_status( ) print(self._completionStatus, "initial completion status") self._messageURL = self._dataframe_context.get_message_url() self._scriptWeightDict = self._dataframe_context.get_ml_model_training_weight( ) self._ignoreMsg = self._dataframe_context.get_message_ignore() self._scriptStages = { "initialization": { "summary": "Initialized The Neural Network (TensorFlow) Regression Scripts", "weight": 1 }, "training": { "summary": "Neural Network (TensorFlow) Regression Model Training Started", "weight": 2 }, "completion": { "summary": "Neural Network (TensorFlow) Regression Model Training Finished", "weight": 1 }, }
def __init__(self, data_frame, df_helper, df_context, spark, prediction_narrative, result_setter, meta_parser, mLEnvironment="sklearn"): self._metaParser = meta_parser self._prediction_narrative = prediction_narrative self._result_setter = result_setter self._data_frame = data_frame self._dataframe_helper = df_helper self._dataframe_context = df_context self._spark = spark self._model_summary = MLModelSummary() self._score_summary = {} self._slug = GLOBALSETTINGS.MODEL_SLUG_MAPPING["gbtregression"] self._analysisName = "gbtRegression" self._dataframe_context.set_analysis_name(self._analysisName) self._mlEnv = mLEnvironment self._completionStatus = self._dataframe_context.get_completion_status( ) print self._completionStatus, "initial completion status" self._messageURL = self._dataframe_context.get_message_url() self._scriptWeightDict = self._dataframe_context.get_ml_model_training_weight( ) self._ignoreMsg = self._dataframe_context.get_message_ignore() self._scriptStages = { "initialization": { "summary": "Initialized the Gradient Boosted Tree Regression Scripts", "weight": 4 }, "training": { "summary": "Gradient Boosted Tree Regression Model Training Started", "weight": 2 }, "completion": { "summary": "Gradient Boosted Tree Regression Model Training Finished", "weight": 4 }, }
def __init__(self, data_frame, df_helper,df_context, spark, prediction_narrative, result_setter, meta_parser, mlEnvironment="pyspark"): self._metaParser = meta_parser self._prediction_narrative = prediction_narrative self._result_setter = result_setter self._data_frame = data_frame self._dataframe_helper = df_helper self._dataframe_context = df_context self._ignoreMsg = self._dataframe_context.get_message_ignore() self._spark = spark self._model_summary = MLModelSummary() self._score_summary = {} self._slug = GLOBALSETTINGS.MODEL_SLUG_MAPPING["xgboost"] self._targetLevel = self._dataframe_context.get_target_level_for_model() self._completionStatus = self._dataframe_context.get_completion_status() print(self._completionStatus,"initial completion status") self._analysisName = self._slug self._messageURL = self._dataframe_context.get_message_url() self._scriptWeightDict = self._dataframe_context.get_ml_model_training_weight() self._datasetName = CommonUtils.get_dataset_name(self._dataframe_context.CSV_FILE) self._mlEnv = mlEnvironment # os.environ['PYSPARK_SUBMIT_ARGS'] = "--jars xgboost4j-spark-0.72.jar,xgboost4j-0.72.jar pyspark-shell" self._scriptStages = { "initialization":{ "summary":"Initialized the Xgboost Scripts", "weight":4 }, "training":{ "summary":"Xgboost Model Training Started", "weight":2 }, "completion":{ "summary":"Xgboost Model Training Finished", "weight":4 }, }
def __init__(self, data_frame, df_helper,df_context, spark, prediction_narrative, result_setter, meta_parser, mlEnvironment="pyspark"): self._metaParser = meta_parser self._prediction_narrative = prediction_narrative self._result_setter = result_setter self._data_frame = data_frame self._dataframe_helper = df_helper self._dataframe_context = df_context self._ignoreMsg = self._dataframe_context.get_message_ignore() self._spark = spark self._model_summary = MLModelSummary() self._score_summary = {} self._slug = GLOBALSETTINGS.MODEL_SLUG_MAPPING["logisticregression"] self._targetLevel = self._dataframe_context.get_target_level_for_model() #self._model_summary = {"confusion_matrix":{},"precision_recall_stats":{},"FrequencySummary":{},"ChiSquare":{}} self._datasetName = CommonUtils.get_dataset_name(self._dataframe_context.CSV_FILE) self._completionStatus = self._dataframe_context.get_completion_status() self._analysisName = self._slug self._messageURL = self._dataframe_context.get_message_url() self._scriptWeightDict = self._dataframe_context.get_ml_model_training_weight() self._mlEnv = mlEnvironment self._classifier = "lr" self._scriptStages = { "initialization":{ "summary":"Initialized the Logistic Regression Scripts", "weight":4 }, "training":{ "summary":"Logistic Regression Model Training Started", "weight":2 }, "completion":{ "summary":"Logistic Regression Model Training Finished", "weight":4 }, }
class DTREERegressionModelScript: def __init__(self, data_frame, df_helper,df_context, spark, prediction_narrative, result_setter,meta_parser,mlEnvironment="sklearn"): self._metaParser = meta_parser self._prediction_narrative = prediction_narrative self._result_setter = result_setter self._data_frame = data_frame self._dataframe_helper = df_helper self._dataframe_context = df_context self._spark = spark self._model_summary = MLModelSummary() self._score_summary = {} self._slug = GLOBALSETTINGS.MODEL_SLUG_MAPPING["dtreeregression"] self._analysisName = "dtreeRegression" self._dataframe_context.set_analysis_name(self._analysisName) self._mlEnv = mlEnvironment self._completionStatus = self._dataframe_context.get_completion_status() print self._completionStatus,"initial completion status" self._messageURL = self._dataframe_context.get_message_url() self._scriptWeightDict = self._dataframe_context.get_ml_model_training_weight() self._ignoreMsg = self._dataframe_context.get_message_ignore() self._scriptStages = { "initialization":{ "summary":"Initialized the Decision Tree Regression Scripts", "weight":4 }, "training":{ "summary":"Decision Tree Regression Model Training Started", "weight":2 }, "completion":{ "summary":"Decision Tree Regression Model Training Finished", "weight":4 }, } def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"initialization","info",display=True,emptyBin=False,customMsg=None,weightKey="total") appType = self._dataframe_context.get_app_type() algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = filter(lambda x:x.get_algorithm_slug()==self._slug,algosToRun)[0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns)-set(allDateCols)) print categorical_columns result_column = self._dataframe_context.get_result_column() numerical_columns = self._dataframe_helper.get_numeric_columns() numerical_columns = [x for x in numerical_columns if x != result_column] model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print "model_path",model_path pipeline_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/pipeline/" model_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/model" pmml_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/modelPmml" df = self._data_frame if self._mlEnv == "spark": pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns,categorical_columns,result_column,algoType="regression") pipelineModel = pipeline.fit(df) indexed = pipelineModel.transform(df) featureMapping = sorted((attr["idx"], attr["name"]) for attr in (chain(*indexed.schema["features"].metadata["ml_attr"]["attrs"].values()))) # print indexed.select([result_column,"features"]).show(5) MLUtils.save_pipeline_or_model(pipelineModel,pipeline_filepath) # OriginalTargetconverter = IndexToString(inputCol="label", outputCol="originalTargetColumn") dtreer = DecisionTreeRegressor(labelCol=result_column, featuresCol='features',predictionCol="prediction") if validationDict["name"] == "kFold": defaultSplit = GLOBALSETTINGS.DEFAULT_VALIDATION_OBJECT["value"] numFold = int(validationDict["value"]) if numFold == 0: numFold = 3 trainingData,validationData = indexed.randomSplit([defaultSplit,1-defaultSplit], seed=12345) paramGrid = ParamGridBuilder()\ .addGrid(dtreer.regParam, [0.1, 0.01]) \ .addGrid(dtreer.fitIntercept, [False, True])\ .addGrid(dtreer.elasticNetParam, [0.0, 0.5, 1.0])\ .build() crossval = CrossValidator(estimator=dtreer, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(predictionCol="prediction", labelCol=result_column), numFolds=numFold) st = time.time() cvModel = crossval.fit(indexed) trainingTime = time.time()-st print "cvModel training takes",trainingTime bestModel = cvModel.bestModel elif validationDict["name"] == "trainAndtest": trainingData,validationData = indexed.randomSplit([float(validationDict["value"]),1-float(validationDict["value"])], seed=12345) st = time.time() fit = dtreer.fit(trainingData) trainingTime = time.time()-st print "time to train",trainingTime bestModel = fit featureImportance = bestModel.featureImportances print featureImportance,type(featureImportance) # print featureImportance[0],len(featureImportance[1],len(featureImportance[2])) print len(featureMapping) featuresArray = [(name, featureImportance[idx]) for idx, name in featureMapping] print featuresArray MLUtils.save_pipeline_or_model(bestModel,model_filepath) transformed = bestModel.transform(validationData) transformed = transformed.withColumn(result_column,transformed[result_column].cast(DoubleType())) transformed = transformed.select([result_column,"prediction",transformed[result_column]-transformed["prediction"]]) transformed = transformed.withColumnRenamed(transformed.columns[-1],"difference") transformed = transformed.select([result_column,"prediction","difference",FN.abs(transformed["difference"])*100/transformed[result_column]]) transformed = transformed.withColumnRenamed(transformed.columns[-1],"mape") sampleData = None nrows = transformed.count() if nrows > 100: sampleData = transformed.sample(False, float(100)/nrows, seed=420) else: sampleData = transformed print sampleData.show() evaluator = RegressionEvaluator(predictionCol="prediction",labelCol=result_column) metrics = {} metrics["r2"] = evaluator.evaluate(transformed,{evaluator.metricName: "r2"}) metrics["rmse"] = evaluator.evaluate(transformed,{evaluator.metricName: "rmse"}) metrics["mse"] = evaluator.evaluate(transformed,{evaluator.metricName: "mse"}) metrics["mae"] = evaluator.evaluate(transformed,{evaluator.metricName: "mae"}) runtime = round((time.time() - st_global),2) # print transformed.count() mapeDf = transformed.select("mape") # print mapeDf.show() mapeStats = MLUtils.get_mape_stats(mapeDf,"mape") mapeStatsArr = mapeStats.items() mapeStatsArr = sorted(mapeStatsArr,key=lambda x:int(x[0])) # print mapeStatsArr quantileDf = transformed.select("prediction") # print quantileDf.show() quantileSummaryDict = MLUtils.get_quantile_summary(quantileDf,"prediction") quantileSummaryArr = quantileSummaryDict.items() quantileSummaryArr = sorted(quantileSummaryArr,key=lambda x:int(x[0])) # print quantileSummaryArr self._model_summary.set_model_type("regression") self._model_summary.set_algorithm_name("dtree Regression") self._model_summary.set_algorithm_display_name("Decision Tree Regression") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_training_time(trainingTime) self._model_summary.set_target_variable(result_column) self._model_summary.set_validation_method(validationDict["displayName"]) self._model_summary.set_model_evaluation_metrics(metrics) self._model_summary.set_model_params(bestEstimator.get_params()) self._model_summary.set_quantile_summary(quantileSummaryArr) self._model_summary.set_mape_stats(mapeStatsArr) self._model_summary.set_sample_data(sampleData.toPandas().to_dict()) self._model_summary.set_feature_importance(featureImportance) # print CommonUtils.convert_python_object_to_json(self._model_summary) elif self._mlEnv == "sklearn": model_filepath = model_path+"/"+self._slug+"/model.pkl" x_train,x_test,y_train,y_test = self._dataframe_helper.get_train_test_data() x_train = MLUtils.create_dummy_columns(x_train,[x for x in categorical_columns if x != result_column]) x_test = MLUtils.create_dummy_columns(x_test,[x for x in categorical_columns if x != result_column]) x_test = MLUtils.fill_missing_columns(x_test,x_train.columns,result_column) st = time.time() est = DecisionTreeRegressor() CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"training","info",display=True,emptyBin=False,customMsg=None,weightKey="total") if algoSetting.is_hyperparameter_tuning_enabled(): hyperParamInitParam = algoSetting.get_hyperparameter_params() evaluationMetricDict = {"name":hyperParamInitParam["evaluationMetric"]} evaluationMetricDict["displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[evaluationMetricDict["name"]] hyperParamAlgoName = algoSetting.get_hyperparameter_algo_name() params_grid = algoSetting.get_params_dict_hyperparameter() params_grid = {k:v for k,v in params_grid.items() if k in est.get_params()} print params_grid if hyperParamAlgoName == "gridsearchcv": estGrid = GridSearchCV(est,params_grid) gridParams = estGrid.get_params() hyperParamInitParam = {k:v for k,v in hyperParamInitParam.items() if k in gridParams} estGrid.set_params(**hyperParamInitParam) estGrid.fit(x_train,y_train) bestEstimator = estGrid.best_estimator_ modelFilepath = "/".join(model_filepath.split("/")[:-1]) sklearnHyperParameterResultObj = SklearnGridSearchResult(estGrid.cv_results_,est,x_train,x_test,y_train,y_test,appType,modelFilepath,evaluationMetricDict=evaluationMetricDict) resultArray = sklearnHyperParameterResultObj.train_and_save_models() self._result_setter.set_hyper_parameter_results(self._slug,resultArray) self._result_setter.set_metadata_parallel_coordinates(self._slug,{"ignoreList":sklearnHyperParameterResultObj.get_ignore_list(),"hideColumns":sklearnHyperParameterResultObj.get_hide_columns(),"metricColName":sklearnHyperParameterResultObj.get_comparison_metric_colname(),"columnOrder":sklearnHyperParameterResultObj.get_keep_columns()}) elif hyperParamAlgoName == "randomsearchcv": estRand = RandomizedSearchCV(est,params_grid) estRand.set_params(**hyperParamInitParam) bestEstimator = None else: evaluationMetricDict = {"name":GLOBALSETTINGS.REGRESSION_MODEL_EVALUATION_METRIC} evaluationMetricDict["displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[evaluationMetricDict["name"]] algoParams = algoSetting.get_params_dict() algoParams = {k:v for k,v in algoParams.items() if k in est.get_params().keys()} est.set_params(**algoParams) self._result_setter.set_hyper_parameter_results(self._slug,None) if validationDict["name"] == "kFold": defaultSplit = GLOBALSETTINGS.DEFAULT_VALIDATION_OBJECT["value"] numFold = int(validationDict["value"]) if numFold == 0: numFold = 3 kFoldClass = SkleanrKFoldResult(numFold,est,x_train,x_test,y_train,y_test,appType,evaluationMetricDict=evaluationMetricDict) kFoldClass.train_and_save_result() kFoldOutput = kFoldClass.get_kfold_result() bestEstimator = kFoldClass.get_best_estimator() elif validationDict["name"] == "trainAndtest": est.fit(x_train, y_train) bestEstimator = est trainingTime = time.time()-st y_score = bestEstimator.predict(x_test) try: y_prob = bestEstimator.predict_proba(x_test) except: y_prob = [0]*len(y_score) featureImportance={} objs = {"trained_model":bestEstimator,"actual":y_test,"predicted":y_score,"probability":y_prob,"feature_importance":featureImportance,"featureList":list(x_train.columns),"labelMapping":{}} featureImportance = objs["trained_model"].feature_importances_ featuresArray = [(col_name, featureImportance[idx]) for idx, col_name in enumerate(x_train.columns)] if not algoSetting.is_hyperparameter_tuning_enabled(): modelName = "M"+"0"*(GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH-1)+"1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(modelName+".pkl") joblib.dump(objs["trained_model"],"/".join(modelFilepathArr)) metrics = {} metrics["r2"] = r2_score(y_test, y_score) metrics["mse"] = mean_squared_error(y_test, y_score) metrics["mae"] = mean_absolute_error(y_test, y_score) metrics["rmse"] = sqrt(metrics["mse"]) transformed = pd.DataFrame({"prediction":y_score,result_column:y_test}) transformed["difference"] = transformed[result_column] - transformed["prediction"] transformed["mape"] = np.abs(transformed["difference"])*100/transformed[result_column] sampleData = None nrows = transformed.shape[0] if nrows > 100: sampleData = transformed.sample(n=100,random_state=420) else: sampleData = transformed print sampleData.head() mapeCountArr = pd.cut(transformed["mape"],GLOBALSETTINGS.MAPEBINS).value_counts().to_dict().items() mapeStatsArr = [(str(idx),dictObj) for idx,dictObj in enumerate(sorted([{"count":x[1],"splitRange":(x[0].left,x[0].right)} for x in mapeCountArr],key = lambda x:x["splitRange"][0]))] predictionColSummary = transformed["prediction"].describe().to_dict() quantileBins = [predictionColSummary["min"],predictionColSummary["25%"],predictionColSummary["50%"],predictionColSummary["75%"],predictionColSummary["max"]] print quantileBins quantileBins = sorted(list(set(quantileBins))) transformed["quantileBinId"] = pd.cut(transformed["prediction"],quantileBins) quantileDf = transformed.groupby("quantileBinId").agg({"prediction":[np.sum,np.mean,np.size]}).reset_index() quantileDf.columns = ["prediction","sum","mean","count"] print quantileDf quantileArr = quantileDf.T.to_dict().items() quantileSummaryArr = [(obj[0],{"splitRange":(obj[1]["prediction"].left,obj[1]["prediction"].right),"count":obj[1]["count"],"mean":obj[1]["mean"],"sum":obj[1]["sum"]}) for obj in quantileArr] print quantileSummaryArr runtime = round((time.time() - st_global),2) self._model_summary.set_model_type("regression") self._model_summary.set_algorithm_name("DTREE Regression") self._model_summary.set_algorithm_display_name("Decision Tree Regression") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_training_time(trainingTime) self._model_summary.set_target_variable(result_column) self._model_summary.set_validation_method(validationDict["displayName"]) self._model_summary.set_model_evaluation_metrics(metrics) self._model_summary.set_model_params(bestEstimator.get_params()) self._model_summary.set_quantile_summary(quantileSummaryArr) self._model_summary.set_mape_stats(mapeStatsArr) self._model_summary.set_sample_data(sampleData.to_dict()) self._model_summary.set_feature_importance(featuresArray) self._model_summary.set_feature_list(list(x_train.columns)) try: pmml_filepath = str(model_path)+"/"+str(self._slug)+"/traindeModel.pmml" modelPmmlPipeline = PMMLPipeline([ ("pretrained-estimator", objs["trained_model"]) ]) modelPmmlPipeline.target_field = result_column modelPmmlPipeline.active_fields = np.array([col for col in x_train.columns if col != result_column]) sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr = True) pmmlfile = open(pmml_filepath,"r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug:pmmlText}) except: pass if not algoSetting.is_hyperparameter_tuning_enabled(): modelDropDownObj = { "name":self._model_summary.get_algorithm_name(), "evaluationMetricValue":self._model_summary.get_model_accuracy(), "evaluationMetricName":"r2", "slug":self._model_summary.get_slug(), "Model Id":modelName } modelSummaryJson = { "dropdown":modelDropDownObj, "levelcount":self._model_summary.get_level_counts(), "modelFeatureList":self._model_summary.get_feature_list(), "levelMapping":self._model_summary.get_level_map_dict(), "slug":self._model_summary.get_slug(), "name":self._model_summary.get_algorithm_name() } else: modelDropDownObj = { "name":self._model_summary.get_algorithm_name(), "evaluationMetricValue":resultArray[0]["R-Squared"], "evaluationMetricName":"r2", "slug":self._model_summary.get_slug(), "Model Id":resultArray[0]["Model Id"] } modelSummaryJson = { "dropdown":modelDropDownObj, "levelcount":self._model_summary.get_level_counts(), "modelFeatureList":self._model_summary.get_feature_list(), "levelMapping":self._model_summary.get_level_map_dict(), "slug":self._model_summary.get_slug(), "name":self._model_summary.get_algorithm_name() } dtreerCards = [json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary)] for card in dtreerCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({"dtreeregression":json.loads(CommonUtils.convert_python_object_to_json(self._model_summary))}) self._result_setter.set_dtree_regression_model_summart(modelSummaryJson) self._result_setter.set_dtreer_cards(dtreerCards) CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"completion","info",display=True,emptyBin=False,customMsg=None,weightKey="total") def Predict(self): self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight() self._scriptStages = { "initialization":{ "summary":"Initialized the Decision Tree Regression Scripts", "weight":2 }, "predictionStart":{ "summary":"Decision Tree Regression Model Prediction Started", "weight":2 }, "predictionFinished":{ "summary":"Decision Tree Regression Model Prediction Finished", "weight":6 } } CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"initialization","info",display=True,emptyBin=False,customMsg=None,weightKey="total") SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) dataSanity = True categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns)-set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() test_data_path = self._dataframe_context.get_input_file() if self._mlEnv == "spark": score_data_path = self._dataframe_context.get_score_path()+"/data.csv" trained_model_path = "file://" + self._dataframe_context.get_model_path() trained_model_path += "/model" pipeline_path = "/".join(trained_model_path.split("/")[:-1])+"/pipeline" print "trained_model_path",trained_model_path print "pipeline_path",pipeline_path print "score_data_path",score_data_path pipelineModel = MLUtils.load_pipeline(pipeline_path) trained_model = MLUtils.load_dtree_regresssion_pyspark_model(trained_model_path) df = self._data_frame indexed = pipelineModel.transform(df) transformed = trained_model.transform(indexed) if result_column in transformed.columns: transformed = transformed.withColumnRenamed(result_column,"originalLabel") transformed = transformed.withColumnRenamed("prediction",result_column) pandas_scored_df = transformed.select(list(set(self._data_frame.columns+[result_column]))).toPandas() if score_data_path.startswith("file"): score_data_path = score_data_path[7:] pandas_scored_df.to_csv(score_data_path,header=True,index=False) print "STARTING Measure ANALYSIS ..." columns_to_keep = [] columns_to_drop = [] columns_to_keep = self._dataframe_context.get_score_consider_columns() if len(columns_to_keep) > 0: columns_to_drop = list(set(df.columns)-set(columns_to_keep)) else: columns_to_drop += ["predicted_probability"] columns_to_drop = [x for x in columns_to_drop if x in df.columns and x != result_column] print "columns_to_drop",columns_to_drop spark_scored_df = transformed.select(list(set(columns_to_keep+[result_column]))) elif self._mlEnv == "sklearn": CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"predictionStart","info",display=True,emptyBin=False,customMsg=None,weightKey="total") score_data_path = self._dataframe_context.get_score_path()+"/data.csv" trained_model_path = "file://" + self._dataframe_context.get_model_path() trained_model_path += "/"+self._dataframe_context.get_model_for_scoring()+".pkl" print "trained_model_path",trained_model_path print "score_data_path",score_data_path if trained_model_path.startswith("file"): trained_model_path = trained_model_path[7:] trained_model = joblib.load(trained_model_path) model_columns = self._dataframe_context.get_model_features() print "model_columns",model_columns df = self._data_frame.toPandas() # pandas_df = MLUtils.factorize_columns(df,[x for x in categorical_columns if x != result_column]) pandas_df = MLUtils.create_dummy_columns(df,[x for x in categorical_columns if x != result_column]) pandas_df = MLUtils.fill_missing_columns(pandas_df,model_columns,result_column) if uid_col: pandas_df = pandas_df[[x for x in pandas_df.columns if x != uid_col]] y_score = trained_model.predict(pandas_df) scoreKpiArray = MLUtils.get_scored_data_summary(y_score) kpiCard = NormalCard() kpiCardData = [KpiData(data=x) for x in scoreKpiArray] kpiCard.set_card_data(kpiCardData) kpiCard.set_cente_alignment(True) print CommonUtils.convert_python_object_to_json(kpiCard) self._result_setter.set_kpi_card_regression_score(kpiCard) pandas_df[result_column] = y_score df[result_column] = y_score df.to_csv(score_data_path,header=True,index=False) CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"predictionFinished","info",display=True,emptyBin=False,customMsg=None,weightKey="total") print "STARTING Measure ANALYSIS ..." columns_to_keep = [] columns_to_drop = [] columns_to_keep = self._dataframe_context.get_score_consider_columns() if len(columns_to_keep) > 0: columns_to_drop = list(set(df.columns)-set(columns_to_keep)) else: columns_to_drop += ["predicted_probability"] columns_to_drop = [x for x in columns_to_drop if x in df.columns and x != result_column] print "columns_to_drop",columns_to_drop pandas_scored_df = df[list(set(columns_to_keep+[result_column]))] spark_scored_df = SQLctx.createDataFrame(pandas_scored_df) # spark_scored_df.write.csv(score_data_path+"/data",mode="overwrite",header=True) # TODO update metadata for the newly created dataframe self._dataframe_context.update_consider_columns(columns_to_keep) print spark_scored_df.printSchema() df_helper = DataFrameHelper(spark_scored_df, self._dataframe_context,self._metaParser) df_helper.set_params() df = df_helper.get_data_frame() # self._dataframe_context.set_dont_send_message(True) try: fs = time.time() descr_stats_obj = DescriptiveStatsScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,scriptWeight=self._scriptWeightDict,analysisName="Descriptive analysis") descr_stats_obj.Run() print "DescriptiveStats Analysis Done in ", time.time() - fs, " seconds." except: print "Frequency Analysis Failed " # try: # fs = time.time() # df_helper.fill_na_dimension_nulls() # df = df_helper.get_data_frame() # dt_reg = DecisionTreeRegressionScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,self._metaParser,scriptWeight=self._scriptWeightDict,analysisName="Predictive modeling") # dt_reg.Run() # print "DecisionTrees Analysis Done in ", time.time() - fs, " seconds." # except: # print "DTREE FAILED" try: fs = time.time() two_way_obj = TwoWayAnovaScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,self._metaParser,scriptWeight=self._scriptWeightDict,analysisName="Measure vs. Dimension") two_way_obj.Run() print "OneWayAnova Analysis Done in ", time.time() - fs, " seconds." except: print "Anova Analysis Failed"
class GeneralizedLinearRegressionModelScript: def __init__(self, data_frame, df_helper,df_context, spark, prediction_narrative, result_setter,meta_parser): self._metaParser = meta_parser self._prediction_narrative = prediction_narrative self._result_setter = result_setter self._data_frame = data_frame self._dataframe_helper = df_helper self._dataframe_context = df_context self._spark = spark self._model_summary = MLModelSummary() self._score_summary = {} self._slug = GLOBALSETTINGS.MODEL_SLUG_MAPPING["generalizedlinearregression"] def Train(self): st_global = time.time() algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = filter(lambda x:x["algorithmSlug"]==GLOBALSETTINGS.MODEL_SLUG_MAPPING["generalizedlinearregression"],algosToRun)[0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns)-set(allDateCols)) print categorical_columns result_column = self._dataframe_context.get_result_column() numerical_columns = self._dataframe_helper.get_numeric_columns() numerical_columns = [x for x in numerical_columns if x != result_column] model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print "model_path",model_path pipeline_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/pipeline/" model_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/model" pmml_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/modelPmml" df = self._data_frame pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns,categorical_columns,result_column,algoType="regression") pipelineModel = pipeline.fit(df) indexed = pipelineModel.transform(df) featureMapping = sorted((attr["idx"], attr["name"]) for attr in (chain(*indexed.schema["features"].metadata["ml_attr"]["attrs"].values()))) # print indexed.select([result_column,"features"]).show(5) MLUtils.save_pipeline_or_model(pipelineModel,pipeline_filepath) glinr = GeneralizedLinearRegression(labelCol=result_column, featuresCol='features',predictionCol="prediction") if validationDict["name"] == "kFold": defaultSplit = GLOBALSETTINGS.DEFAULT_VALIDATION_OBJECT["value"] numFold = int(validationDict["value"]) if numFold == 0: numFold = 3 trainingData,validationData = indexed.randomSplit([defaultSplit,1-defaultSplit], seed=12345) paramGrid = ParamGridBuilder()\ .addGrid(glinr.regParam, [0.1, 0.01]) \ .addGrid(glinr.fitIntercept, [False, True])\ .build() crossval = CrossValidator(estimator=glinr, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(predictionCol="prediction", labelCol=result_column), numFolds=numFold) st = time.time() cvModel = crossval.fit(indexed) trainingTime = time.time()-st print "cvModel training takes",trainingTime bestModel = cvModel.bestModel elif validationDict["name"] == "trainAndtest": trainingData,validationData = indexed.randomSplit([float(validationDict["value"]),1-float(validationDict["value"])], seed=12345) st = time.time() fit = glinr.fit(trainingData) trainingTime = time.time()-st print "time to train",trainingTime bestModel = fit print bestModel.explainParams() print bestModel.extractParamMap() print bestModel.params print 'Best Param (regParam): ', bestModel._java_obj.getRegParam() print 'Best Param (MaxIter): ', bestModel._java_obj.getMaxIter() # modelPmmlPipeline = PMMLPipeline([ # ("pretrained-estimator", objs["trained_model"]) # ]) # try: # modelPmmlPipeline.target_field = result_column # modelPmmlPipeline.active_fields = np.array([col for col in x_train.columns if col != result_column]) # sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr = True) # pmmlfile = open(pmml_filepath,"r") # pmmlText = pmmlfile.read() # pmmlfile.close() # self._result_setter.update_pmml_object({self._slug:pmmlText}) # except: # pass coefficientsArray = [(name, bestModel.coefficients[idx]) for idx, name in featureMapping] MLUtils.save_pipeline_or_model(bestModel,model_filepath) transformed = bestModel.transform(validationData) transformed = transformed.withColumn(result_column,transformed[result_column].cast(DoubleType())) transformed = transformed.select([result_column,"prediction",transformed[result_column]-transformed["prediction"]]) transformed = transformed.withColumnRenamed(transformed.columns[-1],"difference") transformed = transformed.select([result_column,"prediction","difference",FN.abs(transformed["difference"])*100/transformed[result_column]]) transformed = transformed.withColumnRenamed(transformed.columns[-1],"mape") sampleData = None nrows = transformed.count() if nrows > 100: sampleData = transformed.sample(False, float(100)/nrows, seed=420) else: sampleData = transformed print sampleData.show() evaluator = RegressionEvaluator(predictionCol="prediction",labelCol=result_column) metrics = {} metrics["r2"] = evaluator.evaluate(transformed,{evaluator.metricName: "r2"}) metrics["rmse"] = evaluator.evaluate(transformed,{evaluator.metricName: "rmse"}) metrics["mse"] = evaluator.evaluate(transformed,{evaluator.metricName: "mse"}) metrics["mae"] = evaluator.evaluate(transformed,{evaluator.metricName: "mae"}) runtime = round((time.time() - st_global),2) # print transformed.count() mapeDf = transformed.select("mape") # print mapeDf.show() mapeStats = MLUtils.get_mape_stats(mapeDf,"mape") mapeStatsArr = mapeStats.items() mapeStatsArr = sorted(mapeStatsArr,key=lambda x:int(x[0])) # print mapeStatsArr quantileDf = transformed.select("prediction") # print quantileDf.show() quantileSummaryDict = MLUtils.get_quantile_summary(quantileDf,"prediction") quantileSummaryArr = quantileSummaryDict.items() quantileSummaryArr = sorted(quantileSummaryArr,key=lambda x:int(x[0])) # print quantileSummaryArr self._model_summary.set_model_type("regression") self._model_summary.set_algorithm_name("Generalized Linear Regression") self._model_summary.set_algorithm_display_name("Generalized Linear Regression") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_training_time(trainingTime) self._model_summary.set_target_variable(result_column) self._model_summary.set_validation_method(validationDict["displayName"]) self._model_summary.set_model_evaluation_metrics(metrics) self._model_summary.set_model_params(bestEstimator.get_params()) self._model_summary.set_quantile_summary(quantileSummaryArr) self._model_summary.set_mape_stats(mapeStatsArr) self._model_summary.set_sample_data(sampleData.toPandas().to_dict()) self._model_summary.set_coefficinets_array(coefficientsArray) self._model_summary.set_feature_list(list(x_train.columns)) # print CommonUtils.convert_python_object_to_json(self._model_summary) modelSummaryJson = { "dropdown":{ "name":self._model_summary.get_algorithm_name(), "accuracy":CommonUtils.round_sig(self._model_summary.get_model_evaluation_metrics()["r2"]), "slug":self._model_summary.get_slug() }, "levelcount":self._model_summary.get_level_counts(), "modelFeatureList":self._model_summary.get_feature_list(), "levelMapping":self._model_summary.get_level_map_dict() } glinrCards = [json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary)] for card in glinrCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({"generalizedlinearregression":json.loads(CommonUtils.convert_python_object_to_json(self._model_summary))}) self._result_setter.set_generalized_linear_regression_model_summary(modelSummaryJson) self._result_setter.set_glinr_cards(glinrCards) def Predict(self): self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight() self._scriptStages = { "initialization":{ "summary":"Initialized the Generalized Linear Regression Scripts", "weight":2 }, "predictionStart":{ "summary":"Generalized Linear Regression Model Prediction Started", "weight":2 }, "predictionFinished":{ "summary":"Generalized Linear Regression Model Prediction Finished", "weight":6 } } CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"initialization","info",display=True,emptyBin=False,customMsg=None,weightKey="total") SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) dataSanity = True categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns)-set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() test_data_path = self._dataframe_context.get_input_file() CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"predictionStart","info",display=True,emptyBin=False,customMsg=None,weightKey="total") test_data_path = self._dataframe_context.get_input_file() score_data_path = self._dataframe_context.get_score_path()+"/data.csv" trained_model_path = "file://" + self._dataframe_context.get_model_path() trained_model_path += "/model" pipeline_path = "/".join(trained_model_path.split("/")[:-1])+"/pipeline" print "trained_model_path",trained_model_path print "pipeline_path",pipeline_path print "score_data_path",score_data_path pipelineModel = MLUtils.load_pipeline(pipeline_path) trained_model = MLUtils.load_generalized_linear_regresssion_pyspark_model(trained_model_path) df = self._data_frame indexed = pipelineModel.transform(df) transformed = trained_model.transform(indexed) if result_column in transformed.columns: transformed = transformed.withColumnRenamed(result_column,"originalLabel") transformed = transformed.withColumnRenamed("prediction",result_column) pandas_scored_df = transformed.select(list(set(self._data_frame.columns+[result_column]))).toPandas() if score_data_path.startswith("file"): score_data_path = score_data_path[7:] pandas_scored_df.to_csv(score_data_path,header=True,index=False) CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"predictionFinished","info",display=True,emptyBin=False,customMsg=None,weightKey="total") print "STARTING Measure ANALYSIS ..." columns_to_keep = [] columns_to_drop = [] columns_to_keep = self._dataframe_context.get_score_consider_columns() if len(columns_to_keep) > 0: columns_to_drop = list(set(df.columns)-set(columns_to_keep)) else: columns_to_drop += ["predicted_probability"] columns_to_drop = [x for x in columns_to_drop if x in df.columns and x != result_column] print "columns_to_drop",columns_to_drop spark_scored_df = transformed.select(list(set(columns_to_keep+[result_column]))) df_helper = DataFrameHelper(spark_scored_df, self._dataframe_context,self._metaParser) df_helper.set_params() df = df_helper.get_data_frame() # self._dataframe_context.set_dont_send_message(True) try: fs = time.time() descr_stats_obj = DescriptiveStatsScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,scriptWeight=self._scriptWeightDict,analysisName="Descriptive analysis") descr_stats_obj.Run() print "DescriptiveStats Analysis Done in ", time.time() - fs, " seconds." except: print "Frequency Analysis Failed " try: fs = time.time() df_helper.fill_na_dimension_nulls() df = df_helper.get_data_frame() dt_reg = DecisionTreeRegressionScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,self._metaParser,scriptWeight=self._scriptWeightDict,analysisName="Predictive modeling") dt_reg.Run() print "DecisionTrees Analysis Done in ", time.time() - fs, " seconds." except: print "DTREE FAILED" try: fs = time.time() two_way_obj = TwoWayAnovaScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,self._metaParser,scriptWeight=self._scriptWeightDict,analysisName="Measure vs. Dimension") two_way_obj.Run() print "OneWayAnova Analysis Done in ", time.time() - fs, " seconds." except: print "Anova Analysis Failed"
class SupportVectorMachineScript(object): def __init__(self, data_frame, df_helper, df_context, spark, prediction_narrative, result_setter, meta_parser, mlEnvironment="sklearn"): self._metaParser = meta_parser self._prediction_narrative = prediction_narrative self._result_setter = result_setter self._data_frame = data_frame self._dataframe_helper = df_helper self._dataframe_context = df_context self._ignoreMsg = self._dataframe_context.get_message_ignore() self._spark = spark self._model_summary = { "confusion_matrix": {}, "precision_recall_stats": {}, "FrequencySummary": {}, "ChiSquare": {} } self._score_summary = {} self._model_slug_map = GLOBALSETTINGS.MODEL_SLUG_MAPPING self._slug = self._model_slug_map["svm"] self._targetLevel = self._dataframe_context.get_target_level_for_model( ) self._completionStatus = self._dataframe_context.get_completion_status( ) print(self._completionStatus, "initial completion status") self._analysisName = self._slug self._messageURL = self._dataframe_context.get_message_url() self._scriptWeightDict = self._dataframe_context.get_ml_model_training_weight( ) self._mlEnv = mlEnvironment self._scriptStages = { "initialization": { "summary": "Initialized the SVM Scripts", "weight": 4 }, "training": { "summary": "SVM Model Training Started", "weight": 2 }, "completion": { "summary": "SVM Model Training Finished", "weight": 4 }, } def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = [ x for x in algosToRun if x.get_algorithm_slug() == self._slug ][0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) print(categorical_columns) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print("model_path", model_path) pipeline_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/pipeline/" model_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/model" pmml_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/modelPmml" df = self._data_frame if self._mlEnv == "spark": pass elif self._mlEnv == "sklearn": model_filepath = model_path + "/" + self._slug + "/model.pkl" pmml_filepath = str(model_path) + "/" + str( self._slug) + "/traindeModel.pmml" x_train, x_test, y_train, y_test = self._dataframe_helper.get_train_test_data( ) x_train = MLUtils.create_dummy_columns( x_train, [x for x in categorical_columns if x != result_column]) x_test = MLUtils.create_dummy_columns( x_test, [x for x in categorical_columns if x != result_column]) x_test = MLUtils.fill_missing_columns(x_test, x_train.columns, result_column) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "training", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") st = time.time() levels = df[result_column].unique() clf = SVC(kernel='linear', probability=True) labelEncoder = preprocessing.LabelEncoder() labelEncoder.fit(np.concatenate([y_train, y_test])) y_train = pd.Series(labelEncoder.transform(y_train)) y_test = labelEncoder.transform(y_test) classes = labelEncoder.classes_ transformed = labelEncoder.transform(classes) labelMapping = dict(list(zip(transformed, classes))) inverseLabelMapping = dict(list(zip(classes, transformed))) posLabel = inverseLabelMapping[self._targetLevel] appType = self._dataframe_context.get_app_type() print(appType, labelMapping, inverseLabelMapping, posLabel, self._targetLevel) if algoSetting.is_hyperparameter_tuning_enabled(): hyperParamInitParam = algoSetting.get_hyperparameter_params() evaluationMetricDict = { "name": hyperParamInitParam["evaluationMetric"] } evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] hyperParamAlgoName = algoSetting.get_hyperparameter_algo_name() params_grid = algoSetting.get_params_dict_hyperparameter() params_grid = { k: v for k, v in list(params_grid.items()) if k in clf.get_params() } print(params_grid) if hyperParamAlgoName == "gridsearchcv": clfGrid = GridSearchCV(clf, params_grid) gridParams = clfGrid.get_params() hyperParamInitParam = { k: v for k, v in list(hyperParamInitParam.items()) if k in gridParams } clfGrid.set_params(**hyperParamInitParam) #clfGrid.fit(x_train,y_train) grid_param = {} grid_param['params'] = ParameterGrid(params_grid) #bestEstimator = clfGrid.best_estimator_ modelFilepath = "/".join(model_filepath.split("/")[:-1]) sklearnHyperParameterResultObj = SklearnGridSearchResult( grid_param, clf, x_train, x_test, y_train, y_test, appType, modelFilepath, levels, posLabel, evaluationMetricDict) resultArray = sklearnHyperParameterResultObj.train_and_save_models( ) self._result_setter.set_hyper_parameter_results( self._slug, resultArray) self._result_setter.set_metadata_parallel_coordinates( self._slug, { "ignoreList": sklearnHyperParameterResultObj.get_ignore_list(), "hideColumns": sklearnHyperParameterResultObj.get_hide_columns(), "metricColName": sklearnHyperParameterResultObj. get_comparison_metric_colname(), "columnOrder": sklearnHyperParameterResultObj.get_keep_columns() }) elif hyperParamAlgoName == "randomsearchcv": clfRand = RandomizedSearchCV(clf, params_grid) clfRand.set_params(**hyperParamInitParam) bestEstimator = None else: evaluationMetricDict = { "name": GLOBALSETTINGS.CLASSIFICATION_MODEL_EVALUATION_METRIC } evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] self._result_setter.set_hyper_parameter_results( self._slug, None) algoParams = algoSetting.get_params_dict() algoParams = { k: v for k, v in list(algoParams.items()) if k in list(clf.get_params().keys()) } clf.set_params(**algoParams) print("!" * 50) print(clf.get_params()) print("!" * 50) if validationDict["name"] == "kFold": defaultSplit = GLOBALSETTINGS.DEFAULT_VALIDATION_OBJECT[ "value"] numFold = int(validationDict["value"]) if numFold == 0: numFold = 3 kFoldClass = SkleanrKFoldResult( numFold, clf, x_train, x_test, y_train, y_test, appType, levels, posLabel, evaluationMetricDict=evaluationMetricDict) kFoldClass.train_and_save_result() kFoldOutput = kFoldClass.get_kfold_result() bestEstimator = kFoldClass.get_best_estimator() elif validationDict["name"] == "trainAndtest": clf.fit(x_train, y_train) bestEstimator = clf # clf.fit(x_train, y_train) # bestEstimator = clf trainingTime = time.time() - st y_score = bestEstimator.predict(x_test) try: y_prob = bestEstimator.predict_proba(x_test) except: y_prob = [0] * len(y_score) # overall_precision_recall = MLUtils.calculate_overall_precision_recall(y_test,y_score,targetLevel = self._targetLevel) # print overall_precision_recall accuracy = metrics.accuracy_score(y_test, y_score) if len(levels) <= 2: precision = metrics.precision_score(y_test, y_score, pos_label=posLabel, average="binary") recall = metrics.recall_score(y_test, y_score, pos_label=posLabel, average="binary") auc = metrics.roc_auc_score(y_test, y_score) elif len(levels) > 2: precision = metrics.precision_score(y_test, y_score, pos_label=posLabel, average="macro") recall = metrics.recall_score(y_test, y_score, pos_label=posLabel, average="macro") # auc = metrics.roc_auc_score(y_test,y_score,average="weighted") auc = None y_score = labelEncoder.inverse_transform(y_score) y_test = labelEncoder.inverse_transform(y_test) featureImportance = {} feature_importance = dict( sorted(zip(x_train.columns, bestEstimator.feature_importances_), key=lambda x: x[1], reverse=True)) for k, v in feature_importance.items(): feature_importance[k] = CommonUtils.round_sig(v) objs = { "trained_model": bestEstimator, "actual": y_test, "predicted": y_score, "probability": y_prob, "feature_importance": feature_importance, "featureList": list(x_train.columns), "labelMapping": labelMapping } if not algoSetting.is_hyperparameter_tuning_enabled(): modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(modelName + ".pkl") joblib.dump(objs["trained_model"], "/".join(modelFilepathArr)) runtime = round((time.time() - st_global), 2) try: modelPmmlPipeline = PMMLPipeline([("pretrained-estimator", objs["trained_model"])]) modelPmmlPipeline.target_field = result_column modelPmmlPipeline.active_fields = np.array( [col for col in x_train.columns if col != result_column]) sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr=True) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except: pass cat_cols = list(set(categorical_columns) - {result_column}) overall_precision_recall = MLUtils.calculate_overall_precision_recall( objs["actual"], objs["predicted"], targetLevel=self._targetLevel) self._model_summary = MLModelSummary() self._model_summary.set_algorithm_name("Svm") self._model_summary.set_algorithm_display_name( "Support Vector Machine") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_confusion_matrix( MLUtils.calculate_confusion_matrix(objs["actual"], objs["predicted"])) self._model_summary.set_feature_importance(objs["feature_importance"]) self._model_summary.set_feature_list(objs["featureList"]) self._model_summary.set_model_accuracy( round(metrics.accuracy_score(objs["actual"], objs["predicted"]), 2)) self._model_summary.set_training_time(round((time.time() - st), 2)) self._model_summary.set_precision_recall_stats( overall_precision_recall["classwise_stats"]) self._model_summary.set_model_precision( overall_precision_recall["precision"]) self._model_summary.set_model_recall( overall_precision_recall["recall"]) self._model_summary.set_target_variable(result_column) self._model_summary.set_prediction_split( overall_precision_recall["prediction_split"]) self._model_summary.set_validation_method("Train and Test") self._model_summary.set_level_map_dict(objs["labelMapping"]) # self._model_summary.set_model_features(list(set(x_train.columns)-set([result_column]))) self._model_summary.set_model_features( [col for col in x_train.columns if col != result_column]) self._model_summary.set_level_counts( self._metaParser.get_unique_level_dict( list(set(categorical_columns)))) self._model_summary.set_num_trees(100) self._model_summary.set_num_rules(300) if not algoSetting.is_hyperparameter_tuning_enabled(): modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": self._model_summary.get_model_accuracy(), "evaluationMetricName": "accuracy", "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } else: modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": resultArray[0]["Accuracy"], "evaluationMetricName": "accuracy", "slug": self._model_summary.get_slug(), "Model Id": resultArray[0]["Model Id"] } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } svmCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary) ] for card in svmCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({ "svm": json.loads( CommonUtils.convert_python_object_to_json(self._model_summary)) }) self._result_setter.set_svm_model_summary(modelSummaryJson) self._result_setter.set_rf_cards(svmCards) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "completion", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") # DataWriter.write_dict_as_json(self._spark, {"modelSummary":json.dumps(self._model_summary)}, summary_filepath) # print self._model_summary # CommonUtils.write_to_file(summary_filepath,json.dumps({"modelSummary":self._model_summary})) def Predict(self): self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight( ) self._scriptStages = { "initialization": { "summary": "Initialized the Random Forest Scripts", "weight": 2 }, "prediction": { "summary": "Random Forest Model Prediction Finished", "weight": 2 }, "frequency": { "summary": "descriptive analysis finished", "weight": 2 }, "chisquare": { "summary": "chi Square analysis finished", "weight": 4 }, "completion": { "summary": "all analysis finished", "weight": 4 }, } self._completionStatus += old_div( self._scriptWeightDict[self._analysisName]["total"] * self._scriptStages["initialization"]["weight"], 10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "initialization",\ "info",\ self._scriptStages["initialization"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=self._ignoreMsg) self._dataframe_context.update_completion_status( self._completionStatus) # Match with the level_counts and then clean the data dataSanity = True level_counts_train = self._dataframe_context.get_level_count_dict() cat_cols = self._dataframe_helper.get_string_columns() # level_counts_score = CommonUtils.get_level_count_dict(self._data_frame,cat_cols,self._dataframe_context.get_column_separator(),output_type="dict") # if level_counts_train != {}: # for key in level_counts_train: # if key in level_counts_score: # if level_counts_train[key] != level_counts_score[key]: # dataSanity = False # else: # dataSanity = False categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() test_data_path = self._dataframe_context.get_input_file() if self._mlEnv == "spark": pass elif self._mlEnv == "sklearn": score_data_path = self._dataframe_context.get_score_path( ) + "/data.csv" if score_data_path.startswith("file"): score_data_path = score_data_path[7:] trained_model_path = self._dataframe_context.get_model_path() trained_model_path += "/" + self._dataframe_context.get_model_for_scoring( ) + ".pkl" if trained_model_path.startswith("file"): trained_model_path = trained_model_path[7:] score_summary_path = self._dataframe_context.get_score_path( ) + "/Summary/summary.json" if score_summary_path.startswith("file"): score_summary_path = score_summary_path[7:] trained_model = joblib.load(trained_model_path) # pandas_df = self._data_frame.toPandas() df = self._data_frame.toPandas() model_columns = self._dataframe_context.get_model_features() pandas_df = MLUtils.create_dummy_columns( df, [x for x in categorical_columns if x != result_column]) pandas_df = MLUtils.fill_missing_columns(pandas_df, model_columns, result_column) if uid_col: pandas_df = pandas_df[[ x for x in pandas_df.columns if x != uid_col ]] y_score = trained_model.predict(pandas_df) y_prob = trained_model.predict_proba(pandas_df) y_prob = MLUtils.calculate_predicted_probability(y_prob) y_prob = list([round(x, 2) for x in y_prob]) score = { "predicted_class": y_score, "predicted_probability": y_prob } df["predicted_class"] = score["predicted_class"] labelMappingDict = self._dataframe_context.get_label_map() df["predicted_class"] = df["predicted_class"].apply( lambda x: labelMappingDict[x] if x != None else "NA") df["predicted_probability"] = score["predicted_probability"] self._score_summary[ "prediction_split"] = MLUtils.calculate_scored_probability_stats( df) self._score_summary["result_column"] = result_column if result_column in df.columns: df.drop(result_column, axis=1, inplace=True) df = df.rename(index=str, columns={"predicted_class": result_column}) df.to_csv(score_data_path, header=True, index=False) uidCol = self._dataframe_context.get_uid_column() if uidCol == None: uidCols = self._metaParser.get_suggested_uid_columns() if len(uidCols) > 0: uidCol = uidCols[0] uidTableData = [] predictedClasses = list(df[result_column].unique()) if uidCol: if uidCol in df.columns: for level in predictedClasses: levelDf = df[df[result_column] == level] levelDf = levelDf[[ uidCol, "predicted_probability", result_column ]] levelDf.sort_values(by="predicted_probability", ascending=False, inplace=True) levelDf["predicted_probability"] = levelDf[ "predicted_probability"].apply( lambda x: humanize.apnumber(x * 100) + "%" if x * 100 >= 10 else str(int(x * 100)) + "%") uidTableData.append(levelDf[:5]) uidTableData = pd.concat(uidTableData) uidTableData = [list(arr) for arr in list(uidTableData.values)] uidTableData = [[uidCol, "Probability", result_column] ] + uidTableData uidTable = TableData() uidTable.set_table_width(25) uidTable.set_table_data(uidTableData) uidTable.set_table_type("normalHideColumn") self._result_setter.set_unique_identifier_table( json.loads( CommonUtils.convert_python_object_to_json(uidTable))) self._completionStatus += old_div( self._scriptWeightDict[self._analysisName]["total"] * self._scriptStages["prediction"]["weight"], 10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "prediction",\ "info",\ self._scriptStages["prediction"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=self._ignoreMsg) self._dataframe_context.update_completion_status( self._completionStatus) # CommonUtils.write_to_file(score_summary_path,json.dumps({"scoreSummary":self._score_summary})) print("STARTING DIMENSION ANALYSIS ...") columns_to_keep = [] columns_to_drop = [] # considercolumnstype = self._dataframe_context.get_score_consider_columns_type() # considercolumns = self._dataframe_context.get_score_consider_columns() # if considercolumnstype != None: # if considercolumns != None: # if considercolumnstype == ["excluding"]: # columns_to_drop = considercolumns # elif considercolumnstype == ["including"]: # columns_to_keep = considercolumns columns_to_keep = self._dataframe_context.get_score_consider_columns() if len(columns_to_keep) > 0: columns_to_drop = list(set(df.columns) - set(columns_to_keep)) else: columns_to_drop += ["predicted_probability"] columns_to_drop = [ x for x in columns_to_drop if x in df.columns and x != result_column ] print("columns_to_drop", columns_to_drop) df.drop(columns_to_drop, axis=1, inplace=True) resultColLevelCount = dict(df[result_column].value_counts()) # self._metaParser.update_level_counts(result_column,resultColLevelCount) self._metaParser.update_column_dict( result_column, { "LevelCount": resultColLevelCount, "numberOfUniqueValues": len(list(resultColLevelCount.keys())) }) self._dataframe_context.set_story_on_scored_data(True) SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) spark_scored_df = SQLctx.createDataFrame(df) # spark_scored_df.write.csv(score_data_path+"/data",mode="overwrite",header=True) # TODO update metadata for the newly created dataframe self._dataframe_context.update_consider_columns(columns_to_keep) df_helper = DataFrameHelper(spark_scored_df, self._dataframe_context, self._metaParser) df_helper.set_params() spark_scored_df = df_helper.get_data_frame() # try: # fs = time.time() # narratives_file = self._dataframe_context.get_score_path()+"/narratives/FreqDimension/data.json" # if narratives_file.startswith("file"): # narratives_file = narratives_file[7:] # result_file = self._dataframe_context.get_score_path()+"/results/FreqDimension/data.json" # if result_file.startswith("file"): # result_file = result_file[7:] # init_freq_dim = FreqDimensions(df, df_helper, self._dataframe_context,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName) # df_freq_dimension_obj = init_freq_dim.test_all(dimension_columns=[result_column]) # df_freq_dimension_result = CommonUtils.as_dict(df_freq_dimension_obj) # narratives_obj = DimensionColumnNarrative(result_column, df_helper, self._dataframe_context, df_freq_dimension_obj,self._result_setter,self._prediction_narrative,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName) # narratives = CommonUtils.as_dict(narratives_obj) # # print "Frequency Analysis Done in ", time.time() - fs, " seconds." # self._completionStatus += self._scriptWeightDict[self._analysisName]["total"]*self._scriptStages["frequency"]["weight"]/10 # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ # "frequency",\ # "info",\ # self._scriptStages["frequency"]["summary"],\ # self._completionStatus,\ # self._completionStatus) # CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsg) # self._dataframe_context.update_completion_status(self._completionStatus) # print "Frequency ",self._completionStatus # except: # print "Frequency Analysis Failed " # # try: # fs = time.time() # narratives_file = self._dataframe_context.get_score_path()+"/narratives/ChiSquare/data.json" # if narratives_file.startswith("file"): # narratives_file = narratives_file[7:] # result_file = self._dataframe_context.get_score_path()+"/results/ChiSquare/data.json" # if result_file.startswith("file"): # result_file = result_file[7:] # init_chisquare_obj = ChiSquare(df, df_helper, self._dataframe_context,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName) # df_chisquare_obj = init_chisquare_obj.test_all(dimension_columns= [result_column]) # df_chisquare_result = CommonUtils.as_dict(df_chisquare_obj) # chisquare_narratives = CommonUtils.as_dict(ChiSquareNarratives(df_helper, df_chisquare_obj, self._dataframe_context,df,self._prediction_narrative,self._result_setter,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName)) # except: # print "ChiSquare Analysis Failed " if len(predictedClasses) >= 2: try: fs = time.time() df_decision_tree_obj = DecisionTrees( spark_scored_df, df_helper, self._dataframe_context, self._spark, self._metaParser, scriptWeight=self._scriptWeightDict, analysisName=self._analysisName).test_all( dimension_columns=[result_column]) narratives_obj = CommonUtils.as_dict( DecisionTreeNarrative(result_column, df_decision_tree_obj, self._dataframe_helper, self._dataframe_context, self._metaParser, self._result_setter, story_narrative=None, analysisName=self._analysisName, scriptWeight=self._scriptWeightDict)) print(narratives_obj) except: print("DecisionTree Analysis Failed ") else: data_dict = { "npred": len(predictedClasses), "nactual": len(list(labelMappingDict.values())) } if data_dict["nactual"] > 2: levelCountDict[predictedClasses[0]] = resultColLevelCount[ predictedClasses[0]] levelCountDict["Others"] = sum([ v for k, v in list(resultColLevelCount.items()) if k != predictedClasses[0] ]) else: levelCountDict = resultColLevelCount otherClass = list( set(labelMappingDict.values()) - set(predictedClasses))[0] levelCountDict[otherClass] = 0 print(levelCountDict) total = float( sum([x for x in list(levelCountDict.values()) if x != None])) levelCountTuple = [({ "name": k, "count": v, "percentage": humanize.apnumber(old_div(v * 100, total)) + "%" if old_div(v * 100, total) >= 10 else str(int(old_div(v * 100, total))) + "%" }) for k, v in list(levelCountDict.items()) if v != None] levelCountTuple = sorted(levelCountTuple, key=lambda x: x["count"], reverse=True) data_dict["blockSplitter"] = "|~NEWBLOCK~|" data_dict["targetcol"] = result_column data_dict["nlevel"] = len(list(levelCountDict.keys())) data_dict["topLevel"] = levelCountTuple[0] data_dict["secondLevel"] = levelCountTuple[1] maincardSummary = NarrativesUtils.get_template_output( "/apps/", 'scorewithoutdtree.html', data_dict) main_card = NormalCard() main_card_data = [] main_card_narrative = NarrativesUtils.block_splitter( maincardSummary, "|~NEWBLOCK~|") main_card_data += main_card_narrative chartData = NormalChartData([levelCountDict]).get_data() chartJson = ChartJson(data=chartData) chartJson.set_title(result_column) chartJson.set_chart_type("donut") mainCardChart = C3ChartData(data=chartJson) mainCardChart.set_width_percent(33) main_card_data.append(mainCardChart) uidTable = self._result_setter.get_unique_identifier_table() if uidTable != None: main_card_data.append(uidTable) main_card.set_card_data(main_card_data) main_card.set_card_name( "Predicting Key Drivers of {}".format(result_column)) self._result_setter.set_score_dtree_cards([main_card], {})
def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = [ x for x in algosToRun if x.get_algorithm_slug() == self._slug ][0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) print(categorical_columns) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print("model_path", model_path) pipeline_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/pipeline/" model_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/model" pmml_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/modelPmml" df = self._data_frame if self._mlEnv == "spark": pass elif self._mlEnv == "sklearn": model_filepath = model_path + "/" + self._slug + "/model.pkl" pmml_filepath = str(model_path) + "/" + str( self._slug) + "/traindeModel.pmml" x_train, x_test, y_train, y_test = self._dataframe_helper.get_train_test_data( ) x_train = MLUtils.create_dummy_columns( x_train, [x for x in categorical_columns if x != result_column]) x_test = MLUtils.create_dummy_columns( x_test, [x for x in categorical_columns if x != result_column]) x_test = MLUtils.fill_missing_columns(x_test, x_train.columns, result_column) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "training", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") st = time.time() levels = df[result_column].unique() clf = SVC(kernel='linear', probability=True) labelEncoder = preprocessing.LabelEncoder() labelEncoder.fit(np.concatenate([y_train, y_test])) y_train = pd.Series(labelEncoder.transform(y_train)) y_test = labelEncoder.transform(y_test) classes = labelEncoder.classes_ transformed = labelEncoder.transform(classes) labelMapping = dict(list(zip(transformed, classes))) inverseLabelMapping = dict(list(zip(classes, transformed))) posLabel = inverseLabelMapping[self._targetLevel] appType = self._dataframe_context.get_app_type() print(appType, labelMapping, inverseLabelMapping, posLabel, self._targetLevel) if algoSetting.is_hyperparameter_tuning_enabled(): hyperParamInitParam = algoSetting.get_hyperparameter_params() evaluationMetricDict = { "name": hyperParamInitParam["evaluationMetric"] } evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] hyperParamAlgoName = algoSetting.get_hyperparameter_algo_name() params_grid = algoSetting.get_params_dict_hyperparameter() params_grid = { k: v for k, v in list(params_grid.items()) if k in clf.get_params() } print(params_grid) if hyperParamAlgoName == "gridsearchcv": clfGrid = GridSearchCV(clf, params_grid) gridParams = clfGrid.get_params() hyperParamInitParam = { k: v for k, v in list(hyperParamInitParam.items()) if k in gridParams } clfGrid.set_params(**hyperParamInitParam) #clfGrid.fit(x_train,y_train) grid_param = {} grid_param['params'] = ParameterGrid(params_grid) #bestEstimator = clfGrid.best_estimator_ modelFilepath = "/".join(model_filepath.split("/")[:-1]) sklearnHyperParameterResultObj = SklearnGridSearchResult( grid_param, clf, x_train, x_test, y_train, y_test, appType, modelFilepath, levels, posLabel, evaluationMetricDict) resultArray = sklearnHyperParameterResultObj.train_and_save_models( ) self._result_setter.set_hyper_parameter_results( self._slug, resultArray) self._result_setter.set_metadata_parallel_coordinates( self._slug, { "ignoreList": sklearnHyperParameterResultObj.get_ignore_list(), "hideColumns": sklearnHyperParameterResultObj.get_hide_columns(), "metricColName": sklearnHyperParameterResultObj. get_comparison_metric_colname(), "columnOrder": sklearnHyperParameterResultObj.get_keep_columns() }) elif hyperParamAlgoName == "randomsearchcv": clfRand = RandomizedSearchCV(clf, params_grid) clfRand.set_params(**hyperParamInitParam) bestEstimator = None else: evaluationMetricDict = { "name": GLOBALSETTINGS.CLASSIFICATION_MODEL_EVALUATION_METRIC } evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] self._result_setter.set_hyper_parameter_results( self._slug, None) algoParams = algoSetting.get_params_dict() algoParams = { k: v for k, v in list(algoParams.items()) if k in list(clf.get_params().keys()) } clf.set_params(**algoParams) print("!" * 50) print(clf.get_params()) print("!" * 50) if validationDict["name"] == "kFold": defaultSplit = GLOBALSETTINGS.DEFAULT_VALIDATION_OBJECT[ "value"] numFold = int(validationDict["value"]) if numFold == 0: numFold = 3 kFoldClass = SkleanrKFoldResult( numFold, clf, x_train, x_test, y_train, y_test, appType, levels, posLabel, evaluationMetricDict=evaluationMetricDict) kFoldClass.train_and_save_result() kFoldOutput = kFoldClass.get_kfold_result() bestEstimator = kFoldClass.get_best_estimator() elif validationDict["name"] == "trainAndtest": clf.fit(x_train, y_train) bestEstimator = clf # clf.fit(x_train, y_train) # bestEstimator = clf trainingTime = time.time() - st y_score = bestEstimator.predict(x_test) try: y_prob = bestEstimator.predict_proba(x_test) except: y_prob = [0] * len(y_score) # overall_precision_recall = MLUtils.calculate_overall_precision_recall(y_test,y_score,targetLevel = self._targetLevel) # print overall_precision_recall accuracy = metrics.accuracy_score(y_test, y_score) if len(levels) <= 2: precision = metrics.precision_score(y_test, y_score, pos_label=posLabel, average="binary") recall = metrics.recall_score(y_test, y_score, pos_label=posLabel, average="binary") auc = metrics.roc_auc_score(y_test, y_score) elif len(levels) > 2: precision = metrics.precision_score(y_test, y_score, pos_label=posLabel, average="macro") recall = metrics.recall_score(y_test, y_score, pos_label=posLabel, average="macro") # auc = metrics.roc_auc_score(y_test,y_score,average="weighted") auc = None y_score = labelEncoder.inverse_transform(y_score) y_test = labelEncoder.inverse_transform(y_test) featureImportance = {} feature_importance = dict( sorted(zip(x_train.columns, bestEstimator.feature_importances_), key=lambda x: x[1], reverse=True)) for k, v in feature_importance.items(): feature_importance[k] = CommonUtils.round_sig(v) objs = { "trained_model": bestEstimator, "actual": y_test, "predicted": y_score, "probability": y_prob, "feature_importance": feature_importance, "featureList": list(x_train.columns), "labelMapping": labelMapping } if not algoSetting.is_hyperparameter_tuning_enabled(): modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(modelName + ".pkl") joblib.dump(objs["trained_model"], "/".join(modelFilepathArr)) runtime = round((time.time() - st_global), 2) try: modelPmmlPipeline = PMMLPipeline([("pretrained-estimator", objs["trained_model"])]) modelPmmlPipeline.target_field = result_column modelPmmlPipeline.active_fields = np.array( [col for col in x_train.columns if col != result_column]) sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr=True) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except: pass cat_cols = list(set(categorical_columns) - {result_column}) overall_precision_recall = MLUtils.calculate_overall_precision_recall( objs["actual"], objs["predicted"], targetLevel=self._targetLevel) self._model_summary = MLModelSummary() self._model_summary.set_algorithm_name("Svm") self._model_summary.set_algorithm_display_name( "Support Vector Machine") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_confusion_matrix( MLUtils.calculate_confusion_matrix(objs["actual"], objs["predicted"])) self._model_summary.set_feature_importance(objs["feature_importance"]) self._model_summary.set_feature_list(objs["featureList"]) self._model_summary.set_model_accuracy( round(metrics.accuracy_score(objs["actual"], objs["predicted"]), 2)) self._model_summary.set_training_time(round((time.time() - st), 2)) self._model_summary.set_precision_recall_stats( overall_precision_recall["classwise_stats"]) self._model_summary.set_model_precision( overall_precision_recall["precision"]) self._model_summary.set_model_recall( overall_precision_recall["recall"]) self._model_summary.set_target_variable(result_column) self._model_summary.set_prediction_split( overall_precision_recall["prediction_split"]) self._model_summary.set_validation_method("Train and Test") self._model_summary.set_level_map_dict(objs["labelMapping"]) # self._model_summary.set_model_features(list(set(x_train.columns)-set([result_column]))) self._model_summary.set_model_features( [col for col in x_train.columns if col != result_column]) self._model_summary.set_level_counts( self._metaParser.get_unique_level_dict( list(set(categorical_columns)))) self._model_summary.set_num_trees(100) self._model_summary.set_num_rules(300) if not algoSetting.is_hyperparameter_tuning_enabled(): modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": self._model_summary.get_model_accuracy(), "evaluationMetricName": "accuracy", "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } else: modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": resultArray[0]["Accuracy"], "evaluationMetricName": "accuracy", "slug": self._model_summary.get_slug(), "Model Id": resultArray[0]["Model Id"] } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } svmCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary) ] for card in svmCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({ "svm": json.loads( CommonUtils.convert_python_object_to_json(self._model_summary)) }) self._result_setter.set_svm_model_summary(modelSummaryJson) self._result_setter.set_rf_cards(svmCards) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "completion", "info", display=True, emptyBin=False, customMsg=None, weightKey="total")
class NaiveBayesPysparkScript(object): def __init__(self, data_frame, df_helper, df_context, spark, prediction_narrative, result_setter, meta_parser, mlEnvironment="pyspark"): self._metaParser = meta_parser self._prediction_narrative = prediction_narrative self._result_setter = result_setter self._data_frame = data_frame self._dataframe_helper = df_helper self._dataframe_context = df_context self._ignoreMsg = self._dataframe_context.get_message_ignore() self._spark = spark self._model_summary = MLModelSummary() self._score_summary = {} self._slug = GLOBALSETTINGS.MODEL_SLUG_MAPPING["naive bayes"] self._datasetName = CommonUtils.get_dataset_name( self._dataframe_context.CSV_FILE) self._targetLevel = self._dataframe_context.get_target_level_for_model( ) self._targetLevel = self._dataframe_context.get_target_level_for_model( ) self._completionStatus = self._dataframe_context.get_completion_status( ) print(self._completionStatus, "initial completion status") self._analysisName = self._slug self._messageURL = self._dataframe_context.get_message_url() self._scriptWeightDict = self._dataframe_context.get_ml_model_training_weight( ) self._mlEnv = mlEnvironment # self._classifier = "nb" self._scriptStages = { "initialization": { "summary": "Initialized the Naive Bayes Scripts", "weight": 4 }, "training": { "summary": "Naive Bayes Model Training Started", "weight": 2 }, "completion": { "summary": "Naive Bayes Model Training Finished", "weight": 4 }, } def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = [ x for x in algosToRun if x.get_algorithm_slug() == self._slug ][0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() categorical_columns = [ x for x in categorical_columns if x != result_column ] appType = self._dataframe_context.get_app_type() model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print("model_path", model_path) pipeline_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/pipeline/" model_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/model" pmml_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/modelPmml" df = self._data_frame levels = df.select(result_column).distinct().count() appType = self._dataframe_context.get_app_type() model_filepath = model_path + "/" + self._slug + "/model" pmml_filepath = str(model_path) + "/" + str( self._slug) + "/traindeModel.pmml" CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "training", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") st = time.time() pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns, categorical_columns, result_column) trainingData, validationData = MLUtils.get_training_and_validation_data( df, result_column, 0.8) # indexed labelIndexer = StringIndexer(inputCol=result_column, outputCol="label") # OriginalTargetconverter = IndexToString(inputCol="label", outputCol="originalTargetColumn") # Label Mapping and Inverse labelIdx = labelIndexer.fit(trainingData) labelMapping = {k: v for k, v in enumerate(labelIdx.labels)} inverseLabelMapping = { v: float(k) for k, v in enumerate(labelIdx.labels) } if self._dataframe_context.get_trainerMode() == "autoML": automl_enable = True else: automl_enable = False clf = NaiveBayes() if not algoSetting.is_hyperparameter_tuning_enabled(): algoParams = algoSetting.get_params_dict() else: algoParams = algoSetting.get_params_dict_hyperparameter() print("=" * 100) print(algoParams) print("=" * 100) clfParams = [prm.name for prm in clf.params] algoParams = { getattr(clf, k): v if isinstance(v, list) else [v] for k, v in algoParams.items() if k in clfParams } #print("="*100) #print("ALGOPARAMS - ",algoParams) #print("="*100) paramGrid = ParamGridBuilder() # if not algoSetting.is_hyperparameter_tuning_enabled(): # for k,v in algoParams.items(): # if v == [None] * len(v): # continue # if k.name == 'thresholds': # paramGrid = paramGrid.addGrid(k,v[0]) # else: # paramGrid = paramGrid.addGrid(k,v) # paramGrid = paramGrid.build() # if not algoSetting.is_hyperparameter_tuning_enabled(): for k, v in algoParams.items(): print(k, v) if v == [None] * len(v): continue paramGrid = paramGrid.addGrid(k, v) paramGrid = paramGrid.build() # else: # for k,v in algoParams.items(): # print k.name, v # if v[0] == [None] * len(v[0]): # continue # paramGrid = paramGrid.addGrid(k,v[0]) # paramGrid = paramGrid.build() #print("="*143) #print("PARAMGRID - ", paramGrid) #print("="*143) if len(paramGrid) > 1: hyperParamInitParam = algoSetting.get_hyperparameter_params() evaluationMetricDict = { "name": hyperParamInitParam["evaluationMetric"] } evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] else: evaluationMetricDict = { "name": GLOBALSETTINGS.CLASSIFICATION_MODEL_EVALUATION_METRIC } evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] self._result_setter.set_hyper_parameter_results(self._slug, None) if validationDict["name"] == "kFold": numFold = int(validationDict["value"]) estimator = Pipeline(stages=[pipeline, labelIndexer, clf]) if algoSetting.is_hyperparameter_tuning_enabled(): modelFilepath = "/".join(model_filepath.split("/")[:-1]) pySparkHyperParameterResultObj = PySparkGridSearchResult( estimator, paramGrid, appType, modelFilepath, levels, evaluationMetricDict, trainingData, validationData, numFold, self._targetLevel, labelMapping, inverseLabelMapping, df) resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models( ) self._result_setter.set_hyper_parameter_results( self._slug, resultArray) self._result_setter.set_metadata_parallel_coordinates( self._slug, { "ignoreList": pySparkHyperParameterResultObj.get_ignore_list(), "hideColumns": pySparkHyperParameterResultObj.get_hide_columns(), "metricColName": pySparkHyperParameterResultObj. get_comparison_metric_colname(), "columnOrder": pySparkHyperParameterResultObj.get_keep_columns() }) bestModel = pySparkHyperParameterResultObj.getBestModel() prediction = pySparkHyperParameterResultObj.getBestPrediction() else: if automl_enable: paramGrid = (ParamGridBuilder().addGrid( clf.smoothing, [1.0, 0.2]).build()) crossval = CrossValidator( estimator=estimator, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator() if levels == 2 else MulticlassClassificationEvaluator(), numFolds=3 if numFold is None else numFold) # use 3+ folds in practice cvnb = crossval.fit(trainingData) prediction = cvnb.transform(validationData) bestModel = cvnb.bestModel else: train_test_ratio = float( self._dataframe_context.get_train_test_split()) estimator = Pipeline(stages=[pipeline, labelIndexer, clf]) if algoSetting.is_hyperparameter_tuning_enabled(): modelFilepath = "/".join(model_filepath.split("/")[:-1]) pySparkHyperParameterResultObj = PySparkTrainTestResult( estimator, paramGrid, appType, modelFilepath, levels, evaluationMetricDict, trainingData, validationData, train_test_ratio, self._targetLevel, labelMapping, inverseLabelMapping, df) resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models( ) self._result_setter.set_hyper_parameter_results( self._slug, resultArray) self._result_setter.set_metadata_parallel_coordinates( self._slug, { "ignoreList": pySparkHyperParameterResultObj.get_ignore_list(), "hideColumns": pySparkHyperParameterResultObj.get_hide_columns(), "metricColName": pySparkHyperParameterResultObj. get_comparison_metric_colname(), "columnOrder": pySparkHyperParameterResultObj.get_keep_columns() }) bestModel = pySparkHyperParameterResultObj.getBestModel() prediction = pySparkHyperParameterResultObj.getBestPrediction() else: tvs = TrainValidationSplit( estimator=estimator, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator() if levels == 2 else MulticlassClassificationEvaluator(), trainRatio=train_test_ratio) tvspnb = tvs.fit(trainingData) prediction = tvspnb.transform(validationData) bestModel = tvspnb.bestModel modelmanagement_ = { param[0].name: param[1] for param in bestModel.stages[2].extractParamMap().items() } MLUtils.save_pipeline_or_model(bestModel, model_filepath) predsAndLabels = prediction.select(['prediction', 'label']).rdd.map(tuple) # label_classes = prediction.select("label").distinct().collect() # label_classes = prediction.agg((F.collect_set('label').alias('label'))).first().asDict()['label'] #results = transformed.select(["prediction","label"]) # if len(label_classes) > 2: # metrics = MulticlassMetrics(predsAndLabels) # accuracy of the model # else: # metrics = BinaryClassificationMetrics(predsAndLabels) posLabel = inverseLabelMapping[self._targetLevel] metrics = MulticlassMetrics(predsAndLabels) trainingTime = time.time() - st f1_score = metrics.fMeasure(inverseLabelMapping[self._targetLevel], 1.0) precision = metrics.precision(inverseLabelMapping[self._targetLevel]) recall = metrics.recall(inverseLabelMapping[self._targetLevel]) accuracy = metrics.accuracy print(f1_score, precision, recall, accuracy) #gain chart implementation def cal_prob_eval(x): if len(x) == 1: if x == posLabel: return (float(x[1])) else: return (float(1 - x[1])) else: return (float(x[int(posLabel)])) column_name = 'probability' def y_prob_for_eval_udf(): return udf(lambda x: cal_prob_eval(x)) prediction = prediction.withColumn( "y_prob_for_eval", y_prob_for_eval_udf()(col(column_name))) try: pys_df = prediction.select( ['y_prob_for_eval', 'prediction', 'label']) gain_lift_ks_obj = GainLiftKS(pys_df, 'y_prob_for_eval', 'prediction', 'label', posLabel, self._spark) gain_lift_KS_dataframe = gain_lift_ks_obj.Run().toPandas() except: try: temp_df = pys_df.toPandas() gain_lift_ks_obj = GainLiftKS(temp_df, 'y_prob_for_eval', 'prediction', 'label', posLabel, self._spark) gain_lift_KS_dataframe = gain_lift_ks_obj.Rank_Ordering() except: print("gain chant failed") gain_lift_KS_dataframe = None #feature_importance = MLUtils.calculate_sparkml_feature_importance(df, bestModel.stages[-1], categorical_columns, numerical_columns) act_list = prediction.select('label').collect() actual = [int(row.label) for row in act_list] pred_list = prediction.select('prediction').collect() predicted = [int(row.prediction) for row in pred_list] prob_list = prediction.select('probability').collect() probability = [list(row.probability) for row in prob_list] # objs = {"trained_model":bestModel,"actual":prediction.select('label'),"predicted":prediction.select('prediction'), # "probability":prediction.select('probability'),"feature_importance":None, # "featureList":list(categorical_columns) + list(numerical_columns),"labelMapping":labelMapping} objs = { "trained_model": bestModel, "actual": actual, "predicted": predicted, "probability": probability, "feature_importance": None, "featureList": list(categorical_columns) + list(numerical_columns), "labelMapping": labelMapping } conf_mat_ar = metrics.confusionMatrix().toArray() print(conf_mat_ar) confusion_matrix = {} for i in range(len(conf_mat_ar)): confusion_matrix[labelMapping[i]] = {} for j, val in enumerate(conf_mat_ar[i]): confusion_matrix[labelMapping[i]][labelMapping[j]] = val print(confusion_matrix) # accuracy of the model '''ROC CURVE IMPLEMENTATION''' y_prob = probability y_score = predicted y_test = actual logLoss = log_loss(y_test, y_prob) if levels <= 2: positive_label_probs = [] for val in y_prob: positive_label_probs.append(val[int(posLabel)]) roc_auc = roc_auc_score(y_test, y_score) roc_data_dict = { "y_score": y_score, "y_test": y_test, "positive_label_probs": positive_label_probs, "y_prob": y_prob, "positive_label": posLabel } roc_dataframe = pd.DataFrame({ "y_score": y_score, "y_test": y_test, "positive_label_probs": positive_label_probs }) #roc_dataframe.to_csv("binary_roc_data.csv") fpr, tpr, thresholds = roc_curve(y_test, positive_label_probs, pos_label=posLabel) roc_df = pd.DataFrame({ "FPR": fpr, "TPR": tpr, "thresholds": thresholds }) roc_df["tpr-fpr"] = roc_df["TPR"] - roc_df["FPR"] optimal_index = np.argmax(np.array(roc_df["tpr-fpr"])) fpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "FPR"] tpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "TPR"] rounded_roc_df = roc_df.round({'FPR': 2, 'TPR': 4}) unique_fpr = rounded_roc_df["FPR"].unique() final_roc_df = rounded_roc_df.groupby("FPR", as_index=False)[["TPR" ]].mean() endgame_roc_df = final_roc_df.round({'FPR': 2, 'TPR': 3}) elif levels > 2: positive_label_probs = [] for val in y_prob: positive_label_probs.append(val[int(posLabel)]) y_test_roc_multi = [] for val in y_test: if val != posLabel: val = posLabel + 1 y_test_roc_multi.append(val) else: y_test_roc_multi.append(val) y_score_roc_multi = [] for val in y_score: if val != posLabel: val = posLabel + 1 y_score_roc_multi.append(val) else: y_score_roc_multi.append(val) roc_auc = roc_auc_score(y_test_roc_multi, y_score_roc_multi) fpr, tpr, thresholds = roc_curve(y_test_roc_multi, positive_label_probs, pos_label=posLabel) roc_df = pd.DataFrame({ "FPR": fpr, "TPR": tpr, "thresholds": thresholds }) roc_df["tpr-fpr"] = roc_df["TPR"] - roc_df["FPR"] optimal_index = np.argmax(np.array(roc_df["tpr-fpr"])) fpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "FPR"] tpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "TPR"] rounded_roc_df = roc_df.round({'FPR': 2, 'TPR': 4}) unique_fpr = rounded_roc_df["FPR"].unique() final_roc_df = rounded_roc_df.groupby("FPR", as_index=False)[["TPR" ]].mean() endgame_roc_df = final_roc_df.round({'FPR': 2, 'TPR': 3}) # Calculating prediction_split val_cnts = prediction.groupBy('label').count() val_cnts = map(lambda row: row.asDict(), val_cnts.collect()) prediction_split = {} total_nos = prediction.select('label').count() for item in val_cnts: print(labelMapping) classname = labelMapping[item['label']] prediction_split[classname] = round( item['count'] * 100 / float(total_nos), 2) if not algoSetting.is_hyperparameter_tuning_enabled(): modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(modelName) bestModel.save("/".join(modelFilepathArr)) runtime = round((time.time() - st_global), 2) try: print(pmml_filepath) pmmlBuilder = PMMLBuilder(self._spark, trainingData, bestModel).putOption( clf, 'compact', True) pmmlBuilder.buildFile(pmml_filepath) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except Exception as e: print("PMML failed...", str(e)) pass cat_cols = list(set(categorical_columns) - {result_column}) self._model_summary = MLModelSummary() self._model_summary.set_algorithm_name("Naive Bayes") self._model_summary.set_algorithm_display_name("Naive Bayes") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_confusion_matrix(confusion_matrix) # self._model_summary.set_feature_importance(objs["feature_importance"]) self._model_summary.set_feature_list(objs["featureList"]) self._model_summary.set_model_accuracy(accuracy) self._model_summary.set_training_time(round((time.time() - st), 2)) self._model_summary.set_precision_recall_stats([precision, recall]) self._model_summary.set_model_precision(precision) self._model_summary.set_model_recall(recall) self._model_summary.set_model_F1_score(f1_score) self._model_summary.set_model_log_loss(logLoss) self._model_summary.set_gain_lift_KS_data(gain_lift_KS_dataframe) self._model_summary.set_AUC_score(roc_auc) self._model_summary.set_target_variable(result_column) self._model_summary.set_prediction_split(prediction_split) self._model_summary.set_validation_method("KFold") self._model_summary.set_level_map_dict(objs["labelMapping"]) # self._model_summary.set_model_features(list(set(x_train.columns)-set([result_column]))) self._model_summary.set_model_features(objs["featureList"]) self._model_summary.set_level_counts( self._metaParser.get_unique_level_dict( list(set(categorical_columns)) + [result_column])) #self._model_summary.set_num_trees(objs['trained_model'].getNumTrees) self._model_summary.set_num_rules(300) self._model_summary.set_target_level(self._targetLevel) if not algoSetting.is_hyperparameter_tuning_enabled(): modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": accuracy, "evaluationMetricName": "accuracy", "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } else: modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": accuracy, "evaluationMetricName": "accuracy", "slug": self._model_summary.get_slug(), "Model Id": resultArray[0]["Model Id"] } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } self._model_management = MLModelSummary() print(modelmanagement_) self._model_management.set_job_type( self._dataframe_context.get_job_name()) #Project name self._model_management.set_training_status( data="completed") # training status self._model_management.set_target_level( self._targetLevel) # target column value self._model_management.set_training_time(runtime) # run time self._model_management.set_model_accuracy(round(metrics.accuracy, 2)) # self._model_management.set_model_accuracy(round(metrics.accuracy_score(objs["actual"], objs["predicted"]),2))#accuracy self._model_management.set_algorithm_name( "NaiveBayes") #algorithm name self._model_management.set_validation_method( str(validationDict["displayName"]) + "(" + str(validationDict["value"]) + ")") #validation method self._model_management.set_target_variable( result_column) #target column name self._model_management.set_creation_date(data=str( datetime.now().strftime('%b %d ,%Y %H:%M '))) #creation date self._model_management.set_datasetName(self._datasetName) self._model_management.set_model_type(data='classification') self._model_management.set_var_smoothing( data=int(modelmanagement_['smoothing'])) # self._model_management.set_no_of_independent_variables(df) #no of independent varables modelManagementSummaryJson = [ ["Project Name", self._model_management.get_job_type()], ["Algorithm", self._model_management.get_algorithm_name()], ["Training Status", self._model_management.get_training_status()], ["Accuracy", self._model_management.get_model_accuracy()], ["RunTime", self._model_management.get_training_time()], #["Owner",None], ["Created On", self._model_management.get_creation_date()] ] modelManagementModelSettingsJson = [ ["Training Dataset", self._model_management.get_datasetName()], ["Target Column", self._model_management.get_target_variable()], ["Target Column Value", self._model_management.get_target_level()], ["Algorithm", self._model_management.get_algorithm_name()], [ "Model Validation", self._model_management.get_validation_method() ], ["Model Type", self._model_management.get_model_type()], ["Smoothing", self._model_management.get_var_smoothing()], #,["priors",self._model_management.get_priors()] #,["var_smoothing",self._model_management.get_var_smoothing()] ] nbOverviewCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_card_overview( self._model_management, modelManagementSummaryJson, modelManagementModelSettingsJson) ] nbPerformanceCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_cards( self._model_summary, endgame_roc_df) ] nbDeploymentCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_deploy_empty_card() ] nbCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary) ] NB_Overview_Node = NarrativesTree() NB_Overview_Node.set_name("Overview") NB_Performance_Node = NarrativesTree() NB_Performance_Node.set_name("Performance") NB_Deployment_Node = NarrativesTree() NB_Deployment_Node.set_name("Deployment") for card in nbOverviewCards: NB_Overview_Node.add_a_card(card) for card in nbPerformanceCards: NB_Performance_Node.add_a_card(card) for card in nbDeploymentCards: NB_Deployment_Node.add_a_card(card) for card in nbCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({ "naivebayes": json.loads( CommonUtils.convert_python_object_to_json(self._model_summary)) }) self._result_setter.set_naive_bayes_model_summary(modelSummaryJson) self._result_setter.set_nb_cards(nbCards) self._result_setter.set_nb_nodes( [NB_Overview_Node, NB_Performance_Node, NB_Deployment_Node]) self._result_setter.set_nb_fail_card({ "Algorithm_Name": "Naive Bayes", "success": "True" }) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "completion", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") print("\n\n") def Predict(self): self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight( ) self._scriptStages = { "initialization": { "summary": "Initialized the Naive Bayes Scripts", "weight": 2 }, "prediction": { "summary": "Spark ML Naive Bayes Model Prediction Finished", "weight": 2 }, "frequency": { "summary": "descriptive analysis finished", "weight": 2 }, "chisquare": { "summary": "chi Square analysis finished", "weight": 4 }, "completion": { "summary": "all analysis finished", "weight": 4 }, } self._completionStatus += self._scriptWeightDict[self._analysisName][ "total"] * self._scriptStages["initialization"]["weight"] / 10 progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "initialization",\ "info",\ self._scriptStages["initialization"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) dataSanity = True level_counts_train = self._dataframe_context.get_level_count_dict() categorical_columns = self._dataframe_helper.get_string_columns() numerical_columns = self._dataframe_helper.get_numeric_columns() time_dimension_columns = self._dataframe_helper.get_timestamp_columns() result_column = self._dataframe_context.get_result_column() categorical_columns = [ x for x in categorical_columns if x != result_column ] level_counts_score = CommonUtils.get_level_count_dict( self._data_frame, categorical_columns, self._dataframe_context.get_column_separator(), output_type="dict", dataType="spark") for key in level_counts_train: if key in level_counts_score: if level_counts_train[key] != level_counts_score[key]: dataSanity = False else: dataSanity = False test_data_path = self._dataframe_context.get_input_file() score_data_path = self._dataframe_context.get_score_path( ) + "/data.csv" trained_model_path = self._dataframe_context.get_model_path() trained_model_path = "/".join( trained_model_path.split("/")[:-1] ) + "/" + self._slug + "/" + self._dataframe_context.get_model_for_scoring( ) # score_summary_path = self._dataframe_context.get_score_path()+"/Summary/summary.json" pipelineModel = MLUtils.load_pipeline(trained_model_path) df = self._data_frame transformed = pipelineModel.transform(df) label_indexer_dict = MLUtils.read_string_indexer_mapping( trained_model_path, SQLctx) prediction_to_levels = udf(lambda x: label_indexer_dict[x], StringType()) transformed = transformed.withColumn( result_column, prediction_to_levels(transformed.prediction)) if "probability" in transformed.columns: probability_dataframe = transformed.select( [result_column, "probability"]).toPandas() probability_dataframe = probability_dataframe.rename( index=str, columns={result_column: "predicted_class"}) probability_dataframe[ "predicted_probability"] = probability_dataframe[ "probability"].apply(lambda x: max(x)) self._score_summary[ "prediction_split"] = MLUtils.calculate_scored_probability_stats( probability_dataframe) self._score_summary["result_column"] = result_column scored_dataframe = transformed.select( categorical_columns + time_dimension_columns + numerical_columns + [result_column, "probability"]).toPandas() scored_dataframe['predicted_probability'] = probability_dataframe[ "predicted_probability"].values # scored_dataframe = scored_dataframe.rename(index=str, columns={"predicted_probability": "probability"}) else: self._score_summary["prediction_split"] = [] self._score_summary["result_column"] = result_column scored_dataframe = transformed.select(categorical_columns + time_dimension_columns + numerical_columns + [result_column]).toPandas() labelMappingDict = self._dataframe_context.get_label_map() if score_data_path.startswith("file"): score_data_path = score_data_path[7:] scored_dataframe.to_csv(score_data_path, header=True, index=False) uidCol = self._dataframe_context.get_uid_column() if uidCol == None: uidCols = self._metaParser.get_suggested_uid_columns() if len(uidCols) > 0: uidCol = uidCols[0] uidTableData = [] predictedClasses = list(scored_dataframe[result_column].unique()) if uidCol: if uidCol in df.columns: for level in predictedClasses: levelDf = scored_dataframe[scored_dataframe[result_column] == level] levelDf = levelDf[[ uidCol, "predicted_probability", result_column ]] levelDf.sort_values(by="predicted_probability", ascending=False, inplace=True) levelDf["predicted_probability"] = levelDf[ "predicted_probability"].apply( lambda x: humanize.apnumber(x * 100) + "%" if x * 100 >= 10 else str(int(x * 100)) + "%") uidTableData.append(levelDf[:5]) uidTableData = pd.concat(uidTableData) uidTableData = [list(arr) for arr in list(uidTableData.values)] uidTableData = [[uidCol, "Probability", result_column] ] + uidTableData uidTable = TableData() uidTable.set_table_width(25) uidTable.set_table_data(uidTableData) uidTable.set_table_type("normalHideColumn") self._result_setter.set_unique_identifier_table( json.loads( CommonUtils.convert_python_object_to_json(uidTable))) self._completionStatus += self._scriptWeightDict[self._analysisName][ "total"] * self._scriptStages["prediction"]["weight"] / 10 progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "prediction",\ "info",\ self._scriptStages["prediction"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) print("STARTING DIMENSION ANALYSIS ...") columns_to_keep = [] columns_to_drop = [] columns_to_keep = self._dataframe_context.get_score_consider_columns() if len(columns_to_keep) > 0: columns_to_drop = list(set(df.columns) - set(columns_to_keep)) else: columns_to_drop += ["predicted_probability"] scored_df = transformed.select(categorical_columns + time_dimension_columns + numerical_columns + [result_column]) columns_to_drop = [ x for x in columns_to_drop if x in scored_df.columns ] modified_df = scored_df.select( [x for x in scored_df.columns if x not in columns_to_drop]) resultColLevelCount = dict( modified_df.groupby(result_column).count().collect()) self._metaParser.update_column_dict( result_column, { "LevelCount": resultColLevelCount, "numberOfUniqueValues": len(resultColLevelCount.keys()) }) self._dataframe_context.set_story_on_scored_data(True) self._dataframe_context.update_consider_columns(columns_to_keep) df_helper = DataFrameHelper(modified_df, self._dataframe_context, self._metaParser) df_helper.set_params() spark_scored_df = df_helper.get_data_frame() if len(predictedClasses) >= 2: try: fs = time.time() df_decision_tree_obj = DecisionTrees( spark_scored_df, df_helper, self._dataframe_context, self._spark, self._metaParser, scriptWeight=self._scriptWeightDict, analysisName=self._analysisName).test_all( dimension_columns=[result_column]) narratives_obj = CommonUtils.as_dict( DecisionTreeNarrative(result_column, df_decision_tree_obj, self._dataframe_helper, self._dataframe_context, self._metaParser, self._result_setter, story_narrative=None, analysisName=self._analysisName, scriptWeight=self._scriptWeightDict)) print(narratives_obj) except Exception as e: print("DecisionTree Analysis Failed ", str(e)) else: data_dict = { "npred": len(predictedClasses), "nactual": len(labelMappingDict.values()) } if data_dict["nactual"] > 2: levelCountDict[predictedClasses[0]] = resultColLevelCount[ predictedClasses[0]] levelCountDict["Others"] = sum([ v for k, v in resultColLevelCount.items() if k != predictedClasses[0] ]) else: levelCountDict = resultColLevelCount otherClass = list( set(labelMappingDict.values()) - set(predictedClasses))[0] levelCountDict[otherClass] = 0 print(levelCountDict) total = float( sum([x for x in levelCountDict.values() if x != None])) levelCountTuple = [({ "name": k, "count": v, "percentage": humanize.apnumber(v * 100 / total) + "%" }) for k, v in levelCountDict.items() if v != None] levelCountTuple = sorted(levelCountTuple, key=lambda x: x["count"], reverse=True) data_dict["blockSplitter"] = "|~NEWBLOCK~|" data_dict["targetcol"] = result_column data_dict["nlevel"] = len(levelCountDict.keys()) data_dict["topLevel"] = levelCountTuple[0] data_dict["secondLevel"] = levelCountTuple[1] maincardSummary = NarrativesUtils.get_template_output( "/apps/", 'scorewithoutdtree.html', data_dict) main_card = NormalCard() main_card_data = [] main_card_narrative = NarrativesUtils.block_splitter( maincardSummary, "|~NEWBLOCK~|") main_card_data += main_card_narrative chartData = NormalChartData([levelCountDict]).get_data() chartJson = ChartJson(data=chartData) chartJson.set_title(result_column) chartJson.set_chart_type("donut") mainCardChart = C3ChartData(data=chartJson) mainCardChart.set_width_percent(33) main_card_data.append(mainCardChart) uidTable = self._result_setter.get_unique_identifier_table() if uidTable != None: main_card_data.append(uidTable) main_card.set_card_data(main_card_data) main_card.set_card_name( "Predicting Key Drivers of {}".format(result_column)) self._result_setter.set_score_dtree_cards([main_card], {})
def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = [ x for x in algosToRun if x.get_algorithm_slug() == self._slug ][0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() categorical_columns = [ x for x in categorical_columns if x != result_column ] appType = self._dataframe_context.get_app_type() model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print("model_path", model_path) pipeline_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/pipeline/" model_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/model" pmml_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/modelPmml" df = self._data_frame levels = df.select(result_column).distinct().count() appType = self._dataframe_context.get_app_type() model_filepath = model_path + "/" + self._slug + "/model" pmml_filepath = str(model_path) + "/" + str( self._slug) + "/traindeModel.pmml" CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "training", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") st = time.time() pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns, categorical_columns, result_column) trainingData, validationData = MLUtils.get_training_and_validation_data( df, result_column, 0.8) # indexed labelIndexer = StringIndexer(inputCol=result_column, outputCol="label") # OriginalTargetconverter = IndexToString(inputCol="label", outputCol="originalTargetColumn") # Label Mapping and Inverse labelIdx = labelIndexer.fit(trainingData) labelMapping = {k: v for k, v in enumerate(labelIdx.labels)} inverseLabelMapping = { v: float(k) for k, v in enumerate(labelIdx.labels) } if self._dataframe_context.get_trainerMode() == "autoML": automl_enable = True else: automl_enable = False clf = NaiveBayes() if not algoSetting.is_hyperparameter_tuning_enabled(): algoParams = algoSetting.get_params_dict() else: algoParams = algoSetting.get_params_dict_hyperparameter() print("=" * 100) print(algoParams) print("=" * 100) clfParams = [prm.name for prm in clf.params] algoParams = { getattr(clf, k): v if isinstance(v, list) else [v] for k, v in algoParams.items() if k in clfParams } #print("="*100) #print("ALGOPARAMS - ",algoParams) #print("="*100) paramGrid = ParamGridBuilder() # if not algoSetting.is_hyperparameter_tuning_enabled(): # for k,v in algoParams.items(): # if v == [None] * len(v): # continue # if k.name == 'thresholds': # paramGrid = paramGrid.addGrid(k,v[0]) # else: # paramGrid = paramGrid.addGrid(k,v) # paramGrid = paramGrid.build() # if not algoSetting.is_hyperparameter_tuning_enabled(): for k, v in algoParams.items(): print(k, v) if v == [None] * len(v): continue paramGrid = paramGrid.addGrid(k, v) paramGrid = paramGrid.build() # else: # for k,v in algoParams.items(): # print k.name, v # if v[0] == [None] * len(v[0]): # continue # paramGrid = paramGrid.addGrid(k,v[0]) # paramGrid = paramGrid.build() #print("="*143) #print("PARAMGRID - ", paramGrid) #print("="*143) if len(paramGrid) > 1: hyperParamInitParam = algoSetting.get_hyperparameter_params() evaluationMetricDict = { "name": hyperParamInitParam["evaluationMetric"] } evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] else: evaluationMetricDict = { "name": GLOBALSETTINGS.CLASSIFICATION_MODEL_EVALUATION_METRIC } evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] self._result_setter.set_hyper_parameter_results(self._slug, None) if validationDict["name"] == "kFold": numFold = int(validationDict["value"]) estimator = Pipeline(stages=[pipeline, labelIndexer, clf]) if algoSetting.is_hyperparameter_tuning_enabled(): modelFilepath = "/".join(model_filepath.split("/")[:-1]) pySparkHyperParameterResultObj = PySparkGridSearchResult( estimator, paramGrid, appType, modelFilepath, levels, evaluationMetricDict, trainingData, validationData, numFold, self._targetLevel, labelMapping, inverseLabelMapping, df) resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models( ) self._result_setter.set_hyper_parameter_results( self._slug, resultArray) self._result_setter.set_metadata_parallel_coordinates( self._slug, { "ignoreList": pySparkHyperParameterResultObj.get_ignore_list(), "hideColumns": pySparkHyperParameterResultObj.get_hide_columns(), "metricColName": pySparkHyperParameterResultObj. get_comparison_metric_colname(), "columnOrder": pySparkHyperParameterResultObj.get_keep_columns() }) bestModel = pySparkHyperParameterResultObj.getBestModel() prediction = pySparkHyperParameterResultObj.getBestPrediction() else: if automl_enable: paramGrid = (ParamGridBuilder().addGrid( clf.smoothing, [1.0, 0.2]).build()) crossval = CrossValidator( estimator=estimator, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator() if levels == 2 else MulticlassClassificationEvaluator(), numFolds=3 if numFold is None else numFold) # use 3+ folds in practice cvnb = crossval.fit(trainingData) prediction = cvnb.transform(validationData) bestModel = cvnb.bestModel else: train_test_ratio = float( self._dataframe_context.get_train_test_split()) estimator = Pipeline(stages=[pipeline, labelIndexer, clf]) if algoSetting.is_hyperparameter_tuning_enabled(): modelFilepath = "/".join(model_filepath.split("/")[:-1]) pySparkHyperParameterResultObj = PySparkTrainTestResult( estimator, paramGrid, appType, modelFilepath, levels, evaluationMetricDict, trainingData, validationData, train_test_ratio, self._targetLevel, labelMapping, inverseLabelMapping, df) resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models( ) self._result_setter.set_hyper_parameter_results( self._slug, resultArray) self._result_setter.set_metadata_parallel_coordinates( self._slug, { "ignoreList": pySparkHyperParameterResultObj.get_ignore_list(), "hideColumns": pySparkHyperParameterResultObj.get_hide_columns(), "metricColName": pySparkHyperParameterResultObj. get_comparison_metric_colname(), "columnOrder": pySparkHyperParameterResultObj.get_keep_columns() }) bestModel = pySparkHyperParameterResultObj.getBestModel() prediction = pySparkHyperParameterResultObj.getBestPrediction() else: tvs = TrainValidationSplit( estimator=estimator, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator() if levels == 2 else MulticlassClassificationEvaluator(), trainRatio=train_test_ratio) tvspnb = tvs.fit(trainingData) prediction = tvspnb.transform(validationData) bestModel = tvspnb.bestModel modelmanagement_ = { param[0].name: param[1] for param in bestModel.stages[2].extractParamMap().items() } MLUtils.save_pipeline_or_model(bestModel, model_filepath) predsAndLabels = prediction.select(['prediction', 'label']).rdd.map(tuple) # label_classes = prediction.select("label").distinct().collect() # label_classes = prediction.agg((F.collect_set('label').alias('label'))).first().asDict()['label'] #results = transformed.select(["prediction","label"]) # if len(label_classes) > 2: # metrics = MulticlassMetrics(predsAndLabels) # accuracy of the model # else: # metrics = BinaryClassificationMetrics(predsAndLabels) posLabel = inverseLabelMapping[self._targetLevel] metrics = MulticlassMetrics(predsAndLabels) trainingTime = time.time() - st f1_score = metrics.fMeasure(inverseLabelMapping[self._targetLevel], 1.0) precision = metrics.precision(inverseLabelMapping[self._targetLevel]) recall = metrics.recall(inverseLabelMapping[self._targetLevel]) accuracy = metrics.accuracy print(f1_score, precision, recall, accuracy) #gain chart implementation def cal_prob_eval(x): if len(x) == 1: if x == posLabel: return (float(x[1])) else: return (float(1 - x[1])) else: return (float(x[int(posLabel)])) column_name = 'probability' def y_prob_for_eval_udf(): return udf(lambda x: cal_prob_eval(x)) prediction = prediction.withColumn( "y_prob_for_eval", y_prob_for_eval_udf()(col(column_name))) try: pys_df = prediction.select( ['y_prob_for_eval', 'prediction', 'label']) gain_lift_ks_obj = GainLiftKS(pys_df, 'y_prob_for_eval', 'prediction', 'label', posLabel, self._spark) gain_lift_KS_dataframe = gain_lift_ks_obj.Run().toPandas() except: try: temp_df = pys_df.toPandas() gain_lift_ks_obj = GainLiftKS(temp_df, 'y_prob_for_eval', 'prediction', 'label', posLabel, self._spark) gain_lift_KS_dataframe = gain_lift_ks_obj.Rank_Ordering() except: print("gain chant failed") gain_lift_KS_dataframe = None #feature_importance = MLUtils.calculate_sparkml_feature_importance(df, bestModel.stages[-1], categorical_columns, numerical_columns) act_list = prediction.select('label').collect() actual = [int(row.label) for row in act_list] pred_list = prediction.select('prediction').collect() predicted = [int(row.prediction) for row in pred_list] prob_list = prediction.select('probability').collect() probability = [list(row.probability) for row in prob_list] # objs = {"trained_model":bestModel,"actual":prediction.select('label'),"predicted":prediction.select('prediction'), # "probability":prediction.select('probability'),"feature_importance":None, # "featureList":list(categorical_columns) + list(numerical_columns),"labelMapping":labelMapping} objs = { "trained_model": bestModel, "actual": actual, "predicted": predicted, "probability": probability, "feature_importance": None, "featureList": list(categorical_columns) + list(numerical_columns), "labelMapping": labelMapping } conf_mat_ar = metrics.confusionMatrix().toArray() print(conf_mat_ar) confusion_matrix = {} for i in range(len(conf_mat_ar)): confusion_matrix[labelMapping[i]] = {} for j, val in enumerate(conf_mat_ar[i]): confusion_matrix[labelMapping[i]][labelMapping[j]] = val print(confusion_matrix) # accuracy of the model '''ROC CURVE IMPLEMENTATION''' y_prob = probability y_score = predicted y_test = actual logLoss = log_loss(y_test, y_prob) if levels <= 2: positive_label_probs = [] for val in y_prob: positive_label_probs.append(val[int(posLabel)]) roc_auc = roc_auc_score(y_test, y_score) roc_data_dict = { "y_score": y_score, "y_test": y_test, "positive_label_probs": positive_label_probs, "y_prob": y_prob, "positive_label": posLabel } roc_dataframe = pd.DataFrame({ "y_score": y_score, "y_test": y_test, "positive_label_probs": positive_label_probs }) #roc_dataframe.to_csv("binary_roc_data.csv") fpr, tpr, thresholds = roc_curve(y_test, positive_label_probs, pos_label=posLabel) roc_df = pd.DataFrame({ "FPR": fpr, "TPR": tpr, "thresholds": thresholds }) roc_df["tpr-fpr"] = roc_df["TPR"] - roc_df["FPR"] optimal_index = np.argmax(np.array(roc_df["tpr-fpr"])) fpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "FPR"] tpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "TPR"] rounded_roc_df = roc_df.round({'FPR': 2, 'TPR': 4}) unique_fpr = rounded_roc_df["FPR"].unique() final_roc_df = rounded_roc_df.groupby("FPR", as_index=False)[["TPR" ]].mean() endgame_roc_df = final_roc_df.round({'FPR': 2, 'TPR': 3}) elif levels > 2: positive_label_probs = [] for val in y_prob: positive_label_probs.append(val[int(posLabel)]) y_test_roc_multi = [] for val in y_test: if val != posLabel: val = posLabel + 1 y_test_roc_multi.append(val) else: y_test_roc_multi.append(val) y_score_roc_multi = [] for val in y_score: if val != posLabel: val = posLabel + 1 y_score_roc_multi.append(val) else: y_score_roc_multi.append(val) roc_auc = roc_auc_score(y_test_roc_multi, y_score_roc_multi) fpr, tpr, thresholds = roc_curve(y_test_roc_multi, positive_label_probs, pos_label=posLabel) roc_df = pd.DataFrame({ "FPR": fpr, "TPR": tpr, "thresholds": thresholds }) roc_df["tpr-fpr"] = roc_df["TPR"] - roc_df["FPR"] optimal_index = np.argmax(np.array(roc_df["tpr-fpr"])) fpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "FPR"] tpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "TPR"] rounded_roc_df = roc_df.round({'FPR': 2, 'TPR': 4}) unique_fpr = rounded_roc_df["FPR"].unique() final_roc_df = rounded_roc_df.groupby("FPR", as_index=False)[["TPR" ]].mean() endgame_roc_df = final_roc_df.round({'FPR': 2, 'TPR': 3}) # Calculating prediction_split val_cnts = prediction.groupBy('label').count() val_cnts = map(lambda row: row.asDict(), val_cnts.collect()) prediction_split = {} total_nos = prediction.select('label').count() for item in val_cnts: print(labelMapping) classname = labelMapping[item['label']] prediction_split[classname] = round( item['count'] * 100 / float(total_nos), 2) if not algoSetting.is_hyperparameter_tuning_enabled(): modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(modelName) bestModel.save("/".join(modelFilepathArr)) runtime = round((time.time() - st_global), 2) try: print(pmml_filepath) pmmlBuilder = PMMLBuilder(self._spark, trainingData, bestModel).putOption( clf, 'compact', True) pmmlBuilder.buildFile(pmml_filepath) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except Exception as e: print("PMML failed...", str(e)) pass cat_cols = list(set(categorical_columns) - {result_column}) self._model_summary = MLModelSummary() self._model_summary.set_algorithm_name("Naive Bayes") self._model_summary.set_algorithm_display_name("Naive Bayes") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_confusion_matrix(confusion_matrix) # self._model_summary.set_feature_importance(objs["feature_importance"]) self._model_summary.set_feature_list(objs["featureList"]) self._model_summary.set_model_accuracy(accuracy) self._model_summary.set_training_time(round((time.time() - st), 2)) self._model_summary.set_precision_recall_stats([precision, recall]) self._model_summary.set_model_precision(precision) self._model_summary.set_model_recall(recall) self._model_summary.set_model_F1_score(f1_score) self._model_summary.set_model_log_loss(logLoss) self._model_summary.set_gain_lift_KS_data(gain_lift_KS_dataframe) self._model_summary.set_AUC_score(roc_auc) self._model_summary.set_target_variable(result_column) self._model_summary.set_prediction_split(prediction_split) self._model_summary.set_validation_method("KFold") self._model_summary.set_level_map_dict(objs["labelMapping"]) # self._model_summary.set_model_features(list(set(x_train.columns)-set([result_column]))) self._model_summary.set_model_features(objs["featureList"]) self._model_summary.set_level_counts( self._metaParser.get_unique_level_dict( list(set(categorical_columns)) + [result_column])) #self._model_summary.set_num_trees(objs['trained_model'].getNumTrees) self._model_summary.set_num_rules(300) self._model_summary.set_target_level(self._targetLevel) if not algoSetting.is_hyperparameter_tuning_enabled(): modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": accuracy, "evaluationMetricName": "accuracy", "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } else: modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": accuracy, "evaluationMetricName": "accuracy", "slug": self._model_summary.get_slug(), "Model Id": resultArray[0]["Model Id"] } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } self._model_management = MLModelSummary() print(modelmanagement_) self._model_management.set_job_type( self._dataframe_context.get_job_name()) #Project name self._model_management.set_training_status( data="completed") # training status self._model_management.set_target_level( self._targetLevel) # target column value self._model_management.set_training_time(runtime) # run time self._model_management.set_model_accuracy(round(metrics.accuracy, 2)) # self._model_management.set_model_accuracy(round(metrics.accuracy_score(objs["actual"], objs["predicted"]),2))#accuracy self._model_management.set_algorithm_name( "NaiveBayes") #algorithm name self._model_management.set_validation_method( str(validationDict["displayName"]) + "(" + str(validationDict["value"]) + ")") #validation method self._model_management.set_target_variable( result_column) #target column name self._model_management.set_creation_date(data=str( datetime.now().strftime('%b %d ,%Y %H:%M '))) #creation date self._model_management.set_datasetName(self._datasetName) self._model_management.set_model_type(data='classification') self._model_management.set_var_smoothing( data=int(modelmanagement_['smoothing'])) # self._model_management.set_no_of_independent_variables(df) #no of independent varables modelManagementSummaryJson = [ ["Project Name", self._model_management.get_job_type()], ["Algorithm", self._model_management.get_algorithm_name()], ["Training Status", self._model_management.get_training_status()], ["Accuracy", self._model_management.get_model_accuracy()], ["RunTime", self._model_management.get_training_time()], #["Owner",None], ["Created On", self._model_management.get_creation_date()] ] modelManagementModelSettingsJson = [ ["Training Dataset", self._model_management.get_datasetName()], ["Target Column", self._model_management.get_target_variable()], ["Target Column Value", self._model_management.get_target_level()], ["Algorithm", self._model_management.get_algorithm_name()], [ "Model Validation", self._model_management.get_validation_method() ], ["Model Type", self._model_management.get_model_type()], ["Smoothing", self._model_management.get_var_smoothing()], #,["priors",self._model_management.get_priors()] #,["var_smoothing",self._model_management.get_var_smoothing()] ] nbOverviewCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_card_overview( self._model_management, modelManagementSummaryJson, modelManagementModelSettingsJson) ] nbPerformanceCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_cards( self._model_summary, endgame_roc_df) ] nbDeploymentCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_deploy_empty_card() ] nbCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary) ] NB_Overview_Node = NarrativesTree() NB_Overview_Node.set_name("Overview") NB_Performance_Node = NarrativesTree() NB_Performance_Node.set_name("Performance") NB_Deployment_Node = NarrativesTree() NB_Deployment_Node.set_name("Deployment") for card in nbOverviewCards: NB_Overview_Node.add_a_card(card) for card in nbPerformanceCards: NB_Performance_Node.add_a_card(card) for card in nbDeploymentCards: NB_Deployment_Node.add_a_card(card) for card in nbCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({ "naivebayes": json.loads( CommonUtils.convert_python_object_to_json(self._model_summary)) }) self._result_setter.set_naive_bayes_model_summary(modelSummaryJson) self._result_setter.set_nb_cards(nbCards) self._result_setter.set_nb_nodes( [NB_Overview_Node, NB_Performance_Node, NB_Deployment_Node]) self._result_setter.set_nb_fail_card({ "Algorithm_Name": "Naive Bayes", "success": "True" }) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "completion", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") print("\n\n")
class NNPTRegressionScript(object): def __init__(self, data_frame, df_helper, df_context, spark, prediction_narrative, result_setter, meta_parser, mlEnvironment="sklearn"): self._metaParser = meta_parser self._prediction_narrative = prediction_narrative self._result_setter = result_setter self._data_frame = data_frame self._dataframe_helper = df_helper self._dataframe_context = df_context self._spark = spark self._model_summary = MLModelSummary() self._score_summary = {} self._slug = GLOBALSETTINGS.MODEL_SLUG_MAPPING[ "Neural Network (PyTorch)"] self._analysisName = self._slug self._dataframe_context.set_analysis_name(self._analysisName) self._mlEnv = mlEnvironment self._datasetName = CommonUtils.get_dataset_name( self._dataframe_context.CSV_FILE) self._completionStatus = self._dataframe_context.get_completion_status( ) print(self._completionStatus, "initial completion status") self._messageURL = self._dataframe_context.get_message_url() self._scriptWeightDict = self._dataframe_context.get_ml_model_training_weight( ) self._ignoreMsg = self._dataframe_context.get_message_ignore() self._scriptStages = { "initialization": { "summary": "Initialized The Neural Network (PyTorch) Scripts", "weight": 1 }, "training": { "summary": "Neural Network (PyTorch) Training Started", "weight": 2 }, "completion": { "summary": "Neural Network (PyTorch) Training Finished", "weight": 1 }, } def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") appType = self._dataframe_context.get_app_type() algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = [ x for x in algosToRun if x.get_algorithm_slug() == self._slug ][0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) print("CATEGORICAL COLS - ", categorical_columns) result_column = self._dataframe_context.get_result_column() numerical_columns = self._dataframe_helper.get_numeric_columns() numerical_columns = [ x for x in numerical_columns if x != result_column ] model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print("model_path", model_path) pipeline_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/pipeline/" model_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/model" pmml_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/modelPmml" df = self._data_frame if self._mlEnv == "spark": pass elif self._mlEnv == "sklearn": model_filepath = model_path + "/" + self._slug + "/model.pkl" x_train, x_test, y_train, y_test = self._dataframe_helper.get_train_test_data( ) x_train = MLUtils.create_dummy_columns( x_train, [x for x in categorical_columns if x != result_column]) x_test = MLUtils.create_dummy_columns( x_test, [x for x in categorical_columns if x != result_column]) x_test = MLUtils.fill_missing_columns(x_test, x_train.columns, result_column) print("=" * 150) print("X-Train Shape - ", x_train.shape) print("Y-Train Shape - ", y_train.shape) print("X-Test Shape - ", x_test.shape) print("Y-Test Shape - ", y_test.shape) print("~" * 50) print("X-Train dtype - ", type(x_train)) print("Y-Train dtype - ", type(y_train)) print("X-Test dtype - ", type(x_test)) print("Y-Test dtype - ", type(y_test)) print("~" * 50) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "training", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") st = time.time() self._result_setter.set_hyper_parameter_results(self._slug, None) evaluationMetricDict = algoSetting.get_evaluvation_metric( Type="REGRESSION") evaluationMetricDict = { "name": GLOBALSETTINGS.REGRESSION_MODEL_EVALUATION_METRIC } evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] x_train_tensored, y_train_tensored, x_test_tensored, y_test_tensored = PYTORCHUTILS.get_tensored_data( x_train, y_train, x_test, y_test) trainset = torch_data_utils.TensorDataset(x_train_tensored, y_train_tensored) testset = torch_data_utils.TensorDataset(x_test_tensored, y_test_tensored) nnptr_params = algoSetting.get_nnptr_params_dict()[0] layers_for_network = PYTORCHUTILS.get_layers_for_network_module( nnptr_params, task_type="REGRESSION", first_layer_units=x_train.shape[1]) # Use GPU if available device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") network = PyTorchNetwork(layers_for_network).to(device) network.eval() other_params_dict = PYTORCHUTILS.get_other_pytorch_params( nnptr_params, task_type="REGRESSION", network_params=network.parameters()) print("~" * 50) print("NNPTR-PARAMS - ", nnptr_params) print("~" * 50) print("OTHER-PARAMS-DICT - ", other_params_dict) print("~" * 50) print("NEURAL-NETWORK - ", network) print("~" * 50) criterion = other_params_dict["loss_criterion"] n_epochs = other_params_dict["number_of_epochs"] batch_size = other_params_dict["batch_size"] optimizer = other_params_dict["optimizer"] dataloader_params = { "batch_size": batch_size, "shuffle": True # "num_workers": } train_loader = torch_data_utils.DataLoader(trainset, **dataloader_params) test_loader = torch_data_utils.DataLoader(testset, **dataloader_params) ''' Training the network; Batchnormalization(num_features) should be equal to units_op for that layer in training config; else --> RuntimeError('running_mean should contain 100 elements not 200',) ''' for epoch in range(n_epochs): batchwise_losses = [] average_loss = 0.0 for i, (inputs, labels) in enumerate(train_loader): inputs = inputs.to(device) labels = labels.to(device) # Zero the parameter gradients optimizer.zero_grad() # Forward + backward + optimize outputs = network(inputs.float()) loss = criterion(outputs, labels.float()) loss.backward() optimizer.step() average_loss += loss.item() batchwise_losses.append(loss.item()) average_loss_per_epoch = old_div(average_loss, (i + 1)) print("+" * 80) print("EPOCH - ", epoch) print("BATCHWISE_LOSSES shape - ", len(batchwise_losses)) print("AVERAGE LOSS PER EPOCH - ", average_loss_per_epoch) print("+" * 80) trainingTime = time.time() - st bestEstimator = network outputs_x_test_tensored = network(x_test_tensored.float()) y_score_mid = outputs_x_test_tensored.tolist() y_score = [x[0] for x in y_score_mid] print("Y-SCORE - ", y_score) print("Y-SCORE length - ", len(y_score)) y_prob = None featureImportance = {} objs = { "trained_model": bestEstimator, "actual": y_test, "predicted": y_score, "probability": y_prob, "feature_importance": featureImportance, "featureList": list(x_train.columns), "labelMapping": {} } #featureImportance = objs["trained_model"].feature_importances_ #featuresArray = [(col_name, featureImportance[idx]) for idx, col_name in enumerate(x_train.columns)] featuresArray = [] if not algoSetting.is_hyperparameter_tuning_enabled(): modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(modelName + ".pt") torch.save(objs["trained_model"], "/".join(modelFilepathArr)) #joblib.dump(objs["trained_model"],"/".join(modelFilepathArr)) runtime = round((time.time() - st), 2) else: runtime = round((time.time() - hyper_st), 2) try: modelPmmlPipeline = PMMLPipeline([("pretrained-estimator", objs["trained_model"])]) modelPmmlPipeline.target_field = result_column modelPmmlPipeline.active_fields = np.array( [col for col in x_train.columns if col != result_column]) sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr=True) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except: pass metrics = {} metrics["r2"] = r2_score(y_test, y_score) metrics["neg_mean_squared_error"] = mean_squared_error( y_test, y_score) metrics["neg_mean_absolute_error"] = mean_absolute_error( y_test, y_score) metrics["RMSE"] = sqrt(metrics["neg_mean_squared_error"]) metrics["explained_variance_score"] = explained_variance_score( y_test, y_score) transformed = pd.DataFrame({ "prediction": y_score, result_column: y_test }) print("TRANSFORMED PREDICTION TYPE - ", type(transformed["prediction"])) print(transformed["prediction"]) print("TRANSFORMED RESULT COL TYPE - ", type(transformed[result_column])) print(transformed[result_column]) transformed["difference"] = transformed[ result_column] - transformed["prediction"] transformed["mape"] = old_div( np.abs(transformed["difference"]) * 100, transformed[result_column]) sampleData = None nrows = transformed.shape[0] if nrows > 100: sampleData = transformed.sample(n=100, random_state=420) else: sampleData = transformed print(sampleData.head()) if transformed["mape"].max() > 100: GLOBALSETTINGS.MAPEBINS.append(transformed["mape"].max()) mapeCountArr = list( pd.cut(transformed["mape"], GLOBALSETTINGS.MAPEBINS). value_counts().to_dict().items()) GLOBALSETTINGS.MAPEBINS.pop(5) else: mapeCountArr = list( pd.cut(transformed["mape"], GLOBALSETTINGS.MAPEBINS). value_counts().to_dict().items()) mapeStatsArr = [(str(idx), dictObj) for idx, dictObj in enumerate( sorted([{ "count": x[1], "splitRange": (x[0].left, x[0].right) } for x in mapeCountArr], key=lambda x: x["splitRange"][0]))] print(mapeStatsArr) print(mapeCountArr) predictionColSummary = transformed["prediction"].describe( ).to_dict() quantileBins = [ predictionColSummary["min"], predictionColSummary["25%"], predictionColSummary["50%"], predictionColSummary["75%"], predictionColSummary["max"] ] print(quantileBins) quantileBins = sorted(list(set(quantileBins))) transformed["quantileBinId"] = pd.cut(transformed["prediction"], quantileBins) quantileDf = transformed.groupby("quantileBinId").agg({ "prediction": [np.sum, np.mean, np.size] }).reset_index() quantileDf.columns = ["prediction", "sum", "mean", "count"] print(quantileDf) quantileArr = list(quantileDf.T.to_dict().items()) quantileSummaryArr = [(obj[0], { "splitRange": (obj[1]["prediction"].left, obj[1]["prediction"].right), "count": obj[1]["count"], "mean": obj[1]["mean"], "sum": obj[1]["sum"] }) for obj in quantileArr] print(quantileSummaryArr) runtime = round((time.time() - st_global), 2) self._model_summary.set_model_type("regression") self._model_summary.set_algorithm_name("Neural Network (PyTorch)") self._model_summary.set_algorithm_display_name( "Neural Network (PyTorch)") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_training_time(trainingTime) self._model_summary.set_target_variable(result_column) self._model_summary.set_validation_method( validationDict["displayName"]) self._model_summary.set_model_evaluation_metrics(metrics) self._model_summary.set_model_params(nnptr_params) self._model_summary.set_quantile_summary(quantileSummaryArr) self._model_summary.set_mape_stats(mapeStatsArr) self._model_summary.set_sample_data(sampleData.to_dict()) self._model_summary.set_feature_importance(featuresArray) self._model_summary.set_feature_list(list(x_train.columns)) self._model_summary.set_model_mse( metrics["neg_mean_squared_error"]) self._model_summary.set_model_mae( metrics["neg_mean_absolute_error"]) self._model_summary.set_rmse(metrics["RMSE"]) self._model_summary.set_model_rsquared(metrics["r2"]) self._model_summary.set_model_exp_variance_score( metrics["explained_variance_score"]) try: pmml_filepath = str(model_path) + "/" + str( self._slug) + "/traindeModel.pmml" modelPmmlPipeline = PMMLPipeline([("pretrained-estimator", objs["trained_model"])]) modelPmmlPipeline.target_field = result_column modelPmmlPipeline.active_fields = np.array( [col for col in x_train.columns if col != result_column]) sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr=True) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except: pass if algoSetting.is_hyperparameter_tuning_enabled(): modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": metrics[evaluationMetricDict["name"]], "evaluationMetricName": evaluationMetricDict["name"], "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } else: modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": metrics[evaluationMetricDict["name"]], "evaluationMetricName": evaluationMetricDict["name"], "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } modelmanagement_ = nnptr_params self._model_management = MLModelSummary() if algoSetting.is_hyperparameter_tuning_enabled(): pass else: self._model_management.set_layer_info( data=modelmanagement_['hidden_layer_info']) self._model_management.set_loss_function( data=modelmanagement_['loss']) self._model_management.set_optimizer( data=modelmanagement_['optimizer']) self._model_management.set_batch_size( data=modelmanagement_['batch_size']) self._model_management.set_no_epochs( data=modelmanagement_['number_of_epochs']) # self._model_management.set_model_evaluation_metrics(data=modelmanagement_['metrics']) self._model_management.set_job_type( self._dataframe_context.get_job_name()) #Project name self._model_management.set_training_status( data="completed") # training status self._model_management.set_no_of_independent_variables( data=x_train) #no of independent varables self._model_management.set_training_time(runtime) # run time self._model_management.set_rmse(metrics["RMSE"]) self._model_management.set_algorithm_name( "Neural Network (TensorFlow)") #algorithm name self._model_management.set_validation_method( str(validationDict["displayName"]) + "(" + str(validationDict["value"]) + ")") #validation method self._model_management.set_target_variable( result_column) #target column name self._model_management.set_creation_date(data=str( datetime.now().strftime('%b %d ,%Y %H:%M '))) #creation date self._model_management.set_datasetName(self._datasetName) modelManagementSummaryJson = [ ["Project Name", self._model_management.get_job_type()], ["Algorithm", self._model_management.get_algorithm_name()], ["Training Status", self._model_management.get_training_status()], ["RMSE", self._model_management.get_rmse()], ["RunTime", self._model_management.get_training_time()], #["Owner",None], ["Created On", self._model_management.get_creation_date()] ] if algoSetting.is_hyperparameter_tuning_enabled(): modelManagementModelSettingsJson = [] else: modelManagementModelSettingsJson = [ ["Training Dataset", self._model_management.get_datasetName()], [ "Target Column", self._model_management.get_target_variable() ], [ "Number Of Independent Variables", self._model_management.get_no_of_independent_variables() ], ["Algorithm", self._model_management.get_algorithm_name()], [ "Model Validation", self._model_management.get_validation_method() ], ["batch_size", str(self._model_management.get_batch_size())], ["Loss", self._model_management.get_loss_function()], ["Optimizer", self._model_management.get_optimizer()], ["Epochs", self._model_management.get_no_epochs()], [ "Metrics", self._model_management.get_model_evaluation_metrics() ] ] for i in modelmanagement_["hidden_layer_info"]: string = "" key = str(modelmanagement_["hidden_layer_info"][i] ["layer"]) + " " + str(i) + ":" for j in modelmanagement_["hidden_layer_info"][i]: string = string + str(j) + ":" + str( modelmanagement_["hidden_layer_info"][i][j]) + ", " modelManagementModelSettingsJson.append([key, string]) print(modelManagementModelSettingsJson) nnptrCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary) ] nnptrPerformanceCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_cards_regression( self._model_summary) ] nnptrOverviewCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_card_overview( self._model_management, modelManagementSummaryJson, modelManagementModelSettingsJson) ] nnptrDeploymentCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_deploy_empty_card() ] nnptr_Overview_Node = NarrativesTree() nnptr_Overview_Node.set_name("Overview") nnptr_Performance_Node = NarrativesTree() nnptr_Performance_Node.set_name("Performance") nnptr_Deployment_Node = NarrativesTree() nnptr_Deployment_Node.set_name("Deployment") for card in nnptrOverviewCards: nnptr_Overview_Node.add_a_card(card) for card in nnptrPerformanceCards: nnptr_Performance_Node.add_a_card(card) for card in nnptrDeploymentCards: nnptr_Deployment_Node.add_a_card(card) for card in nnptrCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({ "Neural Network (PyTorch)": json.loads( CommonUtils.convert_python_object_to_json(self._model_summary)) }) self._result_setter.set_nnptr_regression_model_summary( modelSummaryJson) self._result_setter.set_nnptr_cards(nnptrCards) self._result_setter.set_nnptr_nodes([ nnptr_Overview_Node, nnptr_Performance_Node, nnptr_Deployment_Node ]) self._result_setter.set_nnptr_fail_card({ "Algorithm_Name": "Neural Network (PyTorch)", "Success": "True" }) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "completion", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") def Predict(self): self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight( ) self._scriptStages = { "initialization": { "summary": "Initialized The Neural Network (PyTorch) Scripts", "weight": 2 }, "predictionStart": { "summary": "Neural Network (PyTorch) Prediction Started", "weight": 2 }, "predictionFinished": { "summary": "Neural Network (PyTorch) Prediction Finished", "weight": 6 } } CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) dataSanity = True categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() test_data_path = self._dataframe_context.get_input_file() if self._mlEnv == "spark": pass elif self._mlEnv == "sklearn": CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "predictionStart", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") score_data_path = self._dataframe_context.get_score_path( ) + "/data.csv" trained_model_path = "file://" + self._dataframe_context.get_model_path( ) trained_model_path += "/" + self._dataframe_context.get_model_for_scoring( ) + ".pt" print("trained_model_path", trained_model_path) print("score_data_path", score_data_path) if trained_model_path.startswith("file"): trained_model_path = trained_model_path[7:] #trained_model = joblib.load(trained_model_path) trained_model = torch.load(trained_model_path, map_location=torch.device('cpu')) model_columns = self._dataframe_context.get_model_features() print("model_columns", model_columns) try: df = self._data_frame.toPandas() except: df = self._data_frame # pandas_df = MLUtils.factorize_columns(df,[x for x in categorical_columns if x != result_column]) pandas_df = MLUtils.create_dummy_columns( df, [x for x in categorical_columns if x != result_column]) pandas_df = MLUtils.fill_missing_columns(pandas_df, model_columns, result_column) if uid_col: pandas_df = pandas_df[[ x for x in pandas_df.columns if x != uid_col ]] test_df = np.stack( [pandas_df[col].values for col in pandas_df.columns], 1) tensored_test_df = torch.tensor(test_df, dtype=torch.float) outputs_test_df_tensored = trained_model(tensored_test_df.float()) y_score_mid = outputs_test_df_tensored.tolist() y_score = [x[0] for x in y_score_mid] scoreKpiArray = MLUtils.get_scored_data_summary(y_score) kpiCard = NormalCard() kpiCardData = [KpiData(data=x) for x in scoreKpiArray] kpiCard.set_card_data(kpiCardData) kpiCard.set_cente_alignment(True) print(CommonUtils.convert_python_object_to_json(kpiCard)) self._result_setter.set_kpi_card_regression_score(kpiCard) pandas_df[result_column] = y_score df[result_column] = y_score df.to_csv(score_data_path, header=True, index=False) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "predictionFinished", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") print("STARTING Measure ANALYSIS ...") columns_to_keep = [] columns_to_drop = [] columns_to_keep = self._dataframe_context.get_score_consider_columns( ) if len(columns_to_keep) > 0: columns_to_drop = list(set(df.columns) - set(columns_to_keep)) else: columns_to_drop += ["predicted_probability"] columns_to_drop = [ x for x in columns_to_drop if x in df.columns and x != result_column ] print("columns_to_drop", columns_to_drop) pandas_scored_df = df[list(set(columns_to_keep + [result_column]))] spark_scored_df = SQLctx.createDataFrame(pandas_scored_df) # spark_scored_df.write.csv(score_data_path+"/data",mode="overwrite",header=True) # TODO update metadata for the newly created dataframe self._dataframe_context.update_consider_columns(columns_to_keep) print(spark_scored_df.printSchema()) df_helper = DataFrameHelper(spark_scored_df, self._dataframe_context, self._metaParser) df_helper.set_params() df = df_helper.get_data_frame() # self._dataframe_context.set_dont_send_message(True) try: fs = time.time() descr_stats_obj = DescriptiveStatsScript( df, df_helper, self._dataframe_context, self._result_setter, self._spark, self._prediction_narrative, scriptWeight=self._scriptWeightDict, analysisName="Descriptive analysis") descr_stats_obj.Run() print("DescriptiveStats Analysis Done in ", time.time() - fs, " seconds.") except: print("Frequency Analysis Failed ") # try: # fs = time.time() # df_helper.fill_na_dimension_nulls() # df = df_helper.get_data_frame() # dt_reg = DecisionTreeRegressionScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,self._metaParser,scriptWeight=self._scriptWeightDict,analysisName="Predictive modeling") # dt_reg.Run() # print "DecisionTrees Analysis Done in ", time.time() - fs, " seconds." # except: # print "DTREE FAILED" try: fs = time.time() two_way_obj = TwoWayAnovaScript( df, df_helper, self._dataframe_context, self._result_setter, self._spark, self._prediction_narrative, self._metaParser, scriptWeight=self._scriptWeightDict, analysisName="Measure vs. Dimension") two_way_obj.Run() print("OneWayAnova Analysis Done in ", time.time() - fs, " seconds.") except: print("Anova Analysis Failed")
class TensorFlowRegScript(object): def __init__(self, data_frame, df_helper, df_context, spark, prediction_narrative, result_setter, meta_parser, mlEnvironment="sklearn"): self._metaParser = meta_parser self._prediction_narrative = prediction_narrative self._result_setter = result_setter self._data_frame = data_frame self._dataframe_helper = df_helper self._dataframe_context = df_context self._spark = spark self._model_summary = MLModelSummary() self._score_summary = {} self._slug = GLOBALSETTINGS.MODEL_SLUG_MAPPING[ "Neural Network (TensorFlow)"] self._analysisName = "Neural Network (TensorFlow)" self._dataframe_context.set_analysis_name(self._analysisName) self._mlEnv = mlEnvironment self._datasetName = CommonUtils.get_dataset_name( self._dataframe_context.CSV_FILE) self._completionStatus = self._dataframe_context.get_completion_status( ) print(self._completionStatus, "initial completion status") self._messageURL = self._dataframe_context.get_message_url() self._scriptWeightDict = self._dataframe_context.get_ml_model_training_weight( ) self._ignoreMsg = self._dataframe_context.get_message_ignore() self._scriptStages = { "initialization": { "summary": "Initialized The Neural Network (TensorFlow) Regression Scripts", "weight": 1 }, "training": { "summary": "Neural Network (TensorFlow) Regression Model Training Started", "weight": 2 }, "completion": { "summary": "Neural Network (TensorFlow) Regression Model Training Finished", "weight": 1 }, } def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") appType = self._dataframe_context.get_app_type() algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = [ x for x in algosToRun if x.get_algorithm_slug() == self._slug ][0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) print(categorical_columns) result_column = self._dataframe_context.get_result_column() numerical_columns = self._dataframe_helper.get_numeric_columns() numerical_columns = [ x for x in numerical_columns if x != result_column ] model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print("model_path", model_path) pipeline_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/pipeline/" model_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/model" pmml_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/modelPmml" df = self._data_frame if self._mlEnv == "spark": pass elif self._mlEnv == "sklearn": model_filepath = model_path + "/" + self._slug + "/model.pkl" x_train, x_test, y_train, y_test = self._dataframe_helper.get_train_test_data( ) x_train = MLUtils.create_dummy_columns( x_train, [x for x in categorical_columns if x != result_column]) x_test = MLUtils.create_dummy_columns( x_test, [x for x in categorical_columns if x != result_column]) x_test = MLUtils.fill_missing_columns(x_test, x_train.columns, result_column) st = time.time() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "training", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") if algoSetting.is_hyperparameter_tuning_enabled(): pass else: self._result_setter.set_hyper_parameter_results( self._slug, None) evaluationMetricDict = algoSetting.get_evaluvation_metric( Type="Regression") evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] params_tf = algoSetting.get_tf_params_dict() algoParams = algoSetting.get_params_dict() algoParams = {k: v for k, v in list(algoParams.items())} model = tf.keras.models.Sequential() first_layer_flag = True for i in range(len(list( params_tf['hidden_layer_info'].keys()))): if params_tf['hidden_layer_info'][str( i)]["layer"] == "Dense": if first_layer_flag: model.add( tf.keras.layers.Dense( params_tf['hidden_layer_info'][str( i)]["units"], activation=params_tf['hidden_layer_info'][ str(i)]["activation"], input_shape=(len(x_train.columns), ), use_bias=params_tf['hidden_layer_info'][ str(i)]["use_bias"], kernel_initializer=params_tf[ 'hidden_layer_info'][str( i)]["kernel_initializer"], bias_initializer=params_tf[ 'hidden_layer_info'][str( i)]["bias_initializer"], kernel_regularizer=params_tf[ 'hidden_layer_info'][str( i)]["kernel_regularizer"], bias_regularizer=params_tf[ 'hidden_layer_info'][str( i)]["bias_regularizer"], activity_regularizer=params_tf[ 'hidden_layer_info'][str( i)]["activity_regularizer"], kernel_constraint=params_tf[ 'hidden_layer_info'][str( i)]["kernel_constraint"], bias_constraint=params_tf[ 'hidden_layer_info'][str( i)]["bias_constraint"])) try: if params_tf['hidden_layer_info'][str( i)]["batch_normalization"] == "True": model.add( tf.keras.layers.BatchNormalization()) except: print( "BATCH_NORM_FAILED ##########################" ) pass first_layer_flag = False else: model.add( tf.keras.layers.Dense( params_tf['hidden_layer_info'][str( i)]["units"], activation=params_tf['hidden_layer_info'][ str(i)]["activation"], use_bias=params_tf['hidden_layer_info'][ str(i)]["use_bias"], kernel_initializer=params_tf[ 'hidden_layer_info'][str( i)]["kernel_initializer"], bias_initializer=params_tf[ 'hidden_layer_info'][str( i)]["bias_initializer"], kernel_regularizer=params_tf[ 'hidden_layer_info'][str( i)]["kernel_regularizer"], bias_regularizer=params_tf[ 'hidden_layer_info'][str( i)]["bias_regularizer"], activity_regularizer=params_tf[ 'hidden_layer_info'][str( i)]["activity_regularizer"], kernel_constraint=params_tf[ 'hidden_layer_info'][str( i)]["kernel_constraint"], bias_constraint=params_tf[ 'hidden_layer_info'][str( i)]["bias_constraint"])) try: if params_tf['hidden_layer_info'][str( i)]["batch_normalization"] == "True": model.add( tf.keras.layers.BatchNormalization()) except: print( "BATCH_NORM_FAILED ##########################" ) pass elif params_tf['hidden_layer_info'][str( i)]["layer"] == "Dropout": model.add( tf.keras.layers.Dropout( float(params_tf['hidden_layer_info'][str(i)] ["rate"]))) elif params_tf['hidden_layer_info'][str( i)]["layer"] == "Lambda": if params_tf['hidden_layer_info'][str( i)]["lambda"] == "Addition": model.add( tf.keras.layers.Lambda(lambda x: x + int( params_tf['hidden_layer_info'][str(i)][ "units"]))) if params_tf['hidden_layer_info'][str( i)]["lambda"] == "Multiplication": model.add( tf.keras.layers.Lambda(lambda x: x * int( params_tf['hidden_layer_info'][str(i)][ "units"]))) if params_tf['hidden_layer_info'][str( i)]["lambda"] == "Subtraction": model.add( tf.keras.layers.Lambda(lambda x: x - int( params_tf['hidden_layer_info'][str(i)][ "units"]))) if params_tf['hidden_layer_info'][str( i)]["lambda"] == "Division": model.add( tf.keras.layers.Lambda(lambda x: old_div( x, int(params_tf['hidden_layer_info'][str(i)][ "units"])))) model.compile(optimizer=algoParams["optimizer"], loss=algoParams["loss"], metrics=[algoParams['metrics']]) model.fit(x_train, y_train, epochs=algoParams["number_of_epochs"], verbose=1, batch_size=algoParams["batch_size"]) bestEstimator = model print(model.summary()) trainingTime = time.time() - st y_score = bestEstimator.predict(x_test) y_score = list(y_score.flatten()) try: y_prob = bestEstimator.predict_proba(x_test) except: y_prob = [0] * len(y_score) featureImportance = {} objs = { "trained_model": bestEstimator, "actual": y_test, "predicted": y_score, "probability": y_prob, "feature_importance": featureImportance, "featureList": list(x_train.columns), "labelMapping": {} } #featureImportance = objs["trained_model"].feature_importances_ #featuresArray = [(col_name, featureImportance[idx]) for idx, col_name in enumerate(x_train.columns)] featuresArray = [] if not algoSetting.is_hyperparameter_tuning_enabled(): modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(modelName + ".h5") objs["trained_model"].save("/".join(modelFilepathArr)) #joblib.dump(objs["trained_model"],"/".join(modelFilepathArr)) metrics = {} metrics["r2"] = r2_score(y_test, y_score) metrics["neg_mean_squared_error"] = mean_squared_error( y_test, y_score) metrics["neg_mean_absolute_error"] = mean_absolute_error( y_test, y_score) metrics["RMSE"] = sqrt(metrics["neg_mean_squared_error"]) metrics["explained_variance_score"] = explained_variance_score( y_test, y_score) transformed = pd.DataFrame({ "prediction": y_score, result_column: y_test }) transformed["difference"] = transformed[ result_column] - transformed["prediction"] transformed["mape"] = old_div( np.abs(transformed["difference"]) * 100, transformed[result_column]) sampleData = None nrows = transformed.shape[0] if nrows > 100: sampleData = transformed.sample(n=100, random_state=420) else: sampleData = transformed print(sampleData.head()) if transformed["mape"].max() > 100: GLOBALSETTINGS.MAPEBINS.append(transformed["mape"].max()) mapeCountArr = list( pd.cut(transformed["mape"], GLOBALSETTINGS.MAPEBINS). value_counts().to_dict().items()) GLOBALSETTINGS.MAPEBINS.pop(5) else: mapeCountArr = list( pd.cut(transformed["mape"], GLOBALSETTINGS.MAPEBINS). value_counts().to_dict().items()) mapeStatsArr = [(str(idx), dictObj) for idx, dictObj in enumerate( sorted([{ "count": x[1], "splitRange": (x[0].left, x[0].right) } for x in mapeCountArr], key=lambda x: x["splitRange"][0]))] print(mapeStatsArr) print(mapeCountArr) predictionColSummary = transformed["prediction"].describe( ).to_dict() quantileBins = [ predictionColSummary["min"], predictionColSummary["25%"], predictionColSummary["50%"], predictionColSummary["75%"], predictionColSummary["max"] ] print(quantileBins) quantileBins = sorted(list(set(quantileBins))) transformed["quantileBinId"] = pd.cut(transformed["prediction"], quantileBins) quantileDf = transformed.groupby("quantileBinId").agg({ "prediction": [np.sum, np.mean, np.size] }).reset_index() quantileDf.columns = ["prediction", "sum", "mean", "count"] print(quantileDf) quantileArr = list(quantileDf.T.to_dict().items()) quantileSummaryArr = [(obj[0], { "splitRange": (obj[1]["prediction"].left, obj[1]["prediction"].right), "count": obj[1]["count"], "mean": obj[1]["mean"], "sum": obj[1]["sum"] }) for obj in quantileArr] print(quantileSummaryArr) runtime = round((time.time() - st_global), 2) self._model_summary.set_model_type("regression") self._model_summary.set_algorithm_name( "Neural Network (TensorFlow)") self._model_summary.set_algorithm_display_name( "Neural Network (TensorFlow)") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_training_time(trainingTime) self._model_summary.set_target_variable(result_column) self._model_summary.set_validation_method( validationDict["displayName"]) self._model_summary.set_model_evaluation_metrics(metrics) self._model_summary.set_model_params(params_tf) self._model_summary.set_quantile_summary(quantileSummaryArr) self._model_summary.set_mape_stats(mapeStatsArr) self._model_summary.set_sample_data(sampleData.to_dict()) self._model_summary.set_feature_importance(featuresArray) self._model_summary.set_feature_list(list(x_train.columns)) self._model_summary.set_model_mse( metrics["neg_mean_squared_error"]) self._model_summary.set_model_mae( metrics["neg_mean_absolute_error"]) self._model_summary.set_rmse(metrics["RMSE"]) self._model_summary.set_model_rsquared(metrics["r2"]) self._model_summary.set_model_exp_variance_score( metrics["explained_variance_score"]) try: pmml_filepath = str(model_path) + "/" + str( self._slug) + "/traindeModel.pmml" modelPmmlPipeline = PMMLPipeline([("pretrained-estimator", objs["trained_model"])]) modelPmmlPipeline.target_field = result_column modelPmmlPipeline.active_fields = np.array( [col for col in x_train.columns if col != result_column]) sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr=True) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except: pass if algoSetting.is_hyperparameter_tuning_enabled(): modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": metrics[evaluationMetricDict["name"]], "evaluationMetricName": evaluationMetricDict["name"], "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } else: modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": metrics[evaluationMetricDict["name"]], "evaluationMetricName": evaluationMetricDict["name"], "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } modelmanagement_ = params_tf modelmanagement_.update(algoParams) self._model_management = MLModelSummary() if algoSetting.is_hyperparameter_tuning_enabled(): pass else: self._model_management.set_layer_info( data=modelmanagement_['hidden_layer_info']) self._model_management.set_loss_function( data=modelmanagement_['loss']) self._model_management.set_optimizer( data=modelmanagement_['optimizer']) self._model_management.set_batch_size( data=modelmanagement_['batch_size']) self._model_management.set_no_epochs( data=modelmanagement_['number_of_epochs']) self._model_management.set_model_evaluation_metrics( data=modelmanagement_['metrics']) self._model_management.set_job_type( self._dataframe_context.get_job_name()) #Project name self._model_management.set_training_status( data="completed") # training status self._model_management.set_no_of_independent_variables( data=x_train) #no of independent varables self._model_management.set_training_time(runtime) # run time self._model_management.set_rmse(metrics["RMSE"]) self._model_management.set_algorithm_name( "Neural Network (TensorFlow)") #algorithm name self._model_management.set_validation_method( str(validationDict["displayName"]) + "(" + str(validationDict["value"]) + ")") #validation method self._model_management.set_target_variable( result_column) #target column name self._model_management.set_creation_date(data=str( datetime.now().strftime('%b %d ,%Y %H:%M '))) #creation date self._model_management.set_datasetName(self._datasetName) modelManagementSummaryJson = [ ["Project Name", self._model_management.get_job_type()], ["Algorithm", self._model_management.get_algorithm_name()], ["Training Status", self._model_management.get_training_status()], ["RMSE", self._model_management.get_rmse()], ["RunTime", self._model_management.get_training_time()], #["Owner",None], ["Created On", self._model_management.get_creation_date()] ] if algoSetting.is_hyperparameter_tuning_enabled(): modelManagementModelSettingsJson = [] else: modelManagementModelSettingsJson = [ ["Training Dataset", self._model_management.get_datasetName()], [ "Target Column", self._model_management.get_target_variable() ], [ "Number Of Independent Variables", self._model_management.get_no_of_independent_variables() ], ["Algorithm", self._model_management.get_algorithm_name()], [ "Model Validation", self._model_management.get_validation_method() ], ["batch_size", str(self._model_management.get_batch_size())], ["Loss", self._model_management.get_loss_function()], ["Optimizer", self._model_management.get_optimizer()], ["Epochs", self._model_management.get_no_epochs()], [ "Metrics", self._model_management.get_model_evaluation_metrics() ] ] for i in range( len(list(modelmanagement_['hidden_layer_info'].keys()))): string = "" key = "layer No-" + str(i) + "-" + str( modelmanagement_["hidden_layer_info"][str(i)]["layer"] + "-") for j in modelmanagement_["hidden_layer_info"][str(i)]: modelManagementModelSettingsJson.append([ key + j + ":", modelmanagement_["hidden_layer_info"][str(i)][j] ]) print(modelManagementModelSettingsJson) tfregCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary) ] tfregPerformanceCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_cards_regression( self._model_summary) ] tfregOverviewCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_card_overview( self._model_management, modelManagementSummaryJson, modelManagementModelSettingsJson) ] tfregDeploymentCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_deploy_empty_card() ] TFReg_Overview_Node = NarrativesTree() TFReg_Overview_Node.set_name("Overview") TFReg_Performance_Node = NarrativesTree() TFReg_Performance_Node.set_name("Performance") TFReg_Deployment_Node = NarrativesTree() TFReg_Deployment_Node.set_name("Deployment") for card in tfregOverviewCards: TFReg_Overview_Node.add_a_card(card) for card in tfregPerformanceCards: TFReg_Performance_Node.add_a_card(card) for card in tfregDeploymentCards: TFReg_Deployment_Node.add_a_card(card) for card in tfregCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({ "Neural Network (TensorFlow)": json.loads( CommonUtils.convert_python_object_to_json(self._model_summary)) }) self._result_setter.set_tfreg_regression_model_summart( modelSummaryJson) self._result_setter.set_tfreg_cards(tfregCards) self._result_setter.set_tfreg_nodes([ TFReg_Overview_Node, TFReg_Performance_Node, TFReg_Deployment_Node ]) self._result_setter.set_tfreg_fail_card({ "Algorithm_Name": "Neural Network (TensorFlow)", "Success": "True" }) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "completion", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") def Predict(self): self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight( ) self._scriptStages = { "initialization": { "summary": "Initialized The Neural Network (TensorFlow) Regression Scripts", "weight": 2 }, "predictionStart": { "summary": "Neural Network (TensorFlow) Regression Model Prediction Started", "weight": 2 }, "predictionFinished": { "summary": "Neural Network (TensorFlow) Regression Model Prediction Finished", "weight": 6 } } CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) dataSanity = True categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() test_data_path = self._dataframe_context.get_input_file() if self._mlEnv == "spark": pass elif self._mlEnv == "sklearn": CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "predictionStart", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") score_data_path = self._dataframe_context.get_score_path( ) + "/data.csv" trained_model_path = "file://" + self._dataframe_context.get_model_path( ) trained_model_path += "/" + self._dataframe_context.get_model_for_scoring( ) + ".h5" print("trained_model_path", trained_model_path) print("score_data_path", score_data_path) if trained_model_path.startswith("file"): trained_model_path = trained_model_path[7:] #trained_model = joblib.load(trained_model_path) trained_model = tf.keras.models.load_model(trained_model_path) model_columns = self._dataframe_context.get_model_features() print("model_columns", model_columns) df = self._data_frame.toPandas() # pandas_df = MLUtils.factorize_columns(df,[x for x in categorical_columns if x != result_column]) pandas_df = MLUtils.create_dummy_columns( df, [x for x in categorical_columns if x != result_column]) pandas_df = MLUtils.fill_missing_columns(pandas_df, model_columns, result_column) if uid_col: pandas_df = pandas_df[[ x for x in pandas_df.columns if x != uid_col ]] y_score = trained_model.predict(pandas_df) y_score = list(y_score.flatten()) scoreKpiArray = MLUtils.get_scored_data_summary(y_score) kpiCard = NormalCard() kpiCardData = [KpiData(data=x) for x in scoreKpiArray] kpiCard.set_card_data(kpiCardData) kpiCard.set_cente_alignment(True) print(CommonUtils.convert_python_object_to_json(kpiCard)) self._result_setter.set_kpi_card_regression_score(kpiCard) pandas_df[result_column] = y_score df[result_column] = y_score df.to_csv(score_data_path, header=True, index=False) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "predictionFinished", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") print("STARTING Measure ANALYSIS ...") columns_to_keep = [] columns_to_drop = [] columns_to_keep = self._dataframe_context.get_score_consider_columns( ) if len(columns_to_keep) > 0: columns_to_drop = list(set(df.columns) - set(columns_to_keep)) else: columns_to_drop += ["predicted_probability"] columns_to_drop = [ x for x in columns_to_drop if x in df.columns and x != result_column ] print("columns_to_drop", columns_to_drop) pandas_scored_df = df[list(set(columns_to_keep + [result_column]))] spark_scored_df = SQLctx.createDataFrame(pandas_scored_df) # spark_scored_df.write.csv(score_data_path+"/data",mode="overwrite",header=True) # TODO update metadata for the newly created dataframe self._dataframe_context.update_consider_columns(columns_to_keep) print(spark_scored_df.printSchema()) df_helper = DataFrameHelper(spark_scored_df, self._dataframe_context, self._metaParser) df_helper.set_params() df = df_helper.get_data_frame() # self._dataframe_context.set_dont_send_message(True) try: fs = time.time() descr_stats_obj = DescriptiveStatsScript( df, df_helper, self._dataframe_context, self._result_setter, self._spark, self._prediction_narrative, scriptWeight=self._scriptWeightDict, analysisName="Descriptive analysis") descr_stats_obj.Run() print("DescriptiveStats Analysis Done in ", time.time() - fs, " seconds.") except: print("Frequency Analysis Failed ") # try: # fs = time.time() # df_helper.fill_na_dimension_nulls() # df = df_helper.get_data_frame() # dt_reg = DecisionTreeRegressionScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,self._metaParser,scriptWeight=self._scriptWeightDict,analysisName="Predictive modeling") # dt_reg.Run() # print "DecisionTrees Analysis Done in ", time.time() - fs, " seconds." # except: # print "DTREE FAILED" try: fs = time.time() two_way_obj = TwoWayAnovaScript( df, df_helper, self._dataframe_context, self._result_setter, self._spark, self._prediction_narrative, self._metaParser, scriptWeight=self._scriptWeightDict, analysisName="Measure vs. Dimension") two_way_obj.Run() print("OneWayAnova Analysis Done in ", time.time() - fs, " seconds.") except: print("Anova Analysis Failed")
def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message(self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = [x for x in algosToRun if x.get_algorithm_slug()==self._slug][0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() categorical_columns = [x for x in categorical_columns if x != result_column] appType = self._dataframe_context.get_app_type() model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() # pipeline_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/pipeline/" # model_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/model" # pmml_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/modelPmml" df = self._data_frame levels = df.select(result_column).distinct().count() appType = self._dataframe_context.get_app_type() model_filepath = model_path + "/" + self._slug + "/model" pmml_filepath = str(model_path) + "/" + str(self._slug) + "/trainedModel.pmml" CommonUtils.create_update_and_save_progress_message(self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "training", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") st = time.time() pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns, categorical_columns, result_column) vectorFeats = pipeline.getStages()[-1].transform(df) input_feats = len(vectorFeats.select('features').take(1)[0][0]) trainingData, validationData = MLUtils.get_training_and_validation_data(df, result_column, 0.8) # indexed labelIndexer = StringIndexer(inputCol=result_column, outputCol="label") # OriginalTargetconverter = IndexToString(inputCol="label", outputCol="originalTargetColumn") # Label Mapping and Inverse labelIdx = labelIndexer.fit(trainingData) labelMapping = {k: v for k, v in enumerate(labelIdx.labels)} inverseLabelMapping = {v: float(k) for k, v in enumerate(labelIdx.labels)} clf = MultilayerPerceptronClassifier() if not algoSetting.is_hyperparameter_tuning_enabled(): algoParams = algoSetting.get_params_dict() else: algoParams = algoSetting.get_params_dict_hyperparameter() clfParams = [prm.name for prm in clf.params] algoParams = {getattr(clf, k): v if isinstance(v, list) else [v] for k, v in algoParams.items() if k in clfParams} paramGrid = ParamGridBuilder() layer_param_val = algoParams[getattr(clf, 'layers')] for layer in layer_param_val: layer.insert(0, input_feats) layer.append(levels) print('layer_param_val =', layer_param_val) # if not algoSetting.is_hyperparameter_tuning_enabled(): # for k,v in algoParams.items(): # if k.name == 'layers': # paramGrid = paramGrid.addGrid(k,layer_param_val) # else: # paramGrid = paramGrid.addGrid(k,v) # paramGrid = paramGrid.build() # else: for k, v in algoParams.items(): if v == [None] * len(v): continue if k.name == 'layers': paramGrid = paramGrid.addGrid(k, layer_param_val) else: paramGrid = paramGrid.addGrid(k, v) paramGrid = paramGrid.build() if len(paramGrid) > 1: hyperParamInitParam = algoSetting.get_hyperparameter_params() evaluationMetricDict = {"name": hyperParamInitParam["evaluationMetric"]} evaluationMetricDict["displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] else: evaluationMetricDict = {"name": GLOBALSETTINGS.CLASSIFICATION_MODEL_EVALUATION_METRIC} evaluationMetricDict["displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] self._result_setter.set_hyper_parameter_results(self._slug, None) if validationDict["name"] == "kFold": numFold = int(validationDict["value"]) estimator = Pipeline(stages=[pipeline, labelIndexer, clf]) if algoSetting.is_hyperparameter_tuning_enabled(): modelFilepath = "/".join(model_filepath.split("/")[:-1]) pySparkHyperParameterResultObj = PySparkGridSearchResult(estimator, paramGrid, appType, modelFilepath, levels, evaluationMetricDict, trainingData, validationData, numFold, self._targetLevel, labelMapping, inverseLabelMapping, df) resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models() self._result_setter.set_hyper_parameter_results(self._slug, resultArray) self._result_setter.set_metadata_parallel_coordinates(self._slug, { "ignoreList": pySparkHyperParameterResultObj.get_ignore_list(), "hideColumns": pySparkHyperParameterResultObj.get_hide_columns(), "metricColName": pySparkHyperParameterResultObj.get_comparison_metric_colname(), "columnOrder": pySparkHyperParameterResultObj.get_keep_columns()}) bestModel = pySparkHyperParameterResultObj.getBestModel() prediction = pySparkHyperParameterResultObj.getBestPrediction() bestModelName = resultArray[0]["Model Id"] else: crossval = CrossValidator(estimator=estimator, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator() if levels == 2 else MulticlassClassificationEvaluator(), numFolds=3 if numFold is None else numFold) # use 3+ folds in practice cvrf = crossval.fit(trainingData) prediction = cvrf.transform(validationData) bestModel = cvrf.bestModel bestModelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" else: train_test_ratio = float(self._dataframe_context.get_train_test_split()) estimator = Pipeline(stages=[pipeline, labelIndexer, clf]) if algoSetting.is_hyperparameter_tuning_enabled(): modelFilepath = "/".join(model_filepath.split("/")[:-1]) pySparkHyperParameterResultObj = PySparkTrainTestResult(estimator, paramGrid, appType, modelFilepath, levels, evaluationMetricDict, trainingData, validationData, train_test_ratio, self._targetLevel, labelMapping, inverseLabelMapping, df) resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models() self._result_setter.set_hyper_parameter_results(self._slug, resultArray) self._result_setter.set_metadata_parallel_coordinates(self._slug, { "ignoreList": pySparkHyperParameterResultObj.get_ignore_list(), "hideColumns": pySparkHyperParameterResultObj.get_hide_columns(), "metricColName": pySparkHyperParameterResultObj.get_comparison_metric_colname(), "columnOrder": pySparkHyperParameterResultObj.get_keep_columns()}) bestModel = pySparkHyperParameterResultObj.getBestModel() prediction = pySparkHyperParameterResultObj.getBestPrediction() bestModelName = resultArray[0]["Model Id"] else: tvs = TrainValidationSplit(estimator=estimator, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator() if levels == 2 else MulticlassClassificationEvaluator(), trainRatio=train_test_ratio) tvrf = tvs.fit(trainingData) prediction = tvrf.transform(validationData) bestModel = tvrf.bestModel bestModelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" MLUtils.save_pipeline_or_model(bestModel,model_filepath) predsAndLabels = prediction.select(['prediction', 'label']).rdd.map(tuple) metrics = MulticlassMetrics(predsAndLabels) posLabel = inverseLabelMapping[self._targetLevel] conf_mat_ar = metrics.confusionMatrix().toArray() print(conf_mat_ar) confusion_matrix = {} for i in range(len(conf_mat_ar)): confusion_matrix[labelMapping[i]] = {} for j, val in enumerate(conf_mat_ar[i]): confusion_matrix[labelMapping[i]][labelMapping[j]] = val print(confusion_matrix) trainingTime = time.time() - st f1_score = metrics.fMeasure(inverseLabelMapping[self._targetLevel], 1.0) precision = metrics.precision(inverseLabelMapping[self._targetLevel]) recall = metrics.recall(inverseLabelMapping[self._targetLevel]) accuracy = metrics.accuracy roc_auc = 'Undefined' if levels == 2: bin_metrics = BinaryClassificationMetrics(predsAndLabels) roc_auc = bin_metrics.areaUnderROC precision = metrics.precision(inverseLabelMapping[self._targetLevel]) recall = metrics.recall(inverseLabelMapping[self._targetLevel]) print(f1_score,precision,recall,accuracy) #gain chart implementation def cal_prob_eval(x): if len(x) == 1: if x == posLabel: return(float(x[1])) else: return(float(1 - x[1])) else: return(float(x[int(posLabel)])) column_name= 'probability' def y_prob_for_eval_udf(): return udf(lambda x:cal_prob_eval(x)) prediction = prediction.withColumn("y_prob_for_eval", y_prob_for_eval_udf()(col(column_name))) try: pys_df = prediction.select(['y_prob_for_eval','prediction','label']) gain_lift_ks_obj = GainLiftKS(pys_df, 'y_prob_for_eval', 'prediction', 'label', posLabel, self._spark) gain_lift_KS_dataframe = gain_lift_ks_obj.Run().toPandas() except: try: temp_df = pys_df.toPandas() gain_lift_ks_obj = GainLiftKS(temp_df, 'y_prob_for_eval', 'prediction', 'label', posLabel, self._spark) gain_lift_KS_dataframe = gain_lift_ks_obj.Rank_Ordering() except: print("gain chant failed") gain_lift_KS_dataframe = None objs = {"trained_model": bestModel, "actual": prediction.select('label'), "predicted": prediction.select('prediction'), "probability": prediction.select('probability'), "feature_importance": None, "featureList": list(categorical_columns) + list(numerical_columns), "labelMapping": labelMapping} # Calculating prediction_split val_cnts = prediction.groupBy('label').count() val_cnts = map(lambda row: row.asDict(), val_cnts.collect()) prediction_split = {} total_nos = objs['actual'].count() for item in val_cnts: classname = labelMapping[item['label']] prediction_split[classname] = round(item['count'] * 100 / float(total_nos), 2) if not algoSetting.is_hyperparameter_tuning_enabled(): # modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(bestModelName) bestModel.save("/".join(modelFilepathArr)) runtime = round((time.time() - st_global), 2) try: print(pmml_filepath) pmmlBuilder = PMMLBuilder(self._spark, trainingData, bestModel).putOption(clf, 'compact', True) pmmlBuilder.buildFile(pmml_filepath) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except Exception as e: print("PMML failed...", str(e)) pass cat_cols = list(set(categorical_columns) - {result_column}) self._model_summary = MLModelSummary() self._model_summary.set_algorithm_name("Spark ML Multilayer Perceptron") self._model_summary.set_algorithm_display_name("Spark ML Multilayer Perceptron") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_confusion_matrix(confusion_matrix) self._model_summary.set_feature_importance(objs["feature_importance"]) self._model_summary.set_feature_list(objs["featureList"]) self._model_summary.set_model_accuracy(accuracy) self._model_summary.set_training_time(round((time.time() - st), 2)) self._model_summary.set_precision_recall_stats([precision, recall]) self._model_summary.set_model_precision(precision) self._model_summary.set_model_recall(recall) self._model_summary.set_target_variable(result_column) self._model_summary.set_prediction_split(prediction_split) self._model_summary.set_validation_method("KFold") self._model_summary.set_level_map_dict(objs["labelMapping"]) self._model_summary.set_model_features(objs["featureList"]) self._model_summary.set_level_counts( self._metaParser.get_unique_level_dict(list(set(categorical_columns)) + [result_column])) self._model_summary.set_num_trees(None) self._model_summary.set_num_rules(300) self._model_summary.set_target_level(self._targetLevel) modelManagementJson = { "Model ID": "SPMLP-" + bestModelName, "Project Name": self._dataframe_context.get_job_name(), "Algorithm": self._model_summary.get_algorithm_name(), "Status": 'Completed', "Accuracy": accuracy, "Runtime": runtime, "Created On": "", "Owner": "", "Deployment": 0, "Action": '' } # if not algoSetting.is_hyperparameter_tuning_enabled(): # modelDropDownObj = { # "name": self._model_summary.get_algorithm_name(), # "evaluationMetricValue": locals()[evaluationMetricDict["name"]], # accuracy # "evaluationMetricName": evaluationMetricDict["displayName"], # accuracy # "slug": self._model_summary.get_slug(), # "Model Id": bestModelName # } # modelSummaryJson = { # "dropdown": modelDropDownObj, # "levelcount": self._model_summary.get_level_counts(), # "modelFeatureList": self._model_summary.get_feature_list(), # "levelMapping": self._model_summary.get_level_map_dict(), # "slug": self._model_summary.get_slug(), # "name": self._model_summary.get_algorithm_name() # } # else: modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": accuracy, #locals()[evaluationMetricDict["name"]], "evaluationMetricName": "accuracy", # evaluationMetricDict["name"], "slug": self._model_summary.get_slug(), "Model Id": bestModelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } mlpcCards = [json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary)] for card in mlpcCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary( {"sparkperceptron": json.loads(CommonUtils.convert_python_object_to_json(self._model_summary))}) self._result_setter.set_spark_multilayer_perceptron_model_summary(modelSummaryJson) self._result_setter.set_spark_multilayer_perceptron_management_summary(modelManagementJson) self._result_setter.set_mlpc_cards(mlpcCards) CommonUtils.create_update_and_save_progress_message(self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "completion", "info", display=True, emptyBin=False, customMsg=None, weightKey="total")
class MultilayerPerceptronPysparkScript(object): def __init__(self, data_frame, df_helper, df_context, spark, prediction_narrative, result_setter, meta_parser, mlEnvironment="pyspark"): self._metaParser = meta_parser self._prediction_narrative = prediction_narrative self._result_setter = result_setter self._data_frame = data_frame self._dataframe_helper = df_helper self._dataframe_context = df_context self._ignoreMsg = self._dataframe_context.get_message_ignore() self._spark = spark self._model_summary = MLModelSummary() self._score_summary = {} self._slug = GLOBALSETTINGS.MODEL_SLUG_MAPPING["sparkmlpclassifier"] self._targetLevel = self._dataframe_context.get_target_level_for_model() self._completionStatus = self._dataframe_context.get_completion_status() self._analysisName = self._slug self._messageURL = self._dataframe_context.get_message_url() self._scriptWeightDict = self._dataframe_context.get_ml_model_training_weight() self._mlEnv = mlEnvironment self._scriptStages = { "initialization": { "summary": "Initialized the Multilayer Perceptron Scripts", "weight": 4 }, "training": { "summary": "Multilayer Perceptron Model Training Started", "weight": 2 }, "completion": { "summary": "Multilayer Perceptron Model Training Finished", "weight": 4 }, } def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message(self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = [x for x in algosToRun if x.get_algorithm_slug()==self._slug][0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() categorical_columns = [x for x in categorical_columns if x != result_column] appType = self._dataframe_context.get_app_type() model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() # pipeline_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/pipeline/" # model_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/model" # pmml_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/modelPmml" df = self._data_frame levels = df.select(result_column).distinct().count() appType = self._dataframe_context.get_app_type() model_filepath = model_path + "/" + self._slug + "/model" pmml_filepath = str(model_path) + "/" + str(self._slug) + "/trainedModel.pmml" CommonUtils.create_update_and_save_progress_message(self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "training", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") st = time.time() pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns, categorical_columns, result_column) vectorFeats = pipeline.getStages()[-1].transform(df) input_feats = len(vectorFeats.select('features').take(1)[0][0]) trainingData, validationData = MLUtils.get_training_and_validation_data(df, result_column, 0.8) # indexed labelIndexer = StringIndexer(inputCol=result_column, outputCol="label") # OriginalTargetconverter = IndexToString(inputCol="label", outputCol="originalTargetColumn") # Label Mapping and Inverse labelIdx = labelIndexer.fit(trainingData) labelMapping = {k: v for k, v in enumerate(labelIdx.labels)} inverseLabelMapping = {v: float(k) for k, v in enumerate(labelIdx.labels)} clf = MultilayerPerceptronClassifier() if not algoSetting.is_hyperparameter_tuning_enabled(): algoParams = algoSetting.get_params_dict() else: algoParams = algoSetting.get_params_dict_hyperparameter() clfParams = [prm.name for prm in clf.params] algoParams = {getattr(clf, k): v if isinstance(v, list) else [v] for k, v in algoParams.items() if k in clfParams} paramGrid = ParamGridBuilder() layer_param_val = algoParams[getattr(clf, 'layers')] for layer in layer_param_val: layer.insert(0, input_feats) layer.append(levels) print('layer_param_val =', layer_param_val) # if not algoSetting.is_hyperparameter_tuning_enabled(): # for k,v in algoParams.items(): # if k.name == 'layers': # paramGrid = paramGrid.addGrid(k,layer_param_val) # else: # paramGrid = paramGrid.addGrid(k,v) # paramGrid = paramGrid.build() # else: for k, v in algoParams.items(): if v == [None] * len(v): continue if k.name == 'layers': paramGrid = paramGrid.addGrid(k, layer_param_val) else: paramGrid = paramGrid.addGrid(k, v) paramGrid = paramGrid.build() if len(paramGrid) > 1: hyperParamInitParam = algoSetting.get_hyperparameter_params() evaluationMetricDict = {"name": hyperParamInitParam["evaluationMetric"]} evaluationMetricDict["displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] else: evaluationMetricDict = {"name": GLOBALSETTINGS.CLASSIFICATION_MODEL_EVALUATION_METRIC} evaluationMetricDict["displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] self._result_setter.set_hyper_parameter_results(self._slug, None) if validationDict["name"] == "kFold": numFold = int(validationDict["value"]) estimator = Pipeline(stages=[pipeline, labelIndexer, clf]) if algoSetting.is_hyperparameter_tuning_enabled(): modelFilepath = "/".join(model_filepath.split("/")[:-1]) pySparkHyperParameterResultObj = PySparkGridSearchResult(estimator, paramGrid, appType, modelFilepath, levels, evaluationMetricDict, trainingData, validationData, numFold, self._targetLevel, labelMapping, inverseLabelMapping, df) resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models() self._result_setter.set_hyper_parameter_results(self._slug, resultArray) self._result_setter.set_metadata_parallel_coordinates(self._slug, { "ignoreList": pySparkHyperParameterResultObj.get_ignore_list(), "hideColumns": pySparkHyperParameterResultObj.get_hide_columns(), "metricColName": pySparkHyperParameterResultObj.get_comparison_metric_colname(), "columnOrder": pySparkHyperParameterResultObj.get_keep_columns()}) bestModel = pySparkHyperParameterResultObj.getBestModel() prediction = pySparkHyperParameterResultObj.getBestPrediction() bestModelName = resultArray[0]["Model Id"] else: crossval = CrossValidator(estimator=estimator, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator() if levels == 2 else MulticlassClassificationEvaluator(), numFolds=3 if numFold is None else numFold) # use 3+ folds in practice cvrf = crossval.fit(trainingData) prediction = cvrf.transform(validationData) bestModel = cvrf.bestModel bestModelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" else: train_test_ratio = float(self._dataframe_context.get_train_test_split()) estimator = Pipeline(stages=[pipeline, labelIndexer, clf]) if algoSetting.is_hyperparameter_tuning_enabled(): modelFilepath = "/".join(model_filepath.split("/")[:-1]) pySparkHyperParameterResultObj = PySparkTrainTestResult(estimator, paramGrid, appType, modelFilepath, levels, evaluationMetricDict, trainingData, validationData, train_test_ratio, self._targetLevel, labelMapping, inverseLabelMapping, df) resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models() self._result_setter.set_hyper_parameter_results(self._slug, resultArray) self._result_setter.set_metadata_parallel_coordinates(self._slug, { "ignoreList": pySparkHyperParameterResultObj.get_ignore_list(), "hideColumns": pySparkHyperParameterResultObj.get_hide_columns(), "metricColName": pySparkHyperParameterResultObj.get_comparison_metric_colname(), "columnOrder": pySparkHyperParameterResultObj.get_keep_columns()}) bestModel = pySparkHyperParameterResultObj.getBestModel() prediction = pySparkHyperParameterResultObj.getBestPrediction() bestModelName = resultArray[0]["Model Id"] else: tvs = TrainValidationSplit(estimator=estimator, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator() if levels == 2 else MulticlassClassificationEvaluator(), trainRatio=train_test_ratio) tvrf = tvs.fit(trainingData) prediction = tvrf.transform(validationData) bestModel = tvrf.bestModel bestModelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" MLUtils.save_pipeline_or_model(bestModel,model_filepath) predsAndLabels = prediction.select(['prediction', 'label']).rdd.map(tuple) metrics = MulticlassMetrics(predsAndLabels) posLabel = inverseLabelMapping[self._targetLevel] conf_mat_ar = metrics.confusionMatrix().toArray() print(conf_mat_ar) confusion_matrix = {} for i in range(len(conf_mat_ar)): confusion_matrix[labelMapping[i]] = {} for j, val in enumerate(conf_mat_ar[i]): confusion_matrix[labelMapping[i]][labelMapping[j]] = val print(confusion_matrix) trainingTime = time.time() - st f1_score = metrics.fMeasure(inverseLabelMapping[self._targetLevel], 1.0) precision = metrics.precision(inverseLabelMapping[self._targetLevel]) recall = metrics.recall(inverseLabelMapping[self._targetLevel]) accuracy = metrics.accuracy roc_auc = 'Undefined' if levels == 2: bin_metrics = BinaryClassificationMetrics(predsAndLabels) roc_auc = bin_metrics.areaUnderROC precision = metrics.precision(inverseLabelMapping[self._targetLevel]) recall = metrics.recall(inverseLabelMapping[self._targetLevel]) print(f1_score,precision,recall,accuracy) #gain chart implementation def cal_prob_eval(x): if len(x) == 1: if x == posLabel: return(float(x[1])) else: return(float(1 - x[1])) else: return(float(x[int(posLabel)])) column_name= 'probability' def y_prob_for_eval_udf(): return udf(lambda x:cal_prob_eval(x)) prediction = prediction.withColumn("y_prob_for_eval", y_prob_for_eval_udf()(col(column_name))) try: pys_df = prediction.select(['y_prob_for_eval','prediction','label']) gain_lift_ks_obj = GainLiftKS(pys_df, 'y_prob_for_eval', 'prediction', 'label', posLabel, self._spark) gain_lift_KS_dataframe = gain_lift_ks_obj.Run().toPandas() except: try: temp_df = pys_df.toPandas() gain_lift_ks_obj = GainLiftKS(temp_df, 'y_prob_for_eval', 'prediction', 'label', posLabel, self._spark) gain_lift_KS_dataframe = gain_lift_ks_obj.Rank_Ordering() except: print("gain chant failed") gain_lift_KS_dataframe = None objs = {"trained_model": bestModel, "actual": prediction.select('label'), "predicted": prediction.select('prediction'), "probability": prediction.select('probability'), "feature_importance": None, "featureList": list(categorical_columns) + list(numerical_columns), "labelMapping": labelMapping} # Calculating prediction_split val_cnts = prediction.groupBy('label').count() val_cnts = map(lambda row: row.asDict(), val_cnts.collect()) prediction_split = {} total_nos = objs['actual'].count() for item in val_cnts: classname = labelMapping[item['label']] prediction_split[classname] = round(item['count'] * 100 / float(total_nos), 2) if not algoSetting.is_hyperparameter_tuning_enabled(): # modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(bestModelName) bestModel.save("/".join(modelFilepathArr)) runtime = round((time.time() - st_global), 2) try: print(pmml_filepath) pmmlBuilder = PMMLBuilder(self._spark, trainingData, bestModel).putOption(clf, 'compact', True) pmmlBuilder.buildFile(pmml_filepath) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except Exception as e: print("PMML failed...", str(e)) pass cat_cols = list(set(categorical_columns) - {result_column}) self._model_summary = MLModelSummary() self._model_summary.set_algorithm_name("Spark ML Multilayer Perceptron") self._model_summary.set_algorithm_display_name("Spark ML Multilayer Perceptron") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_confusion_matrix(confusion_matrix) self._model_summary.set_feature_importance(objs["feature_importance"]) self._model_summary.set_feature_list(objs["featureList"]) self._model_summary.set_model_accuracy(accuracy) self._model_summary.set_training_time(round((time.time() - st), 2)) self._model_summary.set_precision_recall_stats([precision, recall]) self._model_summary.set_model_precision(precision) self._model_summary.set_model_recall(recall) self._model_summary.set_target_variable(result_column) self._model_summary.set_prediction_split(prediction_split) self._model_summary.set_validation_method("KFold") self._model_summary.set_level_map_dict(objs["labelMapping"]) self._model_summary.set_model_features(objs["featureList"]) self._model_summary.set_level_counts( self._metaParser.get_unique_level_dict(list(set(categorical_columns)) + [result_column])) self._model_summary.set_num_trees(None) self._model_summary.set_num_rules(300) self._model_summary.set_target_level(self._targetLevel) modelManagementJson = { "Model ID": "SPMLP-" + bestModelName, "Project Name": self._dataframe_context.get_job_name(), "Algorithm": self._model_summary.get_algorithm_name(), "Status": 'Completed', "Accuracy": accuracy, "Runtime": runtime, "Created On": "", "Owner": "", "Deployment": 0, "Action": '' } # if not algoSetting.is_hyperparameter_tuning_enabled(): # modelDropDownObj = { # "name": self._model_summary.get_algorithm_name(), # "evaluationMetricValue": locals()[evaluationMetricDict["name"]], # accuracy # "evaluationMetricName": evaluationMetricDict["displayName"], # accuracy # "slug": self._model_summary.get_slug(), # "Model Id": bestModelName # } # modelSummaryJson = { # "dropdown": modelDropDownObj, # "levelcount": self._model_summary.get_level_counts(), # "modelFeatureList": self._model_summary.get_feature_list(), # "levelMapping": self._model_summary.get_level_map_dict(), # "slug": self._model_summary.get_slug(), # "name": self._model_summary.get_algorithm_name() # } # else: modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": accuracy, #locals()[evaluationMetricDict["name"]], "evaluationMetricName": "accuracy", # evaluationMetricDict["name"], "slug": self._model_summary.get_slug(), "Model Id": bestModelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } mlpcCards = [json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary)] for card in mlpcCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary( {"sparkperceptron": json.loads(CommonUtils.convert_python_object_to_json(self._model_summary))}) self._result_setter.set_spark_multilayer_perceptron_model_summary(modelSummaryJson) self._result_setter.set_spark_multilayer_perceptron_management_summary(modelManagementJson) self._result_setter.set_mlpc_cards(mlpcCards) CommonUtils.create_update_and_save_progress_message(self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "completion", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") def Predict(self): self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight() self._scriptStages = { "initialization": { "summary": "Initialized the Multilayer Perceptron Scripts", "weight": 2 }, "prediction": { "summary": "Spark ML Multilayer Perceptron Model Prediction Finished", "weight": 2 }, "frequency": { "summary": "descriptive analysis finished", "weight": 2 }, "chisquare": { "summary": "chi Square analysis finished", "weight": 4 }, "completion": { "summary": "all analysis finished", "weight": 4 }, } self._completionStatus += self._scriptWeightDict[self._analysisName]["total"] * \ self._scriptStages["initialization"]["weight"] / 10 progressMessage = CommonUtils.create_progress_message_object(self._analysisName, "initialization", "info", self._scriptStages["initialization"]["summary"], self._completionStatus, self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage) self._dataframe_context.update_completion_status(self._completionStatus) SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) dataSanity = True level_counts_train = self._dataframe_context.get_level_count_dict() categorical_columns = self._dataframe_helper.get_string_columns() numerical_columns = self._dataframe_helper.get_numeric_columns() time_dimension_columns = self._dataframe_helper.get_timestamp_columns() result_column = self._dataframe_context.get_result_column() categorical_columns = [x for x in categorical_columns if x != result_column] level_counts_score = CommonUtils.get_level_count_dict(self._data_frame, categorical_columns, self._dataframe_context.get_column_separator(), output_type="dict", dataType="spark") for key in level_counts_train: if key in level_counts_score: if level_counts_train[key] != level_counts_score[key]: dataSanity = False else: dataSanity = False test_data_path = self._dataframe_context.get_input_file() score_data_path = self._dataframe_context.get_score_path() + "/data.csv" trained_model_path = self._dataframe_context.get_model_path() trained_model_path = "/".join(trained_model_path.split("/")[ :-1]) + "/" + self._slug + "/" + self._dataframe_context.get_model_for_scoring() # score_summary_path = self._dataframe_context.get_score_path()+"/Summary/summary.json" pipelineModel = MLUtils.load_pipeline(trained_model_path) df = self._data_frame transformed = pipelineModel.transform(df) label_indexer_dict = MLUtils.read_string_indexer_mapping(trained_model_path, SQLctx) prediction_to_levels = udf(lambda x: label_indexer_dict[x], StringType()) transformed = transformed.withColumn(result_column, prediction_to_levels(transformed.prediction)) # transformed.show() if "probability" in transformed.columns: probability_dataframe = transformed.select([result_column, "probability"]).toPandas() probability_dataframe = probability_dataframe.rename(index=str, columns={result_column: "predicted_class"}) probability_dataframe["predicted_probability"] = probability_dataframe["probability"].apply( lambda x: max(x)) self._score_summary["prediction_split"] = MLUtils.calculate_scored_probability_stats(probability_dataframe) self._score_summary["result_column"] = result_column scored_dataframe = transformed.select( categorical_columns + time_dimension_columns + numerical_columns + [result_column, "probability"]).toPandas() # scored_dataframe = scored_dataframe.rename(index=str, columns={"predicted_probability": "probability"}) else: self._score_summary["prediction_split"] = [] self._score_summary["result_column"] = result_column scored_dataframe = transformed.select( categorical_columns + time_dimension_columns + numerical_columns + [result_column]).toPandas() labelMappingDict = self._dataframe_context.get_label_map() if score_data_path.startswith("file"): score_data_path = score_data_path[7:] scored_dataframe.to_csv(score_data_path, header=True, index=False) uidCol = self._dataframe_context.get_uid_column() if uidCol == None: uidCols = self._metaParser.get_suggested_uid_columns() if len(uidCols) > 0: uidCol = uidCols[0] uidTableData = [] predictedClasses = list(scored_dataframe[result_column].unique()) if uidCol: if uidCol in df.columns: for level in predictedClasses: levelDf = scored_dataframe[scored_dataframe[result_column] == level] levelDf = levelDf[[uidCol, "predicted_probability", result_column]] levelDf.sort_values(by="predicted_probability", ascending=False, inplace=True) levelDf["predicted_probability"] = levelDf["predicted_probability"].apply( lambda x: humanize.apnumber(x * 100) + "%" if x * 100 >= 10 else str(int(x * 100)) + "%") uidTableData.append(levelDf[:5]) uidTableData = pd.concat(uidTableData) uidTableData = [list(arr) for arr in list(uidTableData.values)] uidTableData = [[uidCol, "Probability", result_column]] + uidTableData uidTable = TableData() uidTable.set_table_width(25) uidTable.set_table_data(uidTableData) uidTable.set_table_type("normalHideColumn") self._result_setter.set_unique_identifier_table( json.loads(CommonUtils.convert_python_object_to_json(uidTable))) self._completionStatus += self._scriptWeightDict[self._analysisName]["total"] * \ self._scriptStages["prediction"]["weight"] / 10 progressMessage = CommonUtils.create_progress_message_object(self._analysisName, \ "prediction", \ "info", \ self._scriptStages["prediction"]["summary"], \ self._completionStatus, \ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage) self._dataframe_context.update_completion_status(self._completionStatus) print("STARTING DIMENSION ANALYSIS ...") columns_to_keep = [] columns_to_drop = [] columns_to_keep = self._dataframe_context.get_score_consider_columns() if len(columns_to_keep) > 0: columns_to_drop = list(set(df.columns) - set(columns_to_keep)) else: columns_to_drop += ["predicted_probability"] scored_df = transformed.select( categorical_columns + time_dimension_columns + numerical_columns + [result_column]) columns_to_drop = [x for x in columns_to_drop if x in scored_df.columns] modified_df = scored_df.select([x for x in scored_df.columns if x not in columns_to_drop]) resultColLevelCount = dict(modified_df.groupby(result_column).count().collect()) self._metaParser.update_column_dict(result_column, {"LevelCount": resultColLevelCount, "numberOfUniqueValues": len(resultColLevelCount.keys())}) self._dataframe_context.set_story_on_scored_data(True) self._dataframe_context.update_consider_columns(columns_to_keep) df_helper = DataFrameHelper(modified_df, self._dataframe_context, self._metaParser) df_helper.set_params() spark_scored_df = df_helper.get_data_frame() if len(predictedClasses) >= 2: try: fs = time.time() df_decision_tree_obj = DecisionTrees(spark_scored_df, df_helper, self._dataframe_context, self._spark, self._metaParser, scriptWeight=self._scriptWeightDict, analysisName=self._analysisName).test_all( dimension_columns=[result_column]) narratives_obj = CommonUtils.as_dict( DecisionTreeNarrative(result_column, df_decision_tree_obj, self._dataframe_helper, self._dataframe_context, self._metaParser, self._result_setter, story_narrative=None, analysisName=self._analysisName, scriptWeight=self._scriptWeightDict)) print(narratives_obj) except Exception as e: print("DecisionTree Analysis Failed ", str(e)) else: data_dict = {"npred": len(predictedClasses), "nactual": len(labelMappingDict.values())} if data_dict["nactual"] > 2: levelCountDict[predictedClasses[0]] = resultColLevelCount[predictedClasses[0]] levelCountDict["Others"] = sum([v for k, v in resultColLevelCount.items() if k != predictedClasses[0]]) else: levelCountDict = resultColLevelCount otherClass = list(set(labelMappingDict.values()) - set(predictedClasses))[0] levelCountDict[otherClass] = 0 print(levelCountDict) total = float(sum([x for x in levelCountDict.values() if x != None])) levelCountTuple = [({"name": k, "count": v, "percentage": humanize.apnumber(v * 100 / total) + "%"}) for k, v in levelCountDict.items() if v != None] levelCountTuple = sorted(levelCountTuple, key=lambda x: x["count"], reverse=True) data_dict["blockSplitter"] = "|~NEWBLOCK~|" data_dict["targetcol"] = result_column data_dict["nlevel"] = len(levelCountDict.keys()) data_dict["topLevel"] = levelCountTuple[0] data_dict["secondLevel"] = levelCountTuple[1] maincardSummary = NarrativesUtils.get_template_output("/apps/", 'scorewithoutdtree.html', data_dict) main_card = NormalCard() main_card_data = [] main_card_narrative = NarrativesUtils.block_splitter(maincardSummary, "|~NEWBLOCK~|") main_card_data += main_card_narrative chartData = NormalChartData([levelCountDict]).get_data() chartJson = ChartJson(data=chartData) chartJson.set_title(result_column) chartJson.set_chart_type("donut") mainCardChart = C3ChartData(data=chartJson) mainCardChart.set_width_percent(33) main_card_data.append(mainCardChart) uidTable = self._result_setter.get_unique_identifier_table() if uidTable != None: main_card_data.append(uidTable) main_card.set_card_data(main_card_data) main_card.set_card_name("Predicting Key Drivers of {}".format(result_column)) self._result_setter.set_score_dtree_cards([main_card])