def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"initialization","info",display=True,emptyBin=False,customMsg=None,weightKey="total") appType = self._dataframe_context.get_app_type() algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = filter(lambda x:x.get_algorithm_slug()==self._slug,algosToRun)[0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns)-set(allDateCols)) print categorical_columns result_column = self._dataframe_context.get_result_column() numerical_columns = self._dataframe_helper.get_numeric_columns() numerical_columns = [x for x in numerical_columns if x != result_column] model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print "model_path",model_path pipeline_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/pipeline/" model_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/model" pmml_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/modelPmml" df = self._data_frame if self._mlEnv == "spark": pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns,categorical_columns,result_column,algoType="regression") pipelineModel = pipeline.fit(df) indexed = pipelineModel.transform(df) featureMapping = sorted((attr["idx"], attr["name"]) for attr in (chain(*indexed.schema["features"].metadata["ml_attr"]["attrs"].values()))) # print indexed.select([result_column,"features"]).show(5) MLUtils.save_pipeline_or_model(pipelineModel,pipeline_filepath) # OriginalTargetconverter = IndexToString(inputCol="label", outputCol="originalTargetColumn") dtreer = DecisionTreeRegressor(labelCol=result_column, featuresCol='features',predictionCol="prediction") if validationDict["name"] == "kFold": defaultSplit = GLOBALSETTINGS.DEFAULT_VALIDATION_OBJECT["value"] numFold = int(validationDict["value"]) if numFold == 0: numFold = 3 trainingData,validationData = indexed.randomSplit([defaultSplit,1-defaultSplit], seed=12345) paramGrid = ParamGridBuilder()\ .addGrid(dtreer.regParam, [0.1, 0.01]) \ .addGrid(dtreer.fitIntercept, [False, True])\ .addGrid(dtreer.elasticNetParam, [0.0, 0.5, 1.0])\ .build() crossval = CrossValidator(estimator=dtreer, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(predictionCol="prediction", labelCol=result_column), numFolds=numFold) st = time.time() cvModel = crossval.fit(indexed) trainingTime = time.time()-st print "cvModel training takes",trainingTime bestModel = cvModel.bestModel elif validationDict["name"] == "trainAndtest": trainingData,validationData = indexed.randomSplit([float(validationDict["value"]),1-float(validationDict["value"])], seed=12345) st = time.time() fit = dtreer.fit(trainingData) trainingTime = time.time()-st print "time to train",trainingTime bestModel = fit featureImportance = bestModel.featureImportances print featureImportance,type(featureImportance) # print featureImportance[0],len(featureImportance[1],len(featureImportance[2])) print len(featureMapping) featuresArray = [(name, featureImportance[idx]) for idx, name in featureMapping] print featuresArray MLUtils.save_pipeline_or_model(bestModel,model_filepath) transformed = bestModel.transform(validationData) transformed = transformed.withColumn(result_column,transformed[result_column].cast(DoubleType())) transformed = transformed.select([result_column,"prediction",transformed[result_column]-transformed["prediction"]]) transformed = transformed.withColumnRenamed(transformed.columns[-1],"difference") transformed = transformed.select([result_column,"prediction","difference",FN.abs(transformed["difference"])*100/transformed[result_column]]) transformed = transformed.withColumnRenamed(transformed.columns[-1],"mape") sampleData = None nrows = transformed.count() if nrows > 100: sampleData = transformed.sample(False, float(100)/nrows, seed=420) else: sampleData = transformed print sampleData.show() evaluator = RegressionEvaluator(predictionCol="prediction",labelCol=result_column) metrics = {} metrics["r2"] = evaluator.evaluate(transformed,{evaluator.metricName: "r2"}) metrics["rmse"] = evaluator.evaluate(transformed,{evaluator.metricName: "rmse"}) metrics["mse"] = evaluator.evaluate(transformed,{evaluator.metricName: "mse"}) metrics["mae"] = evaluator.evaluate(transformed,{evaluator.metricName: "mae"}) runtime = round((time.time() - st_global),2) # print transformed.count() mapeDf = transformed.select("mape") # print mapeDf.show() mapeStats = MLUtils.get_mape_stats(mapeDf,"mape") mapeStatsArr = mapeStats.items() mapeStatsArr = sorted(mapeStatsArr,key=lambda x:int(x[0])) # print mapeStatsArr quantileDf = transformed.select("prediction") # print quantileDf.show() quantileSummaryDict = MLUtils.get_quantile_summary(quantileDf,"prediction") quantileSummaryArr = quantileSummaryDict.items() quantileSummaryArr = sorted(quantileSummaryArr,key=lambda x:int(x[0])) # print quantileSummaryArr self._model_summary.set_model_type("regression") self._model_summary.set_algorithm_name("dtree Regression") self._model_summary.set_algorithm_display_name("Decision Tree Regression") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_training_time(trainingTime) self._model_summary.set_target_variable(result_column) self._model_summary.set_validation_method(validationDict["displayName"]) self._model_summary.set_model_evaluation_metrics(metrics) self._model_summary.set_model_params(bestEstimator.get_params()) self._model_summary.set_quantile_summary(quantileSummaryArr) self._model_summary.set_mape_stats(mapeStatsArr) self._model_summary.set_sample_data(sampleData.toPandas().to_dict()) self._model_summary.set_feature_importance(featureImportance) # print CommonUtils.convert_python_object_to_json(self._model_summary) elif self._mlEnv == "sklearn": model_filepath = model_path+"/"+self._slug+"/model.pkl" x_train,x_test,y_train,y_test = self._dataframe_helper.get_train_test_data() x_train = MLUtils.create_dummy_columns(x_train,[x for x in categorical_columns if x != result_column]) x_test = MLUtils.create_dummy_columns(x_test,[x for x in categorical_columns if x != result_column]) x_test = MLUtils.fill_missing_columns(x_test,x_train.columns,result_column) st = time.time() est = DecisionTreeRegressor() CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"training","info",display=True,emptyBin=False,customMsg=None,weightKey="total") if algoSetting.is_hyperparameter_tuning_enabled(): hyperParamInitParam = algoSetting.get_hyperparameter_params() evaluationMetricDict = {"name":hyperParamInitParam["evaluationMetric"]} evaluationMetricDict["displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[evaluationMetricDict["name"]] hyperParamAlgoName = algoSetting.get_hyperparameter_algo_name() params_grid = algoSetting.get_params_dict_hyperparameter() params_grid = {k:v for k,v in params_grid.items() if k in est.get_params()} print params_grid if hyperParamAlgoName == "gridsearchcv": estGrid = GridSearchCV(est,params_grid) gridParams = estGrid.get_params() hyperParamInitParam = {k:v for k,v in hyperParamInitParam.items() if k in gridParams} estGrid.set_params(**hyperParamInitParam) estGrid.fit(x_train,y_train) bestEstimator = estGrid.best_estimator_ modelFilepath = "/".join(model_filepath.split("/")[:-1]) sklearnHyperParameterResultObj = SklearnGridSearchResult(estGrid.cv_results_,est,x_train,x_test,y_train,y_test,appType,modelFilepath,evaluationMetricDict=evaluationMetricDict) resultArray = sklearnHyperParameterResultObj.train_and_save_models() self._result_setter.set_hyper_parameter_results(self._slug,resultArray) self._result_setter.set_metadata_parallel_coordinates(self._slug,{"ignoreList":sklearnHyperParameterResultObj.get_ignore_list(),"hideColumns":sklearnHyperParameterResultObj.get_hide_columns(),"metricColName":sklearnHyperParameterResultObj.get_comparison_metric_colname(),"columnOrder":sklearnHyperParameterResultObj.get_keep_columns()}) elif hyperParamAlgoName == "randomsearchcv": estRand = RandomizedSearchCV(est,params_grid) estRand.set_params(**hyperParamInitParam) bestEstimator = None else: evaluationMetricDict = {"name":GLOBALSETTINGS.REGRESSION_MODEL_EVALUATION_METRIC} evaluationMetricDict["displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[evaluationMetricDict["name"]] algoParams = algoSetting.get_params_dict() algoParams = {k:v for k,v in algoParams.items() if k in est.get_params().keys()} est.set_params(**algoParams) self._result_setter.set_hyper_parameter_results(self._slug,None) if validationDict["name"] == "kFold": defaultSplit = GLOBALSETTINGS.DEFAULT_VALIDATION_OBJECT["value"] numFold = int(validationDict["value"]) if numFold == 0: numFold = 3 kFoldClass = SkleanrKFoldResult(numFold,est,x_train,x_test,y_train,y_test,appType,evaluationMetricDict=evaluationMetricDict) kFoldClass.train_and_save_result() kFoldOutput = kFoldClass.get_kfold_result() bestEstimator = kFoldClass.get_best_estimator() elif validationDict["name"] == "trainAndtest": est.fit(x_train, y_train) bestEstimator = est trainingTime = time.time()-st y_score = bestEstimator.predict(x_test) try: y_prob = bestEstimator.predict_proba(x_test) except: y_prob = [0]*len(y_score) featureImportance={} objs = {"trained_model":bestEstimator,"actual":y_test,"predicted":y_score,"probability":y_prob,"feature_importance":featureImportance,"featureList":list(x_train.columns),"labelMapping":{}} featureImportance = objs["trained_model"].feature_importances_ featuresArray = [(col_name, featureImportance[idx]) for idx, col_name in enumerate(x_train.columns)] if not algoSetting.is_hyperparameter_tuning_enabled(): modelName = "M"+"0"*(GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH-1)+"1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(modelName+".pkl") joblib.dump(objs["trained_model"],"/".join(modelFilepathArr)) metrics = {} metrics["r2"] = r2_score(y_test, y_score) metrics["mse"] = mean_squared_error(y_test, y_score) metrics["mae"] = mean_absolute_error(y_test, y_score) metrics["rmse"] = sqrt(metrics["mse"]) transformed = pd.DataFrame({"prediction":y_score,result_column:y_test}) transformed["difference"] = transformed[result_column] - transformed["prediction"] transformed["mape"] = np.abs(transformed["difference"])*100/transformed[result_column] sampleData = None nrows = transformed.shape[0] if nrows > 100: sampleData = transformed.sample(n=100,random_state=420) else: sampleData = transformed print sampleData.head() mapeCountArr = pd.cut(transformed["mape"],GLOBALSETTINGS.MAPEBINS).value_counts().to_dict().items() mapeStatsArr = [(str(idx),dictObj) for idx,dictObj in enumerate(sorted([{"count":x[1],"splitRange":(x[0].left,x[0].right)} for x in mapeCountArr],key = lambda x:x["splitRange"][0]))] predictionColSummary = transformed["prediction"].describe().to_dict() quantileBins = [predictionColSummary["min"],predictionColSummary["25%"],predictionColSummary["50%"],predictionColSummary["75%"],predictionColSummary["max"]] print quantileBins quantileBins = sorted(list(set(quantileBins))) transformed["quantileBinId"] = pd.cut(transformed["prediction"],quantileBins) quantileDf = transformed.groupby("quantileBinId").agg({"prediction":[np.sum,np.mean,np.size]}).reset_index() quantileDf.columns = ["prediction","sum","mean","count"] print quantileDf quantileArr = quantileDf.T.to_dict().items() quantileSummaryArr = [(obj[0],{"splitRange":(obj[1]["prediction"].left,obj[1]["prediction"].right),"count":obj[1]["count"],"mean":obj[1]["mean"],"sum":obj[1]["sum"]}) for obj in quantileArr] print quantileSummaryArr runtime = round((time.time() - st_global),2) self._model_summary.set_model_type("regression") self._model_summary.set_algorithm_name("DTREE Regression") self._model_summary.set_algorithm_display_name("Decision Tree Regression") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_training_time(trainingTime) self._model_summary.set_target_variable(result_column) self._model_summary.set_validation_method(validationDict["displayName"]) self._model_summary.set_model_evaluation_metrics(metrics) self._model_summary.set_model_params(bestEstimator.get_params()) self._model_summary.set_quantile_summary(quantileSummaryArr) self._model_summary.set_mape_stats(mapeStatsArr) self._model_summary.set_sample_data(sampleData.to_dict()) self._model_summary.set_feature_importance(featuresArray) self._model_summary.set_feature_list(list(x_train.columns)) try: pmml_filepath = str(model_path)+"/"+str(self._slug)+"/traindeModel.pmml" modelPmmlPipeline = PMMLPipeline([ ("pretrained-estimator", objs["trained_model"]) ]) modelPmmlPipeline.target_field = result_column modelPmmlPipeline.active_fields = np.array([col for col in x_train.columns if col != result_column]) sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr = True) pmmlfile = open(pmml_filepath,"r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug:pmmlText}) except: pass if not algoSetting.is_hyperparameter_tuning_enabled(): modelDropDownObj = { "name":self._model_summary.get_algorithm_name(), "evaluationMetricValue":self._model_summary.get_model_accuracy(), "evaluationMetricName":"r2", "slug":self._model_summary.get_slug(), "Model Id":modelName } modelSummaryJson = { "dropdown":modelDropDownObj, "levelcount":self._model_summary.get_level_counts(), "modelFeatureList":self._model_summary.get_feature_list(), "levelMapping":self._model_summary.get_level_map_dict(), "slug":self._model_summary.get_slug(), "name":self._model_summary.get_algorithm_name() } else: modelDropDownObj = { "name":self._model_summary.get_algorithm_name(), "evaluationMetricValue":resultArray[0]["R-Squared"], "evaluationMetricName":"r2", "slug":self._model_summary.get_slug(), "Model Id":resultArray[0]["Model Id"] } modelSummaryJson = { "dropdown":modelDropDownObj, "levelcount":self._model_summary.get_level_counts(), "modelFeatureList":self._model_summary.get_feature_list(), "levelMapping":self._model_summary.get_level_map_dict(), "slug":self._model_summary.get_slug(), "name":self._model_summary.get_algorithm_name() } dtreerCards = [json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary)] for card in dtreerCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({"dtreeregression":json.loads(CommonUtils.convert_python_object_to_json(self._model_summary))}) self._result_setter.set_dtree_regression_model_summart(modelSummaryJson) self._result_setter.set_dtreer_cards(dtreerCards) CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"completion","info",display=True,emptyBin=False,customMsg=None,weightKey="total")
def Train(self): st_global = time.time() algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = filter(lambda x:x["algorithmSlug"]==GLOBALSETTINGS.MODEL_SLUG_MAPPING["generalizedlinearregression"],algosToRun)[0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns)-set(allDateCols)) print categorical_columns result_column = self._dataframe_context.get_result_column() numerical_columns = self._dataframe_helper.get_numeric_columns() numerical_columns = [x for x in numerical_columns if x != result_column] model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print "model_path",model_path pipeline_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/pipeline/" model_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/model" pmml_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/modelPmml" df = self._data_frame pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns,categorical_columns,result_column,algoType="regression") pipelineModel = pipeline.fit(df) indexed = pipelineModel.transform(df) featureMapping = sorted((attr["idx"], attr["name"]) for attr in (chain(*indexed.schema["features"].metadata["ml_attr"]["attrs"].values()))) # print indexed.select([result_column,"features"]).show(5) MLUtils.save_pipeline_or_model(pipelineModel,pipeline_filepath) glinr = GeneralizedLinearRegression(labelCol=result_column, featuresCol='features',predictionCol="prediction") if validationDict["name"] == "kFold": defaultSplit = GLOBALSETTINGS.DEFAULT_VALIDATION_OBJECT["value"] numFold = int(validationDict["value"]) if numFold == 0: numFold = 3 trainingData,validationData = indexed.randomSplit([defaultSplit,1-defaultSplit], seed=12345) paramGrid = ParamGridBuilder()\ .addGrid(glinr.regParam, [0.1, 0.01]) \ .addGrid(glinr.fitIntercept, [False, True])\ .build() crossval = CrossValidator(estimator=glinr, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(predictionCol="prediction", labelCol=result_column), numFolds=numFold) st = time.time() cvModel = crossval.fit(indexed) trainingTime = time.time()-st print "cvModel training takes",trainingTime bestModel = cvModel.bestModel elif validationDict["name"] == "trainAndtest": trainingData,validationData = indexed.randomSplit([float(validationDict["value"]),1-float(validationDict["value"])], seed=12345) st = time.time() fit = glinr.fit(trainingData) trainingTime = time.time()-st print "time to train",trainingTime bestModel = fit print bestModel.explainParams() print bestModel.extractParamMap() print bestModel.params print 'Best Param (regParam): ', bestModel._java_obj.getRegParam() print 'Best Param (MaxIter): ', bestModel._java_obj.getMaxIter() # modelPmmlPipeline = PMMLPipeline([ # ("pretrained-estimator", objs["trained_model"]) # ]) # try: # modelPmmlPipeline.target_field = result_column # modelPmmlPipeline.active_fields = np.array([col for col in x_train.columns if col != result_column]) # sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr = True) # pmmlfile = open(pmml_filepath,"r") # pmmlText = pmmlfile.read() # pmmlfile.close() # self._result_setter.update_pmml_object({self._slug:pmmlText}) # except: # pass coefficientsArray = [(name, bestModel.coefficients[idx]) for idx, name in featureMapping] MLUtils.save_pipeline_or_model(bestModel,model_filepath) transformed = bestModel.transform(validationData) transformed = transformed.withColumn(result_column,transformed[result_column].cast(DoubleType())) transformed = transformed.select([result_column,"prediction",transformed[result_column]-transformed["prediction"]]) transformed = transformed.withColumnRenamed(transformed.columns[-1],"difference") transformed = transformed.select([result_column,"prediction","difference",FN.abs(transformed["difference"])*100/transformed[result_column]]) transformed = transformed.withColumnRenamed(transformed.columns[-1],"mape") sampleData = None nrows = transformed.count() if nrows > 100: sampleData = transformed.sample(False, float(100)/nrows, seed=420) else: sampleData = transformed print sampleData.show() evaluator = RegressionEvaluator(predictionCol="prediction",labelCol=result_column) metrics = {} metrics["r2"] = evaluator.evaluate(transformed,{evaluator.metricName: "r2"}) metrics["rmse"] = evaluator.evaluate(transformed,{evaluator.metricName: "rmse"}) metrics["mse"] = evaluator.evaluate(transformed,{evaluator.metricName: "mse"}) metrics["mae"] = evaluator.evaluate(transformed,{evaluator.metricName: "mae"}) runtime = round((time.time() - st_global),2) # print transformed.count() mapeDf = transformed.select("mape") # print mapeDf.show() mapeStats = MLUtils.get_mape_stats(mapeDf,"mape") mapeStatsArr = mapeStats.items() mapeStatsArr = sorted(mapeStatsArr,key=lambda x:int(x[0])) # print mapeStatsArr quantileDf = transformed.select("prediction") # print quantileDf.show() quantileSummaryDict = MLUtils.get_quantile_summary(quantileDf,"prediction") quantileSummaryArr = quantileSummaryDict.items() quantileSummaryArr = sorted(quantileSummaryArr,key=lambda x:int(x[0])) # print quantileSummaryArr self._model_summary.set_model_type("regression") self._model_summary.set_algorithm_name("Generalized Linear Regression") self._model_summary.set_algorithm_display_name("Generalized Linear Regression") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_training_time(trainingTime) self._model_summary.set_target_variable(result_column) self._model_summary.set_validation_method(validationDict["displayName"]) self._model_summary.set_model_evaluation_metrics(metrics) self._model_summary.set_model_params(bestEstimator.get_params()) self._model_summary.set_quantile_summary(quantileSummaryArr) self._model_summary.set_mape_stats(mapeStatsArr) self._model_summary.set_sample_data(sampleData.toPandas().to_dict()) self._model_summary.set_coefficinets_array(coefficientsArray) self._model_summary.set_feature_list(list(x_train.columns)) # print CommonUtils.convert_python_object_to_json(self._model_summary) modelSummaryJson = { "dropdown":{ "name":self._model_summary.get_algorithm_name(), "accuracy":CommonUtils.round_sig(self._model_summary.get_model_evaluation_metrics()["r2"]), "slug":self._model_summary.get_slug() }, "levelcount":self._model_summary.get_level_counts(), "modelFeatureList":self._model_summary.get_feature_list(), "levelMapping":self._model_summary.get_level_map_dict() } glinrCards = [json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary)] for card in glinrCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({"generalizedlinearregression":json.loads(CommonUtils.convert_python_object_to_json(self._model_summary))}) self._result_setter.set_generalized_linear_regression_model_summary(modelSummaryJson) self._result_setter.set_glinr_cards(glinrCards)
def Train(self): st = time.time() categorical_columns = self._dataframe_helper.get_string_columns() numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() categorical_columns = [ x for x in categorical_columns if x != result_column ] model_path = self._dataframe_context.get_model_path() pipeline_filepath = model_path + "/LogisticRegression/TrainedModels/pipeline" model_filepath = model_path + "/LogisticRegression/TrainedModels/model" summary_filepath = model_path + "/LogisticRegression/ModelSummary/summary.json" df = self._data_frame pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns, categorical_columns, result_column) pipelineModel = pipeline.fit(df) indexed = pipelineModel.transform(df) MLUtils.save_pipeline_or_model(pipelineModel, pipeline_filepath) trainingData, validationData = MLUtils.get_training_and_validation_data( indexed, result_column, 0.8) OriginalTargetconverter = IndexToString( inputCol="label", outputCol="originalTargetColumn") levels = trainingData.select("label").distinct().collect() if self._classifier == "lr": if len(levels) == 2: lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) elif len(levels) > 2: lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial") fit = lr.fit(trainingData) elif self._classifier == "OneVsRest": lr = LogisticRegression() ovr = OneVsRest(classifier=lr) fit = ovr.fit(trainingData) transformed = fit.transform(validationData) MLUtils.save_pipeline_or_model(fit, model_filepath) print fit.coefficientMatrix print fit.interceptVector # feature_importance = MLUtils.calculate_sparkml_feature_importance(indexed,fit,categorical_columns,numerical_columns) label_classes = transformed.select("label").distinct().collect() results = transformed.select(["prediction", "label"]) if len(label_classes) > 2: evaluator = MulticlassClassificationEvaluator( predictionCol="prediction") evaluator.evaluate(results) self._model_summary["model_accuracy"] = evaluator.evaluate( results, {evaluator.metricName: "accuracy"}) # accuracy of the model else: evaluator = BinaryClassificationEvaluator( rawPredictionCol="prediction") evaluator.evaluate(results) # print evaluator.evaluate(results,{evaluator.metricName: "areaUnderROC"}) # print evaluator.evaluate(results,{evaluator.metricName: "areaUnderPR"}) self._model_summary["model_accuracy"] = evaluator.evaluate( results, {evaluator.metricName: "areaUnderPR"}) # accuracy of the model # self._model_summary["feature_importance"] = MLUtils.transform_feature_importance(feature_importance) self._model_summary["runtime_in_seconds"] = round((time.time() - st), 2) transformed = OriginalTargetconverter.transform(transformed) label_indexer_dict = [ dict(enumerate(field.metadata["ml_attr"]["vals"])) for field in transformed.schema.fields if field.name == "label" ][0] prediction_to_levels = udf(lambda x: label_indexer_dict[x], StringType()) transformed = transformed.withColumn( "predictedClass", prediction_to_levels(transformed.prediction)) prediction_df = transformed.select( ["originalTargetColumn", "predictedClass"]).toPandas() objs = { "actual": prediction_df["originalTargetColumn"], "predicted": prediction_df["predictedClass"] } self._model_summary[ "confusion_matrix"] = MLUtils.calculate_confusion_matrix( objs["actual"], objs["predicted"]) overall_precision_recall = MLUtils.calculate_overall_precision_recall( objs["actual"], objs["predicted"]) self._model_summary[ "precision_recall_stats"] = overall_precision_recall[ "classwise_stats"] self._model_summary["model_precision"] = overall_precision_recall[ "precision"] self._model_summary["model_recall"] = overall_precision_recall[ "recall"] self._model_summary["target_variable"] = result_column self._model_summary[ "test_sample_prediction"] = overall_precision_recall[ "prediction_split"] self._model_summary["algorithm_name"] = "Random Forest" self._model_summary["validation_method"] = "Train and Test" self._model_summary["independent_variables"] = len( categorical_columns) + len(numerical_columns) self._model_summary["level_counts"] = CommonUtils.get_level_count_dict( trainingData, categorical_columns, self._dataframe_context.get_column_separator(), dataType="spark") # print json.dumps(self._model_summary,indent=2) self._model_summary["total_trees"] = 100 self._model_summary["total_rules"] = 300 CommonUtils.write_to_file( summary_filepath, json.dumps({"modelSummary": self._model_summary}))
def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = [ x for x in algosToRun if x.get_algorithm_slug() == self._slug ][0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() categorical_columns = [ x for x in categorical_columns if x != result_column ] appType = self._dataframe_context.get_app_type() model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print("model_path", model_path) pipeline_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/pipeline/" model_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/model" pmml_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/modelPmml" df = self._data_frame levels = df.select(result_column).distinct().count() appType = self._dataframe_context.get_app_type() model_filepath = model_path + "/" + self._slug + "/model" pmml_filepath = str(model_path) + "/" + str( self._slug) + "/traindeModel.pmml" CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "training", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") st = time.time() pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns, categorical_columns, result_column) trainingData, validationData = MLUtils.get_training_and_validation_data( df, result_column, 0.8) # indexed labelIndexer = StringIndexer(inputCol=result_column, outputCol="label") # OriginalTargetconverter = IndexToString(inputCol="label", outputCol="originalTargetColumn") # Label Mapping and Inverse labelIdx = labelIndexer.fit(trainingData) labelMapping = {k: v for k, v in enumerate(labelIdx.labels)} inverseLabelMapping = { v: float(k) for k, v in enumerate(labelIdx.labels) } if self._dataframe_context.get_trainerMode() == "autoML": automl_enable = True else: automl_enable = False clf = NaiveBayes() if not algoSetting.is_hyperparameter_tuning_enabled(): algoParams = algoSetting.get_params_dict() else: algoParams = algoSetting.get_params_dict_hyperparameter() print("=" * 100) print(algoParams) print("=" * 100) clfParams = [prm.name for prm in clf.params] algoParams = { getattr(clf, k): v if isinstance(v, list) else [v] for k, v in algoParams.items() if k in clfParams } #print("="*100) #print("ALGOPARAMS - ",algoParams) #print("="*100) paramGrid = ParamGridBuilder() # if not algoSetting.is_hyperparameter_tuning_enabled(): # for k,v in algoParams.items(): # if v == [None] * len(v): # continue # if k.name == 'thresholds': # paramGrid = paramGrid.addGrid(k,v[0]) # else: # paramGrid = paramGrid.addGrid(k,v) # paramGrid = paramGrid.build() # if not algoSetting.is_hyperparameter_tuning_enabled(): for k, v in algoParams.items(): print(k, v) if v == [None] * len(v): continue paramGrid = paramGrid.addGrid(k, v) paramGrid = paramGrid.build() # else: # for k,v in algoParams.items(): # print k.name, v # if v[0] == [None] * len(v[0]): # continue # paramGrid = paramGrid.addGrid(k,v[0]) # paramGrid = paramGrid.build() #print("="*143) #print("PARAMGRID - ", paramGrid) #print("="*143) if len(paramGrid) > 1: hyperParamInitParam = algoSetting.get_hyperparameter_params() evaluationMetricDict = { "name": hyperParamInitParam["evaluationMetric"] } evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] else: evaluationMetricDict = { "name": GLOBALSETTINGS.CLASSIFICATION_MODEL_EVALUATION_METRIC } evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] self._result_setter.set_hyper_parameter_results(self._slug, None) if validationDict["name"] == "kFold": numFold = int(validationDict["value"]) estimator = Pipeline(stages=[pipeline, labelIndexer, clf]) if algoSetting.is_hyperparameter_tuning_enabled(): modelFilepath = "/".join(model_filepath.split("/")[:-1]) pySparkHyperParameterResultObj = PySparkGridSearchResult( estimator, paramGrid, appType, modelFilepath, levels, evaluationMetricDict, trainingData, validationData, numFold, self._targetLevel, labelMapping, inverseLabelMapping, df) resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models( ) self._result_setter.set_hyper_parameter_results( self._slug, resultArray) self._result_setter.set_metadata_parallel_coordinates( self._slug, { "ignoreList": pySparkHyperParameterResultObj.get_ignore_list(), "hideColumns": pySparkHyperParameterResultObj.get_hide_columns(), "metricColName": pySparkHyperParameterResultObj. get_comparison_metric_colname(), "columnOrder": pySparkHyperParameterResultObj.get_keep_columns() }) bestModel = pySparkHyperParameterResultObj.getBestModel() prediction = pySparkHyperParameterResultObj.getBestPrediction() else: if automl_enable: paramGrid = (ParamGridBuilder().addGrid( clf.smoothing, [1.0, 0.2]).build()) crossval = CrossValidator( estimator=estimator, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator() if levels == 2 else MulticlassClassificationEvaluator(), numFolds=3 if numFold is None else numFold) # use 3+ folds in practice cvnb = crossval.fit(trainingData) prediction = cvnb.transform(validationData) bestModel = cvnb.bestModel else: train_test_ratio = float( self._dataframe_context.get_train_test_split()) estimator = Pipeline(stages=[pipeline, labelIndexer, clf]) if algoSetting.is_hyperparameter_tuning_enabled(): modelFilepath = "/".join(model_filepath.split("/")[:-1]) pySparkHyperParameterResultObj = PySparkTrainTestResult( estimator, paramGrid, appType, modelFilepath, levels, evaluationMetricDict, trainingData, validationData, train_test_ratio, self._targetLevel, labelMapping, inverseLabelMapping, df) resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models( ) self._result_setter.set_hyper_parameter_results( self._slug, resultArray) self._result_setter.set_metadata_parallel_coordinates( self._slug, { "ignoreList": pySparkHyperParameterResultObj.get_ignore_list(), "hideColumns": pySparkHyperParameterResultObj.get_hide_columns(), "metricColName": pySparkHyperParameterResultObj. get_comparison_metric_colname(), "columnOrder": pySparkHyperParameterResultObj.get_keep_columns() }) bestModel = pySparkHyperParameterResultObj.getBestModel() prediction = pySparkHyperParameterResultObj.getBestPrediction() else: tvs = TrainValidationSplit( estimator=estimator, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator() if levels == 2 else MulticlassClassificationEvaluator(), trainRatio=train_test_ratio) tvspnb = tvs.fit(trainingData) prediction = tvspnb.transform(validationData) bestModel = tvspnb.bestModel modelmanagement_ = { param[0].name: param[1] for param in bestModel.stages[2].extractParamMap().items() } MLUtils.save_pipeline_or_model(bestModel, model_filepath) predsAndLabels = prediction.select(['prediction', 'label']).rdd.map(tuple) # label_classes = prediction.select("label").distinct().collect() # label_classes = prediction.agg((F.collect_set('label').alias('label'))).first().asDict()['label'] #results = transformed.select(["prediction","label"]) # if len(label_classes) > 2: # metrics = MulticlassMetrics(predsAndLabels) # accuracy of the model # else: # metrics = BinaryClassificationMetrics(predsAndLabels) posLabel = inverseLabelMapping[self._targetLevel] metrics = MulticlassMetrics(predsAndLabels) trainingTime = time.time() - st f1_score = metrics.fMeasure(inverseLabelMapping[self._targetLevel], 1.0) precision = metrics.precision(inverseLabelMapping[self._targetLevel]) recall = metrics.recall(inverseLabelMapping[self._targetLevel]) accuracy = metrics.accuracy print(f1_score, precision, recall, accuracy) #gain chart implementation def cal_prob_eval(x): if len(x) == 1: if x == posLabel: return (float(x[1])) else: return (float(1 - x[1])) else: return (float(x[int(posLabel)])) column_name = 'probability' def y_prob_for_eval_udf(): return udf(lambda x: cal_prob_eval(x)) prediction = prediction.withColumn( "y_prob_for_eval", y_prob_for_eval_udf()(col(column_name))) try: pys_df = prediction.select( ['y_prob_for_eval', 'prediction', 'label']) gain_lift_ks_obj = GainLiftKS(pys_df, 'y_prob_for_eval', 'prediction', 'label', posLabel, self._spark) gain_lift_KS_dataframe = gain_lift_ks_obj.Run().toPandas() except: try: temp_df = pys_df.toPandas() gain_lift_ks_obj = GainLiftKS(temp_df, 'y_prob_for_eval', 'prediction', 'label', posLabel, self._spark) gain_lift_KS_dataframe = gain_lift_ks_obj.Rank_Ordering() except: print("gain chant failed") gain_lift_KS_dataframe = None #feature_importance = MLUtils.calculate_sparkml_feature_importance(df, bestModel.stages[-1], categorical_columns, numerical_columns) act_list = prediction.select('label').collect() actual = [int(row.label) for row in act_list] pred_list = prediction.select('prediction').collect() predicted = [int(row.prediction) for row in pred_list] prob_list = prediction.select('probability').collect() probability = [list(row.probability) for row in prob_list] # objs = {"trained_model":bestModel,"actual":prediction.select('label'),"predicted":prediction.select('prediction'), # "probability":prediction.select('probability'),"feature_importance":None, # "featureList":list(categorical_columns) + list(numerical_columns),"labelMapping":labelMapping} objs = { "trained_model": bestModel, "actual": actual, "predicted": predicted, "probability": probability, "feature_importance": None, "featureList": list(categorical_columns) + list(numerical_columns), "labelMapping": labelMapping } conf_mat_ar = metrics.confusionMatrix().toArray() print(conf_mat_ar) confusion_matrix = {} for i in range(len(conf_mat_ar)): confusion_matrix[labelMapping[i]] = {} for j, val in enumerate(conf_mat_ar[i]): confusion_matrix[labelMapping[i]][labelMapping[j]] = val print(confusion_matrix) # accuracy of the model '''ROC CURVE IMPLEMENTATION''' y_prob = probability y_score = predicted y_test = actual logLoss = log_loss(y_test, y_prob) if levels <= 2: positive_label_probs = [] for val in y_prob: positive_label_probs.append(val[int(posLabel)]) roc_auc = roc_auc_score(y_test, y_score) roc_data_dict = { "y_score": y_score, "y_test": y_test, "positive_label_probs": positive_label_probs, "y_prob": y_prob, "positive_label": posLabel } roc_dataframe = pd.DataFrame({ "y_score": y_score, "y_test": y_test, "positive_label_probs": positive_label_probs }) #roc_dataframe.to_csv("binary_roc_data.csv") fpr, tpr, thresholds = roc_curve(y_test, positive_label_probs, pos_label=posLabel) roc_df = pd.DataFrame({ "FPR": fpr, "TPR": tpr, "thresholds": thresholds }) roc_df["tpr-fpr"] = roc_df["TPR"] - roc_df["FPR"] optimal_index = np.argmax(np.array(roc_df["tpr-fpr"])) fpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "FPR"] tpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "TPR"] rounded_roc_df = roc_df.round({'FPR': 2, 'TPR': 4}) unique_fpr = rounded_roc_df["FPR"].unique() final_roc_df = rounded_roc_df.groupby("FPR", as_index=False)[["TPR" ]].mean() endgame_roc_df = final_roc_df.round({'FPR': 2, 'TPR': 3}) elif levels > 2: positive_label_probs = [] for val in y_prob: positive_label_probs.append(val[int(posLabel)]) y_test_roc_multi = [] for val in y_test: if val != posLabel: val = posLabel + 1 y_test_roc_multi.append(val) else: y_test_roc_multi.append(val) y_score_roc_multi = [] for val in y_score: if val != posLabel: val = posLabel + 1 y_score_roc_multi.append(val) else: y_score_roc_multi.append(val) roc_auc = roc_auc_score(y_test_roc_multi, y_score_roc_multi) fpr, tpr, thresholds = roc_curve(y_test_roc_multi, positive_label_probs, pos_label=posLabel) roc_df = pd.DataFrame({ "FPR": fpr, "TPR": tpr, "thresholds": thresholds }) roc_df["tpr-fpr"] = roc_df["TPR"] - roc_df["FPR"] optimal_index = np.argmax(np.array(roc_df["tpr-fpr"])) fpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "FPR"] tpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "TPR"] rounded_roc_df = roc_df.round({'FPR': 2, 'TPR': 4}) unique_fpr = rounded_roc_df["FPR"].unique() final_roc_df = rounded_roc_df.groupby("FPR", as_index=False)[["TPR" ]].mean() endgame_roc_df = final_roc_df.round({'FPR': 2, 'TPR': 3}) # Calculating prediction_split val_cnts = prediction.groupBy('label').count() val_cnts = map(lambda row: row.asDict(), val_cnts.collect()) prediction_split = {} total_nos = prediction.select('label').count() for item in val_cnts: print(labelMapping) classname = labelMapping[item['label']] prediction_split[classname] = round( item['count'] * 100 / float(total_nos), 2) if not algoSetting.is_hyperparameter_tuning_enabled(): modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(modelName) bestModel.save("/".join(modelFilepathArr)) runtime = round((time.time() - st_global), 2) try: print(pmml_filepath) pmmlBuilder = PMMLBuilder(self._spark, trainingData, bestModel).putOption( clf, 'compact', True) pmmlBuilder.buildFile(pmml_filepath) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except Exception as e: print("PMML failed...", str(e)) pass cat_cols = list(set(categorical_columns) - {result_column}) self._model_summary = MLModelSummary() self._model_summary.set_algorithm_name("Naive Bayes") self._model_summary.set_algorithm_display_name("Naive Bayes") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_confusion_matrix(confusion_matrix) # self._model_summary.set_feature_importance(objs["feature_importance"]) self._model_summary.set_feature_list(objs["featureList"]) self._model_summary.set_model_accuracy(accuracy) self._model_summary.set_training_time(round((time.time() - st), 2)) self._model_summary.set_precision_recall_stats([precision, recall]) self._model_summary.set_model_precision(precision) self._model_summary.set_model_recall(recall) self._model_summary.set_model_F1_score(f1_score) self._model_summary.set_model_log_loss(logLoss) self._model_summary.set_gain_lift_KS_data(gain_lift_KS_dataframe) self._model_summary.set_AUC_score(roc_auc) self._model_summary.set_target_variable(result_column) self._model_summary.set_prediction_split(prediction_split) self._model_summary.set_validation_method("KFold") self._model_summary.set_level_map_dict(objs["labelMapping"]) # self._model_summary.set_model_features(list(set(x_train.columns)-set([result_column]))) self._model_summary.set_model_features(objs["featureList"]) self._model_summary.set_level_counts( self._metaParser.get_unique_level_dict( list(set(categorical_columns)) + [result_column])) #self._model_summary.set_num_trees(objs['trained_model'].getNumTrees) self._model_summary.set_num_rules(300) self._model_summary.set_target_level(self._targetLevel) if not algoSetting.is_hyperparameter_tuning_enabled(): modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": accuracy, "evaluationMetricName": "accuracy", "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } else: modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": accuracy, "evaluationMetricName": "accuracy", "slug": self._model_summary.get_slug(), "Model Id": resultArray[0]["Model Id"] } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } self._model_management = MLModelSummary() print(modelmanagement_) self._model_management.set_job_type( self._dataframe_context.get_job_name()) #Project name self._model_management.set_training_status( data="completed") # training status self._model_management.set_target_level( self._targetLevel) # target column value self._model_management.set_training_time(runtime) # run time self._model_management.set_model_accuracy(round(metrics.accuracy, 2)) # self._model_management.set_model_accuracy(round(metrics.accuracy_score(objs["actual"], objs["predicted"]),2))#accuracy self._model_management.set_algorithm_name( "NaiveBayes") #algorithm name self._model_management.set_validation_method( str(validationDict["displayName"]) + "(" + str(validationDict["value"]) + ")") #validation method self._model_management.set_target_variable( result_column) #target column name self._model_management.set_creation_date(data=str( datetime.now().strftime('%b %d ,%Y %H:%M '))) #creation date self._model_management.set_datasetName(self._datasetName) self._model_management.set_model_type(data='classification') self._model_management.set_var_smoothing( data=int(modelmanagement_['smoothing'])) # self._model_management.set_no_of_independent_variables(df) #no of independent varables modelManagementSummaryJson = [ ["Project Name", self._model_management.get_job_type()], ["Algorithm", self._model_management.get_algorithm_name()], ["Training Status", self._model_management.get_training_status()], ["Accuracy", self._model_management.get_model_accuracy()], ["RunTime", self._model_management.get_training_time()], #["Owner",None], ["Created On", self._model_management.get_creation_date()] ] modelManagementModelSettingsJson = [ ["Training Dataset", self._model_management.get_datasetName()], ["Target Column", self._model_management.get_target_variable()], ["Target Column Value", self._model_management.get_target_level()], ["Algorithm", self._model_management.get_algorithm_name()], [ "Model Validation", self._model_management.get_validation_method() ], ["Model Type", self._model_management.get_model_type()], ["Smoothing", self._model_management.get_var_smoothing()], #,["priors",self._model_management.get_priors()] #,["var_smoothing",self._model_management.get_var_smoothing()] ] nbOverviewCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_card_overview( self._model_management, modelManagementSummaryJson, modelManagementModelSettingsJson) ] nbPerformanceCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_cards( self._model_summary, endgame_roc_df) ] nbDeploymentCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_deploy_empty_card() ] nbCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary) ] NB_Overview_Node = NarrativesTree() NB_Overview_Node.set_name("Overview") NB_Performance_Node = NarrativesTree() NB_Performance_Node.set_name("Performance") NB_Deployment_Node = NarrativesTree() NB_Deployment_Node.set_name("Deployment") for card in nbOverviewCards: NB_Overview_Node.add_a_card(card) for card in nbPerformanceCards: NB_Performance_Node.add_a_card(card) for card in nbDeploymentCards: NB_Deployment_Node.add_a_card(card) for card in nbCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({ "naivebayes": json.loads( CommonUtils.convert_python_object_to_json(self._model_summary)) }) self._result_setter.set_naive_bayes_model_summary(modelSummaryJson) self._result_setter.set_nb_cards(nbCards) self._result_setter.set_nb_nodes( [NB_Overview_Node, NB_Performance_Node, NB_Deployment_Node]) self._result_setter.set_nb_fail_card({ "Algorithm_Name": "Naive Bayes", "success": "True" }) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "completion", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") print("\n\n")
def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message(self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = [x for x in algosToRun if x.get_algorithm_slug()==self._slug][0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() categorical_columns = [x for x in categorical_columns if x != result_column] appType = self._dataframe_context.get_app_type() model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() # pipeline_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/pipeline/" # model_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/model" # pmml_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/modelPmml" df = self._data_frame levels = df.select(result_column).distinct().count() appType = self._dataframe_context.get_app_type() model_filepath = model_path + "/" + self._slug + "/model" pmml_filepath = str(model_path) + "/" + str(self._slug) + "/trainedModel.pmml" CommonUtils.create_update_and_save_progress_message(self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "training", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") st = time.time() pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns, categorical_columns, result_column) vectorFeats = pipeline.getStages()[-1].transform(df) input_feats = len(vectorFeats.select('features').take(1)[0][0]) trainingData, validationData = MLUtils.get_training_and_validation_data(df, result_column, 0.8) # indexed labelIndexer = StringIndexer(inputCol=result_column, outputCol="label") # OriginalTargetconverter = IndexToString(inputCol="label", outputCol="originalTargetColumn") # Label Mapping and Inverse labelIdx = labelIndexer.fit(trainingData) labelMapping = {k: v for k, v in enumerate(labelIdx.labels)} inverseLabelMapping = {v: float(k) for k, v in enumerate(labelIdx.labels)} clf = MultilayerPerceptronClassifier() if not algoSetting.is_hyperparameter_tuning_enabled(): algoParams = algoSetting.get_params_dict() else: algoParams = algoSetting.get_params_dict_hyperparameter() clfParams = [prm.name for prm in clf.params] algoParams = {getattr(clf, k): v if isinstance(v, list) else [v] for k, v in algoParams.items() if k in clfParams} paramGrid = ParamGridBuilder() layer_param_val = algoParams[getattr(clf, 'layers')] for layer in layer_param_val: layer.insert(0, input_feats) layer.append(levels) print('layer_param_val =', layer_param_val) # if not algoSetting.is_hyperparameter_tuning_enabled(): # for k,v in algoParams.items(): # if k.name == 'layers': # paramGrid = paramGrid.addGrid(k,layer_param_val) # else: # paramGrid = paramGrid.addGrid(k,v) # paramGrid = paramGrid.build() # else: for k, v in algoParams.items(): if v == [None] * len(v): continue if k.name == 'layers': paramGrid = paramGrid.addGrid(k, layer_param_val) else: paramGrid = paramGrid.addGrid(k, v) paramGrid = paramGrid.build() if len(paramGrid) > 1: hyperParamInitParam = algoSetting.get_hyperparameter_params() evaluationMetricDict = {"name": hyperParamInitParam["evaluationMetric"]} evaluationMetricDict["displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] else: evaluationMetricDict = {"name": GLOBALSETTINGS.CLASSIFICATION_MODEL_EVALUATION_METRIC} evaluationMetricDict["displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] self._result_setter.set_hyper_parameter_results(self._slug, None) if validationDict["name"] == "kFold": numFold = int(validationDict["value"]) estimator = Pipeline(stages=[pipeline, labelIndexer, clf]) if algoSetting.is_hyperparameter_tuning_enabled(): modelFilepath = "/".join(model_filepath.split("/")[:-1]) pySparkHyperParameterResultObj = PySparkGridSearchResult(estimator, paramGrid, appType, modelFilepath, levels, evaluationMetricDict, trainingData, validationData, numFold, self._targetLevel, labelMapping, inverseLabelMapping, df) resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models() self._result_setter.set_hyper_parameter_results(self._slug, resultArray) self._result_setter.set_metadata_parallel_coordinates(self._slug, { "ignoreList": pySparkHyperParameterResultObj.get_ignore_list(), "hideColumns": pySparkHyperParameterResultObj.get_hide_columns(), "metricColName": pySparkHyperParameterResultObj.get_comparison_metric_colname(), "columnOrder": pySparkHyperParameterResultObj.get_keep_columns()}) bestModel = pySparkHyperParameterResultObj.getBestModel() prediction = pySparkHyperParameterResultObj.getBestPrediction() bestModelName = resultArray[0]["Model Id"] else: crossval = CrossValidator(estimator=estimator, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator() if levels == 2 else MulticlassClassificationEvaluator(), numFolds=3 if numFold is None else numFold) # use 3+ folds in practice cvrf = crossval.fit(trainingData) prediction = cvrf.transform(validationData) bestModel = cvrf.bestModel bestModelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" else: train_test_ratio = float(self._dataframe_context.get_train_test_split()) estimator = Pipeline(stages=[pipeline, labelIndexer, clf]) if algoSetting.is_hyperparameter_tuning_enabled(): modelFilepath = "/".join(model_filepath.split("/")[:-1]) pySparkHyperParameterResultObj = PySparkTrainTestResult(estimator, paramGrid, appType, modelFilepath, levels, evaluationMetricDict, trainingData, validationData, train_test_ratio, self._targetLevel, labelMapping, inverseLabelMapping, df) resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models() self._result_setter.set_hyper_parameter_results(self._slug, resultArray) self._result_setter.set_metadata_parallel_coordinates(self._slug, { "ignoreList": pySparkHyperParameterResultObj.get_ignore_list(), "hideColumns": pySparkHyperParameterResultObj.get_hide_columns(), "metricColName": pySparkHyperParameterResultObj.get_comparison_metric_colname(), "columnOrder": pySparkHyperParameterResultObj.get_keep_columns()}) bestModel = pySparkHyperParameterResultObj.getBestModel() prediction = pySparkHyperParameterResultObj.getBestPrediction() bestModelName = resultArray[0]["Model Id"] else: tvs = TrainValidationSplit(estimator=estimator, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator() if levels == 2 else MulticlassClassificationEvaluator(), trainRatio=train_test_ratio) tvrf = tvs.fit(trainingData) prediction = tvrf.transform(validationData) bestModel = tvrf.bestModel bestModelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" MLUtils.save_pipeline_or_model(bestModel,model_filepath) predsAndLabels = prediction.select(['prediction', 'label']).rdd.map(tuple) metrics = MulticlassMetrics(predsAndLabels) posLabel = inverseLabelMapping[self._targetLevel] conf_mat_ar = metrics.confusionMatrix().toArray() print(conf_mat_ar) confusion_matrix = {} for i in range(len(conf_mat_ar)): confusion_matrix[labelMapping[i]] = {} for j, val in enumerate(conf_mat_ar[i]): confusion_matrix[labelMapping[i]][labelMapping[j]] = val print(confusion_matrix) trainingTime = time.time() - st f1_score = metrics.fMeasure(inverseLabelMapping[self._targetLevel], 1.0) precision = metrics.precision(inverseLabelMapping[self._targetLevel]) recall = metrics.recall(inverseLabelMapping[self._targetLevel]) accuracy = metrics.accuracy roc_auc = 'Undefined' if levels == 2: bin_metrics = BinaryClassificationMetrics(predsAndLabels) roc_auc = bin_metrics.areaUnderROC precision = metrics.precision(inverseLabelMapping[self._targetLevel]) recall = metrics.recall(inverseLabelMapping[self._targetLevel]) print(f1_score,precision,recall,accuracy) #gain chart implementation def cal_prob_eval(x): if len(x) == 1: if x == posLabel: return(float(x[1])) else: return(float(1 - x[1])) else: return(float(x[int(posLabel)])) column_name= 'probability' def y_prob_for_eval_udf(): return udf(lambda x:cal_prob_eval(x)) prediction = prediction.withColumn("y_prob_for_eval", y_prob_for_eval_udf()(col(column_name))) try: pys_df = prediction.select(['y_prob_for_eval','prediction','label']) gain_lift_ks_obj = GainLiftKS(pys_df, 'y_prob_for_eval', 'prediction', 'label', posLabel, self._spark) gain_lift_KS_dataframe = gain_lift_ks_obj.Run().toPandas() except: try: temp_df = pys_df.toPandas() gain_lift_ks_obj = GainLiftKS(temp_df, 'y_prob_for_eval', 'prediction', 'label', posLabel, self._spark) gain_lift_KS_dataframe = gain_lift_ks_obj.Rank_Ordering() except: print("gain chant failed") gain_lift_KS_dataframe = None objs = {"trained_model": bestModel, "actual": prediction.select('label'), "predicted": prediction.select('prediction'), "probability": prediction.select('probability'), "feature_importance": None, "featureList": list(categorical_columns) + list(numerical_columns), "labelMapping": labelMapping} # Calculating prediction_split val_cnts = prediction.groupBy('label').count() val_cnts = map(lambda row: row.asDict(), val_cnts.collect()) prediction_split = {} total_nos = objs['actual'].count() for item in val_cnts: classname = labelMapping[item['label']] prediction_split[classname] = round(item['count'] * 100 / float(total_nos), 2) if not algoSetting.is_hyperparameter_tuning_enabled(): # modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(bestModelName) bestModel.save("/".join(modelFilepathArr)) runtime = round((time.time() - st_global), 2) try: print(pmml_filepath) pmmlBuilder = PMMLBuilder(self._spark, trainingData, bestModel).putOption(clf, 'compact', True) pmmlBuilder.buildFile(pmml_filepath) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except Exception as e: print("PMML failed...", str(e)) pass cat_cols = list(set(categorical_columns) - {result_column}) self._model_summary = MLModelSummary() self._model_summary.set_algorithm_name("Spark ML Multilayer Perceptron") self._model_summary.set_algorithm_display_name("Spark ML Multilayer Perceptron") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_confusion_matrix(confusion_matrix) self._model_summary.set_feature_importance(objs["feature_importance"]) self._model_summary.set_feature_list(objs["featureList"]) self._model_summary.set_model_accuracy(accuracy) self._model_summary.set_training_time(round((time.time() - st), 2)) self._model_summary.set_precision_recall_stats([precision, recall]) self._model_summary.set_model_precision(precision) self._model_summary.set_model_recall(recall) self._model_summary.set_target_variable(result_column) self._model_summary.set_prediction_split(prediction_split) self._model_summary.set_validation_method("KFold") self._model_summary.set_level_map_dict(objs["labelMapping"]) self._model_summary.set_model_features(objs["featureList"]) self._model_summary.set_level_counts( self._metaParser.get_unique_level_dict(list(set(categorical_columns)) + [result_column])) self._model_summary.set_num_trees(None) self._model_summary.set_num_rules(300) self._model_summary.set_target_level(self._targetLevel) modelManagementJson = { "Model ID": "SPMLP-" + bestModelName, "Project Name": self._dataframe_context.get_job_name(), "Algorithm": self._model_summary.get_algorithm_name(), "Status": 'Completed', "Accuracy": accuracy, "Runtime": runtime, "Created On": "", "Owner": "", "Deployment": 0, "Action": '' } # if not algoSetting.is_hyperparameter_tuning_enabled(): # modelDropDownObj = { # "name": self._model_summary.get_algorithm_name(), # "evaluationMetricValue": locals()[evaluationMetricDict["name"]], # accuracy # "evaluationMetricName": evaluationMetricDict["displayName"], # accuracy # "slug": self._model_summary.get_slug(), # "Model Id": bestModelName # } # modelSummaryJson = { # "dropdown": modelDropDownObj, # "levelcount": self._model_summary.get_level_counts(), # "modelFeatureList": self._model_summary.get_feature_list(), # "levelMapping": self._model_summary.get_level_map_dict(), # "slug": self._model_summary.get_slug(), # "name": self._model_summary.get_algorithm_name() # } # else: modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": accuracy, #locals()[evaluationMetricDict["name"]], "evaluationMetricName": "accuracy", # evaluationMetricDict["name"], "slug": self._model_summary.get_slug(), "Model Id": bestModelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } mlpcCards = [json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary)] for card in mlpcCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary( {"sparkperceptron": json.loads(CommonUtils.convert_python_object_to_json(self._model_summary))}) self._result_setter.set_spark_multilayer_perceptron_model_summary(modelSummaryJson) self._result_setter.set_spark_multilayer_perceptron_management_summary(modelManagementJson) self._result_setter.set_mlpc_cards(mlpcCards) CommonUtils.create_update_and_save_progress_message(self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "completion", "info", display=True, emptyBin=False, customMsg=None, weightKey="total")