class NNPTRegressionScript(object): def __init__(self, data_frame, df_helper, df_context, spark, prediction_narrative, result_setter, meta_parser, mlEnvironment="sklearn"): self._metaParser = meta_parser self._prediction_narrative = prediction_narrative self._result_setter = result_setter self._data_frame = data_frame self._dataframe_helper = df_helper self._dataframe_context = df_context self._spark = spark self._model_summary = MLModelSummary() self._score_summary = {} self._slug = GLOBALSETTINGS.MODEL_SLUG_MAPPING[ "Neural Network (PyTorch)"] self._analysisName = self._slug self._dataframe_context.set_analysis_name(self._analysisName) self._mlEnv = mlEnvironment self._datasetName = CommonUtils.get_dataset_name( self._dataframe_context.CSV_FILE) self._completionStatus = self._dataframe_context.get_completion_status( ) print(self._completionStatus, "initial completion status") self._messageURL = self._dataframe_context.get_message_url() self._scriptWeightDict = self._dataframe_context.get_ml_model_training_weight( ) self._ignoreMsg = self._dataframe_context.get_message_ignore() self._scriptStages = { "initialization": { "summary": "Initialized The Neural Network (PyTorch) Scripts", "weight": 1 }, "training": { "summary": "Neural Network (PyTorch) Training Started", "weight": 2 }, "completion": { "summary": "Neural Network (PyTorch) Training Finished", "weight": 1 }, } def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") appType = self._dataframe_context.get_app_type() algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = [ x for x in algosToRun if x.get_algorithm_slug() == self._slug ][0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) print("CATEGORICAL COLS - ", categorical_columns) result_column = self._dataframe_context.get_result_column() numerical_columns = self._dataframe_helper.get_numeric_columns() numerical_columns = [ x for x in numerical_columns if x != result_column ] model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print("model_path", model_path) pipeline_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/pipeline/" model_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/model" pmml_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/modelPmml" df = self._data_frame if self._mlEnv == "spark": pass elif self._mlEnv == "sklearn": model_filepath = model_path + "/" + self._slug + "/model.pkl" x_train, x_test, y_train, y_test = self._dataframe_helper.get_train_test_data( ) x_train = MLUtils.create_dummy_columns( x_train, [x for x in categorical_columns if x != result_column]) x_test = MLUtils.create_dummy_columns( x_test, [x for x in categorical_columns if x != result_column]) x_test = MLUtils.fill_missing_columns(x_test, x_train.columns, result_column) print("=" * 150) print("X-Train Shape - ", x_train.shape) print("Y-Train Shape - ", y_train.shape) print("X-Test Shape - ", x_test.shape) print("Y-Test Shape - ", y_test.shape) print("~" * 50) print("X-Train dtype - ", type(x_train)) print("Y-Train dtype - ", type(y_train)) print("X-Test dtype - ", type(x_test)) print("Y-Test dtype - ", type(y_test)) print("~" * 50) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "training", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") st = time.time() self._result_setter.set_hyper_parameter_results(self._slug, None) evaluationMetricDict = algoSetting.get_evaluvation_metric( Type="REGRESSION") evaluationMetricDict = { "name": GLOBALSETTINGS.REGRESSION_MODEL_EVALUATION_METRIC } evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] x_train_tensored, y_train_tensored, x_test_tensored, y_test_tensored = PYTORCHUTILS.get_tensored_data( x_train, y_train, x_test, y_test) trainset = torch_data_utils.TensorDataset(x_train_tensored, y_train_tensored) testset = torch_data_utils.TensorDataset(x_test_tensored, y_test_tensored) nnptr_params = algoSetting.get_nnptr_params_dict()[0] layers_for_network = PYTORCHUTILS.get_layers_for_network_module( nnptr_params, task_type="REGRESSION", first_layer_units=x_train.shape[1]) # Use GPU if available device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") network = PyTorchNetwork(layers_for_network).to(device) network.eval() other_params_dict = PYTORCHUTILS.get_other_pytorch_params( nnptr_params, task_type="REGRESSION", network_params=network.parameters()) print("~" * 50) print("NNPTR-PARAMS - ", nnptr_params) print("~" * 50) print("OTHER-PARAMS-DICT - ", other_params_dict) print("~" * 50) print("NEURAL-NETWORK - ", network) print("~" * 50) criterion = other_params_dict["loss_criterion"] n_epochs = other_params_dict["number_of_epochs"] batch_size = other_params_dict["batch_size"] optimizer = other_params_dict["optimizer"] dataloader_params = { "batch_size": batch_size, "shuffle": True # "num_workers": } train_loader = torch_data_utils.DataLoader(trainset, **dataloader_params) test_loader = torch_data_utils.DataLoader(testset, **dataloader_params) ''' Training the network; Batchnormalization(num_features) should be equal to units_op for that layer in training config; else --> RuntimeError('running_mean should contain 100 elements not 200',) ''' for epoch in range(n_epochs): batchwise_losses = [] average_loss = 0.0 for i, (inputs, labels) in enumerate(train_loader): inputs = inputs.to(device) labels = labels.to(device) # Zero the parameter gradients optimizer.zero_grad() # Forward + backward + optimize outputs = network(inputs.float()) loss = criterion(outputs, labels.float()) loss.backward() optimizer.step() average_loss += loss.item() batchwise_losses.append(loss.item()) average_loss_per_epoch = old_div(average_loss, (i + 1)) print("+" * 80) print("EPOCH - ", epoch) print("BATCHWISE_LOSSES shape - ", len(batchwise_losses)) print("AVERAGE LOSS PER EPOCH - ", average_loss_per_epoch) print("+" * 80) trainingTime = time.time() - st bestEstimator = network outputs_x_test_tensored = network(x_test_tensored.float()) y_score_mid = outputs_x_test_tensored.tolist() y_score = [x[0] for x in y_score_mid] print("Y-SCORE - ", y_score) print("Y-SCORE length - ", len(y_score)) y_prob = None featureImportance = {} objs = { "trained_model": bestEstimator, "actual": y_test, "predicted": y_score, "probability": y_prob, "feature_importance": featureImportance, "featureList": list(x_train.columns), "labelMapping": {} } #featureImportance = objs["trained_model"].feature_importances_ #featuresArray = [(col_name, featureImportance[idx]) for idx, col_name in enumerate(x_train.columns)] featuresArray = [] if not algoSetting.is_hyperparameter_tuning_enabled(): modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(modelName + ".pt") torch.save(objs["trained_model"], "/".join(modelFilepathArr)) #joblib.dump(objs["trained_model"],"/".join(modelFilepathArr)) runtime = round((time.time() - st), 2) else: runtime = round((time.time() - hyper_st), 2) try: modelPmmlPipeline = PMMLPipeline([("pretrained-estimator", objs["trained_model"])]) modelPmmlPipeline.target_field = result_column modelPmmlPipeline.active_fields = np.array( [col for col in x_train.columns if col != result_column]) sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr=True) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except: pass metrics = {} metrics["r2"] = r2_score(y_test, y_score) metrics["neg_mean_squared_error"] = mean_squared_error( y_test, y_score) metrics["neg_mean_absolute_error"] = mean_absolute_error( y_test, y_score) metrics["RMSE"] = sqrt(metrics["neg_mean_squared_error"]) metrics["explained_variance_score"] = explained_variance_score( y_test, y_score) transformed = pd.DataFrame({ "prediction": y_score, result_column: y_test }) print("TRANSFORMED PREDICTION TYPE - ", type(transformed["prediction"])) print(transformed["prediction"]) print("TRANSFORMED RESULT COL TYPE - ", type(transformed[result_column])) print(transformed[result_column]) transformed["difference"] = transformed[ result_column] - transformed["prediction"] transformed["mape"] = old_div( np.abs(transformed["difference"]) * 100, transformed[result_column]) sampleData = None nrows = transformed.shape[0] if nrows > 100: sampleData = transformed.sample(n=100, random_state=420) else: sampleData = transformed print(sampleData.head()) if transformed["mape"].max() > 100: GLOBALSETTINGS.MAPEBINS.append(transformed["mape"].max()) mapeCountArr = list( pd.cut(transformed["mape"], GLOBALSETTINGS.MAPEBINS). value_counts().to_dict().items()) GLOBALSETTINGS.MAPEBINS.pop(5) else: mapeCountArr = list( pd.cut(transformed["mape"], GLOBALSETTINGS.MAPEBINS). value_counts().to_dict().items()) mapeStatsArr = [(str(idx), dictObj) for idx, dictObj in enumerate( sorted([{ "count": x[1], "splitRange": (x[0].left, x[0].right) } for x in mapeCountArr], key=lambda x: x["splitRange"][0]))] print(mapeStatsArr) print(mapeCountArr) predictionColSummary = transformed["prediction"].describe( ).to_dict() quantileBins = [ predictionColSummary["min"], predictionColSummary["25%"], predictionColSummary["50%"], predictionColSummary["75%"], predictionColSummary["max"] ] print(quantileBins) quantileBins = sorted(list(set(quantileBins))) transformed["quantileBinId"] = pd.cut(transformed["prediction"], quantileBins) quantileDf = transformed.groupby("quantileBinId").agg({ "prediction": [np.sum, np.mean, np.size] }).reset_index() quantileDf.columns = ["prediction", "sum", "mean", "count"] print(quantileDf) quantileArr = list(quantileDf.T.to_dict().items()) quantileSummaryArr = [(obj[0], { "splitRange": (obj[1]["prediction"].left, obj[1]["prediction"].right), "count": obj[1]["count"], "mean": obj[1]["mean"], "sum": obj[1]["sum"] }) for obj in quantileArr] print(quantileSummaryArr) runtime = round((time.time() - st_global), 2) self._model_summary.set_model_type("regression") self._model_summary.set_algorithm_name("Neural Network (PyTorch)") self._model_summary.set_algorithm_display_name( "Neural Network (PyTorch)") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_training_time(trainingTime) self._model_summary.set_target_variable(result_column) self._model_summary.set_validation_method( validationDict["displayName"]) self._model_summary.set_model_evaluation_metrics(metrics) self._model_summary.set_model_params(nnptr_params) self._model_summary.set_quantile_summary(quantileSummaryArr) self._model_summary.set_mape_stats(mapeStatsArr) self._model_summary.set_sample_data(sampleData.to_dict()) self._model_summary.set_feature_importance(featuresArray) self._model_summary.set_feature_list(list(x_train.columns)) self._model_summary.set_model_mse( metrics["neg_mean_squared_error"]) self._model_summary.set_model_mae( metrics["neg_mean_absolute_error"]) self._model_summary.set_rmse(metrics["RMSE"]) self._model_summary.set_model_rsquared(metrics["r2"]) self._model_summary.set_model_exp_variance_score( metrics["explained_variance_score"]) try: pmml_filepath = str(model_path) + "/" + str( self._slug) + "/traindeModel.pmml" modelPmmlPipeline = PMMLPipeline([("pretrained-estimator", objs["trained_model"])]) modelPmmlPipeline.target_field = result_column modelPmmlPipeline.active_fields = np.array( [col for col in x_train.columns if col != result_column]) sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr=True) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except: pass if algoSetting.is_hyperparameter_tuning_enabled(): modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": metrics[evaluationMetricDict["name"]], "evaluationMetricName": evaluationMetricDict["name"], "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } else: modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": metrics[evaluationMetricDict["name"]], "evaluationMetricName": evaluationMetricDict["name"], "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } modelmanagement_ = nnptr_params self._model_management = MLModelSummary() if algoSetting.is_hyperparameter_tuning_enabled(): pass else: self._model_management.set_layer_info( data=modelmanagement_['hidden_layer_info']) self._model_management.set_loss_function( data=modelmanagement_['loss']) self._model_management.set_optimizer( data=modelmanagement_['optimizer']) self._model_management.set_batch_size( data=modelmanagement_['batch_size']) self._model_management.set_no_epochs( data=modelmanagement_['number_of_epochs']) # self._model_management.set_model_evaluation_metrics(data=modelmanagement_['metrics']) self._model_management.set_job_type( self._dataframe_context.get_job_name()) #Project name self._model_management.set_training_status( data="completed") # training status self._model_management.set_no_of_independent_variables( data=x_train) #no of independent varables self._model_management.set_training_time(runtime) # run time self._model_management.set_rmse(metrics["RMSE"]) self._model_management.set_algorithm_name( "Neural Network (TensorFlow)") #algorithm name self._model_management.set_validation_method( str(validationDict["displayName"]) + "(" + str(validationDict["value"]) + ")") #validation method self._model_management.set_target_variable( result_column) #target column name self._model_management.set_creation_date(data=str( datetime.now().strftime('%b %d ,%Y %H:%M '))) #creation date self._model_management.set_datasetName(self._datasetName) modelManagementSummaryJson = [ ["Project Name", self._model_management.get_job_type()], ["Algorithm", self._model_management.get_algorithm_name()], ["Training Status", self._model_management.get_training_status()], ["RMSE", self._model_management.get_rmse()], ["RunTime", self._model_management.get_training_time()], #["Owner",None], ["Created On", self._model_management.get_creation_date()] ] if algoSetting.is_hyperparameter_tuning_enabled(): modelManagementModelSettingsJson = [] else: modelManagementModelSettingsJson = [ ["Training Dataset", self._model_management.get_datasetName()], [ "Target Column", self._model_management.get_target_variable() ], [ "Number Of Independent Variables", self._model_management.get_no_of_independent_variables() ], ["Algorithm", self._model_management.get_algorithm_name()], [ "Model Validation", self._model_management.get_validation_method() ], ["batch_size", str(self._model_management.get_batch_size())], ["Loss", self._model_management.get_loss_function()], ["Optimizer", self._model_management.get_optimizer()], ["Epochs", self._model_management.get_no_epochs()], [ "Metrics", self._model_management.get_model_evaluation_metrics() ] ] for i in modelmanagement_["hidden_layer_info"]: string = "" key = str(modelmanagement_["hidden_layer_info"][i] ["layer"]) + " " + str(i) + ":" for j in modelmanagement_["hidden_layer_info"][i]: string = string + str(j) + ":" + str( modelmanagement_["hidden_layer_info"][i][j]) + ", " modelManagementModelSettingsJson.append([key, string]) print(modelManagementModelSettingsJson) nnptrCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary) ] nnptrPerformanceCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_cards_regression( self._model_summary) ] nnptrOverviewCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_card_overview( self._model_management, modelManagementSummaryJson, modelManagementModelSettingsJson) ] nnptrDeploymentCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_deploy_empty_card() ] nnptr_Overview_Node = NarrativesTree() nnptr_Overview_Node.set_name("Overview") nnptr_Performance_Node = NarrativesTree() nnptr_Performance_Node.set_name("Performance") nnptr_Deployment_Node = NarrativesTree() nnptr_Deployment_Node.set_name("Deployment") for card in nnptrOverviewCards: nnptr_Overview_Node.add_a_card(card) for card in nnptrPerformanceCards: nnptr_Performance_Node.add_a_card(card) for card in nnptrDeploymentCards: nnptr_Deployment_Node.add_a_card(card) for card in nnptrCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({ "Neural Network (PyTorch)": json.loads( CommonUtils.convert_python_object_to_json(self._model_summary)) }) self._result_setter.set_nnptr_regression_model_summary( modelSummaryJson) self._result_setter.set_nnptr_cards(nnptrCards) self._result_setter.set_nnptr_nodes([ nnptr_Overview_Node, nnptr_Performance_Node, nnptr_Deployment_Node ]) self._result_setter.set_nnptr_fail_card({ "Algorithm_Name": "Neural Network (PyTorch)", "Success": "True" }) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "completion", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") def Predict(self): self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight( ) self._scriptStages = { "initialization": { "summary": "Initialized The Neural Network (PyTorch) Scripts", "weight": 2 }, "predictionStart": { "summary": "Neural Network (PyTorch) Prediction Started", "weight": 2 }, "predictionFinished": { "summary": "Neural Network (PyTorch) Prediction Finished", "weight": 6 } } CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) dataSanity = True categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() test_data_path = self._dataframe_context.get_input_file() if self._mlEnv == "spark": pass elif self._mlEnv == "sklearn": CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "predictionStart", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") score_data_path = self._dataframe_context.get_score_path( ) + "/data.csv" trained_model_path = "file://" + self._dataframe_context.get_model_path( ) trained_model_path += "/" + self._dataframe_context.get_model_for_scoring( ) + ".pt" print("trained_model_path", trained_model_path) print("score_data_path", score_data_path) if trained_model_path.startswith("file"): trained_model_path = trained_model_path[7:] #trained_model = joblib.load(trained_model_path) trained_model = torch.load(trained_model_path, map_location=torch.device('cpu')) model_columns = self._dataframe_context.get_model_features() print("model_columns", model_columns) try: df = self._data_frame.toPandas() except: df = self._data_frame # pandas_df = MLUtils.factorize_columns(df,[x for x in categorical_columns if x != result_column]) pandas_df = MLUtils.create_dummy_columns( df, [x for x in categorical_columns if x != result_column]) pandas_df = MLUtils.fill_missing_columns(pandas_df, model_columns, result_column) if uid_col: pandas_df = pandas_df[[ x for x in pandas_df.columns if x != uid_col ]] test_df = np.stack( [pandas_df[col].values for col in pandas_df.columns], 1) tensored_test_df = torch.tensor(test_df, dtype=torch.float) outputs_test_df_tensored = trained_model(tensored_test_df.float()) y_score_mid = outputs_test_df_tensored.tolist() y_score = [x[0] for x in y_score_mid] scoreKpiArray = MLUtils.get_scored_data_summary(y_score) kpiCard = NormalCard() kpiCardData = [KpiData(data=x) for x in scoreKpiArray] kpiCard.set_card_data(kpiCardData) kpiCard.set_cente_alignment(True) print(CommonUtils.convert_python_object_to_json(kpiCard)) self._result_setter.set_kpi_card_regression_score(kpiCard) pandas_df[result_column] = y_score df[result_column] = y_score df.to_csv(score_data_path, header=True, index=False) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "predictionFinished", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") print("STARTING Measure ANALYSIS ...") columns_to_keep = [] columns_to_drop = [] columns_to_keep = self._dataframe_context.get_score_consider_columns( ) if len(columns_to_keep) > 0: columns_to_drop = list(set(df.columns) - set(columns_to_keep)) else: columns_to_drop += ["predicted_probability"] columns_to_drop = [ x for x in columns_to_drop if x in df.columns and x != result_column ] print("columns_to_drop", columns_to_drop) pandas_scored_df = df[list(set(columns_to_keep + [result_column]))] spark_scored_df = SQLctx.createDataFrame(pandas_scored_df) # spark_scored_df.write.csv(score_data_path+"/data",mode="overwrite",header=True) # TODO update metadata for the newly created dataframe self._dataframe_context.update_consider_columns(columns_to_keep) print(spark_scored_df.printSchema()) df_helper = DataFrameHelper(spark_scored_df, self._dataframe_context, self._metaParser) df_helper.set_params() df = df_helper.get_data_frame() # self._dataframe_context.set_dont_send_message(True) try: fs = time.time() descr_stats_obj = DescriptiveStatsScript( df, df_helper, self._dataframe_context, self._result_setter, self._spark, self._prediction_narrative, scriptWeight=self._scriptWeightDict, analysisName="Descriptive analysis") descr_stats_obj.Run() print("DescriptiveStats Analysis Done in ", time.time() - fs, " seconds.") except: print("Frequency Analysis Failed ") # try: # fs = time.time() # df_helper.fill_na_dimension_nulls() # df = df_helper.get_data_frame() # dt_reg = DecisionTreeRegressionScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,self._metaParser,scriptWeight=self._scriptWeightDict,analysisName="Predictive modeling") # dt_reg.Run() # print "DecisionTrees Analysis Done in ", time.time() - fs, " seconds." # except: # print "DTREE FAILED" try: fs = time.time() two_way_obj = TwoWayAnovaScript( df, df_helper, self._dataframe_context, self._result_setter, self._spark, self._prediction_narrative, self._metaParser, scriptWeight=self._scriptWeightDict, analysisName="Measure vs. Dimension") two_way_obj.Run() print("OneWayAnova Analysis Done in ", time.time() - fs, " seconds.") except: print("Anova Analysis Failed")
class NaiveBayesPysparkScript(object): def __init__(self, data_frame, df_helper, df_context, spark, prediction_narrative, result_setter, meta_parser, mlEnvironment="pyspark"): self._metaParser = meta_parser self._prediction_narrative = prediction_narrative self._result_setter = result_setter self._data_frame = data_frame self._dataframe_helper = df_helper self._dataframe_context = df_context self._ignoreMsg = self._dataframe_context.get_message_ignore() self._spark = spark self._model_summary = MLModelSummary() self._score_summary = {} self._slug = GLOBALSETTINGS.MODEL_SLUG_MAPPING["naive bayes"] self._datasetName = CommonUtils.get_dataset_name( self._dataframe_context.CSV_FILE) self._targetLevel = self._dataframe_context.get_target_level_for_model( ) self._targetLevel = self._dataframe_context.get_target_level_for_model( ) self._completionStatus = self._dataframe_context.get_completion_status( ) print(self._completionStatus, "initial completion status") self._analysisName = self._slug self._messageURL = self._dataframe_context.get_message_url() self._scriptWeightDict = self._dataframe_context.get_ml_model_training_weight( ) self._mlEnv = mlEnvironment # self._classifier = "nb" self._scriptStages = { "initialization": { "summary": "Initialized the Naive Bayes Scripts", "weight": 4 }, "training": { "summary": "Naive Bayes Model Training Started", "weight": 2 }, "completion": { "summary": "Naive Bayes Model Training Finished", "weight": 4 }, } def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = [ x for x in algosToRun if x.get_algorithm_slug() == self._slug ][0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() categorical_columns = [ x for x in categorical_columns if x != result_column ] appType = self._dataframe_context.get_app_type() model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print("model_path", model_path) pipeline_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/pipeline/" model_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/model" pmml_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/modelPmml" df = self._data_frame levels = df.select(result_column).distinct().count() appType = self._dataframe_context.get_app_type() model_filepath = model_path + "/" + self._slug + "/model" pmml_filepath = str(model_path) + "/" + str( self._slug) + "/traindeModel.pmml" CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "training", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") st = time.time() pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns, categorical_columns, result_column) trainingData, validationData = MLUtils.get_training_and_validation_data( df, result_column, 0.8) # indexed labelIndexer = StringIndexer(inputCol=result_column, outputCol="label") # OriginalTargetconverter = IndexToString(inputCol="label", outputCol="originalTargetColumn") # Label Mapping and Inverse labelIdx = labelIndexer.fit(trainingData) labelMapping = {k: v for k, v in enumerate(labelIdx.labels)} inverseLabelMapping = { v: float(k) for k, v in enumerate(labelIdx.labels) } if self._dataframe_context.get_trainerMode() == "autoML": automl_enable = True else: automl_enable = False clf = NaiveBayes() if not algoSetting.is_hyperparameter_tuning_enabled(): algoParams = algoSetting.get_params_dict() else: algoParams = algoSetting.get_params_dict_hyperparameter() print("=" * 100) print(algoParams) print("=" * 100) clfParams = [prm.name for prm in clf.params] algoParams = { getattr(clf, k): v if isinstance(v, list) else [v] for k, v in algoParams.items() if k in clfParams } #print("="*100) #print("ALGOPARAMS - ",algoParams) #print("="*100) paramGrid = ParamGridBuilder() # if not algoSetting.is_hyperparameter_tuning_enabled(): # for k,v in algoParams.items(): # if v == [None] * len(v): # continue # if k.name == 'thresholds': # paramGrid = paramGrid.addGrid(k,v[0]) # else: # paramGrid = paramGrid.addGrid(k,v) # paramGrid = paramGrid.build() # if not algoSetting.is_hyperparameter_tuning_enabled(): for k, v in algoParams.items(): print(k, v) if v == [None] * len(v): continue paramGrid = paramGrid.addGrid(k, v) paramGrid = paramGrid.build() # else: # for k,v in algoParams.items(): # print k.name, v # if v[0] == [None] * len(v[0]): # continue # paramGrid = paramGrid.addGrid(k,v[0]) # paramGrid = paramGrid.build() #print("="*143) #print("PARAMGRID - ", paramGrid) #print("="*143) if len(paramGrid) > 1: hyperParamInitParam = algoSetting.get_hyperparameter_params() evaluationMetricDict = { "name": hyperParamInitParam["evaluationMetric"] } evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] else: evaluationMetricDict = { "name": GLOBALSETTINGS.CLASSIFICATION_MODEL_EVALUATION_METRIC } evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] self._result_setter.set_hyper_parameter_results(self._slug, None) if validationDict["name"] == "kFold": numFold = int(validationDict["value"]) estimator = Pipeline(stages=[pipeline, labelIndexer, clf]) if algoSetting.is_hyperparameter_tuning_enabled(): modelFilepath = "/".join(model_filepath.split("/")[:-1]) pySparkHyperParameterResultObj = PySparkGridSearchResult( estimator, paramGrid, appType, modelFilepath, levels, evaluationMetricDict, trainingData, validationData, numFold, self._targetLevel, labelMapping, inverseLabelMapping, df) resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models( ) self._result_setter.set_hyper_parameter_results( self._slug, resultArray) self._result_setter.set_metadata_parallel_coordinates( self._slug, { "ignoreList": pySparkHyperParameterResultObj.get_ignore_list(), "hideColumns": pySparkHyperParameterResultObj.get_hide_columns(), "metricColName": pySparkHyperParameterResultObj. get_comparison_metric_colname(), "columnOrder": pySparkHyperParameterResultObj.get_keep_columns() }) bestModel = pySparkHyperParameterResultObj.getBestModel() prediction = pySparkHyperParameterResultObj.getBestPrediction() else: if automl_enable: paramGrid = (ParamGridBuilder().addGrid( clf.smoothing, [1.0, 0.2]).build()) crossval = CrossValidator( estimator=estimator, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator() if levels == 2 else MulticlassClassificationEvaluator(), numFolds=3 if numFold is None else numFold) # use 3+ folds in practice cvnb = crossval.fit(trainingData) prediction = cvnb.transform(validationData) bestModel = cvnb.bestModel else: train_test_ratio = float( self._dataframe_context.get_train_test_split()) estimator = Pipeline(stages=[pipeline, labelIndexer, clf]) if algoSetting.is_hyperparameter_tuning_enabled(): modelFilepath = "/".join(model_filepath.split("/")[:-1]) pySparkHyperParameterResultObj = PySparkTrainTestResult( estimator, paramGrid, appType, modelFilepath, levels, evaluationMetricDict, trainingData, validationData, train_test_ratio, self._targetLevel, labelMapping, inverseLabelMapping, df) resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models( ) self._result_setter.set_hyper_parameter_results( self._slug, resultArray) self._result_setter.set_metadata_parallel_coordinates( self._slug, { "ignoreList": pySparkHyperParameterResultObj.get_ignore_list(), "hideColumns": pySparkHyperParameterResultObj.get_hide_columns(), "metricColName": pySparkHyperParameterResultObj. get_comparison_metric_colname(), "columnOrder": pySparkHyperParameterResultObj.get_keep_columns() }) bestModel = pySparkHyperParameterResultObj.getBestModel() prediction = pySparkHyperParameterResultObj.getBestPrediction() else: tvs = TrainValidationSplit( estimator=estimator, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator() if levels == 2 else MulticlassClassificationEvaluator(), trainRatio=train_test_ratio) tvspnb = tvs.fit(trainingData) prediction = tvspnb.transform(validationData) bestModel = tvspnb.bestModel modelmanagement_ = { param[0].name: param[1] for param in bestModel.stages[2].extractParamMap().items() } MLUtils.save_pipeline_or_model(bestModel, model_filepath) predsAndLabels = prediction.select(['prediction', 'label']).rdd.map(tuple) # label_classes = prediction.select("label").distinct().collect() # label_classes = prediction.agg((F.collect_set('label').alias('label'))).first().asDict()['label'] #results = transformed.select(["prediction","label"]) # if len(label_classes) > 2: # metrics = MulticlassMetrics(predsAndLabels) # accuracy of the model # else: # metrics = BinaryClassificationMetrics(predsAndLabels) posLabel = inverseLabelMapping[self._targetLevel] metrics = MulticlassMetrics(predsAndLabels) trainingTime = time.time() - st f1_score = metrics.fMeasure(inverseLabelMapping[self._targetLevel], 1.0) precision = metrics.precision(inverseLabelMapping[self._targetLevel]) recall = metrics.recall(inverseLabelMapping[self._targetLevel]) accuracy = metrics.accuracy print(f1_score, precision, recall, accuracy) #gain chart implementation def cal_prob_eval(x): if len(x) == 1: if x == posLabel: return (float(x[1])) else: return (float(1 - x[1])) else: return (float(x[int(posLabel)])) column_name = 'probability' def y_prob_for_eval_udf(): return udf(lambda x: cal_prob_eval(x)) prediction = prediction.withColumn( "y_prob_for_eval", y_prob_for_eval_udf()(col(column_name))) try: pys_df = prediction.select( ['y_prob_for_eval', 'prediction', 'label']) gain_lift_ks_obj = GainLiftKS(pys_df, 'y_prob_for_eval', 'prediction', 'label', posLabel, self._spark) gain_lift_KS_dataframe = gain_lift_ks_obj.Run().toPandas() except: try: temp_df = pys_df.toPandas() gain_lift_ks_obj = GainLiftKS(temp_df, 'y_prob_for_eval', 'prediction', 'label', posLabel, self._spark) gain_lift_KS_dataframe = gain_lift_ks_obj.Rank_Ordering() except: print("gain chant failed") gain_lift_KS_dataframe = None #feature_importance = MLUtils.calculate_sparkml_feature_importance(df, bestModel.stages[-1], categorical_columns, numerical_columns) act_list = prediction.select('label').collect() actual = [int(row.label) for row in act_list] pred_list = prediction.select('prediction').collect() predicted = [int(row.prediction) for row in pred_list] prob_list = prediction.select('probability').collect() probability = [list(row.probability) for row in prob_list] # objs = {"trained_model":bestModel,"actual":prediction.select('label'),"predicted":prediction.select('prediction'), # "probability":prediction.select('probability'),"feature_importance":None, # "featureList":list(categorical_columns) + list(numerical_columns),"labelMapping":labelMapping} objs = { "trained_model": bestModel, "actual": actual, "predicted": predicted, "probability": probability, "feature_importance": None, "featureList": list(categorical_columns) + list(numerical_columns), "labelMapping": labelMapping } conf_mat_ar = metrics.confusionMatrix().toArray() print(conf_mat_ar) confusion_matrix = {} for i in range(len(conf_mat_ar)): confusion_matrix[labelMapping[i]] = {} for j, val in enumerate(conf_mat_ar[i]): confusion_matrix[labelMapping[i]][labelMapping[j]] = val print(confusion_matrix) # accuracy of the model '''ROC CURVE IMPLEMENTATION''' y_prob = probability y_score = predicted y_test = actual logLoss = log_loss(y_test, y_prob) if levels <= 2: positive_label_probs = [] for val in y_prob: positive_label_probs.append(val[int(posLabel)]) roc_auc = roc_auc_score(y_test, y_score) roc_data_dict = { "y_score": y_score, "y_test": y_test, "positive_label_probs": positive_label_probs, "y_prob": y_prob, "positive_label": posLabel } roc_dataframe = pd.DataFrame({ "y_score": y_score, "y_test": y_test, "positive_label_probs": positive_label_probs }) #roc_dataframe.to_csv("binary_roc_data.csv") fpr, tpr, thresholds = roc_curve(y_test, positive_label_probs, pos_label=posLabel) roc_df = pd.DataFrame({ "FPR": fpr, "TPR": tpr, "thresholds": thresholds }) roc_df["tpr-fpr"] = roc_df["TPR"] - roc_df["FPR"] optimal_index = np.argmax(np.array(roc_df["tpr-fpr"])) fpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "FPR"] tpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "TPR"] rounded_roc_df = roc_df.round({'FPR': 2, 'TPR': 4}) unique_fpr = rounded_roc_df["FPR"].unique() final_roc_df = rounded_roc_df.groupby("FPR", as_index=False)[["TPR" ]].mean() endgame_roc_df = final_roc_df.round({'FPR': 2, 'TPR': 3}) elif levels > 2: positive_label_probs = [] for val in y_prob: positive_label_probs.append(val[int(posLabel)]) y_test_roc_multi = [] for val in y_test: if val != posLabel: val = posLabel + 1 y_test_roc_multi.append(val) else: y_test_roc_multi.append(val) y_score_roc_multi = [] for val in y_score: if val != posLabel: val = posLabel + 1 y_score_roc_multi.append(val) else: y_score_roc_multi.append(val) roc_auc = roc_auc_score(y_test_roc_multi, y_score_roc_multi) fpr, tpr, thresholds = roc_curve(y_test_roc_multi, positive_label_probs, pos_label=posLabel) roc_df = pd.DataFrame({ "FPR": fpr, "TPR": tpr, "thresholds": thresholds }) roc_df["tpr-fpr"] = roc_df["TPR"] - roc_df["FPR"] optimal_index = np.argmax(np.array(roc_df["tpr-fpr"])) fpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "FPR"] tpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "TPR"] rounded_roc_df = roc_df.round({'FPR': 2, 'TPR': 4}) unique_fpr = rounded_roc_df["FPR"].unique() final_roc_df = rounded_roc_df.groupby("FPR", as_index=False)[["TPR" ]].mean() endgame_roc_df = final_roc_df.round({'FPR': 2, 'TPR': 3}) # Calculating prediction_split val_cnts = prediction.groupBy('label').count() val_cnts = map(lambda row: row.asDict(), val_cnts.collect()) prediction_split = {} total_nos = prediction.select('label').count() for item in val_cnts: print(labelMapping) classname = labelMapping[item['label']] prediction_split[classname] = round( item['count'] * 100 / float(total_nos), 2) if not algoSetting.is_hyperparameter_tuning_enabled(): modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(modelName) bestModel.save("/".join(modelFilepathArr)) runtime = round((time.time() - st_global), 2) try: print(pmml_filepath) pmmlBuilder = PMMLBuilder(self._spark, trainingData, bestModel).putOption( clf, 'compact', True) pmmlBuilder.buildFile(pmml_filepath) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except Exception as e: print("PMML failed...", str(e)) pass cat_cols = list(set(categorical_columns) - {result_column}) self._model_summary = MLModelSummary() self._model_summary.set_algorithm_name("Naive Bayes") self._model_summary.set_algorithm_display_name("Naive Bayes") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_confusion_matrix(confusion_matrix) # self._model_summary.set_feature_importance(objs["feature_importance"]) self._model_summary.set_feature_list(objs["featureList"]) self._model_summary.set_model_accuracy(accuracy) self._model_summary.set_training_time(round((time.time() - st), 2)) self._model_summary.set_precision_recall_stats([precision, recall]) self._model_summary.set_model_precision(precision) self._model_summary.set_model_recall(recall) self._model_summary.set_model_F1_score(f1_score) self._model_summary.set_model_log_loss(logLoss) self._model_summary.set_gain_lift_KS_data(gain_lift_KS_dataframe) self._model_summary.set_AUC_score(roc_auc) self._model_summary.set_target_variable(result_column) self._model_summary.set_prediction_split(prediction_split) self._model_summary.set_validation_method("KFold") self._model_summary.set_level_map_dict(objs["labelMapping"]) # self._model_summary.set_model_features(list(set(x_train.columns)-set([result_column]))) self._model_summary.set_model_features(objs["featureList"]) self._model_summary.set_level_counts( self._metaParser.get_unique_level_dict( list(set(categorical_columns)) + [result_column])) #self._model_summary.set_num_trees(objs['trained_model'].getNumTrees) self._model_summary.set_num_rules(300) self._model_summary.set_target_level(self._targetLevel) if not algoSetting.is_hyperparameter_tuning_enabled(): modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": accuracy, "evaluationMetricName": "accuracy", "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } else: modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": accuracy, "evaluationMetricName": "accuracy", "slug": self._model_summary.get_slug(), "Model Id": resultArray[0]["Model Id"] } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } self._model_management = MLModelSummary() print(modelmanagement_) self._model_management.set_job_type( self._dataframe_context.get_job_name()) #Project name self._model_management.set_training_status( data="completed") # training status self._model_management.set_target_level( self._targetLevel) # target column value self._model_management.set_training_time(runtime) # run time self._model_management.set_model_accuracy(round(metrics.accuracy, 2)) # self._model_management.set_model_accuracy(round(metrics.accuracy_score(objs["actual"], objs["predicted"]),2))#accuracy self._model_management.set_algorithm_name( "NaiveBayes") #algorithm name self._model_management.set_validation_method( str(validationDict["displayName"]) + "(" + str(validationDict["value"]) + ")") #validation method self._model_management.set_target_variable( result_column) #target column name self._model_management.set_creation_date(data=str( datetime.now().strftime('%b %d ,%Y %H:%M '))) #creation date self._model_management.set_datasetName(self._datasetName) self._model_management.set_model_type(data='classification') self._model_management.set_var_smoothing( data=int(modelmanagement_['smoothing'])) # self._model_management.set_no_of_independent_variables(df) #no of independent varables modelManagementSummaryJson = [ ["Project Name", self._model_management.get_job_type()], ["Algorithm", self._model_management.get_algorithm_name()], ["Training Status", self._model_management.get_training_status()], ["Accuracy", self._model_management.get_model_accuracy()], ["RunTime", self._model_management.get_training_time()], #["Owner",None], ["Created On", self._model_management.get_creation_date()] ] modelManagementModelSettingsJson = [ ["Training Dataset", self._model_management.get_datasetName()], ["Target Column", self._model_management.get_target_variable()], ["Target Column Value", self._model_management.get_target_level()], ["Algorithm", self._model_management.get_algorithm_name()], [ "Model Validation", self._model_management.get_validation_method() ], ["Model Type", self._model_management.get_model_type()], ["Smoothing", self._model_management.get_var_smoothing()], #,["priors",self._model_management.get_priors()] #,["var_smoothing",self._model_management.get_var_smoothing()] ] nbOverviewCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_card_overview( self._model_management, modelManagementSummaryJson, modelManagementModelSettingsJson) ] nbPerformanceCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_cards( self._model_summary, endgame_roc_df) ] nbDeploymentCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_deploy_empty_card() ] nbCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary) ] NB_Overview_Node = NarrativesTree() NB_Overview_Node.set_name("Overview") NB_Performance_Node = NarrativesTree() NB_Performance_Node.set_name("Performance") NB_Deployment_Node = NarrativesTree() NB_Deployment_Node.set_name("Deployment") for card in nbOverviewCards: NB_Overview_Node.add_a_card(card) for card in nbPerformanceCards: NB_Performance_Node.add_a_card(card) for card in nbDeploymentCards: NB_Deployment_Node.add_a_card(card) for card in nbCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({ "naivebayes": json.loads( CommonUtils.convert_python_object_to_json(self._model_summary)) }) self._result_setter.set_naive_bayes_model_summary(modelSummaryJson) self._result_setter.set_nb_cards(nbCards) self._result_setter.set_nb_nodes( [NB_Overview_Node, NB_Performance_Node, NB_Deployment_Node]) self._result_setter.set_nb_fail_card({ "Algorithm_Name": "Naive Bayes", "success": "True" }) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "completion", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") print("\n\n") def Predict(self): self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight( ) self._scriptStages = { "initialization": { "summary": "Initialized the Naive Bayes Scripts", "weight": 2 }, "prediction": { "summary": "Spark ML Naive Bayes Model Prediction Finished", "weight": 2 }, "frequency": { "summary": "descriptive analysis finished", "weight": 2 }, "chisquare": { "summary": "chi Square analysis finished", "weight": 4 }, "completion": { "summary": "all analysis finished", "weight": 4 }, } self._completionStatus += self._scriptWeightDict[self._analysisName][ "total"] * self._scriptStages["initialization"]["weight"] / 10 progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "initialization",\ "info",\ self._scriptStages["initialization"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) dataSanity = True level_counts_train = self._dataframe_context.get_level_count_dict() categorical_columns = self._dataframe_helper.get_string_columns() numerical_columns = self._dataframe_helper.get_numeric_columns() time_dimension_columns = self._dataframe_helper.get_timestamp_columns() result_column = self._dataframe_context.get_result_column() categorical_columns = [ x for x in categorical_columns if x != result_column ] level_counts_score = CommonUtils.get_level_count_dict( self._data_frame, categorical_columns, self._dataframe_context.get_column_separator(), output_type="dict", dataType="spark") for key in level_counts_train: if key in level_counts_score: if level_counts_train[key] != level_counts_score[key]: dataSanity = False else: dataSanity = False test_data_path = self._dataframe_context.get_input_file() score_data_path = self._dataframe_context.get_score_path( ) + "/data.csv" trained_model_path = self._dataframe_context.get_model_path() trained_model_path = "/".join( trained_model_path.split("/")[:-1] ) + "/" + self._slug + "/" + self._dataframe_context.get_model_for_scoring( ) # score_summary_path = self._dataframe_context.get_score_path()+"/Summary/summary.json" pipelineModel = MLUtils.load_pipeline(trained_model_path) df = self._data_frame transformed = pipelineModel.transform(df) label_indexer_dict = MLUtils.read_string_indexer_mapping( trained_model_path, SQLctx) prediction_to_levels = udf(lambda x: label_indexer_dict[x], StringType()) transformed = transformed.withColumn( result_column, prediction_to_levels(transformed.prediction)) if "probability" in transformed.columns: probability_dataframe = transformed.select( [result_column, "probability"]).toPandas() probability_dataframe = probability_dataframe.rename( index=str, columns={result_column: "predicted_class"}) probability_dataframe[ "predicted_probability"] = probability_dataframe[ "probability"].apply(lambda x: max(x)) self._score_summary[ "prediction_split"] = MLUtils.calculate_scored_probability_stats( probability_dataframe) self._score_summary["result_column"] = result_column scored_dataframe = transformed.select( categorical_columns + time_dimension_columns + numerical_columns + [result_column, "probability"]).toPandas() scored_dataframe['predicted_probability'] = probability_dataframe[ "predicted_probability"].values # scored_dataframe = scored_dataframe.rename(index=str, columns={"predicted_probability": "probability"}) else: self._score_summary["prediction_split"] = [] self._score_summary["result_column"] = result_column scored_dataframe = transformed.select(categorical_columns + time_dimension_columns + numerical_columns + [result_column]).toPandas() labelMappingDict = self._dataframe_context.get_label_map() if score_data_path.startswith("file"): score_data_path = score_data_path[7:] scored_dataframe.to_csv(score_data_path, header=True, index=False) uidCol = self._dataframe_context.get_uid_column() if uidCol == None: uidCols = self._metaParser.get_suggested_uid_columns() if len(uidCols) > 0: uidCol = uidCols[0] uidTableData = [] predictedClasses = list(scored_dataframe[result_column].unique()) if uidCol: if uidCol in df.columns: for level in predictedClasses: levelDf = scored_dataframe[scored_dataframe[result_column] == level] levelDf = levelDf[[ uidCol, "predicted_probability", result_column ]] levelDf.sort_values(by="predicted_probability", ascending=False, inplace=True) levelDf["predicted_probability"] = levelDf[ "predicted_probability"].apply( lambda x: humanize.apnumber(x * 100) + "%" if x * 100 >= 10 else str(int(x * 100)) + "%") uidTableData.append(levelDf[:5]) uidTableData = pd.concat(uidTableData) uidTableData = [list(arr) for arr in list(uidTableData.values)] uidTableData = [[uidCol, "Probability", result_column] ] + uidTableData uidTable = TableData() uidTable.set_table_width(25) uidTable.set_table_data(uidTableData) uidTable.set_table_type("normalHideColumn") self._result_setter.set_unique_identifier_table( json.loads( CommonUtils.convert_python_object_to_json(uidTable))) self._completionStatus += self._scriptWeightDict[self._analysisName][ "total"] * self._scriptStages["prediction"]["weight"] / 10 progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "prediction",\ "info",\ self._scriptStages["prediction"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage) self._dataframe_context.update_completion_status( self._completionStatus) print("STARTING DIMENSION ANALYSIS ...") columns_to_keep = [] columns_to_drop = [] columns_to_keep = self._dataframe_context.get_score_consider_columns() if len(columns_to_keep) > 0: columns_to_drop = list(set(df.columns) - set(columns_to_keep)) else: columns_to_drop += ["predicted_probability"] scored_df = transformed.select(categorical_columns + time_dimension_columns + numerical_columns + [result_column]) columns_to_drop = [ x for x in columns_to_drop if x in scored_df.columns ] modified_df = scored_df.select( [x for x in scored_df.columns if x not in columns_to_drop]) resultColLevelCount = dict( modified_df.groupby(result_column).count().collect()) self._metaParser.update_column_dict( result_column, { "LevelCount": resultColLevelCount, "numberOfUniqueValues": len(resultColLevelCount.keys()) }) self._dataframe_context.set_story_on_scored_data(True) self._dataframe_context.update_consider_columns(columns_to_keep) df_helper = DataFrameHelper(modified_df, self._dataframe_context, self._metaParser) df_helper.set_params() spark_scored_df = df_helper.get_data_frame() if len(predictedClasses) >= 2: try: fs = time.time() df_decision_tree_obj = DecisionTrees( spark_scored_df, df_helper, self._dataframe_context, self._spark, self._metaParser, scriptWeight=self._scriptWeightDict, analysisName=self._analysisName).test_all( dimension_columns=[result_column]) narratives_obj = CommonUtils.as_dict( DecisionTreeNarrative(result_column, df_decision_tree_obj, self._dataframe_helper, self._dataframe_context, self._metaParser, self._result_setter, story_narrative=None, analysisName=self._analysisName, scriptWeight=self._scriptWeightDict)) print(narratives_obj) except Exception as e: print("DecisionTree Analysis Failed ", str(e)) else: data_dict = { "npred": len(predictedClasses), "nactual": len(labelMappingDict.values()) } if data_dict["nactual"] > 2: levelCountDict[predictedClasses[0]] = resultColLevelCount[ predictedClasses[0]] levelCountDict["Others"] = sum([ v for k, v in resultColLevelCount.items() if k != predictedClasses[0] ]) else: levelCountDict = resultColLevelCount otherClass = list( set(labelMappingDict.values()) - set(predictedClasses))[0] levelCountDict[otherClass] = 0 print(levelCountDict) total = float( sum([x for x in levelCountDict.values() if x != None])) levelCountTuple = [({ "name": k, "count": v, "percentage": humanize.apnumber(v * 100 / total) + "%" }) for k, v in levelCountDict.items() if v != None] levelCountTuple = sorted(levelCountTuple, key=lambda x: x["count"], reverse=True) data_dict["blockSplitter"] = "|~NEWBLOCK~|" data_dict["targetcol"] = result_column data_dict["nlevel"] = len(levelCountDict.keys()) data_dict["topLevel"] = levelCountTuple[0] data_dict["secondLevel"] = levelCountTuple[1] maincardSummary = NarrativesUtils.get_template_output( "/apps/", 'scorewithoutdtree.html', data_dict) main_card = NormalCard() main_card_data = [] main_card_narrative = NarrativesUtils.block_splitter( maincardSummary, "|~NEWBLOCK~|") main_card_data += main_card_narrative chartData = NormalChartData([levelCountDict]).get_data() chartJson = ChartJson(data=chartData) chartJson.set_title(result_column) chartJson.set_chart_type("donut") mainCardChart = C3ChartData(data=chartJson) mainCardChart.set_width_percent(33) main_card_data.append(mainCardChart) uidTable = self._result_setter.get_unique_identifier_table() if uidTable != None: main_card_data.append(uidTable) main_card.set_card_data(main_card_data) main_card.set_card_name( "Predicting Key Drivers of {}".format(result_column)) self._result_setter.set_score_dtree_cards([main_card], {})
class TensorFlowRegScript(object): def __init__(self, data_frame, df_helper, df_context, spark, prediction_narrative, result_setter, meta_parser, mlEnvironment="sklearn"): self._metaParser = meta_parser self._prediction_narrative = prediction_narrative self._result_setter = result_setter self._data_frame = data_frame self._dataframe_helper = df_helper self._dataframe_context = df_context self._spark = spark self._model_summary = MLModelSummary() self._score_summary = {} self._slug = GLOBALSETTINGS.MODEL_SLUG_MAPPING[ "Neural Network (TensorFlow)"] self._analysisName = "Neural Network (TensorFlow)" self._dataframe_context.set_analysis_name(self._analysisName) self._mlEnv = mlEnvironment self._datasetName = CommonUtils.get_dataset_name( self._dataframe_context.CSV_FILE) self._completionStatus = self._dataframe_context.get_completion_status( ) print(self._completionStatus, "initial completion status") self._messageURL = self._dataframe_context.get_message_url() self._scriptWeightDict = self._dataframe_context.get_ml_model_training_weight( ) self._ignoreMsg = self._dataframe_context.get_message_ignore() self._scriptStages = { "initialization": { "summary": "Initialized The Neural Network (TensorFlow) Regression Scripts", "weight": 1 }, "training": { "summary": "Neural Network (TensorFlow) Regression Model Training Started", "weight": 2 }, "completion": { "summary": "Neural Network (TensorFlow) Regression Model Training Finished", "weight": 1 }, } def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") appType = self._dataframe_context.get_app_type() algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = [ x for x in algosToRun if x.get_algorithm_slug() == self._slug ][0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) print(categorical_columns) result_column = self._dataframe_context.get_result_column() numerical_columns = self._dataframe_helper.get_numeric_columns() numerical_columns = [ x for x in numerical_columns if x != result_column ] model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print("model_path", model_path) pipeline_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/pipeline/" model_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/model" pmml_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/modelPmml" df = self._data_frame if self._mlEnv == "spark": pass elif self._mlEnv == "sklearn": model_filepath = model_path + "/" + self._slug + "/model.pkl" x_train, x_test, y_train, y_test = self._dataframe_helper.get_train_test_data( ) x_train = MLUtils.create_dummy_columns( x_train, [x for x in categorical_columns if x != result_column]) x_test = MLUtils.create_dummy_columns( x_test, [x for x in categorical_columns if x != result_column]) x_test = MLUtils.fill_missing_columns(x_test, x_train.columns, result_column) st = time.time() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "training", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") if algoSetting.is_hyperparameter_tuning_enabled(): pass else: self._result_setter.set_hyper_parameter_results( self._slug, None) evaluationMetricDict = algoSetting.get_evaluvation_metric( Type="Regression") evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] params_tf = algoSetting.get_tf_params_dict() algoParams = algoSetting.get_params_dict() algoParams = {k: v for k, v in list(algoParams.items())} model = tf.keras.models.Sequential() first_layer_flag = True for i in range(len(list( params_tf['hidden_layer_info'].keys()))): if params_tf['hidden_layer_info'][str( i)]["layer"] == "Dense": if first_layer_flag: model.add( tf.keras.layers.Dense( params_tf['hidden_layer_info'][str( i)]["units"], activation=params_tf['hidden_layer_info'][ str(i)]["activation"], input_shape=(len(x_train.columns), ), use_bias=params_tf['hidden_layer_info'][ str(i)]["use_bias"], kernel_initializer=params_tf[ 'hidden_layer_info'][str( i)]["kernel_initializer"], bias_initializer=params_tf[ 'hidden_layer_info'][str( i)]["bias_initializer"], kernel_regularizer=params_tf[ 'hidden_layer_info'][str( i)]["kernel_regularizer"], bias_regularizer=params_tf[ 'hidden_layer_info'][str( i)]["bias_regularizer"], activity_regularizer=params_tf[ 'hidden_layer_info'][str( i)]["activity_regularizer"], kernel_constraint=params_tf[ 'hidden_layer_info'][str( i)]["kernel_constraint"], bias_constraint=params_tf[ 'hidden_layer_info'][str( i)]["bias_constraint"])) try: if params_tf['hidden_layer_info'][str( i)]["batch_normalization"] == "True": model.add( tf.keras.layers.BatchNormalization()) except: print( "BATCH_NORM_FAILED ##########################" ) pass first_layer_flag = False else: model.add( tf.keras.layers.Dense( params_tf['hidden_layer_info'][str( i)]["units"], activation=params_tf['hidden_layer_info'][ str(i)]["activation"], use_bias=params_tf['hidden_layer_info'][ str(i)]["use_bias"], kernel_initializer=params_tf[ 'hidden_layer_info'][str( i)]["kernel_initializer"], bias_initializer=params_tf[ 'hidden_layer_info'][str( i)]["bias_initializer"], kernel_regularizer=params_tf[ 'hidden_layer_info'][str( i)]["kernel_regularizer"], bias_regularizer=params_tf[ 'hidden_layer_info'][str( i)]["bias_regularizer"], activity_regularizer=params_tf[ 'hidden_layer_info'][str( i)]["activity_regularizer"], kernel_constraint=params_tf[ 'hidden_layer_info'][str( i)]["kernel_constraint"], bias_constraint=params_tf[ 'hidden_layer_info'][str( i)]["bias_constraint"])) try: if params_tf['hidden_layer_info'][str( i)]["batch_normalization"] == "True": model.add( tf.keras.layers.BatchNormalization()) except: print( "BATCH_NORM_FAILED ##########################" ) pass elif params_tf['hidden_layer_info'][str( i)]["layer"] == "Dropout": model.add( tf.keras.layers.Dropout( float(params_tf['hidden_layer_info'][str(i)] ["rate"]))) elif params_tf['hidden_layer_info'][str( i)]["layer"] == "Lambda": if params_tf['hidden_layer_info'][str( i)]["lambda"] == "Addition": model.add( tf.keras.layers.Lambda(lambda x: x + int( params_tf['hidden_layer_info'][str(i)][ "units"]))) if params_tf['hidden_layer_info'][str( i)]["lambda"] == "Multiplication": model.add( tf.keras.layers.Lambda(lambda x: x * int( params_tf['hidden_layer_info'][str(i)][ "units"]))) if params_tf['hidden_layer_info'][str( i)]["lambda"] == "Subtraction": model.add( tf.keras.layers.Lambda(lambda x: x - int( params_tf['hidden_layer_info'][str(i)][ "units"]))) if params_tf['hidden_layer_info'][str( i)]["lambda"] == "Division": model.add( tf.keras.layers.Lambda(lambda x: old_div( x, int(params_tf['hidden_layer_info'][str(i)][ "units"])))) model.compile(optimizer=algoParams["optimizer"], loss=algoParams["loss"], metrics=[algoParams['metrics']]) model.fit(x_train, y_train, epochs=algoParams["number_of_epochs"], verbose=1, batch_size=algoParams["batch_size"]) bestEstimator = model print(model.summary()) trainingTime = time.time() - st y_score = bestEstimator.predict(x_test) y_score = list(y_score.flatten()) try: y_prob = bestEstimator.predict_proba(x_test) except: y_prob = [0] * len(y_score) featureImportance = {} objs = { "trained_model": bestEstimator, "actual": y_test, "predicted": y_score, "probability": y_prob, "feature_importance": featureImportance, "featureList": list(x_train.columns), "labelMapping": {} } #featureImportance = objs["trained_model"].feature_importances_ #featuresArray = [(col_name, featureImportance[idx]) for idx, col_name in enumerate(x_train.columns)] featuresArray = [] if not algoSetting.is_hyperparameter_tuning_enabled(): modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(modelName + ".h5") objs["trained_model"].save("/".join(modelFilepathArr)) #joblib.dump(objs["trained_model"],"/".join(modelFilepathArr)) metrics = {} metrics["r2"] = r2_score(y_test, y_score) metrics["neg_mean_squared_error"] = mean_squared_error( y_test, y_score) metrics["neg_mean_absolute_error"] = mean_absolute_error( y_test, y_score) metrics["RMSE"] = sqrt(metrics["neg_mean_squared_error"]) metrics["explained_variance_score"] = explained_variance_score( y_test, y_score) transformed = pd.DataFrame({ "prediction": y_score, result_column: y_test }) transformed["difference"] = transformed[ result_column] - transformed["prediction"] transformed["mape"] = old_div( np.abs(transformed["difference"]) * 100, transformed[result_column]) sampleData = None nrows = transformed.shape[0] if nrows > 100: sampleData = transformed.sample(n=100, random_state=420) else: sampleData = transformed print(sampleData.head()) if transformed["mape"].max() > 100: GLOBALSETTINGS.MAPEBINS.append(transformed["mape"].max()) mapeCountArr = list( pd.cut(transformed["mape"], GLOBALSETTINGS.MAPEBINS). value_counts().to_dict().items()) GLOBALSETTINGS.MAPEBINS.pop(5) else: mapeCountArr = list( pd.cut(transformed["mape"], GLOBALSETTINGS.MAPEBINS). value_counts().to_dict().items()) mapeStatsArr = [(str(idx), dictObj) for idx, dictObj in enumerate( sorted([{ "count": x[1], "splitRange": (x[0].left, x[0].right) } for x in mapeCountArr], key=lambda x: x["splitRange"][0]))] print(mapeStatsArr) print(mapeCountArr) predictionColSummary = transformed["prediction"].describe( ).to_dict() quantileBins = [ predictionColSummary["min"], predictionColSummary["25%"], predictionColSummary["50%"], predictionColSummary["75%"], predictionColSummary["max"] ] print(quantileBins) quantileBins = sorted(list(set(quantileBins))) transformed["quantileBinId"] = pd.cut(transformed["prediction"], quantileBins) quantileDf = transformed.groupby("quantileBinId").agg({ "prediction": [np.sum, np.mean, np.size] }).reset_index() quantileDf.columns = ["prediction", "sum", "mean", "count"] print(quantileDf) quantileArr = list(quantileDf.T.to_dict().items()) quantileSummaryArr = [(obj[0], { "splitRange": (obj[1]["prediction"].left, obj[1]["prediction"].right), "count": obj[1]["count"], "mean": obj[1]["mean"], "sum": obj[1]["sum"] }) for obj in quantileArr] print(quantileSummaryArr) runtime = round((time.time() - st_global), 2) self._model_summary.set_model_type("regression") self._model_summary.set_algorithm_name( "Neural Network (TensorFlow)") self._model_summary.set_algorithm_display_name( "Neural Network (TensorFlow)") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_training_time(trainingTime) self._model_summary.set_target_variable(result_column) self._model_summary.set_validation_method( validationDict["displayName"]) self._model_summary.set_model_evaluation_metrics(metrics) self._model_summary.set_model_params(params_tf) self._model_summary.set_quantile_summary(quantileSummaryArr) self._model_summary.set_mape_stats(mapeStatsArr) self._model_summary.set_sample_data(sampleData.to_dict()) self._model_summary.set_feature_importance(featuresArray) self._model_summary.set_feature_list(list(x_train.columns)) self._model_summary.set_model_mse( metrics["neg_mean_squared_error"]) self._model_summary.set_model_mae( metrics["neg_mean_absolute_error"]) self._model_summary.set_rmse(metrics["RMSE"]) self._model_summary.set_model_rsquared(metrics["r2"]) self._model_summary.set_model_exp_variance_score( metrics["explained_variance_score"]) try: pmml_filepath = str(model_path) + "/" + str( self._slug) + "/traindeModel.pmml" modelPmmlPipeline = PMMLPipeline([("pretrained-estimator", objs["trained_model"])]) modelPmmlPipeline.target_field = result_column modelPmmlPipeline.active_fields = np.array( [col for col in x_train.columns if col != result_column]) sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr=True) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except: pass if algoSetting.is_hyperparameter_tuning_enabled(): modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": metrics[evaluationMetricDict["name"]], "evaluationMetricName": evaluationMetricDict["name"], "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } else: modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": metrics[evaluationMetricDict["name"]], "evaluationMetricName": evaluationMetricDict["name"], "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } modelmanagement_ = params_tf modelmanagement_.update(algoParams) self._model_management = MLModelSummary() if algoSetting.is_hyperparameter_tuning_enabled(): pass else: self._model_management.set_layer_info( data=modelmanagement_['hidden_layer_info']) self._model_management.set_loss_function( data=modelmanagement_['loss']) self._model_management.set_optimizer( data=modelmanagement_['optimizer']) self._model_management.set_batch_size( data=modelmanagement_['batch_size']) self._model_management.set_no_epochs( data=modelmanagement_['number_of_epochs']) self._model_management.set_model_evaluation_metrics( data=modelmanagement_['metrics']) self._model_management.set_job_type( self._dataframe_context.get_job_name()) #Project name self._model_management.set_training_status( data="completed") # training status self._model_management.set_no_of_independent_variables( data=x_train) #no of independent varables self._model_management.set_training_time(runtime) # run time self._model_management.set_rmse(metrics["RMSE"]) self._model_management.set_algorithm_name( "Neural Network (TensorFlow)") #algorithm name self._model_management.set_validation_method( str(validationDict["displayName"]) + "(" + str(validationDict["value"]) + ")") #validation method self._model_management.set_target_variable( result_column) #target column name self._model_management.set_creation_date(data=str( datetime.now().strftime('%b %d ,%Y %H:%M '))) #creation date self._model_management.set_datasetName(self._datasetName) modelManagementSummaryJson = [ ["Project Name", self._model_management.get_job_type()], ["Algorithm", self._model_management.get_algorithm_name()], ["Training Status", self._model_management.get_training_status()], ["RMSE", self._model_management.get_rmse()], ["RunTime", self._model_management.get_training_time()], #["Owner",None], ["Created On", self._model_management.get_creation_date()] ] if algoSetting.is_hyperparameter_tuning_enabled(): modelManagementModelSettingsJson = [] else: modelManagementModelSettingsJson = [ ["Training Dataset", self._model_management.get_datasetName()], [ "Target Column", self._model_management.get_target_variable() ], [ "Number Of Independent Variables", self._model_management.get_no_of_independent_variables() ], ["Algorithm", self._model_management.get_algorithm_name()], [ "Model Validation", self._model_management.get_validation_method() ], ["batch_size", str(self._model_management.get_batch_size())], ["Loss", self._model_management.get_loss_function()], ["Optimizer", self._model_management.get_optimizer()], ["Epochs", self._model_management.get_no_epochs()], [ "Metrics", self._model_management.get_model_evaluation_metrics() ] ] for i in range( len(list(modelmanagement_['hidden_layer_info'].keys()))): string = "" key = "layer No-" + str(i) + "-" + str( modelmanagement_["hidden_layer_info"][str(i)]["layer"] + "-") for j in modelmanagement_["hidden_layer_info"][str(i)]: modelManagementModelSettingsJson.append([ key + j + ":", modelmanagement_["hidden_layer_info"][str(i)][j] ]) print(modelManagementModelSettingsJson) tfregCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary) ] tfregPerformanceCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_cards_regression( self._model_summary) ] tfregOverviewCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_card_overview( self._model_management, modelManagementSummaryJson, modelManagementModelSettingsJson) ] tfregDeploymentCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_deploy_empty_card() ] TFReg_Overview_Node = NarrativesTree() TFReg_Overview_Node.set_name("Overview") TFReg_Performance_Node = NarrativesTree() TFReg_Performance_Node.set_name("Performance") TFReg_Deployment_Node = NarrativesTree() TFReg_Deployment_Node.set_name("Deployment") for card in tfregOverviewCards: TFReg_Overview_Node.add_a_card(card) for card in tfregPerformanceCards: TFReg_Performance_Node.add_a_card(card) for card in tfregDeploymentCards: TFReg_Deployment_Node.add_a_card(card) for card in tfregCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({ "Neural Network (TensorFlow)": json.loads( CommonUtils.convert_python_object_to_json(self._model_summary)) }) self._result_setter.set_tfreg_regression_model_summart( modelSummaryJson) self._result_setter.set_tfreg_cards(tfregCards) self._result_setter.set_tfreg_nodes([ TFReg_Overview_Node, TFReg_Performance_Node, TFReg_Deployment_Node ]) self._result_setter.set_tfreg_fail_card({ "Algorithm_Name": "Neural Network (TensorFlow)", "Success": "True" }) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "completion", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") def Predict(self): self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight( ) self._scriptStages = { "initialization": { "summary": "Initialized The Neural Network (TensorFlow) Regression Scripts", "weight": 2 }, "predictionStart": { "summary": "Neural Network (TensorFlow) Regression Model Prediction Started", "weight": 2 }, "predictionFinished": { "summary": "Neural Network (TensorFlow) Regression Model Prediction Finished", "weight": 6 } } CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) dataSanity = True categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() test_data_path = self._dataframe_context.get_input_file() if self._mlEnv == "spark": pass elif self._mlEnv == "sklearn": CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "predictionStart", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") score_data_path = self._dataframe_context.get_score_path( ) + "/data.csv" trained_model_path = "file://" + self._dataframe_context.get_model_path( ) trained_model_path += "/" + self._dataframe_context.get_model_for_scoring( ) + ".h5" print("trained_model_path", trained_model_path) print("score_data_path", score_data_path) if trained_model_path.startswith("file"): trained_model_path = trained_model_path[7:] #trained_model = joblib.load(trained_model_path) trained_model = tf.keras.models.load_model(trained_model_path) model_columns = self._dataframe_context.get_model_features() print("model_columns", model_columns) df = self._data_frame.toPandas() # pandas_df = MLUtils.factorize_columns(df,[x for x in categorical_columns if x != result_column]) pandas_df = MLUtils.create_dummy_columns( df, [x for x in categorical_columns if x != result_column]) pandas_df = MLUtils.fill_missing_columns(pandas_df, model_columns, result_column) if uid_col: pandas_df = pandas_df[[ x for x in pandas_df.columns if x != uid_col ]] y_score = trained_model.predict(pandas_df) y_score = list(y_score.flatten()) scoreKpiArray = MLUtils.get_scored_data_summary(y_score) kpiCard = NormalCard() kpiCardData = [KpiData(data=x) for x in scoreKpiArray] kpiCard.set_card_data(kpiCardData) kpiCard.set_cente_alignment(True) print(CommonUtils.convert_python_object_to_json(kpiCard)) self._result_setter.set_kpi_card_regression_score(kpiCard) pandas_df[result_column] = y_score df[result_column] = y_score df.to_csv(score_data_path, header=True, index=False) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "predictionFinished", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") print("STARTING Measure ANALYSIS ...") columns_to_keep = [] columns_to_drop = [] columns_to_keep = self._dataframe_context.get_score_consider_columns( ) if len(columns_to_keep) > 0: columns_to_drop = list(set(df.columns) - set(columns_to_keep)) else: columns_to_drop += ["predicted_probability"] columns_to_drop = [ x for x in columns_to_drop if x in df.columns and x != result_column ] print("columns_to_drop", columns_to_drop) pandas_scored_df = df[list(set(columns_to_keep + [result_column]))] spark_scored_df = SQLctx.createDataFrame(pandas_scored_df) # spark_scored_df.write.csv(score_data_path+"/data",mode="overwrite",header=True) # TODO update metadata for the newly created dataframe self._dataframe_context.update_consider_columns(columns_to_keep) print(spark_scored_df.printSchema()) df_helper = DataFrameHelper(spark_scored_df, self._dataframe_context, self._metaParser) df_helper.set_params() df = df_helper.get_data_frame() # self._dataframe_context.set_dont_send_message(True) try: fs = time.time() descr_stats_obj = DescriptiveStatsScript( df, df_helper, self._dataframe_context, self._result_setter, self._spark, self._prediction_narrative, scriptWeight=self._scriptWeightDict, analysisName="Descriptive analysis") descr_stats_obj.Run() print("DescriptiveStats Analysis Done in ", time.time() - fs, " seconds.") except: print("Frequency Analysis Failed ") # try: # fs = time.time() # df_helper.fill_na_dimension_nulls() # df = df_helper.get_data_frame() # dt_reg = DecisionTreeRegressionScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,self._metaParser,scriptWeight=self._scriptWeightDict,analysisName="Predictive modeling") # dt_reg.Run() # print "DecisionTrees Analysis Done in ", time.time() - fs, " seconds." # except: # print "DTREE FAILED" try: fs = time.time() two_way_obj = TwoWayAnovaScript( df, df_helper, self._dataframe_context, self._result_setter, self._spark, self._prediction_narrative, self._metaParser, scriptWeight=self._scriptWeightDict, analysisName="Measure vs. Dimension") two_way_obj.Run() print("OneWayAnova Analysis Done in ", time.time() - fs, " seconds.") except: print("Anova Analysis Failed")