def test_all(self, measure_columns=None, dimension_columns=None): freq_dimension_result = FreqDimensionResult() dimension = dimension_columns[0] frequency_dict = {} grouped_dataframe = self._data_frame.groupby( dimension).count().toPandas() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "groupby", "info", weightKey="script") frequency_dict[dimension] = grouped_dataframe.to_dict() grouped_dataframe = grouped_dataframe.dropna() frequency_dict = json.dumps(frequency_dict) freq_dimension_result.set_params(frequency_dict) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "completion", "info", weightKey="script") return freq_dimension_result
def stats_for_measure_column(self, measure_column): if not self._dataframe_helper.is_numeric_column(measure_column): raise BIException.non_numeric_column(measure_column) descr_stats = MeasureDescriptiveStats() num_values = self._data_frame.select(measure_column).count() min_value = Stats.min(self._data_frame, measure_column) max_value = Stats.max(self._data_frame, measure_column) total_value = Stats.total(self._data_frame, measure_column) mean = Stats.mean(self._data_frame, measure_column) variance = Stats.variance(self._data_frame, measure_column) std_dev = Stats.std_dev(self._data_frame, measure_column) if min_value == max_value: skewness = 0 kurtosis = 0 else: skewness = Stats.skew(self._data_frame, measure_column) kurtosis = Stats.kurtosis(self._data_frame, measure_column) descr_stats.set_summary_stats(num_values=num_values, min_value=min_value, max_value=max_value, total=total_value, mean=mean, variance=variance, std_dev=std_dev, skew=skewness, kurtosis=kurtosis) descr_stats.set_five_point_summary_stats( self.five_point_summary(measure_column)) descr_stats.set_histogram( Binner(self._data_frame, self._dataframe_helper).get_bins(measure_column)) #descr_stats.set_raw_data([float(row[0]) for row in self._data_frame.select(measure_column).collect()]) # self._completionStatus += self._scriptWeightDict[self._analysisName]["script"] # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ # "statCalculationEnd",\ # "info",\ # self._scriptStages["statCalculationEnd"]["summary"],\ # self._completionStatus,\ # self._completionStatus) # CommonUtils.save_progress_message(self._messageURL,progressMessage) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "statCalculationEnd", "info", display=False, emptyBin=False, customMsg=None, weightKey="script") # self._dataframe_context.update_completion_status(self._completionStatus) return descr_stats
def test_all(self, measure_columns=None, dimension_columns=None, max_num_levels=40): # CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._analysisName,"chisquareStats","info",display=False,weightKey="script") targetDimension = dimension_columns[0] all_dimensions = self._dimension_columns all_dimensions = [x for x in all_dimensions if x != targetDimension] # if self._analysisDict != {}: # nColsToUse = self._analysisDict[self._analysisName]["noOfColumnsToUse"] # else: # nColsToUse = None # if nColsToUse != None: # all_dimensions = all_dimensions[:nColsToUse] all_measures = self._measure_columns df_chisquare_result = DFChiSquareResult() # print "df_chisquare_result"*50 # print df_chisquare_result for d in all_dimensions: try: chisquare_result = self.test_dimension(targetDimension, d) df_chisquare_result.add_chisquare_result( targetDimension, d, chisquare_result) except Exception as e: print(repr(e), d) continue for m in all_measures: try: if self._pandas_flag: if len(self._data_frame[m].unique() ) > self._analysisDict['Dimension vs. Dimension'][ 'binSetting']['binCardinality']: chisquare_result = self.test_measures( targetDimension, m) df_chisquare_result.add_chisquare_result( targetDimension, m, chisquare_result) else: if self._data_frame.select(F.countDistinct(m)).collect( )[0][0] > self._analysisDict['Dimension vs. Dimension'][ 'binSetting']['binCardinality']: chisquare_result = self.test_measures( targetDimension, m) df_chisquare_result.add_chisquare_result( targetDimension, m, chisquare_result) except Exception as e: print(str(e), m) continue CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "completion", "info", display=False, weightKey="script") return df_chisquare_result
def __init__(self, data_frame, df_context, df_helper, spark, meta_parser,scriptWeight=None, analysisName=None): self._spark = spark self._metaParser = meta_parser self._data_frame = data_frame self._data_frame1 = data_frame self._dataframe_helper = df_helper self._dataframe_context = df_context self._measure_columns = self._dataframe_helper.get_numeric_columns() self._dimension_columns = self._dataframe_helper.get_string_columns() self._date_columns = self._dataframe_context.get_date_columns() self._uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(self._uid_col): self._dimension_columns = list(set(self._dimension_columns) - {self._uid_col}) if len(self._date_columns) >0 : self._dimension_columns = list(set(self._dimension_columns)-set(self._date_columns)) self._mapping_dict = {} self._new_rules = {} self._total = {} self._success = {} self._probability = {} self._alias_dict = {} self._important_vars = {} self._numCluster = None self._data_frame = self._dataframe_helper.fill_missing_values(self._data_frame) self._data_frame1 = self._dataframe_helper.fill_missing_values(self._data_frame1) self._completionStatus = self._dataframe_context.get_completion_status() self._messageURL = self._dataframe_context.get_message_url() if analysisName == None: self._analysisName = self._dataframe_context.get_analysis_name() else: self._analysisName = analysisName if scriptWeight == None: self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight() else: self._scriptWeightDict = scriptWeight self._scriptStages = { "dtreeTrainingStart":{ "summary":"Started the Decision Tree Regression Script", "weight":0 }, "dtreeTrainingEnd":{ "summary":"Decision Tree Regression Learning Finished", "weight":10 }, } CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._analysisName,"dtreeTrainingStart","info",weightKey="script")
def test_all(self, measure_columns=None, dimension_columns=None): if dimension_columns is None: dimensions = self._dimension_columns self._target_dimension = measure_columns[0] dimension = self._target_dimension max_num_levels = GLOBALSETTINGS.DTREE_TARGET_DIMENSION_MAX_LEVEL max_num_levels = min(max_num_levels, round(self._dataframe_helper.get_num_rows()**0.5)) # all_dimensions = [dim for dim in self._dimension_columns if self._dataframe_helper.get_num_unique_values(dim) <= max_num_levels] all_dimensions = [dim for dim in self._dimension_columns if self._metaParser.get_num_unique_values(dim) <= max_num_levels] all_measures = [x for x in self._measure_columns if x!=self._target_dimension] self.transform_data_frames() decision_tree_result = DecisionTreeResult() cat_feature_info = [len(self._mapping_dict[c]) for c in all_dimensions] if len(cat_feature_info)>0: max_length = max(cat_feature_info) else: max_length=32 cat_feature_info = dict(enumerate(cat_feature_info)) # print cat_feature_info dimension_classes = self._data_frame.select(dimension).distinct().count() self._data_frame = self._data_frame[[dimension] + all_dimensions + all_measures] data = self._data_frame.rdd.map(lambda x: LabeledPoint(x[0], x[1:])) (trainingData, testData) = data.randomSplit([1.0, 0.0]) # TO DO : set maxBins at least equal to the max level of categories in dimension column model = DecisionTree.trainClassifier(trainingData, numClasses=dimension_classes, categoricalFeaturesInfo=cat_feature_info, impurity='gini', maxDepth=3, maxBins=max_length) output_result = model.toDebugString() print "output_result",output_result decision_tree = self.tree_json(output_result, self._data_frame) self._new_tree = self.generate_new_tree(decision_tree) self._new_tree = self.wrap_tree(self._new_tree) # self._new_tree = utils.recursiveRemoveNullNodes(self._new_tree) # decision_tree_result.set_params(self._new_tree, self._new_rules, self._total, self._success, self._probability) decision_tree_result.set_params(self._new_tree, self._new_rules, self._total, self._success, self._probability) decision_tree_result.set_target_map(self._mapping_dict[self._target_dimension], self._aggr_data, self._important_vars) # self._completionStatus += self._scriptWeightDict[self._analysisName]["script"]*self._scriptStages["dtreeTrainingStart"]["weight"]/10 # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ # "dtreeTrainingEnd",\ # "info",\ # self._scriptStages["dtreeTrainingEnd"]["summary"],\ # self._completionStatus,\ # self._completionStatus) # CommonUtils.save_progress_message(self._messageURL,progressMessage) # self._dataframe_context.update_completion_status(self._completionStatus) CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._analysisName,"dtreeTrainingEnd","info",weightKey="script") # print decision_tree_result return decision_tree_result
def __init__(self, data_frame, df_helper, df_context, scriptWeight=None, analysisName=None): self._data_frame = data_frame self._dataframe_helper = df_helper self._dataframe_context = df_context self._completionStatus = self._dataframe_context.get_completion_status( ) if analysisName == None: self._analysisName = self._dataframe_context.get_analysis_name() else: self._analysisName = analysisName self._messageURL = self._dataframe_context.get_message_url() if scriptWeight == None: self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight( ) else: self._scriptWeightDict = scriptWeight self._scriptStages = { "freqinitialization": { "summary": "Initialized the Frequency Scripts", "weight": 4 }, "groupby": { "summary": "running groupby operations", "weight": 6 }, "completion": { "summary": "Frequency Stats Calculated", "weight": 0 }, } CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "freqinitialization", "info", weightKey="script")
def __init__(self, data_frame, df_helper, df_context, meta_parser, scriptWeight=None, analysisName=None): self._data_frame = data_frame self._dataframe_helper = df_helper self._dataframe_context = df_context self._pandas_flag = df_context._pandas_flag self._metaParser = meta_parser self._measure_columns = self._dataframe_helper.get_numeric_columns() self._dimension_columns = self._dataframe_helper.get_string_columns() self._timestamp_columns = self._dataframe_helper.get_timestamp_columns( ) self._date_columns = self._dataframe_context.get_date_columns() self._uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion( self._uid_col): self._dimension_columns = list( set(self._dimension_columns) - {self._uid_col}) if len(self._date_columns) > 0: self._dimension_columns = list( set(self._dimension_columns) - set(self._date_columns)) self._completionStatus = self._dataframe_context.get_completion_status( ) if analysisName == None: self._analysisName = self._dataframe_context.get_analysis_name() else: self._analysisName = analysisName self._analysisDict = self._dataframe_context.get_analysis_dict() self._messageURL = self._dataframe_context.get_message_url() if scriptWeight == None: self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight( ) else: self._scriptWeightDict = scriptWeight self._scriptStages = { "initialization": { "summary": "Initialized the Chisquare Scripts", "weight": 1 }, "chisquareStats": { "summary": "Running Chisquare For Relevant Dimension Columns", "weight": 2 }, "completion": { "summary": "Chisquare Stats Calculated", "weight": 2 }, } CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "initialization", "info", display=False, weightKey="script")
def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = [ x for x in algosToRun if x.get_algorithm_slug() == self._slug ][0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() categorical_columns = [ x for x in categorical_columns if x != result_column ] appType = self._dataframe_context.get_app_type() model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print("model_path", model_path) pipeline_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/pipeline/" model_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/model" pmml_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/modelPmml" df = self._data_frame levels = df.select(result_column).distinct().count() appType = self._dataframe_context.get_app_type() model_filepath = model_path + "/" + self._slug + "/model" pmml_filepath = str(model_path) + "/" + str( self._slug) + "/traindeModel.pmml" CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "training", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") st = time.time() pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns, categorical_columns, result_column) trainingData, validationData = MLUtils.get_training_and_validation_data( df, result_column, 0.8) # indexed labelIndexer = StringIndexer(inputCol=result_column, outputCol="label") # OriginalTargetconverter = IndexToString(inputCol="label", outputCol="originalTargetColumn") # Label Mapping and Inverse labelIdx = labelIndexer.fit(trainingData) labelMapping = {k: v for k, v in enumerate(labelIdx.labels)} inverseLabelMapping = { v: float(k) for k, v in enumerate(labelIdx.labels) } if self._dataframe_context.get_trainerMode() == "autoML": automl_enable = True else: automl_enable = False clf = NaiveBayes() if not algoSetting.is_hyperparameter_tuning_enabled(): algoParams = algoSetting.get_params_dict() else: algoParams = algoSetting.get_params_dict_hyperparameter() print("=" * 100) print(algoParams) print("=" * 100) clfParams = [prm.name for prm in clf.params] algoParams = { getattr(clf, k): v if isinstance(v, list) else [v] for k, v in algoParams.items() if k in clfParams } #print("="*100) #print("ALGOPARAMS - ",algoParams) #print("="*100) paramGrid = ParamGridBuilder() # if not algoSetting.is_hyperparameter_tuning_enabled(): # for k,v in algoParams.items(): # if v == [None] * len(v): # continue # if k.name == 'thresholds': # paramGrid = paramGrid.addGrid(k,v[0]) # else: # paramGrid = paramGrid.addGrid(k,v) # paramGrid = paramGrid.build() # if not algoSetting.is_hyperparameter_tuning_enabled(): for k, v in algoParams.items(): print(k, v) if v == [None] * len(v): continue paramGrid = paramGrid.addGrid(k, v) paramGrid = paramGrid.build() # else: # for k,v in algoParams.items(): # print k.name, v # if v[0] == [None] * len(v[0]): # continue # paramGrid = paramGrid.addGrid(k,v[0]) # paramGrid = paramGrid.build() #print("="*143) #print("PARAMGRID - ", paramGrid) #print("="*143) if len(paramGrid) > 1: hyperParamInitParam = algoSetting.get_hyperparameter_params() evaluationMetricDict = { "name": hyperParamInitParam["evaluationMetric"] } evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] else: evaluationMetricDict = { "name": GLOBALSETTINGS.CLASSIFICATION_MODEL_EVALUATION_METRIC } evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] self._result_setter.set_hyper_parameter_results(self._slug, None) if validationDict["name"] == "kFold": numFold = int(validationDict["value"]) estimator = Pipeline(stages=[pipeline, labelIndexer, clf]) if algoSetting.is_hyperparameter_tuning_enabled(): modelFilepath = "/".join(model_filepath.split("/")[:-1]) pySparkHyperParameterResultObj = PySparkGridSearchResult( estimator, paramGrid, appType, modelFilepath, levels, evaluationMetricDict, trainingData, validationData, numFold, self._targetLevel, labelMapping, inverseLabelMapping, df) resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models( ) self._result_setter.set_hyper_parameter_results( self._slug, resultArray) self._result_setter.set_metadata_parallel_coordinates( self._slug, { "ignoreList": pySparkHyperParameterResultObj.get_ignore_list(), "hideColumns": pySparkHyperParameterResultObj.get_hide_columns(), "metricColName": pySparkHyperParameterResultObj. get_comparison_metric_colname(), "columnOrder": pySparkHyperParameterResultObj.get_keep_columns() }) bestModel = pySparkHyperParameterResultObj.getBestModel() prediction = pySparkHyperParameterResultObj.getBestPrediction() else: if automl_enable: paramGrid = (ParamGridBuilder().addGrid( clf.smoothing, [1.0, 0.2]).build()) crossval = CrossValidator( estimator=estimator, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator() if levels == 2 else MulticlassClassificationEvaluator(), numFolds=3 if numFold is None else numFold) # use 3+ folds in practice cvnb = crossval.fit(trainingData) prediction = cvnb.transform(validationData) bestModel = cvnb.bestModel else: train_test_ratio = float( self._dataframe_context.get_train_test_split()) estimator = Pipeline(stages=[pipeline, labelIndexer, clf]) if algoSetting.is_hyperparameter_tuning_enabled(): modelFilepath = "/".join(model_filepath.split("/")[:-1]) pySparkHyperParameterResultObj = PySparkTrainTestResult( estimator, paramGrid, appType, modelFilepath, levels, evaluationMetricDict, trainingData, validationData, train_test_ratio, self._targetLevel, labelMapping, inverseLabelMapping, df) resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models( ) self._result_setter.set_hyper_parameter_results( self._slug, resultArray) self._result_setter.set_metadata_parallel_coordinates( self._slug, { "ignoreList": pySparkHyperParameterResultObj.get_ignore_list(), "hideColumns": pySparkHyperParameterResultObj.get_hide_columns(), "metricColName": pySparkHyperParameterResultObj. get_comparison_metric_colname(), "columnOrder": pySparkHyperParameterResultObj.get_keep_columns() }) bestModel = pySparkHyperParameterResultObj.getBestModel() prediction = pySparkHyperParameterResultObj.getBestPrediction() else: tvs = TrainValidationSplit( estimator=estimator, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator() if levels == 2 else MulticlassClassificationEvaluator(), trainRatio=train_test_ratio) tvspnb = tvs.fit(trainingData) prediction = tvspnb.transform(validationData) bestModel = tvspnb.bestModel modelmanagement_ = { param[0].name: param[1] for param in bestModel.stages[2].extractParamMap().items() } MLUtils.save_pipeline_or_model(bestModel, model_filepath) predsAndLabels = prediction.select(['prediction', 'label']).rdd.map(tuple) # label_classes = prediction.select("label").distinct().collect() # label_classes = prediction.agg((F.collect_set('label').alias('label'))).first().asDict()['label'] #results = transformed.select(["prediction","label"]) # if len(label_classes) > 2: # metrics = MulticlassMetrics(predsAndLabels) # accuracy of the model # else: # metrics = BinaryClassificationMetrics(predsAndLabels) posLabel = inverseLabelMapping[self._targetLevel] metrics = MulticlassMetrics(predsAndLabels) trainingTime = time.time() - st f1_score = metrics.fMeasure(inverseLabelMapping[self._targetLevel], 1.0) precision = metrics.precision(inverseLabelMapping[self._targetLevel]) recall = metrics.recall(inverseLabelMapping[self._targetLevel]) accuracy = metrics.accuracy print(f1_score, precision, recall, accuracy) #gain chart implementation def cal_prob_eval(x): if len(x) == 1: if x == posLabel: return (float(x[1])) else: return (float(1 - x[1])) else: return (float(x[int(posLabel)])) column_name = 'probability' def y_prob_for_eval_udf(): return udf(lambda x: cal_prob_eval(x)) prediction = prediction.withColumn( "y_prob_for_eval", y_prob_for_eval_udf()(col(column_name))) try: pys_df = prediction.select( ['y_prob_for_eval', 'prediction', 'label']) gain_lift_ks_obj = GainLiftKS(pys_df, 'y_prob_for_eval', 'prediction', 'label', posLabel, self._spark) gain_lift_KS_dataframe = gain_lift_ks_obj.Run().toPandas() except: try: temp_df = pys_df.toPandas() gain_lift_ks_obj = GainLiftKS(temp_df, 'y_prob_for_eval', 'prediction', 'label', posLabel, self._spark) gain_lift_KS_dataframe = gain_lift_ks_obj.Rank_Ordering() except: print("gain chant failed") gain_lift_KS_dataframe = None #feature_importance = MLUtils.calculate_sparkml_feature_importance(df, bestModel.stages[-1], categorical_columns, numerical_columns) act_list = prediction.select('label').collect() actual = [int(row.label) for row in act_list] pred_list = prediction.select('prediction').collect() predicted = [int(row.prediction) for row in pred_list] prob_list = prediction.select('probability').collect() probability = [list(row.probability) for row in prob_list] # objs = {"trained_model":bestModel,"actual":prediction.select('label'),"predicted":prediction.select('prediction'), # "probability":prediction.select('probability'),"feature_importance":None, # "featureList":list(categorical_columns) + list(numerical_columns),"labelMapping":labelMapping} objs = { "trained_model": bestModel, "actual": actual, "predicted": predicted, "probability": probability, "feature_importance": None, "featureList": list(categorical_columns) + list(numerical_columns), "labelMapping": labelMapping } conf_mat_ar = metrics.confusionMatrix().toArray() print(conf_mat_ar) confusion_matrix = {} for i in range(len(conf_mat_ar)): confusion_matrix[labelMapping[i]] = {} for j, val in enumerate(conf_mat_ar[i]): confusion_matrix[labelMapping[i]][labelMapping[j]] = val print(confusion_matrix) # accuracy of the model '''ROC CURVE IMPLEMENTATION''' y_prob = probability y_score = predicted y_test = actual logLoss = log_loss(y_test, y_prob) if levels <= 2: positive_label_probs = [] for val in y_prob: positive_label_probs.append(val[int(posLabel)]) roc_auc = roc_auc_score(y_test, y_score) roc_data_dict = { "y_score": y_score, "y_test": y_test, "positive_label_probs": positive_label_probs, "y_prob": y_prob, "positive_label": posLabel } roc_dataframe = pd.DataFrame({ "y_score": y_score, "y_test": y_test, "positive_label_probs": positive_label_probs }) #roc_dataframe.to_csv("binary_roc_data.csv") fpr, tpr, thresholds = roc_curve(y_test, positive_label_probs, pos_label=posLabel) roc_df = pd.DataFrame({ "FPR": fpr, "TPR": tpr, "thresholds": thresholds }) roc_df["tpr-fpr"] = roc_df["TPR"] - roc_df["FPR"] optimal_index = np.argmax(np.array(roc_df["tpr-fpr"])) fpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "FPR"] tpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "TPR"] rounded_roc_df = roc_df.round({'FPR': 2, 'TPR': 4}) unique_fpr = rounded_roc_df["FPR"].unique() final_roc_df = rounded_roc_df.groupby("FPR", as_index=False)[["TPR" ]].mean() endgame_roc_df = final_roc_df.round({'FPR': 2, 'TPR': 3}) elif levels > 2: positive_label_probs = [] for val in y_prob: positive_label_probs.append(val[int(posLabel)]) y_test_roc_multi = [] for val in y_test: if val != posLabel: val = posLabel + 1 y_test_roc_multi.append(val) else: y_test_roc_multi.append(val) y_score_roc_multi = [] for val in y_score: if val != posLabel: val = posLabel + 1 y_score_roc_multi.append(val) else: y_score_roc_multi.append(val) roc_auc = roc_auc_score(y_test_roc_multi, y_score_roc_multi) fpr, tpr, thresholds = roc_curve(y_test_roc_multi, positive_label_probs, pos_label=posLabel) roc_df = pd.DataFrame({ "FPR": fpr, "TPR": tpr, "thresholds": thresholds }) roc_df["tpr-fpr"] = roc_df["TPR"] - roc_df["FPR"] optimal_index = np.argmax(np.array(roc_df["tpr-fpr"])) fpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "FPR"] tpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "TPR"] rounded_roc_df = roc_df.round({'FPR': 2, 'TPR': 4}) unique_fpr = rounded_roc_df["FPR"].unique() final_roc_df = rounded_roc_df.groupby("FPR", as_index=False)[["TPR" ]].mean() endgame_roc_df = final_roc_df.round({'FPR': 2, 'TPR': 3}) # Calculating prediction_split val_cnts = prediction.groupBy('label').count() val_cnts = map(lambda row: row.asDict(), val_cnts.collect()) prediction_split = {} total_nos = prediction.select('label').count() for item in val_cnts: print(labelMapping) classname = labelMapping[item['label']] prediction_split[classname] = round( item['count'] * 100 / float(total_nos), 2) if not algoSetting.is_hyperparameter_tuning_enabled(): modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(modelName) bestModel.save("/".join(modelFilepathArr)) runtime = round((time.time() - st_global), 2) try: print(pmml_filepath) pmmlBuilder = PMMLBuilder(self._spark, trainingData, bestModel).putOption( clf, 'compact', True) pmmlBuilder.buildFile(pmml_filepath) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except Exception as e: print("PMML failed...", str(e)) pass cat_cols = list(set(categorical_columns) - {result_column}) self._model_summary = MLModelSummary() self._model_summary.set_algorithm_name("Naive Bayes") self._model_summary.set_algorithm_display_name("Naive Bayes") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_confusion_matrix(confusion_matrix) # self._model_summary.set_feature_importance(objs["feature_importance"]) self._model_summary.set_feature_list(objs["featureList"]) self._model_summary.set_model_accuracy(accuracy) self._model_summary.set_training_time(round((time.time() - st), 2)) self._model_summary.set_precision_recall_stats([precision, recall]) self._model_summary.set_model_precision(precision) self._model_summary.set_model_recall(recall) self._model_summary.set_model_F1_score(f1_score) self._model_summary.set_model_log_loss(logLoss) self._model_summary.set_gain_lift_KS_data(gain_lift_KS_dataframe) self._model_summary.set_AUC_score(roc_auc) self._model_summary.set_target_variable(result_column) self._model_summary.set_prediction_split(prediction_split) self._model_summary.set_validation_method("KFold") self._model_summary.set_level_map_dict(objs["labelMapping"]) # self._model_summary.set_model_features(list(set(x_train.columns)-set([result_column]))) self._model_summary.set_model_features(objs["featureList"]) self._model_summary.set_level_counts( self._metaParser.get_unique_level_dict( list(set(categorical_columns)) + [result_column])) #self._model_summary.set_num_trees(objs['trained_model'].getNumTrees) self._model_summary.set_num_rules(300) self._model_summary.set_target_level(self._targetLevel) if not algoSetting.is_hyperparameter_tuning_enabled(): modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": accuracy, "evaluationMetricName": "accuracy", "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } else: modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": accuracy, "evaluationMetricName": "accuracy", "slug": self._model_summary.get_slug(), "Model Id": resultArray[0]["Model Id"] } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } self._model_management = MLModelSummary() print(modelmanagement_) self._model_management.set_job_type( self._dataframe_context.get_job_name()) #Project name self._model_management.set_training_status( data="completed") # training status self._model_management.set_target_level( self._targetLevel) # target column value self._model_management.set_training_time(runtime) # run time self._model_management.set_model_accuracy(round(metrics.accuracy, 2)) # self._model_management.set_model_accuracy(round(metrics.accuracy_score(objs["actual"], objs["predicted"]),2))#accuracy self._model_management.set_algorithm_name( "NaiveBayes") #algorithm name self._model_management.set_validation_method( str(validationDict["displayName"]) + "(" + str(validationDict["value"]) + ")") #validation method self._model_management.set_target_variable( result_column) #target column name self._model_management.set_creation_date(data=str( datetime.now().strftime('%b %d ,%Y %H:%M '))) #creation date self._model_management.set_datasetName(self._datasetName) self._model_management.set_model_type(data='classification') self._model_management.set_var_smoothing( data=int(modelmanagement_['smoothing'])) # self._model_management.set_no_of_independent_variables(df) #no of independent varables modelManagementSummaryJson = [ ["Project Name", self._model_management.get_job_type()], ["Algorithm", self._model_management.get_algorithm_name()], ["Training Status", self._model_management.get_training_status()], ["Accuracy", self._model_management.get_model_accuracy()], ["RunTime", self._model_management.get_training_time()], #["Owner",None], ["Created On", self._model_management.get_creation_date()] ] modelManagementModelSettingsJson = [ ["Training Dataset", self._model_management.get_datasetName()], ["Target Column", self._model_management.get_target_variable()], ["Target Column Value", self._model_management.get_target_level()], ["Algorithm", self._model_management.get_algorithm_name()], [ "Model Validation", self._model_management.get_validation_method() ], ["Model Type", self._model_management.get_model_type()], ["Smoothing", self._model_management.get_var_smoothing()], #,["priors",self._model_management.get_priors()] #,["var_smoothing",self._model_management.get_var_smoothing()] ] nbOverviewCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_card_overview( self._model_management, modelManagementSummaryJson, modelManagementModelSettingsJson) ] nbPerformanceCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_cards( self._model_summary, endgame_roc_df) ] nbDeploymentCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_deploy_empty_card() ] nbCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary) ] NB_Overview_Node = NarrativesTree() NB_Overview_Node.set_name("Overview") NB_Performance_Node = NarrativesTree() NB_Performance_Node.set_name("Performance") NB_Deployment_Node = NarrativesTree() NB_Deployment_Node.set_name("Deployment") for card in nbOverviewCards: NB_Overview_Node.add_a_card(card) for card in nbPerformanceCards: NB_Performance_Node.add_a_card(card) for card in nbDeploymentCards: NB_Deployment_Node.add_a_card(card) for card in nbCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({ "naivebayes": json.loads( CommonUtils.convert_python_object_to_json(self._model_summary)) }) self._result_setter.set_naive_bayes_model_summary(modelSummaryJson) self._result_setter.set_nb_cards(nbCards) self._result_setter.set_nb_nodes( [NB_Overview_Node, NB_Performance_Node, NB_Deployment_Node]) self._result_setter.set_nb_fail_card({ "Algorithm_Name": "Naive Bayes", "success": "True" }) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "completion", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") print("\n\n")
def __init__(self, df_helper, df_chisquare_result, spark, df_context, data_frame, story_narrative, result_setter, scriptWeight=None, analysisName=None): self._story_narrative = story_narrative self._result_setter = result_setter self._data_frame = data_frame self._dataframe_context = df_context self._dataframe_helper = df_helper self._storyOnScoredData = self._dataframe_context.get_story_on_scored_data( ) self._measure_columns = df_helper.get_numeric_columns() self._df_chisquare = df_chisquare_result self._df_chisquare_result = df_chisquare_result.get_result() self.narratives = {} self._appid = df_context.get_app_id() self._chiSquareNode = NarrativesTree() self._chiSquareNode.set_name("Association") self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER self._noOfSigDimsToShow = GLOBALSETTINGS.CHISQUARESIGNIFICANTDIMENSIONTOSHOW self._base_dir = "/chisquare/" self._spark = spark ############################DataFrame Measure to Dimesion Column##################### pandas_df = self._data_frame.toPandas() target_dimension = self._df_chisquare_result.keys() bin_data = {} for col in self._measure_columns: chisquare_result = self._df_chisquare.get_chisquare_result( target_dimension[0], col) bin_data[col] = chisquare_result.get_contingency_table( ).get_column_two_levels() for bin_col in bin_data.keys(): for split in bin_data[bin_col]: val = split.split('to') pandas_df[bin_col][ (pandas_df[bin_col] >= float(val[0].replace(',', ''))) & (pandas_df[bin_col] < float(val[1].replace(',', '')) )] = split fields = [ StructField(field_name, StringType(), True) for field_name in pandas_df.columns ] schema = StructType(fields) SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) self._data_frame = SQLctx.createDataFrame(pandas_df, schema) # print self._data_frame ############################DataFrame Measure to Dimesion Column##################### if self._appid != None: if self._appid == "1": self._base_dir += "appid1/" elif self._appid == "2": self._base_dir += "appid2/" self._completionStatus = self._dataframe_context.get_completion_status( ) if analysisName == None: self._analysisName = self._dataframe_context.get_analysis_name() else: self._analysisName = analysisName self._messageURL = self._dataframe_context.get_message_url() if scriptWeight == None: self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight( ) else: self._scriptWeightDict = scriptWeight self._analysisDict = self._dataframe_context.get_analysis_dict() if self._analysisDict != {}: self._nColsToUse = self._analysisDict[ self._analysisName]["noOfColumnsToUse"] else: self._nColsToUse = None self._scriptStages = { "initialization": { "summary": "Initialized the Frequency Narratives", "weight": 0 }, "summarygeneration": { "summary": "summary generation finished", "weight": 10 }, "completion": { "summary": "Frequency Stats Narratives done", "weight": 0 }, } CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "initialization", "info", display=False, weightKey="narratives") self._generate_narratives() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "summarygeneration", "info", display=False, weightKey="narratives") CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "completion", "info", display=False, weightKey="narratives")
def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message(self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = [x for x in algosToRun if x.get_algorithm_slug()==self._slug][0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() categorical_columns = [x for x in categorical_columns if x != result_column] appType = self._dataframe_context.get_app_type() model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() # pipeline_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/pipeline/" # model_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/model" # pmml_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/modelPmml" df = self._data_frame levels = df.select(result_column).distinct().count() appType = self._dataframe_context.get_app_type() model_filepath = model_path + "/" + self._slug + "/model" pmml_filepath = str(model_path) + "/" + str(self._slug) + "/trainedModel.pmml" CommonUtils.create_update_and_save_progress_message(self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "training", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") st = time.time() pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns, categorical_columns, result_column) vectorFeats = pipeline.getStages()[-1].transform(df) input_feats = len(vectorFeats.select('features').take(1)[0][0]) trainingData, validationData = MLUtils.get_training_and_validation_data(df, result_column, 0.8) # indexed labelIndexer = StringIndexer(inputCol=result_column, outputCol="label") # OriginalTargetconverter = IndexToString(inputCol="label", outputCol="originalTargetColumn") # Label Mapping and Inverse labelIdx = labelIndexer.fit(trainingData) labelMapping = {k: v for k, v in enumerate(labelIdx.labels)} inverseLabelMapping = {v: float(k) for k, v in enumerate(labelIdx.labels)} clf = MultilayerPerceptronClassifier() if not algoSetting.is_hyperparameter_tuning_enabled(): algoParams = algoSetting.get_params_dict() else: algoParams = algoSetting.get_params_dict_hyperparameter() clfParams = [prm.name for prm in clf.params] algoParams = {getattr(clf, k): v if isinstance(v, list) else [v] for k, v in algoParams.items() if k in clfParams} paramGrid = ParamGridBuilder() layer_param_val = algoParams[getattr(clf, 'layers')] for layer in layer_param_val: layer.insert(0, input_feats) layer.append(levels) print('layer_param_val =', layer_param_val) # if not algoSetting.is_hyperparameter_tuning_enabled(): # for k,v in algoParams.items(): # if k.name == 'layers': # paramGrid = paramGrid.addGrid(k,layer_param_val) # else: # paramGrid = paramGrid.addGrid(k,v) # paramGrid = paramGrid.build() # else: for k, v in algoParams.items(): if v == [None] * len(v): continue if k.name == 'layers': paramGrid = paramGrid.addGrid(k, layer_param_val) else: paramGrid = paramGrid.addGrid(k, v) paramGrid = paramGrid.build() if len(paramGrid) > 1: hyperParamInitParam = algoSetting.get_hyperparameter_params() evaluationMetricDict = {"name": hyperParamInitParam["evaluationMetric"]} evaluationMetricDict["displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] else: evaluationMetricDict = {"name": GLOBALSETTINGS.CLASSIFICATION_MODEL_EVALUATION_METRIC} evaluationMetricDict["displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] self._result_setter.set_hyper_parameter_results(self._slug, None) if validationDict["name"] == "kFold": numFold = int(validationDict["value"]) estimator = Pipeline(stages=[pipeline, labelIndexer, clf]) if algoSetting.is_hyperparameter_tuning_enabled(): modelFilepath = "/".join(model_filepath.split("/")[:-1]) pySparkHyperParameterResultObj = PySparkGridSearchResult(estimator, paramGrid, appType, modelFilepath, levels, evaluationMetricDict, trainingData, validationData, numFold, self._targetLevel, labelMapping, inverseLabelMapping, df) resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models() self._result_setter.set_hyper_parameter_results(self._slug, resultArray) self._result_setter.set_metadata_parallel_coordinates(self._slug, { "ignoreList": pySparkHyperParameterResultObj.get_ignore_list(), "hideColumns": pySparkHyperParameterResultObj.get_hide_columns(), "metricColName": pySparkHyperParameterResultObj.get_comparison_metric_colname(), "columnOrder": pySparkHyperParameterResultObj.get_keep_columns()}) bestModel = pySparkHyperParameterResultObj.getBestModel() prediction = pySparkHyperParameterResultObj.getBestPrediction() bestModelName = resultArray[0]["Model Id"] else: crossval = CrossValidator(estimator=estimator, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator() if levels == 2 else MulticlassClassificationEvaluator(), numFolds=3 if numFold is None else numFold) # use 3+ folds in practice cvrf = crossval.fit(trainingData) prediction = cvrf.transform(validationData) bestModel = cvrf.bestModel bestModelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" else: train_test_ratio = float(self._dataframe_context.get_train_test_split()) estimator = Pipeline(stages=[pipeline, labelIndexer, clf]) if algoSetting.is_hyperparameter_tuning_enabled(): modelFilepath = "/".join(model_filepath.split("/")[:-1]) pySparkHyperParameterResultObj = PySparkTrainTestResult(estimator, paramGrid, appType, modelFilepath, levels, evaluationMetricDict, trainingData, validationData, train_test_ratio, self._targetLevel, labelMapping, inverseLabelMapping, df) resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models() self._result_setter.set_hyper_parameter_results(self._slug, resultArray) self._result_setter.set_metadata_parallel_coordinates(self._slug, { "ignoreList": pySparkHyperParameterResultObj.get_ignore_list(), "hideColumns": pySparkHyperParameterResultObj.get_hide_columns(), "metricColName": pySparkHyperParameterResultObj.get_comparison_metric_colname(), "columnOrder": pySparkHyperParameterResultObj.get_keep_columns()}) bestModel = pySparkHyperParameterResultObj.getBestModel() prediction = pySparkHyperParameterResultObj.getBestPrediction() bestModelName = resultArray[0]["Model Id"] else: tvs = TrainValidationSplit(estimator=estimator, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator() if levels == 2 else MulticlassClassificationEvaluator(), trainRatio=train_test_ratio) tvrf = tvs.fit(trainingData) prediction = tvrf.transform(validationData) bestModel = tvrf.bestModel bestModelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" MLUtils.save_pipeline_or_model(bestModel,model_filepath) predsAndLabels = prediction.select(['prediction', 'label']).rdd.map(tuple) metrics = MulticlassMetrics(predsAndLabels) posLabel = inverseLabelMapping[self._targetLevel] conf_mat_ar = metrics.confusionMatrix().toArray() print(conf_mat_ar) confusion_matrix = {} for i in range(len(conf_mat_ar)): confusion_matrix[labelMapping[i]] = {} for j, val in enumerate(conf_mat_ar[i]): confusion_matrix[labelMapping[i]][labelMapping[j]] = val print(confusion_matrix) trainingTime = time.time() - st f1_score = metrics.fMeasure(inverseLabelMapping[self._targetLevel], 1.0) precision = metrics.precision(inverseLabelMapping[self._targetLevel]) recall = metrics.recall(inverseLabelMapping[self._targetLevel]) accuracy = metrics.accuracy roc_auc = 'Undefined' if levels == 2: bin_metrics = BinaryClassificationMetrics(predsAndLabels) roc_auc = bin_metrics.areaUnderROC precision = metrics.precision(inverseLabelMapping[self._targetLevel]) recall = metrics.recall(inverseLabelMapping[self._targetLevel]) print(f1_score,precision,recall,accuracy) #gain chart implementation def cal_prob_eval(x): if len(x) == 1: if x == posLabel: return(float(x[1])) else: return(float(1 - x[1])) else: return(float(x[int(posLabel)])) column_name= 'probability' def y_prob_for_eval_udf(): return udf(lambda x:cal_prob_eval(x)) prediction = prediction.withColumn("y_prob_for_eval", y_prob_for_eval_udf()(col(column_name))) try: pys_df = prediction.select(['y_prob_for_eval','prediction','label']) gain_lift_ks_obj = GainLiftKS(pys_df, 'y_prob_for_eval', 'prediction', 'label', posLabel, self._spark) gain_lift_KS_dataframe = gain_lift_ks_obj.Run().toPandas() except: try: temp_df = pys_df.toPandas() gain_lift_ks_obj = GainLiftKS(temp_df, 'y_prob_for_eval', 'prediction', 'label', posLabel, self._spark) gain_lift_KS_dataframe = gain_lift_ks_obj.Rank_Ordering() except: print("gain chant failed") gain_lift_KS_dataframe = None objs = {"trained_model": bestModel, "actual": prediction.select('label'), "predicted": prediction.select('prediction'), "probability": prediction.select('probability'), "feature_importance": None, "featureList": list(categorical_columns) + list(numerical_columns), "labelMapping": labelMapping} # Calculating prediction_split val_cnts = prediction.groupBy('label').count() val_cnts = map(lambda row: row.asDict(), val_cnts.collect()) prediction_split = {} total_nos = objs['actual'].count() for item in val_cnts: classname = labelMapping[item['label']] prediction_split[classname] = round(item['count'] * 100 / float(total_nos), 2) if not algoSetting.is_hyperparameter_tuning_enabled(): # modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(bestModelName) bestModel.save("/".join(modelFilepathArr)) runtime = round((time.time() - st_global), 2) try: print(pmml_filepath) pmmlBuilder = PMMLBuilder(self._spark, trainingData, bestModel).putOption(clf, 'compact', True) pmmlBuilder.buildFile(pmml_filepath) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except Exception as e: print("PMML failed...", str(e)) pass cat_cols = list(set(categorical_columns) - {result_column}) self._model_summary = MLModelSummary() self._model_summary.set_algorithm_name("Spark ML Multilayer Perceptron") self._model_summary.set_algorithm_display_name("Spark ML Multilayer Perceptron") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_confusion_matrix(confusion_matrix) self._model_summary.set_feature_importance(objs["feature_importance"]) self._model_summary.set_feature_list(objs["featureList"]) self._model_summary.set_model_accuracy(accuracy) self._model_summary.set_training_time(round((time.time() - st), 2)) self._model_summary.set_precision_recall_stats([precision, recall]) self._model_summary.set_model_precision(precision) self._model_summary.set_model_recall(recall) self._model_summary.set_target_variable(result_column) self._model_summary.set_prediction_split(prediction_split) self._model_summary.set_validation_method("KFold") self._model_summary.set_level_map_dict(objs["labelMapping"]) self._model_summary.set_model_features(objs["featureList"]) self._model_summary.set_level_counts( self._metaParser.get_unique_level_dict(list(set(categorical_columns)) + [result_column])) self._model_summary.set_num_trees(None) self._model_summary.set_num_rules(300) self._model_summary.set_target_level(self._targetLevel) modelManagementJson = { "Model ID": "SPMLP-" + bestModelName, "Project Name": self._dataframe_context.get_job_name(), "Algorithm": self._model_summary.get_algorithm_name(), "Status": 'Completed', "Accuracy": accuracy, "Runtime": runtime, "Created On": "", "Owner": "", "Deployment": 0, "Action": '' } # if not algoSetting.is_hyperparameter_tuning_enabled(): # modelDropDownObj = { # "name": self._model_summary.get_algorithm_name(), # "evaluationMetricValue": locals()[evaluationMetricDict["name"]], # accuracy # "evaluationMetricName": evaluationMetricDict["displayName"], # accuracy # "slug": self._model_summary.get_slug(), # "Model Id": bestModelName # } # modelSummaryJson = { # "dropdown": modelDropDownObj, # "levelcount": self._model_summary.get_level_counts(), # "modelFeatureList": self._model_summary.get_feature_list(), # "levelMapping": self._model_summary.get_level_map_dict(), # "slug": self._model_summary.get_slug(), # "name": self._model_summary.get_algorithm_name() # } # else: modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": accuracy, #locals()[evaluationMetricDict["name"]], "evaluationMetricName": "accuracy", # evaluationMetricDict["name"], "slug": self._model_summary.get_slug(), "Model Id": bestModelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } mlpcCards = [json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary)] for card in mlpcCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary( {"sparkperceptron": json.loads(CommonUtils.convert_python_object_to_json(self._model_summary))}) self._result_setter.set_spark_multilayer_perceptron_model_summary(modelSummaryJson) self._result_setter.set_spark_multilayer_perceptron_management_summary(modelManagementJson) self._result_setter.set_mlpc_cards(mlpcCards) CommonUtils.create_update_and_save_progress_message(self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "completion", "info", display=True, emptyBin=False, customMsg=None, weightKey="total")
def Predict(self): self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight() self._scriptStages = { "initialization":{ "summary":"Initialized the Decision Tree Regression Scripts", "weight":2 }, "predictionStart":{ "summary":"Decision Tree Regression Model Prediction Started", "weight":2 }, "predictionFinished":{ "summary":"Decision Tree Regression Model Prediction Finished", "weight":6 } } CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"initialization","info",display=True,emptyBin=False,customMsg=None,weightKey="total") SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) dataSanity = True categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns)-set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() test_data_path = self._dataframe_context.get_input_file() if self._mlEnv == "spark": score_data_path = self._dataframe_context.get_score_path()+"/data.csv" trained_model_path = "file://" + self._dataframe_context.get_model_path() trained_model_path += "/model" pipeline_path = "/".join(trained_model_path.split("/")[:-1])+"/pipeline" print "trained_model_path",trained_model_path print "pipeline_path",pipeline_path print "score_data_path",score_data_path pipelineModel = MLUtils.load_pipeline(pipeline_path) trained_model = MLUtils.load_dtree_regresssion_pyspark_model(trained_model_path) df = self._data_frame indexed = pipelineModel.transform(df) transformed = trained_model.transform(indexed) if result_column in transformed.columns: transformed = transformed.withColumnRenamed(result_column,"originalLabel") transformed = transformed.withColumnRenamed("prediction",result_column) pandas_scored_df = transformed.select(list(set(self._data_frame.columns+[result_column]))).toPandas() if score_data_path.startswith("file"): score_data_path = score_data_path[7:] pandas_scored_df.to_csv(score_data_path,header=True,index=False) print "STARTING Measure ANALYSIS ..." columns_to_keep = [] columns_to_drop = [] columns_to_keep = self._dataframe_context.get_score_consider_columns() if len(columns_to_keep) > 0: columns_to_drop = list(set(df.columns)-set(columns_to_keep)) else: columns_to_drop += ["predicted_probability"] columns_to_drop = [x for x in columns_to_drop if x in df.columns and x != result_column] print "columns_to_drop",columns_to_drop spark_scored_df = transformed.select(list(set(columns_to_keep+[result_column]))) elif self._mlEnv == "sklearn": CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"predictionStart","info",display=True,emptyBin=False,customMsg=None,weightKey="total") score_data_path = self._dataframe_context.get_score_path()+"/data.csv" trained_model_path = "file://" + self._dataframe_context.get_model_path() trained_model_path += "/"+self._dataframe_context.get_model_for_scoring()+".pkl" print "trained_model_path",trained_model_path print "score_data_path",score_data_path if trained_model_path.startswith("file"): trained_model_path = trained_model_path[7:] trained_model = joblib.load(trained_model_path) model_columns = self._dataframe_context.get_model_features() print "model_columns",model_columns df = self._data_frame.toPandas() # pandas_df = MLUtils.factorize_columns(df,[x for x in categorical_columns if x != result_column]) pandas_df = MLUtils.create_dummy_columns(df,[x for x in categorical_columns if x != result_column]) pandas_df = MLUtils.fill_missing_columns(pandas_df,model_columns,result_column) if uid_col: pandas_df = pandas_df[[x for x in pandas_df.columns if x != uid_col]] y_score = trained_model.predict(pandas_df) scoreKpiArray = MLUtils.get_scored_data_summary(y_score) kpiCard = NormalCard() kpiCardData = [KpiData(data=x) for x in scoreKpiArray] kpiCard.set_card_data(kpiCardData) kpiCard.set_cente_alignment(True) print CommonUtils.convert_python_object_to_json(kpiCard) self._result_setter.set_kpi_card_regression_score(kpiCard) pandas_df[result_column] = y_score df[result_column] = y_score df.to_csv(score_data_path,header=True,index=False) CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"predictionFinished","info",display=True,emptyBin=False,customMsg=None,weightKey="total") print "STARTING Measure ANALYSIS ..." columns_to_keep = [] columns_to_drop = [] columns_to_keep = self._dataframe_context.get_score_consider_columns() if len(columns_to_keep) > 0: columns_to_drop = list(set(df.columns)-set(columns_to_keep)) else: columns_to_drop += ["predicted_probability"] columns_to_drop = [x for x in columns_to_drop if x in df.columns and x != result_column] print "columns_to_drop",columns_to_drop pandas_scored_df = df[list(set(columns_to_keep+[result_column]))] spark_scored_df = SQLctx.createDataFrame(pandas_scored_df) # spark_scored_df.write.csv(score_data_path+"/data",mode="overwrite",header=True) # TODO update metadata for the newly created dataframe self._dataframe_context.update_consider_columns(columns_to_keep) print spark_scored_df.printSchema() df_helper = DataFrameHelper(spark_scored_df, self._dataframe_context,self._metaParser) df_helper.set_params() df = df_helper.get_data_frame() # self._dataframe_context.set_dont_send_message(True) try: fs = time.time() descr_stats_obj = DescriptiveStatsScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,scriptWeight=self._scriptWeightDict,analysisName="Descriptive analysis") descr_stats_obj.Run() print "DescriptiveStats Analysis Done in ", time.time() - fs, " seconds." except: print "Frequency Analysis Failed " # try: # fs = time.time() # df_helper.fill_na_dimension_nulls() # df = df_helper.get_data_frame() # dt_reg = DecisionTreeRegressionScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,self._metaParser,scriptWeight=self._scriptWeightDict,analysisName="Predictive modeling") # dt_reg.Run() # print "DecisionTrees Analysis Done in ", time.time() - fs, " seconds." # except: # print "DTREE FAILED" try: fs = time.time() two_way_obj = TwoWayAnovaScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,self._metaParser,scriptWeight=self._scriptWeightDict,analysisName="Measure vs. Dimension") two_way_obj.Run() print "OneWayAnova Analysis Done in ", time.time() - fs, " seconds." except: print "Anova Analysis Failed"
def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = [ x for x in algosToRun if x.get_algorithm_slug() == self._slug ][0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) print(categorical_columns) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print("model_path", model_path) pipeline_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/pipeline/" model_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/model" pmml_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/modelPmml" df = self._data_frame if self._mlEnv == "spark": pass elif self._mlEnv == "sklearn": model_filepath = model_path + "/" + self._slug + "/model.pkl" pmml_filepath = str(model_path) + "/" + str( self._slug) + "/traindeModel.pmml" x_train, x_test, y_train, y_test = self._dataframe_helper.get_train_test_data( ) x_train = MLUtils.create_dummy_columns( x_train, [x for x in categorical_columns if x != result_column]) x_test = MLUtils.create_dummy_columns( x_test, [x for x in categorical_columns if x != result_column]) x_test = MLUtils.fill_missing_columns(x_test, x_train.columns, result_column) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "training", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") st = time.time() levels = df[result_column].unique() clf = SVC(kernel='linear', probability=True) labelEncoder = preprocessing.LabelEncoder() labelEncoder.fit(np.concatenate([y_train, y_test])) y_train = pd.Series(labelEncoder.transform(y_train)) y_test = labelEncoder.transform(y_test) classes = labelEncoder.classes_ transformed = labelEncoder.transform(classes) labelMapping = dict(list(zip(transformed, classes))) inverseLabelMapping = dict(list(zip(classes, transformed))) posLabel = inverseLabelMapping[self._targetLevel] appType = self._dataframe_context.get_app_type() print(appType, labelMapping, inverseLabelMapping, posLabel, self._targetLevel) if algoSetting.is_hyperparameter_tuning_enabled(): hyperParamInitParam = algoSetting.get_hyperparameter_params() evaluationMetricDict = { "name": hyperParamInitParam["evaluationMetric"] } evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] hyperParamAlgoName = algoSetting.get_hyperparameter_algo_name() params_grid = algoSetting.get_params_dict_hyperparameter() params_grid = { k: v for k, v in list(params_grid.items()) if k in clf.get_params() } print(params_grid) if hyperParamAlgoName == "gridsearchcv": clfGrid = GridSearchCV(clf, params_grid) gridParams = clfGrid.get_params() hyperParamInitParam = { k: v for k, v in list(hyperParamInitParam.items()) if k in gridParams } clfGrid.set_params(**hyperParamInitParam) #clfGrid.fit(x_train,y_train) grid_param = {} grid_param['params'] = ParameterGrid(params_grid) #bestEstimator = clfGrid.best_estimator_ modelFilepath = "/".join(model_filepath.split("/")[:-1]) sklearnHyperParameterResultObj = SklearnGridSearchResult( grid_param, clf, x_train, x_test, y_train, y_test, appType, modelFilepath, levels, posLabel, evaluationMetricDict) resultArray = sklearnHyperParameterResultObj.train_and_save_models( ) self._result_setter.set_hyper_parameter_results( self._slug, resultArray) self._result_setter.set_metadata_parallel_coordinates( self._slug, { "ignoreList": sklearnHyperParameterResultObj.get_ignore_list(), "hideColumns": sklearnHyperParameterResultObj.get_hide_columns(), "metricColName": sklearnHyperParameterResultObj. get_comparison_metric_colname(), "columnOrder": sklearnHyperParameterResultObj.get_keep_columns() }) elif hyperParamAlgoName == "randomsearchcv": clfRand = RandomizedSearchCV(clf, params_grid) clfRand.set_params(**hyperParamInitParam) bestEstimator = None else: evaluationMetricDict = { "name": GLOBALSETTINGS.CLASSIFICATION_MODEL_EVALUATION_METRIC } evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] self._result_setter.set_hyper_parameter_results( self._slug, None) algoParams = algoSetting.get_params_dict() algoParams = { k: v for k, v in list(algoParams.items()) if k in list(clf.get_params().keys()) } clf.set_params(**algoParams) print("!" * 50) print(clf.get_params()) print("!" * 50) if validationDict["name"] == "kFold": defaultSplit = GLOBALSETTINGS.DEFAULT_VALIDATION_OBJECT[ "value"] numFold = int(validationDict["value"]) if numFold == 0: numFold = 3 kFoldClass = SkleanrKFoldResult( numFold, clf, x_train, x_test, y_train, y_test, appType, levels, posLabel, evaluationMetricDict=evaluationMetricDict) kFoldClass.train_and_save_result() kFoldOutput = kFoldClass.get_kfold_result() bestEstimator = kFoldClass.get_best_estimator() elif validationDict["name"] == "trainAndtest": clf.fit(x_train, y_train) bestEstimator = clf # clf.fit(x_train, y_train) # bestEstimator = clf trainingTime = time.time() - st y_score = bestEstimator.predict(x_test) try: y_prob = bestEstimator.predict_proba(x_test) except: y_prob = [0] * len(y_score) # overall_precision_recall = MLUtils.calculate_overall_precision_recall(y_test,y_score,targetLevel = self._targetLevel) # print overall_precision_recall accuracy = metrics.accuracy_score(y_test, y_score) if len(levels) <= 2: precision = metrics.precision_score(y_test, y_score, pos_label=posLabel, average="binary") recall = metrics.recall_score(y_test, y_score, pos_label=posLabel, average="binary") auc = metrics.roc_auc_score(y_test, y_score) elif len(levels) > 2: precision = metrics.precision_score(y_test, y_score, pos_label=posLabel, average="macro") recall = metrics.recall_score(y_test, y_score, pos_label=posLabel, average="macro") # auc = metrics.roc_auc_score(y_test,y_score,average="weighted") auc = None y_score = labelEncoder.inverse_transform(y_score) y_test = labelEncoder.inverse_transform(y_test) featureImportance = {} feature_importance = dict( sorted(zip(x_train.columns, bestEstimator.feature_importances_), key=lambda x: x[1], reverse=True)) for k, v in feature_importance.items(): feature_importance[k] = CommonUtils.round_sig(v) objs = { "trained_model": bestEstimator, "actual": y_test, "predicted": y_score, "probability": y_prob, "feature_importance": feature_importance, "featureList": list(x_train.columns), "labelMapping": labelMapping } if not algoSetting.is_hyperparameter_tuning_enabled(): modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(modelName + ".pkl") joblib.dump(objs["trained_model"], "/".join(modelFilepathArr)) runtime = round((time.time() - st_global), 2) try: modelPmmlPipeline = PMMLPipeline([("pretrained-estimator", objs["trained_model"])]) modelPmmlPipeline.target_field = result_column modelPmmlPipeline.active_fields = np.array( [col for col in x_train.columns if col != result_column]) sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr=True) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except: pass cat_cols = list(set(categorical_columns) - {result_column}) overall_precision_recall = MLUtils.calculate_overall_precision_recall( objs["actual"], objs["predicted"], targetLevel=self._targetLevel) self._model_summary = MLModelSummary() self._model_summary.set_algorithm_name("Svm") self._model_summary.set_algorithm_display_name( "Support Vector Machine") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_confusion_matrix( MLUtils.calculate_confusion_matrix(objs["actual"], objs["predicted"])) self._model_summary.set_feature_importance(objs["feature_importance"]) self._model_summary.set_feature_list(objs["featureList"]) self._model_summary.set_model_accuracy( round(metrics.accuracy_score(objs["actual"], objs["predicted"]), 2)) self._model_summary.set_training_time(round((time.time() - st), 2)) self._model_summary.set_precision_recall_stats( overall_precision_recall["classwise_stats"]) self._model_summary.set_model_precision( overall_precision_recall["precision"]) self._model_summary.set_model_recall( overall_precision_recall["recall"]) self._model_summary.set_target_variable(result_column) self._model_summary.set_prediction_split( overall_precision_recall["prediction_split"]) self._model_summary.set_validation_method("Train and Test") self._model_summary.set_level_map_dict(objs["labelMapping"]) # self._model_summary.set_model_features(list(set(x_train.columns)-set([result_column]))) self._model_summary.set_model_features( [col for col in x_train.columns if col != result_column]) self._model_summary.set_level_counts( self._metaParser.get_unique_level_dict( list(set(categorical_columns)))) self._model_summary.set_num_trees(100) self._model_summary.set_num_rules(300) if not algoSetting.is_hyperparameter_tuning_enabled(): modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": self._model_summary.get_model_accuracy(), "evaluationMetricName": "accuracy", "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } else: modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": resultArray[0]["Accuracy"], "evaluationMetricName": "accuracy", "slug": self._model_summary.get_slug(), "Model Id": resultArray[0]["Model Id"] } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } svmCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary) ] for card in svmCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({ "svm": json.loads( CommonUtils.convert_python_object_to_json(self._model_summary)) }) self._result_setter.set_svm_model_summary(modelSummaryJson) self._result_setter.set_rf_cards(svmCards) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "completion", "info", display=True, emptyBin=False, customMsg=None, weightKey="total")
def __init__(self, data_frame,column_name, measure_descr_stats, df_helper, df_context, result_setter, story_narrative,scriptWeight=None, analysisName=None): self._story_narrative = story_narrative self._result_setter = result_setter self._column_name = column_name.lower() self._capitalized_column_name = "%s%s" % (column_name[0].upper(), column_name[1:]) self._measure_descr_stats = measure_descr_stats self._five_point_summary_stats = measure_descr_stats.get_five_point_summary_stats() self._data_frame = data_frame try: self._total_rows = self._data_frame.shape[0] except: self._total_rows = self._data_frame.count() # self._histogram = measure_descr_stats.get_histogram() # self._num_columns = context.get_column_count() # self._num_rows = context.get_row_count() # self._measures = context.get_measures() # self._dimensions = context.get_dimensions() # self._time_dimensions = context.get_time_dimension() self._dataframe_helper = df_helper self._dataframe_context = df_context self._pandas_flag = self._dataframe_context._pandas_flag self._storyOnScoredData = self._dataframe_context.get_story_on_scored_data() self.title = None self.heading = self._capitalized_column_name + ' Performance Analysis' self.sub_heading = "Distribution of " + self._capitalized_column_name self.summary = None self._analysis1 = None self._analysis2 = None self.analysis = None self.take_away = None self.card2 = '' self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER self._highlightFlag = "|~HIGHLIGHT~|" self._base_dir = "/descriptive/" self.num_measures = len(self._dataframe_helper.get_numeric_columns()) self.num_dimensions = len(self._dataframe_helper.get_string_columns()) self.num_time_dimensions = len(self._dataframe_helper.get_timestamp_columns()) self._completionStatus = self._dataframe_context.get_completion_status() self._messageURL = self._dataframe_context.get_message_url() if analysisName == None: self._analysisName = self._dataframe_context.get_analysis_name() else: self._analysisName = analysisName if scriptWeight == None: self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight() else: self._scriptWeightDict = scriptWeight self._scriptStages = { "statNarrativeStart":{ "summary":"Started The Descriptive Stats Narratives", "weight":0 }, "statNarrativeEnd":{ "summary":"Narratives For Descriptive Stats Finished", "weight":10 }, } CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._analysisName,"statNarrativeStart","info",display=False,emptyBin=False,customMsg=None,weightKey="narratives") self._measureSummaryNode = NarrativesTree() self._headNode = NarrativesTree() self._headNode.set_name("Overview") self._generate_narratives() self._story_narrative.add_a_node(self._measureSummaryNode) self._result_setter.set_head_node(self._headNode) self._result_setter.set_distribution_node(self._measureSummaryNode) CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._analysisName,"statNarrativeEnd","info",display=False,emptyBin=False,customMsg=None,weightKey="narratives")
def Predict(self): self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight( ) self._scriptStages = { "initialization": { "summary": "Initialized The Neural Network (PyTorch) Scripts", "weight": 2 }, "predictionStart": { "summary": "Neural Network (PyTorch) Prediction Started", "weight": 2 }, "predictionFinished": { "summary": "Neural Network (PyTorch) Prediction Finished", "weight": 6 } } CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) dataSanity = True categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() test_data_path = self._dataframe_context.get_input_file() if self._mlEnv == "spark": pass elif self._mlEnv == "sklearn": CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "predictionStart", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") score_data_path = self._dataframe_context.get_score_path( ) + "/data.csv" trained_model_path = "file://" + self._dataframe_context.get_model_path( ) trained_model_path += "/" + self._dataframe_context.get_model_for_scoring( ) + ".pt" print("trained_model_path", trained_model_path) print("score_data_path", score_data_path) if trained_model_path.startswith("file"): trained_model_path = trained_model_path[7:] #trained_model = joblib.load(trained_model_path) trained_model = torch.load(trained_model_path, map_location=torch.device('cpu')) model_columns = self._dataframe_context.get_model_features() print("model_columns", model_columns) try: df = self._data_frame.toPandas() except: df = self._data_frame # pandas_df = MLUtils.factorize_columns(df,[x for x in categorical_columns if x != result_column]) pandas_df = MLUtils.create_dummy_columns( df, [x for x in categorical_columns if x != result_column]) pandas_df = MLUtils.fill_missing_columns(pandas_df, model_columns, result_column) if uid_col: pandas_df = pandas_df[[ x for x in pandas_df.columns if x != uid_col ]] test_df = np.stack( [pandas_df[col].values for col in pandas_df.columns], 1) tensored_test_df = torch.tensor(test_df, dtype=torch.float) outputs_test_df_tensored = trained_model(tensored_test_df.float()) y_score_mid = outputs_test_df_tensored.tolist() y_score = [x[0] for x in y_score_mid] scoreKpiArray = MLUtils.get_scored_data_summary(y_score) kpiCard = NormalCard() kpiCardData = [KpiData(data=x) for x in scoreKpiArray] kpiCard.set_card_data(kpiCardData) kpiCard.set_cente_alignment(True) print(CommonUtils.convert_python_object_to_json(kpiCard)) self._result_setter.set_kpi_card_regression_score(kpiCard) pandas_df[result_column] = y_score df[result_column] = y_score df.to_csv(score_data_path, header=True, index=False) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "predictionFinished", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") print("STARTING Measure ANALYSIS ...") columns_to_keep = [] columns_to_drop = [] columns_to_keep = self._dataframe_context.get_score_consider_columns( ) if len(columns_to_keep) > 0: columns_to_drop = list(set(df.columns) - set(columns_to_keep)) else: columns_to_drop += ["predicted_probability"] columns_to_drop = [ x for x in columns_to_drop if x in df.columns and x != result_column ] print("columns_to_drop", columns_to_drop) pandas_scored_df = df[list(set(columns_to_keep + [result_column]))] spark_scored_df = SQLctx.createDataFrame(pandas_scored_df) # spark_scored_df.write.csv(score_data_path+"/data",mode="overwrite",header=True) # TODO update metadata for the newly created dataframe self._dataframe_context.update_consider_columns(columns_to_keep) print(spark_scored_df.printSchema()) df_helper = DataFrameHelper(spark_scored_df, self._dataframe_context, self._metaParser) df_helper.set_params() df = df_helper.get_data_frame() # self._dataframe_context.set_dont_send_message(True) try: fs = time.time() descr_stats_obj = DescriptiveStatsScript( df, df_helper, self._dataframe_context, self._result_setter, self._spark, self._prediction_narrative, scriptWeight=self._scriptWeightDict, analysisName="Descriptive analysis") descr_stats_obj.Run() print("DescriptiveStats Analysis Done in ", time.time() - fs, " seconds.") except: print("Frequency Analysis Failed ") # try: # fs = time.time() # df_helper.fill_na_dimension_nulls() # df = df_helper.get_data_frame() # dt_reg = DecisionTreeRegressionScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,self._metaParser,scriptWeight=self._scriptWeightDict,analysisName="Predictive modeling") # dt_reg.Run() # print "DecisionTrees Analysis Done in ", time.time() - fs, " seconds." # except: # print "DTREE FAILED" try: fs = time.time() two_way_obj = TwoWayAnovaScript( df, df_helper, self._dataframe_context, self._result_setter, self._spark, self._prediction_narrative, self._metaParser, scriptWeight=self._scriptWeightDict, analysisName="Measure vs. Dimension") two_way_obj.Run() print("OneWayAnova Analysis Done in ", time.time() - fs, " seconds.") except: print("Anova Analysis Failed")
def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") appType = self._dataframe_context.get_app_type() algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = [ x for x in algosToRun if x.get_algorithm_slug() == self._slug ][0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) print("CATEGORICAL COLS - ", categorical_columns) result_column = self._dataframe_context.get_result_column() numerical_columns = self._dataframe_helper.get_numeric_columns() numerical_columns = [ x for x in numerical_columns if x != result_column ] model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print("model_path", model_path) pipeline_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/pipeline/" model_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/model" pmml_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/modelPmml" df = self._data_frame if self._mlEnv == "spark": pass elif self._mlEnv == "sklearn": model_filepath = model_path + "/" + self._slug + "/model.pkl" x_train, x_test, y_train, y_test = self._dataframe_helper.get_train_test_data( ) x_train = MLUtils.create_dummy_columns( x_train, [x for x in categorical_columns if x != result_column]) x_test = MLUtils.create_dummy_columns( x_test, [x for x in categorical_columns if x != result_column]) x_test = MLUtils.fill_missing_columns(x_test, x_train.columns, result_column) print("=" * 150) print("X-Train Shape - ", x_train.shape) print("Y-Train Shape - ", y_train.shape) print("X-Test Shape - ", x_test.shape) print("Y-Test Shape - ", y_test.shape) print("~" * 50) print("X-Train dtype - ", type(x_train)) print("Y-Train dtype - ", type(y_train)) print("X-Test dtype - ", type(x_test)) print("Y-Test dtype - ", type(y_test)) print("~" * 50) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "training", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") st = time.time() self._result_setter.set_hyper_parameter_results(self._slug, None) evaluationMetricDict = algoSetting.get_evaluvation_metric( Type="REGRESSION") evaluationMetricDict = { "name": GLOBALSETTINGS.REGRESSION_MODEL_EVALUATION_METRIC } evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] x_train_tensored, y_train_tensored, x_test_tensored, y_test_tensored = PYTORCHUTILS.get_tensored_data( x_train, y_train, x_test, y_test) trainset = torch_data_utils.TensorDataset(x_train_tensored, y_train_tensored) testset = torch_data_utils.TensorDataset(x_test_tensored, y_test_tensored) nnptr_params = algoSetting.get_nnptr_params_dict()[0] layers_for_network = PYTORCHUTILS.get_layers_for_network_module( nnptr_params, task_type="REGRESSION", first_layer_units=x_train.shape[1]) # Use GPU if available device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") network = PyTorchNetwork(layers_for_network).to(device) network.eval() other_params_dict = PYTORCHUTILS.get_other_pytorch_params( nnptr_params, task_type="REGRESSION", network_params=network.parameters()) print("~" * 50) print("NNPTR-PARAMS - ", nnptr_params) print("~" * 50) print("OTHER-PARAMS-DICT - ", other_params_dict) print("~" * 50) print("NEURAL-NETWORK - ", network) print("~" * 50) criterion = other_params_dict["loss_criterion"] n_epochs = other_params_dict["number_of_epochs"] batch_size = other_params_dict["batch_size"] optimizer = other_params_dict["optimizer"] dataloader_params = { "batch_size": batch_size, "shuffle": True # "num_workers": } train_loader = torch_data_utils.DataLoader(trainset, **dataloader_params) test_loader = torch_data_utils.DataLoader(testset, **dataloader_params) ''' Training the network; Batchnormalization(num_features) should be equal to units_op for that layer in training config; else --> RuntimeError('running_mean should contain 100 elements not 200',) ''' for epoch in range(n_epochs): batchwise_losses = [] average_loss = 0.0 for i, (inputs, labels) in enumerate(train_loader): inputs = inputs.to(device) labels = labels.to(device) # Zero the parameter gradients optimizer.zero_grad() # Forward + backward + optimize outputs = network(inputs.float()) loss = criterion(outputs, labels.float()) loss.backward() optimizer.step() average_loss += loss.item() batchwise_losses.append(loss.item()) average_loss_per_epoch = old_div(average_loss, (i + 1)) print("+" * 80) print("EPOCH - ", epoch) print("BATCHWISE_LOSSES shape - ", len(batchwise_losses)) print("AVERAGE LOSS PER EPOCH - ", average_loss_per_epoch) print("+" * 80) trainingTime = time.time() - st bestEstimator = network outputs_x_test_tensored = network(x_test_tensored.float()) y_score_mid = outputs_x_test_tensored.tolist() y_score = [x[0] for x in y_score_mid] print("Y-SCORE - ", y_score) print("Y-SCORE length - ", len(y_score)) y_prob = None featureImportance = {} objs = { "trained_model": bestEstimator, "actual": y_test, "predicted": y_score, "probability": y_prob, "feature_importance": featureImportance, "featureList": list(x_train.columns), "labelMapping": {} } #featureImportance = objs["trained_model"].feature_importances_ #featuresArray = [(col_name, featureImportance[idx]) for idx, col_name in enumerate(x_train.columns)] featuresArray = [] if not algoSetting.is_hyperparameter_tuning_enabled(): modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(modelName + ".pt") torch.save(objs["trained_model"], "/".join(modelFilepathArr)) #joblib.dump(objs["trained_model"],"/".join(modelFilepathArr)) runtime = round((time.time() - st), 2) else: runtime = round((time.time() - hyper_st), 2) try: modelPmmlPipeline = PMMLPipeline([("pretrained-estimator", objs["trained_model"])]) modelPmmlPipeline.target_field = result_column modelPmmlPipeline.active_fields = np.array( [col for col in x_train.columns if col != result_column]) sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr=True) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except: pass metrics = {} metrics["r2"] = r2_score(y_test, y_score) metrics["neg_mean_squared_error"] = mean_squared_error( y_test, y_score) metrics["neg_mean_absolute_error"] = mean_absolute_error( y_test, y_score) metrics["RMSE"] = sqrt(metrics["neg_mean_squared_error"]) metrics["explained_variance_score"] = explained_variance_score( y_test, y_score) transformed = pd.DataFrame({ "prediction": y_score, result_column: y_test }) print("TRANSFORMED PREDICTION TYPE - ", type(transformed["prediction"])) print(transformed["prediction"]) print("TRANSFORMED RESULT COL TYPE - ", type(transformed[result_column])) print(transformed[result_column]) transformed["difference"] = transformed[ result_column] - transformed["prediction"] transformed["mape"] = old_div( np.abs(transformed["difference"]) * 100, transformed[result_column]) sampleData = None nrows = transformed.shape[0] if nrows > 100: sampleData = transformed.sample(n=100, random_state=420) else: sampleData = transformed print(sampleData.head()) if transformed["mape"].max() > 100: GLOBALSETTINGS.MAPEBINS.append(transformed["mape"].max()) mapeCountArr = list( pd.cut(transformed["mape"], GLOBALSETTINGS.MAPEBINS). value_counts().to_dict().items()) GLOBALSETTINGS.MAPEBINS.pop(5) else: mapeCountArr = list( pd.cut(transformed["mape"], GLOBALSETTINGS.MAPEBINS). value_counts().to_dict().items()) mapeStatsArr = [(str(idx), dictObj) for idx, dictObj in enumerate( sorted([{ "count": x[1], "splitRange": (x[0].left, x[0].right) } for x in mapeCountArr], key=lambda x: x["splitRange"][0]))] print(mapeStatsArr) print(mapeCountArr) predictionColSummary = transformed["prediction"].describe( ).to_dict() quantileBins = [ predictionColSummary["min"], predictionColSummary["25%"], predictionColSummary["50%"], predictionColSummary["75%"], predictionColSummary["max"] ] print(quantileBins) quantileBins = sorted(list(set(quantileBins))) transformed["quantileBinId"] = pd.cut(transformed["prediction"], quantileBins) quantileDf = transformed.groupby("quantileBinId").agg({ "prediction": [np.sum, np.mean, np.size] }).reset_index() quantileDf.columns = ["prediction", "sum", "mean", "count"] print(quantileDf) quantileArr = list(quantileDf.T.to_dict().items()) quantileSummaryArr = [(obj[0], { "splitRange": (obj[1]["prediction"].left, obj[1]["prediction"].right), "count": obj[1]["count"], "mean": obj[1]["mean"], "sum": obj[1]["sum"] }) for obj in quantileArr] print(quantileSummaryArr) runtime = round((time.time() - st_global), 2) self._model_summary.set_model_type("regression") self._model_summary.set_algorithm_name("Neural Network (PyTorch)") self._model_summary.set_algorithm_display_name( "Neural Network (PyTorch)") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_training_time(trainingTime) self._model_summary.set_target_variable(result_column) self._model_summary.set_validation_method( validationDict["displayName"]) self._model_summary.set_model_evaluation_metrics(metrics) self._model_summary.set_model_params(nnptr_params) self._model_summary.set_quantile_summary(quantileSummaryArr) self._model_summary.set_mape_stats(mapeStatsArr) self._model_summary.set_sample_data(sampleData.to_dict()) self._model_summary.set_feature_importance(featuresArray) self._model_summary.set_feature_list(list(x_train.columns)) self._model_summary.set_model_mse( metrics["neg_mean_squared_error"]) self._model_summary.set_model_mae( metrics["neg_mean_absolute_error"]) self._model_summary.set_rmse(metrics["RMSE"]) self._model_summary.set_model_rsquared(metrics["r2"]) self._model_summary.set_model_exp_variance_score( metrics["explained_variance_score"]) try: pmml_filepath = str(model_path) + "/" + str( self._slug) + "/traindeModel.pmml" modelPmmlPipeline = PMMLPipeline([("pretrained-estimator", objs["trained_model"])]) modelPmmlPipeline.target_field = result_column modelPmmlPipeline.active_fields = np.array( [col for col in x_train.columns if col != result_column]) sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr=True) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except: pass if algoSetting.is_hyperparameter_tuning_enabled(): modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": metrics[evaluationMetricDict["name"]], "evaluationMetricName": evaluationMetricDict["name"], "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } else: modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": metrics[evaluationMetricDict["name"]], "evaluationMetricName": evaluationMetricDict["name"], "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } modelmanagement_ = nnptr_params self._model_management = MLModelSummary() if algoSetting.is_hyperparameter_tuning_enabled(): pass else: self._model_management.set_layer_info( data=modelmanagement_['hidden_layer_info']) self._model_management.set_loss_function( data=modelmanagement_['loss']) self._model_management.set_optimizer( data=modelmanagement_['optimizer']) self._model_management.set_batch_size( data=modelmanagement_['batch_size']) self._model_management.set_no_epochs( data=modelmanagement_['number_of_epochs']) # self._model_management.set_model_evaluation_metrics(data=modelmanagement_['metrics']) self._model_management.set_job_type( self._dataframe_context.get_job_name()) #Project name self._model_management.set_training_status( data="completed") # training status self._model_management.set_no_of_independent_variables( data=x_train) #no of independent varables self._model_management.set_training_time(runtime) # run time self._model_management.set_rmse(metrics["RMSE"]) self._model_management.set_algorithm_name( "Neural Network (TensorFlow)") #algorithm name self._model_management.set_validation_method( str(validationDict["displayName"]) + "(" + str(validationDict["value"]) + ")") #validation method self._model_management.set_target_variable( result_column) #target column name self._model_management.set_creation_date(data=str( datetime.now().strftime('%b %d ,%Y %H:%M '))) #creation date self._model_management.set_datasetName(self._datasetName) modelManagementSummaryJson = [ ["Project Name", self._model_management.get_job_type()], ["Algorithm", self._model_management.get_algorithm_name()], ["Training Status", self._model_management.get_training_status()], ["RMSE", self._model_management.get_rmse()], ["RunTime", self._model_management.get_training_time()], #["Owner",None], ["Created On", self._model_management.get_creation_date()] ] if algoSetting.is_hyperparameter_tuning_enabled(): modelManagementModelSettingsJson = [] else: modelManagementModelSettingsJson = [ ["Training Dataset", self._model_management.get_datasetName()], [ "Target Column", self._model_management.get_target_variable() ], [ "Number Of Independent Variables", self._model_management.get_no_of_independent_variables() ], ["Algorithm", self._model_management.get_algorithm_name()], [ "Model Validation", self._model_management.get_validation_method() ], ["batch_size", str(self._model_management.get_batch_size())], ["Loss", self._model_management.get_loss_function()], ["Optimizer", self._model_management.get_optimizer()], ["Epochs", self._model_management.get_no_epochs()], [ "Metrics", self._model_management.get_model_evaluation_metrics() ] ] for i in modelmanagement_["hidden_layer_info"]: string = "" key = str(modelmanagement_["hidden_layer_info"][i] ["layer"]) + " " + str(i) + ":" for j in modelmanagement_["hidden_layer_info"][i]: string = string + str(j) + ":" + str( modelmanagement_["hidden_layer_info"][i][j]) + ", " modelManagementModelSettingsJson.append([key, string]) print(modelManagementModelSettingsJson) nnptrCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary) ] nnptrPerformanceCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_cards_regression( self._model_summary) ] nnptrOverviewCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_card_overview( self._model_management, modelManagementSummaryJson, modelManagementModelSettingsJson) ] nnptrDeploymentCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_deploy_empty_card() ] nnptr_Overview_Node = NarrativesTree() nnptr_Overview_Node.set_name("Overview") nnptr_Performance_Node = NarrativesTree() nnptr_Performance_Node.set_name("Performance") nnptr_Deployment_Node = NarrativesTree() nnptr_Deployment_Node.set_name("Deployment") for card in nnptrOverviewCards: nnptr_Overview_Node.add_a_card(card) for card in nnptrPerformanceCards: nnptr_Performance_Node.add_a_card(card) for card in nnptrDeploymentCards: nnptr_Deployment_Node.add_a_card(card) for card in nnptrCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({ "Neural Network (PyTorch)": json.loads( CommonUtils.convert_python_object_to_json(self._model_summary)) }) self._result_setter.set_nnptr_regression_model_summary( modelSummaryJson) self._result_setter.set_nnptr_cards(nnptrCards) self._result_setter.set_nnptr_nodes([ nnptr_Overview_Node, nnptr_Performance_Node, nnptr_Deployment_Node ]) self._result_setter.set_nnptr_fail_card({ "Algorithm_Name": "Neural Network (PyTorch)", "Success": "True" }) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "completion", "info", display=True, emptyBin=False, customMsg=None, weightKey="total")
def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") appType = self._dataframe_context.get_app_type() algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = [ x for x in algosToRun if x.get_algorithm_slug() == self._slug ][0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) print(categorical_columns) result_column = self._dataframe_context.get_result_column() numerical_columns = self._dataframe_helper.get_numeric_columns() numerical_columns = [ x for x in numerical_columns if x != result_column ] model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print("model_path", model_path) pipeline_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/pipeline/" model_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/model" pmml_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/modelPmml" df = self._data_frame if self._mlEnv == "spark": pass elif self._mlEnv == "sklearn": model_filepath = model_path + "/" + self._slug + "/model.pkl" x_train, x_test, y_train, y_test = self._dataframe_helper.get_train_test_data( ) x_train = MLUtils.create_dummy_columns( x_train, [x for x in categorical_columns if x != result_column]) x_test = MLUtils.create_dummy_columns( x_test, [x for x in categorical_columns if x != result_column]) x_test = MLUtils.fill_missing_columns(x_test, x_train.columns, result_column) st = time.time() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "training", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") if algoSetting.is_hyperparameter_tuning_enabled(): pass else: self._result_setter.set_hyper_parameter_results( self._slug, None) evaluationMetricDict = algoSetting.get_evaluvation_metric( Type="Regression") evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] params_tf = algoSetting.get_tf_params_dict() algoParams = algoSetting.get_params_dict() algoParams = {k: v for k, v in list(algoParams.items())} model = tf.keras.models.Sequential() first_layer_flag = True for i in range(len(list( params_tf['hidden_layer_info'].keys()))): if params_tf['hidden_layer_info'][str( i)]["layer"] == "Dense": if first_layer_flag: model.add( tf.keras.layers.Dense( params_tf['hidden_layer_info'][str( i)]["units"], activation=params_tf['hidden_layer_info'][ str(i)]["activation"], input_shape=(len(x_train.columns), ), use_bias=params_tf['hidden_layer_info'][ str(i)]["use_bias"], kernel_initializer=params_tf[ 'hidden_layer_info'][str( i)]["kernel_initializer"], bias_initializer=params_tf[ 'hidden_layer_info'][str( i)]["bias_initializer"], kernel_regularizer=params_tf[ 'hidden_layer_info'][str( i)]["kernel_regularizer"], bias_regularizer=params_tf[ 'hidden_layer_info'][str( i)]["bias_regularizer"], activity_regularizer=params_tf[ 'hidden_layer_info'][str( i)]["activity_regularizer"], kernel_constraint=params_tf[ 'hidden_layer_info'][str( i)]["kernel_constraint"], bias_constraint=params_tf[ 'hidden_layer_info'][str( i)]["bias_constraint"])) try: if params_tf['hidden_layer_info'][str( i)]["batch_normalization"] == "True": model.add( tf.keras.layers.BatchNormalization()) except: print( "BATCH_NORM_FAILED ##########################" ) pass first_layer_flag = False else: model.add( tf.keras.layers.Dense( params_tf['hidden_layer_info'][str( i)]["units"], activation=params_tf['hidden_layer_info'][ str(i)]["activation"], use_bias=params_tf['hidden_layer_info'][ str(i)]["use_bias"], kernel_initializer=params_tf[ 'hidden_layer_info'][str( i)]["kernel_initializer"], bias_initializer=params_tf[ 'hidden_layer_info'][str( i)]["bias_initializer"], kernel_regularizer=params_tf[ 'hidden_layer_info'][str( i)]["kernel_regularizer"], bias_regularizer=params_tf[ 'hidden_layer_info'][str( i)]["bias_regularizer"], activity_regularizer=params_tf[ 'hidden_layer_info'][str( i)]["activity_regularizer"], kernel_constraint=params_tf[ 'hidden_layer_info'][str( i)]["kernel_constraint"], bias_constraint=params_tf[ 'hidden_layer_info'][str( i)]["bias_constraint"])) try: if params_tf['hidden_layer_info'][str( i)]["batch_normalization"] == "True": model.add( tf.keras.layers.BatchNormalization()) except: print( "BATCH_NORM_FAILED ##########################" ) pass elif params_tf['hidden_layer_info'][str( i)]["layer"] == "Dropout": model.add( tf.keras.layers.Dropout( float(params_tf['hidden_layer_info'][str(i)] ["rate"]))) elif params_tf['hidden_layer_info'][str( i)]["layer"] == "Lambda": if params_tf['hidden_layer_info'][str( i)]["lambda"] == "Addition": model.add( tf.keras.layers.Lambda(lambda x: x + int( params_tf['hidden_layer_info'][str(i)][ "units"]))) if params_tf['hidden_layer_info'][str( i)]["lambda"] == "Multiplication": model.add( tf.keras.layers.Lambda(lambda x: x * int( params_tf['hidden_layer_info'][str(i)][ "units"]))) if params_tf['hidden_layer_info'][str( i)]["lambda"] == "Subtraction": model.add( tf.keras.layers.Lambda(lambda x: x - int( params_tf['hidden_layer_info'][str(i)][ "units"]))) if params_tf['hidden_layer_info'][str( i)]["lambda"] == "Division": model.add( tf.keras.layers.Lambda(lambda x: old_div( x, int(params_tf['hidden_layer_info'][str(i)][ "units"])))) model.compile(optimizer=algoParams["optimizer"], loss=algoParams["loss"], metrics=[algoParams['metrics']]) model.fit(x_train, y_train, epochs=algoParams["number_of_epochs"], verbose=1, batch_size=algoParams["batch_size"]) bestEstimator = model print(model.summary()) trainingTime = time.time() - st y_score = bestEstimator.predict(x_test) y_score = list(y_score.flatten()) try: y_prob = bestEstimator.predict_proba(x_test) except: y_prob = [0] * len(y_score) featureImportance = {} objs = { "trained_model": bestEstimator, "actual": y_test, "predicted": y_score, "probability": y_prob, "feature_importance": featureImportance, "featureList": list(x_train.columns), "labelMapping": {} } #featureImportance = objs["trained_model"].feature_importances_ #featuresArray = [(col_name, featureImportance[idx]) for idx, col_name in enumerate(x_train.columns)] featuresArray = [] if not algoSetting.is_hyperparameter_tuning_enabled(): modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(modelName + ".h5") objs["trained_model"].save("/".join(modelFilepathArr)) #joblib.dump(objs["trained_model"],"/".join(modelFilepathArr)) metrics = {} metrics["r2"] = r2_score(y_test, y_score) metrics["neg_mean_squared_error"] = mean_squared_error( y_test, y_score) metrics["neg_mean_absolute_error"] = mean_absolute_error( y_test, y_score) metrics["RMSE"] = sqrt(metrics["neg_mean_squared_error"]) metrics["explained_variance_score"] = explained_variance_score( y_test, y_score) transformed = pd.DataFrame({ "prediction": y_score, result_column: y_test }) transformed["difference"] = transformed[ result_column] - transformed["prediction"] transformed["mape"] = old_div( np.abs(transformed["difference"]) * 100, transformed[result_column]) sampleData = None nrows = transformed.shape[0] if nrows > 100: sampleData = transformed.sample(n=100, random_state=420) else: sampleData = transformed print(sampleData.head()) if transformed["mape"].max() > 100: GLOBALSETTINGS.MAPEBINS.append(transformed["mape"].max()) mapeCountArr = list( pd.cut(transformed["mape"], GLOBALSETTINGS.MAPEBINS). value_counts().to_dict().items()) GLOBALSETTINGS.MAPEBINS.pop(5) else: mapeCountArr = list( pd.cut(transformed["mape"], GLOBALSETTINGS.MAPEBINS). value_counts().to_dict().items()) mapeStatsArr = [(str(idx), dictObj) for idx, dictObj in enumerate( sorted([{ "count": x[1], "splitRange": (x[0].left, x[0].right) } for x in mapeCountArr], key=lambda x: x["splitRange"][0]))] print(mapeStatsArr) print(mapeCountArr) predictionColSummary = transformed["prediction"].describe( ).to_dict() quantileBins = [ predictionColSummary["min"], predictionColSummary["25%"], predictionColSummary["50%"], predictionColSummary["75%"], predictionColSummary["max"] ] print(quantileBins) quantileBins = sorted(list(set(quantileBins))) transformed["quantileBinId"] = pd.cut(transformed["prediction"], quantileBins) quantileDf = transformed.groupby("quantileBinId").agg({ "prediction": [np.sum, np.mean, np.size] }).reset_index() quantileDf.columns = ["prediction", "sum", "mean", "count"] print(quantileDf) quantileArr = list(quantileDf.T.to_dict().items()) quantileSummaryArr = [(obj[0], { "splitRange": (obj[1]["prediction"].left, obj[1]["prediction"].right), "count": obj[1]["count"], "mean": obj[1]["mean"], "sum": obj[1]["sum"] }) for obj in quantileArr] print(quantileSummaryArr) runtime = round((time.time() - st_global), 2) self._model_summary.set_model_type("regression") self._model_summary.set_algorithm_name( "Neural Network (TensorFlow)") self._model_summary.set_algorithm_display_name( "Neural Network (TensorFlow)") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_training_time(trainingTime) self._model_summary.set_target_variable(result_column) self._model_summary.set_validation_method( validationDict["displayName"]) self._model_summary.set_model_evaluation_metrics(metrics) self._model_summary.set_model_params(params_tf) self._model_summary.set_quantile_summary(quantileSummaryArr) self._model_summary.set_mape_stats(mapeStatsArr) self._model_summary.set_sample_data(sampleData.to_dict()) self._model_summary.set_feature_importance(featuresArray) self._model_summary.set_feature_list(list(x_train.columns)) self._model_summary.set_model_mse( metrics["neg_mean_squared_error"]) self._model_summary.set_model_mae( metrics["neg_mean_absolute_error"]) self._model_summary.set_rmse(metrics["RMSE"]) self._model_summary.set_model_rsquared(metrics["r2"]) self._model_summary.set_model_exp_variance_score( metrics["explained_variance_score"]) try: pmml_filepath = str(model_path) + "/" + str( self._slug) + "/traindeModel.pmml" modelPmmlPipeline = PMMLPipeline([("pretrained-estimator", objs["trained_model"])]) modelPmmlPipeline.target_field = result_column modelPmmlPipeline.active_fields = np.array( [col for col in x_train.columns if col != result_column]) sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr=True) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except: pass if algoSetting.is_hyperparameter_tuning_enabled(): modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": metrics[evaluationMetricDict["name"]], "evaluationMetricName": evaluationMetricDict["name"], "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } else: modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": metrics[evaluationMetricDict["name"]], "evaluationMetricName": evaluationMetricDict["name"], "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } modelmanagement_ = params_tf modelmanagement_.update(algoParams) self._model_management = MLModelSummary() if algoSetting.is_hyperparameter_tuning_enabled(): pass else: self._model_management.set_layer_info( data=modelmanagement_['hidden_layer_info']) self._model_management.set_loss_function( data=modelmanagement_['loss']) self._model_management.set_optimizer( data=modelmanagement_['optimizer']) self._model_management.set_batch_size( data=modelmanagement_['batch_size']) self._model_management.set_no_epochs( data=modelmanagement_['number_of_epochs']) self._model_management.set_model_evaluation_metrics( data=modelmanagement_['metrics']) self._model_management.set_job_type( self._dataframe_context.get_job_name()) #Project name self._model_management.set_training_status( data="completed") # training status self._model_management.set_no_of_independent_variables( data=x_train) #no of independent varables self._model_management.set_training_time(runtime) # run time self._model_management.set_rmse(metrics["RMSE"]) self._model_management.set_algorithm_name( "Neural Network (TensorFlow)") #algorithm name self._model_management.set_validation_method( str(validationDict["displayName"]) + "(" + str(validationDict["value"]) + ")") #validation method self._model_management.set_target_variable( result_column) #target column name self._model_management.set_creation_date(data=str( datetime.now().strftime('%b %d ,%Y %H:%M '))) #creation date self._model_management.set_datasetName(self._datasetName) modelManagementSummaryJson = [ ["Project Name", self._model_management.get_job_type()], ["Algorithm", self._model_management.get_algorithm_name()], ["Training Status", self._model_management.get_training_status()], ["RMSE", self._model_management.get_rmse()], ["RunTime", self._model_management.get_training_time()], #["Owner",None], ["Created On", self._model_management.get_creation_date()] ] if algoSetting.is_hyperparameter_tuning_enabled(): modelManagementModelSettingsJson = [] else: modelManagementModelSettingsJson = [ ["Training Dataset", self._model_management.get_datasetName()], [ "Target Column", self._model_management.get_target_variable() ], [ "Number Of Independent Variables", self._model_management.get_no_of_independent_variables() ], ["Algorithm", self._model_management.get_algorithm_name()], [ "Model Validation", self._model_management.get_validation_method() ], ["batch_size", str(self._model_management.get_batch_size())], ["Loss", self._model_management.get_loss_function()], ["Optimizer", self._model_management.get_optimizer()], ["Epochs", self._model_management.get_no_epochs()], [ "Metrics", self._model_management.get_model_evaluation_metrics() ] ] for i in range( len(list(modelmanagement_['hidden_layer_info'].keys()))): string = "" key = "layer No-" + str(i) + "-" + str( modelmanagement_["hidden_layer_info"][str(i)]["layer"] + "-") for j in modelmanagement_["hidden_layer_info"][str(i)]: modelManagementModelSettingsJson.append([ key + j + ":", modelmanagement_["hidden_layer_info"][str(i)][j] ]) print(modelManagementModelSettingsJson) tfregCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary) ] tfregPerformanceCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_cards_regression( self._model_summary) ] tfregOverviewCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_card_overview( self._model_management, modelManagementSummaryJson, modelManagementModelSettingsJson) ] tfregDeploymentCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_deploy_empty_card() ] TFReg_Overview_Node = NarrativesTree() TFReg_Overview_Node.set_name("Overview") TFReg_Performance_Node = NarrativesTree() TFReg_Performance_Node.set_name("Performance") TFReg_Deployment_Node = NarrativesTree() TFReg_Deployment_Node.set_name("Deployment") for card in tfregOverviewCards: TFReg_Overview_Node.add_a_card(card) for card in tfregPerformanceCards: TFReg_Performance_Node.add_a_card(card) for card in tfregDeploymentCards: TFReg_Deployment_Node.add_a_card(card) for card in tfregCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({ "Neural Network (TensorFlow)": json.loads( CommonUtils.convert_python_object_to_json(self._model_summary)) }) self._result_setter.set_tfreg_regression_model_summart( modelSummaryJson) self._result_setter.set_tfreg_cards(tfregCards) self._result_setter.set_tfreg_nodes([ TFReg_Overview_Node, TFReg_Performance_Node, TFReg_Deployment_Node ]) self._result_setter.set_tfreg_fail_card({ "Algorithm_Name": "Neural Network (TensorFlow)", "Success": "True" }) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "completion", "info", display=True, emptyBin=False, customMsg=None, weightKey="total")
def __init__(self, df_anova_result, df_helper, df_context, result_setter, story_narrative, scriptWeight=None, analysisName=None): self._story_narrative = story_narrative self._result_setter = result_setter self._dataframe_context = df_context self._df_anova_result = df_anova_result self._df_helper = df_helper self.narratives = {} self.narratives['variables'] = '' self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER self._base_dir = "/anova/" self._analysisName = self._dataframe_context.get_analysis_name() self._analysisDict = self._dataframe_context.get_analysis_dict() self._completionStatus = self._dataframe_context.get_completion_status( ) self._messageURL = self._dataframe_context.get_message_url() if analysisName == None: self._analysisName = self._dataframe_context.get_analysis_name() else: self._analysisName = analysisName if scriptWeight == None: self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight( ) else: self._scriptWeightDict = scriptWeight self._scriptStages = { "anovaNarrativeStart": { "summary": "Started The Anova Narratives", "weight": 0 }, "anovaNarrativeEnd": { "summary": "Narratives For Anova Finished", "weight": 10 }, } # self._completionStatus += self._scriptWeightDict[self._analysisName]["narratives"]*self._scriptStages["anovaNarrativeStart"]["weight"]/10 # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ # "anovaNarrativeStart",\ # "info",\ # self._scriptStages["anovaNarrativeStart"]["summary"],\ # self._completionStatus,\ # self._completionStatus) # CommonUtils.save_progress_message(self._messageURL,progressMessage) # self._dataframe_context.update_completion_status(self._completionStatus) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "anovaNarrativeStart", "info", display=False, emptyBin=False, customMsg=None, weightKey="narratives") self._generate_narratives() # self._completionStatus += self._scriptWeightDict[self._analysisName]["narratives"]*self._scriptStages["anovaNarrativeEnd"]["weight"]/10 # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ # "anovaNarrativeEnd",\ # "info",\ # self._scriptStages["anovaNarrativeEnd"]["summary"],\ # self._completionStatus,\ # self._completionStatus) # CommonUtils.save_progress_message(self._messageURL,progressMessage) # self._dataframe_context.update_completion_status(self._completionStatus) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "anovaNarrativeEnd", "info", display=False, emptyBin=False, customMsg=None, weightKey="narratives") if self._anovaNodes.get_card_count() > 0: self._story_narrative.add_a_node(self._anovaNodes) #self._generate_take_away() self._result_setter.set_anova_node(self._anovaNodes)
def __init__(self, column_name, df_helper, df_context, freq_dimension_stats, result_setter, story_narrative, scriptWeight=None, analysisName=None): self._story_narrative = story_narrative self._result_setter = result_setter self._column_name = column_name.lower() self._colname = column_name self._capitalized_column_name = "%s%s" % (column_name[0].upper(), column_name[1:]) self._dimension_col_freq_dict = freq_dimension_stats.get_frequency_dict( ) self.header = None self.subheader = None self.count = {} self.summary = [] self.analysis = [] self.frequency_dict = json.loads(self._dimension_col_freq_dict) self.appid = df_context.get_app_id() self._base_dir = "/dimensions/" if self.appid != None: if self.appid == "1": self._base_dir += "appid1/" elif self.appid == "2": self._base_dir += "appid2/" self._dataframe_context = df_context self._dataframe_helper = df_helper self._storyOnScoredData = self._dataframe_context.get_story_on_scored_data( ) self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER self._dimensionSummaryNode = NarrativesTree() self._dimensionSummaryNode.set_name("Overview") self._headNode = NarrativesTree() self._headNode.set_name("Overview") self._completionStatus = self._dataframe_context.get_completion_status( ) if analysisName == None: self._analysisName = self._dataframe_context.get_analysis_name() else: self._analysisName = analysisName self._messageURL = self._dataframe_context.get_message_url() if scriptWeight == None: self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight( ) else: self._scriptWeightDict = scriptWeight self._scriptStages = { "initialization": { "summary": "Initialized the Frequency Narratives", "weight": 2 }, "summarygeneration": { "summary": "summary generation finished", "weight": 8 }, "completion": { "summary": "Frequency Stats Narratives done", "weight": 0 }, } CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "initialization", "info", weightKey="narratives") self._generate_narratives() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "summarygeneration", "info", weightKey="narratives") self._story_narrative.add_a_node(self._dimensionSummaryNode) self._result_setter.set_head_node(self._headNode) self._result_setter.set_distribution_node(self._dimensionSummaryNode) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "summarygeneration", "info", weightKey="narratives")
print repr(e), d continue for m in all_measures: try: chisquare_result = self.test_measures(targetDimension, m) df_chisquare_result.add_chisquare_result( targetDimension, m, chisquare_result) except Exception, e: print str(e), m continue CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "completion", "info", display=False, weightKey="script") return df_chisquare_result @accepts(object, basestring, basestring) def test_dimension(self, targetDimension, testDimension): if not targetDimension in self._dataframe_helper.get_string_columns(): raise BIException.non_string_column(testDimension) chisquare_result = ChiSquareResult() pivot_table = self._data_frame.stat.crosstab( "{}".format(targetDimension), testDimension) # rdd = pivot_table.rdd.flatMap(lambda x: x).filter(lambda x: str(x).isdigit()).collect() rdd = list(
def test_all(self, measure_columns=None, dimension_columns=None): measures = measure_columns if measure_columns is None: measures = self._measure_columns dimensions = dimension_columns print("===================dimensions================") if dimension_columns is None: dimensions = self._dimension_columns try: nColsToUse = self._analysisDict[ self._analysisName]["noOfColumnsToUse"] except: nColsToUse = None # if nColsToUse != None: # dimensions = dimensions[:nColsToUse] sqrt_nrows = round(self._dataframe_helper.get_num_rows()**0.5) acceptable_level_count = GLOBALSETTINGS.ANOVAMAXLEVEL print({ "acceptable_level_count": acceptable_level_count, "sqrt_nrows": sqrt_nrows }) max_levels = builtins.min([acceptable_level_count, int(sqrt_nrows)]) df_anova_result = DFTwoWayAnovaResult() dimensions_to_test = [ dim for dim in dimensions if self._dataframe_helper.get_num_unique_values(dim) <= max_levels ] print( "======================= dimensions_to_test ===============================" ) print(dimensions_to_test) self._dimensions_to_test = [ x for x in dimensions_to_test if x in self._data_frame.columns ] print("dimensions to test ", self._dimensions_to_test) for measure in measures: if self._pandas_flag: measureColStat = [[ self._data_frame[measure].sum().item(), self._data_frame[measure].mean(), self._data_frame[measure].count().item() ]] else: measureColStat = self._data_frame.select([ sum(measure).alias("total"), mean(measure).alias("average"), count(measure).alias("count") ]).collect() measureColMean = measureColStat[0][1] measureColCount = measureColStat[0][2] if self._pandas_flag: measureColSst = ((self._data_frame[measure] - measureColMean)**2).sum() else: measureColSst = self._data_frame.select( sum(pow(col(measure) - measureColMean, 2))).collect()[0][0] self._anova_result = MeasureAnovaResult( measureColMean=measureColMean, measureColCount=measureColCount, measureColSst=measureColSst) print(self._dataRangeStats) if self._dateFormatDetected: grouped_data = NarrativesUtils.get_grouped_data_for_trend( self._data_frame, self._dataRangeStats["dataLevel"], measure, "measure", self._pandas_flag) trendData = TrendData() trendData.set_params(grouped_data,None,\ self._dataRangeStats["lastDate"],\ self._dataRangeStats["firstDate"],\ self._dataRangeStats["duration"],\ self._dataRangeStats["durationString"],\ self._dataRangeStats["dataLevel"] ) self._anova_result.set_trend_data(trendData) for dimension in self._dimensions_to_test: print("dimension--", dimension) anovaResult = self.one_way_anova_test( self._data_frame, measure, dimension, measureColMean=measureColMean, measureColCount=measureColCount, measureColSst=measureColSst) dimensionAnovaResult = OneWayAnovaResult() dimensionAnovaResult.set_params( df_within=anovaResult["df_within"], df_between=anovaResult["df_between"], sum_of_squares_between=anovaResult["ss_between"], sum_of_squares_within=anovaResult["ss_within"], mean_sum_of_squares_between=anovaResult["ms_between"], mean_sum_of_squares_within=anovaResult["ms_within"], f_value=anovaResult["f_stat"], p_value=anovaResult["p_value"], eta_squared=anovaResult["eta_squared"], f_critical=anovaResult["f_critical"], total_number_of_records=anovaResult["n_total"], n_groups=anovaResult["n_groups"], levelDf=anovaResult["levelDf"]) self._anova_result.set_oneWayAnovaResultDict( dimension, dimensionAnovaResult) # for top level if anovaResult["p_value"] < 0.05: effect_size = anovaResult["eta_squared"] self._dataframe_helper.add_significant_dimension( dimension, effect_size) topLevelAnova = TopLevelDfAnovaStats() levelDf = anovaResult["levelDf"] toplevelStats = levelDf.loc[levelDf["total"].argmax()] print("toplevelStats", toplevelStats) topLevelAnova.set_top_level_stat(toplevelStats) if self._pandas_flag: topLevelDf = self._data_frame[ self._data_frame[dimension].isin( [toplevelStats.levels])] else: topLevelDf = self._data_frame.where( col(dimension).isin([toplevelStats.levels])) if self._dateFormatDetected: levelPivot = NarrativesUtils.get_level_pivot( self._data_frame, 'day', measure, dimension, index_col=None, pandas_flag=self._pandas_flag) topLevelGroupedData = NarrativesUtils.get_grouped_data_for_trend( topLevelDf, self._dataRangeStats["dataLevel"], measure, "measure", self._pandas_flag) trendData = TrendData() trendData.set_grouped_data(topLevelGroupedData) trendData.set_level_pivot(levelPivot) topLevelAnova.set_trend_data(trendData) if self._pandas_flag: topLevelDfMeasureColStat = [[ topLevelDf[measure].sum().item(), topLevelDf[measure].mean(), topLevelDf[measure].count().item() ]] else: topLevelDfMeasureColStat = topLevelDf.select([ sum(measure).alias("total"), mean(measure).alias("average"), count(measure).alias("count") ]).collect() topLevelDfMeasureColMean = topLevelDfMeasureColStat[0][1] topLevelDfMeasureColCount = topLevelDfMeasureColStat[0][2] if self._pandas_flag: topLevelDfMeasureColSst = ( (topLevelDf[measure] - topLevelDfMeasureColMean)**2).sum() else: topLevelDfMeasureColSst = topLevelDf.select( sum(pow( col(measure) - topLevelDfMeasureColMean, 2))).collect()[0][0] dimensions_to_test_for_top_level = list( set(self._dimensions_to_test) - {dimension}) topLevelAnovaDimensions = {} for dimensionlTopLevel in dimensions_to_test_for_top_level: print("top level dimensions", dimensionlTopLevel) topLevelDfAnovaResult = self.one_way_anova_test( topLevelDf, measure, dimensionlTopLevel, measureColMean=topLevelDfMeasureColMean, measureColCount=topLevelDfMeasureColCount, measureColSst=topLevelDfMeasureColSst) dimensiontopLevelAnovaResult = OneWayAnovaResult() dimensiontopLevelAnovaResult.set_params( df_within=topLevelDfAnovaResult["df_within"], df_between=topLevelDfAnovaResult["df_between"], sum_of_squares_between=topLevelDfAnovaResult[ "ss_between"], sum_of_squares_within=topLevelDfAnovaResult[ "ss_within"], mean_sum_of_squares_between=topLevelDfAnovaResult[ "ms_between"], mean_sum_of_squares_within=topLevelDfAnovaResult[ "ms_within"], f_value=topLevelDfAnovaResult["f_stat"], p_value=topLevelDfAnovaResult["p_value"], eta_squared=topLevelDfAnovaResult["eta_squared"], f_critical=topLevelDfAnovaResult["f_critical"], total_number_of_records=topLevelDfAnovaResult[ "n_total"], n_groups=topLevelDfAnovaResult["n_groups"], levelDf=topLevelDfAnovaResult["levelDf"]) topLevelAnova.set_top_level_anova( dimensionlTopLevel, dimensiontopLevelAnovaResult) # contributionDict = self.compute_contributions(topLevelDfAnovaResult["levelDf"]) # print contributionDict # topLevelAnova.set_dimension_contributions(dimension,contributionDict) self._anova_result.set_topLevelDfAnovaResult( dimension, topLevelAnova) df_anova_result.add_measure_result(measure, self._anova_result) print(self._anova_result.get_dimensions_analyzed()) print( "checking effect size access", self._anova_result.get_OneWayAnovaEffectSize( self._dimensions_to_test[0])) # self._completionStatus += self._scriptWeightDict[self._analysisName]["script"] # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ # "anovaEnd",\ # "info",\ # self._scriptStages["anovaEnd"]["summary"],\ # self._completionStatus,\ # self._completionStatus) # CommonUtils.save_progress_message(self._messageURL,progressMessage) # self._dataframe_context.update_completion_status(self._completionStatus) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "anovaEnd", "info", display=False, emptyBin=False, customMsg=None, weightKey="script") return df_anova_result
def __init__(self, data_frame, df_helper, df_context, meta_parser, scriptWeight=None, analysisName=None): self._data_frame = data_frame self._dataframe_helper = df_helper self._dataframe_context = df_context self._pandas_flag = df_context._pandas_flag self._metaParser = meta_parser self._measure_columns = self._dataframe_helper.get_numeric_columns() self._dimension_columns = self._dataframe_helper.get_string_columns() self._timestamp_columns = self._dataframe_helper.get_timestamp_columns( ) if analysisName == None: self._analysisName = self._dataframe_context.get_analysis_name() else: self._analysisName = analysisName if scriptWeight == None: self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight( ) else: self._scriptWeightDict = scriptWeight print("=================dimension columns======================") print(self._dimension_columns) print("=================dimension columns======================") print("==================measure_columns ========================") print(self._measure_columns) print("==================measure_columns ========================") self._storyOnScoredData = self._dataframe_context.get_story_on_scored_data( ) self._date_columns = self._dataframe_context.get_date_columns() self._uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion( self._uid_col): self._dimension_columns = list( set(self._dimension_columns) - {self._uid_col}) if len(self._date_columns) > 0: self._dimension_columns = list( set(self._dimension_columns) - set(self._date_columns)) self.top_dimension_result = {} # if selected date col empty then on td_node self._dataRangeStats = None self._dateFormatDetected = False self._trend_on_td_column = False self._existingDateFormat = None self._dateFormatConversionDict = NarrativesUtils.date_formats_mapping_dict( ) self._dateColumnFormatDict = df_context.get_date_format_dict() if self._dataframe_context.get_requested_date_format() != None: self._requestedDateFormat = df_context.get_requested_date_format() else: self._requestedDateFormat = None dateColCheck = None scriptsToRun = self._dataframe_context.get_analysis_name_list() print(self._dateColumnFormatDict) self._selected_date_columns = self._dataframe_context.get_selected_date_columns( ) if self._selected_date_columns != None: dateColCheck = NarrativesUtils.check_date_column_formats(self._selected_date_columns,\ self._timestamp_columns,\ self._dateColumnFormatDict,\ self._dateFormatConversionDict, self._requestedDateFormat) # print dateColCheck if not self._dataframe_context.get_anova_on_scored_data(): if dateColCheck: self._dataframe_context.set_date_format_details(dateColCheck) self._dateFormatDetected = dateColCheck["dateFormatDetected"] self._trend_on_td_column = dateColCheck["trendOnTdCol"] if self._dateFormatDetected: self._requestedDateFormat = dateColCheck[ "requestedDateFormat"] self._existingDateFormat = dateColCheck[ "existingDateFormat"] self._date_columns_suggested = dateColCheck[ "suggestedDateColumn"] if self._dateFormatDetected: print("self._existingDateFormat", self._existingDateFormat) print("self._existingDateFormat", self._existingDateFormat) self._data_frame, self._dataRangeStats = NarrativesUtils.calculate_data_range_stats( self._data_frame, self._existingDateFormat, self._date_columns_suggested, self._trend_on_td_column, self._pandas_flag) self._completionStatus = self._dataframe_context.get_completion_status( ) self._analysisName = self._dataframe_context.get_analysis_name() self._analysisDict = self._dataframe_context.get_analysis_dict() self._messageURL = self._dataframe_context.get_message_url() self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight( ) if analysisName == None: self._analysisName = self._dataframe_context.get_analysis_name() else: self._analysisName = analysisName if scriptWeight == None: self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight( ) else: self._scriptWeightDict = scriptWeight self._scriptStages = { "anovaStart": { "summary": "Initialized The Anova Scripts", "weight": 0 }, "anovaEnd": { "summary": "Anova Calculated", "weight": 10 }, } # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ # "anovaStart",\ # "info",\ # self._scriptStages["anovaStart"]["summary"],\ # self._completionStatus,\ # self._completionStatus) # CommonUtils.save_progress_message(self._messageURL,progressMessage) # self._dataframe_context.update_completion_status(self._completionStatus) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "anovaStart", "info", display=False, emptyBin=False, customMsg=None, weightKey="script")
def Predict(self): self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight() self._scriptStages = { "initialization":{ "summary":"Initialized the Generalized Linear Regression Scripts", "weight":2 }, "predictionStart":{ "summary":"Generalized Linear Regression Model Prediction Started", "weight":2 }, "predictionFinished":{ "summary":"Generalized Linear Regression Model Prediction Finished", "weight":6 } } CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"initialization","info",display=True,emptyBin=False,customMsg=None,weightKey="total") SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark) dataSanity = True categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns)-set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() test_data_path = self._dataframe_context.get_input_file() CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"predictionStart","info",display=True,emptyBin=False,customMsg=None,weightKey="total") test_data_path = self._dataframe_context.get_input_file() score_data_path = self._dataframe_context.get_score_path()+"/data.csv" trained_model_path = "file://" + self._dataframe_context.get_model_path() trained_model_path += "/model" pipeline_path = "/".join(trained_model_path.split("/")[:-1])+"/pipeline" print "trained_model_path",trained_model_path print "pipeline_path",pipeline_path print "score_data_path",score_data_path pipelineModel = MLUtils.load_pipeline(pipeline_path) trained_model = MLUtils.load_generalized_linear_regresssion_pyspark_model(trained_model_path) df = self._data_frame indexed = pipelineModel.transform(df) transformed = trained_model.transform(indexed) if result_column in transformed.columns: transformed = transformed.withColumnRenamed(result_column,"originalLabel") transformed = transformed.withColumnRenamed("prediction",result_column) pandas_scored_df = transformed.select(list(set(self._data_frame.columns+[result_column]))).toPandas() if score_data_path.startswith("file"): score_data_path = score_data_path[7:] pandas_scored_df.to_csv(score_data_path,header=True,index=False) CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"predictionFinished","info",display=True,emptyBin=False,customMsg=None,weightKey="total") print "STARTING Measure ANALYSIS ..." columns_to_keep = [] columns_to_drop = [] columns_to_keep = self._dataframe_context.get_score_consider_columns() if len(columns_to_keep) > 0: columns_to_drop = list(set(df.columns)-set(columns_to_keep)) else: columns_to_drop += ["predicted_probability"] columns_to_drop = [x for x in columns_to_drop if x in df.columns and x != result_column] print "columns_to_drop",columns_to_drop spark_scored_df = transformed.select(list(set(columns_to_keep+[result_column]))) df_helper = DataFrameHelper(spark_scored_df, self._dataframe_context,self._metaParser) df_helper.set_params() df = df_helper.get_data_frame() # self._dataframe_context.set_dont_send_message(True) try: fs = time.time() descr_stats_obj = DescriptiveStatsScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,scriptWeight=self._scriptWeightDict,analysisName="Descriptive analysis") descr_stats_obj.Run() print "DescriptiveStats Analysis Done in ", time.time() - fs, " seconds." except: print "Frequency Analysis Failed " try: fs = time.time() df_helper.fill_na_dimension_nulls() df = df_helper.get_data_frame() dt_reg = DecisionTreeRegressionScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,self._metaParser,scriptWeight=self._scriptWeightDict,analysisName="Predictive modeling") dt_reg.Run() print "DecisionTrees Analysis Done in ", time.time() - fs, " seconds." except: print "DTREE FAILED" try: fs = time.time() two_way_obj = TwoWayAnovaScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,self._metaParser,scriptWeight=self._scriptWeightDict,analysisName="Measure vs. Dimension") two_way_obj.Run() print "OneWayAnova Analysis Done in ", time.time() - fs, " seconds." except: print "Anova Analysis Failed"
def _generate_narratives(self): """ generate main card narrative and remaining cards are generated by calling ChiSquareAnalysis class for each of analyzed dimensions """ for target_dimension in self._df_chisquare_result.keys(): target_chisquare_result = self._df_chisquare_result[ target_dimension] analysed_variables = target_chisquare_result.keys( ) ## List of all analyzed var. # List of significant var out of analyzed var. significant_variables = [ dim for dim in target_chisquare_result.keys() if target_chisquare_result[dim].get_pvalue() <= 0.05 ] effect_sizes = [ target_chisquare_result[dim].get_effect_size() for dim in significant_variables ] effect_size_dict = dict(zip(significant_variables, effect_sizes)) significant_variables = [ y for (x, y) in sorted(zip(effect_sizes, significant_variables), reverse=True) ] #insignificant_variables = [i for i in self._df_chisquare_result[target_dimension] if i['pv']>0.05] num_analysed_variables = len(analysed_variables) num_significant_variables = len(significant_variables) self.narratives['main_card'] = {} self.narratives['main_card'][ 'heading'] = 'Relationship between ' + target_dimension + ' and other factors' self.narratives['main_card']['paragraphs'] = {} data_dict = { 'num_variables': num_analysed_variables, 'num_significant_variables': num_significant_variables, 'significant_variables': significant_variables, 'target': target_dimension, 'analysed_dimensions': analysed_variables, 'blockSplitter': self._blockSplitter } # for both para 1 and para 2 paragraph = {} paragraph['header'] = '' paragraph['content'] = NarrativesUtils.get_template_output( self._base_dir, 'main_card.html', data_dict) self.narratives['main_card']['paragraphs'] = [paragraph] self.narratives['cards'] = [] chart = { 'header': 'Strength of association between ' + target_dimension + ' and other dimensions' } chart['data'] = effect_size_dict chart['label_text'] = { 'x': 'Dimensions', 'y': 'Effect Size (Cramers-V)' } chart_data = [] chartDataValues = [] for k, v in effect_size_dict.items(): chart_data.append({"key": k, "value": float(v)}) chartDataValues.append(float(v)) chart_data = sorted(chart_data, key=lambda x: x["value"], reverse=True) chart_json = ChartJson() chart_json.set_data(chart_data) chart_json.set_chart_type("bar") # chart_json.set_label_text({'x':'Dimensions','y':'Effect Size (Cramers-V)'}) chart_json.set_label_text({ 'x': ' ', 'y': 'Effect Size (Cramers-V)' }) chart_json.set_axis_rotation(True) chart_json.set_axes({"x": "key", "y": "value"}) # chart_json.set_yaxis_number_format(".4f") chart_json.set_yaxis_number_format( NarrativesUtils.select_y_axis_format(chartDataValues)) self.narratives['main_card']['chart'] = chart main_card = NormalCard() header = "<h3>Strength of association between " + target_dimension + " and other dimensions</h3>" main_card_data = [HtmlData(data=header)] main_card_narrative = NarrativesUtils.get_template_output( self._base_dir, 'main_card.html', data_dict) main_card_narrative = NarrativesUtils.block_splitter( main_card_narrative, self._blockSplitter) main_card_data += main_card_narrative # st_info = ["Test : Chi Square", "Threshold for p-value : 0.05", "Effect Size : Cramer's V"] # print "chartdata",chart_data if len(chart_data) > 0: statistical_info_array = [ ("Test Type", "Chi-Square"), ("Effect Size", "Cramer's V"), ("Max Effect Size", chart_data[0]["key"]), ("Min Effect Size", chart_data[-1]["key"]), ] statistical_inferenc = "" if len(chart_data) == 1: statistical_inference = "{} is the only variable that have significant association with the {} (Target) having an \ Effect size of {}".format( chart_data[0]["key"], self._dataframe_context.get_result_column(), round(chart_data[0]["value"], 4)) elif len(chart_data) == 2: statistical_inference = "There are two variables ({} and {}) that have significant association with the {} (Target) and the \ Effect size ranges are {} and {} respectively".format( chart_data[0]["key"], chart_data[1]["key"], self._dataframe_context.get_result_column(), round(chart_data[0]["value"], 4), round(chart_data[1]["value"], 4)) else: statistical_inference = "There are {} variables that have significant association with the {} (Target) and the \ Effect size ranges from {} to {}".format( len(chart_data), self._dataframe_context.get_result_column(), round(chart_data[0]["value"], 4), round(chart_data[-1]["value"], 4)) if statistical_inference != "": statistical_info_array.append( ("Inference", statistical_inference)) statistical_info_array = NarrativesUtils.statistical_info_array_formatter( statistical_info_array) else: statistical_info_array = [] main_card_data.append( C3ChartData(data=chart_json, info=statistical_info_array)) main_card.set_card_data(main_card_data) main_card.set_card_name("Key Influencers") if self._storyOnScoredData != True: self._chiSquareNode.add_a_card(main_card) self._result_setter.add_a_score_chi_card(main_card) print "target_dimension", target_dimension if self._appid == '2' and num_significant_variables > 5: significant_variables = significant_variables[:5] else: if self._nColsToUse != None: significant_variables = significant_variables[:self. _nColsToUse] CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "custom", "info", display=True, customMsg="Analyzing key drivers", weightKey="narratives") for analysed_dimension in significant_variables[:self. _noOfSigDimsToShow]: chisquare_result = self._df_chisquare.get_chisquare_result( target_dimension, analysed_dimension) if self._appid == '2': print "APPID 2 is used" card = ChiSquareAnalysis( self._dataframe_context, self._dataframe_helper, chisquare_result, target_dimension, analysed_dimension, significant_variables, num_analysed_variables, self._data_frame, self._measure_columns, self._base_dir, None, target_chisquare_result) # self.narratives['cards'].append(card) self._result_setter.add_a_score_chi_card( json.loads( CommonUtils.convert_python_object_to_json( card.get_dimension_card1()))) elif self._appid == '1': print "APPID 1 is used" card = ChiSquareAnalysis( self._dataframe_context, self._dataframe_helper, chisquare_result, target_dimension, analysed_dimension, significant_variables, num_analysed_variables, self._data_frame, self._measure_columns, self._base_dir, None, target_chisquare_result) # self.narratives['cards'].append(card) self._result_setter.add_a_score_chi_card( json.loads( CommonUtils.convert_python_object_to_json( card.get_dimension_card1()))) else: target_dimension_card = ChiSquareAnalysis( self._dataframe_context, self._dataframe_helper, chisquare_result, target_dimension, analysed_dimension, significant_variables, num_analysed_variables, self._data_frame, self._measure_columns, self._base_dir, None, target_chisquare_result) self.narratives['cards'].append(target_dimension_card) self._chiSquareNode.add_a_node( target_dimension_card.get_dimension_node()) self._story_narrative.add_a_node(self._chiSquareNode) self._result_setter.set_chisquare_node(self._chiSquareNode)
def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"initialization","info",display=True,emptyBin=False,customMsg=None,weightKey="total") appType = self._dataframe_context.get_app_type() algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = filter(lambda x:x.get_algorithm_slug()==self._slug,algosToRun)[0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns)-set(allDateCols)) print categorical_columns result_column = self._dataframe_context.get_result_column() numerical_columns = self._dataframe_helper.get_numeric_columns() numerical_columns = [x for x in numerical_columns if x != result_column] model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print "model_path",model_path pipeline_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/pipeline/" model_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/model" pmml_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/modelPmml" df = self._data_frame if self._mlEnv == "spark": pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns,categorical_columns,result_column,algoType="regression") pipelineModel = pipeline.fit(df) indexed = pipelineModel.transform(df) featureMapping = sorted((attr["idx"], attr["name"]) for attr in (chain(*indexed.schema["features"].metadata["ml_attr"]["attrs"].values()))) # print indexed.select([result_column,"features"]).show(5) MLUtils.save_pipeline_or_model(pipelineModel,pipeline_filepath) # OriginalTargetconverter = IndexToString(inputCol="label", outputCol="originalTargetColumn") dtreer = DecisionTreeRegressor(labelCol=result_column, featuresCol='features',predictionCol="prediction") if validationDict["name"] == "kFold": defaultSplit = GLOBALSETTINGS.DEFAULT_VALIDATION_OBJECT["value"] numFold = int(validationDict["value"]) if numFold == 0: numFold = 3 trainingData,validationData = indexed.randomSplit([defaultSplit,1-defaultSplit], seed=12345) paramGrid = ParamGridBuilder()\ .addGrid(dtreer.regParam, [0.1, 0.01]) \ .addGrid(dtreer.fitIntercept, [False, True])\ .addGrid(dtreer.elasticNetParam, [0.0, 0.5, 1.0])\ .build() crossval = CrossValidator(estimator=dtreer, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(predictionCol="prediction", labelCol=result_column), numFolds=numFold) st = time.time() cvModel = crossval.fit(indexed) trainingTime = time.time()-st print "cvModel training takes",trainingTime bestModel = cvModel.bestModel elif validationDict["name"] == "trainAndtest": trainingData,validationData = indexed.randomSplit([float(validationDict["value"]),1-float(validationDict["value"])], seed=12345) st = time.time() fit = dtreer.fit(trainingData) trainingTime = time.time()-st print "time to train",trainingTime bestModel = fit featureImportance = bestModel.featureImportances print featureImportance,type(featureImportance) # print featureImportance[0],len(featureImportance[1],len(featureImportance[2])) print len(featureMapping) featuresArray = [(name, featureImportance[idx]) for idx, name in featureMapping] print featuresArray MLUtils.save_pipeline_or_model(bestModel,model_filepath) transformed = bestModel.transform(validationData) transformed = transformed.withColumn(result_column,transformed[result_column].cast(DoubleType())) transformed = transformed.select([result_column,"prediction",transformed[result_column]-transformed["prediction"]]) transformed = transformed.withColumnRenamed(transformed.columns[-1],"difference") transformed = transformed.select([result_column,"prediction","difference",FN.abs(transformed["difference"])*100/transformed[result_column]]) transformed = transformed.withColumnRenamed(transformed.columns[-1],"mape") sampleData = None nrows = transformed.count() if nrows > 100: sampleData = transformed.sample(False, float(100)/nrows, seed=420) else: sampleData = transformed print sampleData.show() evaluator = RegressionEvaluator(predictionCol="prediction",labelCol=result_column) metrics = {} metrics["r2"] = evaluator.evaluate(transformed,{evaluator.metricName: "r2"}) metrics["rmse"] = evaluator.evaluate(transformed,{evaluator.metricName: "rmse"}) metrics["mse"] = evaluator.evaluate(transformed,{evaluator.metricName: "mse"}) metrics["mae"] = evaluator.evaluate(transformed,{evaluator.metricName: "mae"}) runtime = round((time.time() - st_global),2) # print transformed.count() mapeDf = transformed.select("mape") # print mapeDf.show() mapeStats = MLUtils.get_mape_stats(mapeDf,"mape") mapeStatsArr = mapeStats.items() mapeStatsArr = sorted(mapeStatsArr,key=lambda x:int(x[0])) # print mapeStatsArr quantileDf = transformed.select("prediction") # print quantileDf.show() quantileSummaryDict = MLUtils.get_quantile_summary(quantileDf,"prediction") quantileSummaryArr = quantileSummaryDict.items() quantileSummaryArr = sorted(quantileSummaryArr,key=lambda x:int(x[0])) # print quantileSummaryArr self._model_summary.set_model_type("regression") self._model_summary.set_algorithm_name("dtree Regression") self._model_summary.set_algorithm_display_name("Decision Tree Regression") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_training_time(trainingTime) self._model_summary.set_target_variable(result_column) self._model_summary.set_validation_method(validationDict["displayName"]) self._model_summary.set_model_evaluation_metrics(metrics) self._model_summary.set_model_params(bestEstimator.get_params()) self._model_summary.set_quantile_summary(quantileSummaryArr) self._model_summary.set_mape_stats(mapeStatsArr) self._model_summary.set_sample_data(sampleData.toPandas().to_dict()) self._model_summary.set_feature_importance(featureImportance) # print CommonUtils.convert_python_object_to_json(self._model_summary) elif self._mlEnv == "sklearn": model_filepath = model_path+"/"+self._slug+"/model.pkl" x_train,x_test,y_train,y_test = self._dataframe_helper.get_train_test_data() x_train = MLUtils.create_dummy_columns(x_train,[x for x in categorical_columns if x != result_column]) x_test = MLUtils.create_dummy_columns(x_test,[x for x in categorical_columns if x != result_column]) x_test = MLUtils.fill_missing_columns(x_test,x_train.columns,result_column) st = time.time() est = DecisionTreeRegressor() CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"training","info",display=True,emptyBin=False,customMsg=None,weightKey="total") if algoSetting.is_hyperparameter_tuning_enabled(): hyperParamInitParam = algoSetting.get_hyperparameter_params() evaluationMetricDict = {"name":hyperParamInitParam["evaluationMetric"]} evaluationMetricDict["displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[evaluationMetricDict["name"]] hyperParamAlgoName = algoSetting.get_hyperparameter_algo_name() params_grid = algoSetting.get_params_dict_hyperparameter() params_grid = {k:v for k,v in params_grid.items() if k in est.get_params()} print params_grid if hyperParamAlgoName == "gridsearchcv": estGrid = GridSearchCV(est,params_grid) gridParams = estGrid.get_params() hyperParamInitParam = {k:v for k,v in hyperParamInitParam.items() if k in gridParams} estGrid.set_params(**hyperParamInitParam) estGrid.fit(x_train,y_train) bestEstimator = estGrid.best_estimator_ modelFilepath = "/".join(model_filepath.split("/")[:-1]) sklearnHyperParameterResultObj = SklearnGridSearchResult(estGrid.cv_results_,est,x_train,x_test,y_train,y_test,appType,modelFilepath,evaluationMetricDict=evaluationMetricDict) resultArray = sklearnHyperParameterResultObj.train_and_save_models() self._result_setter.set_hyper_parameter_results(self._slug,resultArray) self._result_setter.set_metadata_parallel_coordinates(self._slug,{"ignoreList":sklearnHyperParameterResultObj.get_ignore_list(),"hideColumns":sklearnHyperParameterResultObj.get_hide_columns(),"metricColName":sklearnHyperParameterResultObj.get_comparison_metric_colname(),"columnOrder":sklearnHyperParameterResultObj.get_keep_columns()}) elif hyperParamAlgoName == "randomsearchcv": estRand = RandomizedSearchCV(est,params_grid) estRand.set_params(**hyperParamInitParam) bestEstimator = None else: evaluationMetricDict = {"name":GLOBALSETTINGS.REGRESSION_MODEL_EVALUATION_METRIC} evaluationMetricDict["displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[evaluationMetricDict["name"]] algoParams = algoSetting.get_params_dict() algoParams = {k:v for k,v in algoParams.items() if k in est.get_params().keys()} est.set_params(**algoParams) self._result_setter.set_hyper_parameter_results(self._slug,None) if validationDict["name"] == "kFold": defaultSplit = GLOBALSETTINGS.DEFAULT_VALIDATION_OBJECT["value"] numFold = int(validationDict["value"]) if numFold == 0: numFold = 3 kFoldClass = SkleanrKFoldResult(numFold,est,x_train,x_test,y_train,y_test,appType,evaluationMetricDict=evaluationMetricDict) kFoldClass.train_and_save_result() kFoldOutput = kFoldClass.get_kfold_result() bestEstimator = kFoldClass.get_best_estimator() elif validationDict["name"] == "trainAndtest": est.fit(x_train, y_train) bestEstimator = est trainingTime = time.time()-st y_score = bestEstimator.predict(x_test) try: y_prob = bestEstimator.predict_proba(x_test) except: y_prob = [0]*len(y_score) featureImportance={} objs = {"trained_model":bestEstimator,"actual":y_test,"predicted":y_score,"probability":y_prob,"feature_importance":featureImportance,"featureList":list(x_train.columns),"labelMapping":{}} featureImportance = objs["trained_model"].feature_importances_ featuresArray = [(col_name, featureImportance[idx]) for idx, col_name in enumerate(x_train.columns)] if not algoSetting.is_hyperparameter_tuning_enabled(): modelName = "M"+"0"*(GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH-1)+"1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(modelName+".pkl") joblib.dump(objs["trained_model"],"/".join(modelFilepathArr)) metrics = {} metrics["r2"] = r2_score(y_test, y_score) metrics["mse"] = mean_squared_error(y_test, y_score) metrics["mae"] = mean_absolute_error(y_test, y_score) metrics["rmse"] = sqrt(metrics["mse"]) transformed = pd.DataFrame({"prediction":y_score,result_column:y_test}) transformed["difference"] = transformed[result_column] - transformed["prediction"] transformed["mape"] = np.abs(transformed["difference"])*100/transformed[result_column] sampleData = None nrows = transformed.shape[0] if nrows > 100: sampleData = transformed.sample(n=100,random_state=420) else: sampleData = transformed print sampleData.head() mapeCountArr = pd.cut(transformed["mape"],GLOBALSETTINGS.MAPEBINS).value_counts().to_dict().items() mapeStatsArr = [(str(idx),dictObj) for idx,dictObj in enumerate(sorted([{"count":x[1],"splitRange":(x[0].left,x[0].right)} for x in mapeCountArr],key = lambda x:x["splitRange"][0]))] predictionColSummary = transformed["prediction"].describe().to_dict() quantileBins = [predictionColSummary["min"],predictionColSummary["25%"],predictionColSummary["50%"],predictionColSummary["75%"],predictionColSummary["max"]] print quantileBins quantileBins = sorted(list(set(quantileBins))) transformed["quantileBinId"] = pd.cut(transformed["prediction"],quantileBins) quantileDf = transformed.groupby("quantileBinId").agg({"prediction":[np.sum,np.mean,np.size]}).reset_index() quantileDf.columns = ["prediction","sum","mean","count"] print quantileDf quantileArr = quantileDf.T.to_dict().items() quantileSummaryArr = [(obj[0],{"splitRange":(obj[1]["prediction"].left,obj[1]["prediction"].right),"count":obj[1]["count"],"mean":obj[1]["mean"],"sum":obj[1]["sum"]}) for obj in quantileArr] print quantileSummaryArr runtime = round((time.time() - st_global),2) self._model_summary.set_model_type("regression") self._model_summary.set_algorithm_name("DTREE Regression") self._model_summary.set_algorithm_display_name("Decision Tree Regression") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_training_time(trainingTime) self._model_summary.set_target_variable(result_column) self._model_summary.set_validation_method(validationDict["displayName"]) self._model_summary.set_model_evaluation_metrics(metrics) self._model_summary.set_model_params(bestEstimator.get_params()) self._model_summary.set_quantile_summary(quantileSummaryArr) self._model_summary.set_mape_stats(mapeStatsArr) self._model_summary.set_sample_data(sampleData.to_dict()) self._model_summary.set_feature_importance(featuresArray) self._model_summary.set_feature_list(list(x_train.columns)) try: pmml_filepath = str(model_path)+"/"+str(self._slug)+"/traindeModel.pmml" modelPmmlPipeline = PMMLPipeline([ ("pretrained-estimator", objs["trained_model"]) ]) modelPmmlPipeline.target_field = result_column modelPmmlPipeline.active_fields = np.array([col for col in x_train.columns if col != result_column]) sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr = True) pmmlfile = open(pmml_filepath,"r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug:pmmlText}) except: pass if not algoSetting.is_hyperparameter_tuning_enabled(): modelDropDownObj = { "name":self._model_summary.get_algorithm_name(), "evaluationMetricValue":self._model_summary.get_model_accuracy(), "evaluationMetricName":"r2", "slug":self._model_summary.get_slug(), "Model Id":modelName } modelSummaryJson = { "dropdown":modelDropDownObj, "levelcount":self._model_summary.get_level_counts(), "modelFeatureList":self._model_summary.get_feature_list(), "levelMapping":self._model_summary.get_level_map_dict(), "slug":self._model_summary.get_slug(), "name":self._model_summary.get_algorithm_name() } else: modelDropDownObj = { "name":self._model_summary.get_algorithm_name(), "evaluationMetricValue":resultArray[0]["R-Squared"], "evaluationMetricName":"r2", "slug":self._model_summary.get_slug(), "Model Id":resultArray[0]["Model Id"] } modelSummaryJson = { "dropdown":modelDropDownObj, "levelcount":self._model_summary.get_level_counts(), "modelFeatureList":self._model_summary.get_feature_list(), "levelMapping":self._model_summary.get_level_map_dict(), "slug":self._model_summary.get_slug(), "name":self._model_summary.get_algorithm_name() } dtreerCards = [json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary)] for card in dtreerCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({"dtreeregression":json.loads(CommonUtils.convert_python_object_to_json(self._model_summary))}) self._result_setter.set_dtree_regression_model_summart(modelSummaryJson) self._result_setter.set_dtreer_cards(dtreerCards) CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"completion","info",display=True,emptyBin=False,customMsg=None,weightKey="total")
def __init__(self, column_name, decision_tree_rules, df_helper, df_context, result_setter, story_narrative, scriptWeight=None, analysisName=None): self._story_narrative = story_narrative self._result_setter = result_setter self._dataframe_context = df_context self._column_name = column_name.lower() self._colname = column_name self._capitalized_column_name = "%s%s" % (column_name[0].upper(), column_name[1:]) self._decision_rules_dict = decision_tree_rules.get_decision_rules() self._table = decision_tree_rules.get_table() self.new_table = {} self.successful_predictions = decision_tree_rules.get_success() self.total_predictions = decision_tree_rules.get_total() self.success_percent = decision_tree_rules.get_success_percent() self._important_vars = decision_tree_rules.get_significant_vars() self._target_distribution = decision_tree_rules.get_target_contributions( ) self._get_new_table() self._df_helper = df_helper self.subheader = None self.dropdownComment = None self.dropdownValues = None self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER self._base_dir = "/decisiontree/" self._decisionTreeNode = NarrativesTree(name='Prediction') # self._decisionTreeNode.set_name("Decision Tree Regression") self._completionStatus = self._dataframe_context.get_completion_status( ) self._messageURL = self._dataframe_context.get_message_url() if analysisName == None: self._analysisName = self._dataframe_context.get_analysis_name() else: self._analysisName = analysisName if scriptWeight == None: self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight( ) else: self._scriptWeightDict = scriptWeight self._scriptStages = { "dtreeNarrativeStart": { "summary": "Started the Decision Tree Regression Narratives", "weight": 0 }, "dtreeNarrativeEnd": { "summary": "Narratives for Decision Tree Regression Finished", "weight": 10 }, } # self._completionStatus += self._scriptWeightDict[self._analysisName]["narratives"]*self._scriptStages["dtreeNarrativeStart"]["weight"]/10 # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ # "dtreeNarrativeStart",\ # "info",\ # self._scriptStages["dtreeNarrativeStart"]["summary"],\ # self._completionStatus,\ # self._completionStatus) # CommonUtils.save_progress_message(self._messageURL,progressMessage) # self._dataframe_context.update_completion_status(self._completionStatus) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "dtreeNarrativeStart", "info", weightKey="narratives") self._generate_narratives() self._story_narrative.add_a_node(self._decisionTreeNode) self._result_setter.set_decision_tree_node(self._decisionTreeNode) # self._completionStatus = self._dataframe_context.get_completion_status() # self._completionStatus += self._scriptWeightDict[self._analysisName]["narratives"]*self._scriptStages["dtreeNarrativeEnd"]["weight"]/10 # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ # "dtreeNarrativeEnd",\ # "info",\ # self._scriptStages["dtreeNarrativeEnd"]["summary"],\ # self._completionStatus,\ # self._completionStatus) # CommonUtils.save_progress_message(self._messageURL,progressMessage) # self._dataframe_context.update_completion_status(self._completionStatus) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "dtreeNarrativeEnd", "info", weightKey="narratives")