def test_all(self, measure_columns=None, dimension_columns=None):
        freq_dimension_result = FreqDimensionResult()
        dimension = dimension_columns[0]
        frequency_dict = {}
        grouped_dataframe = self._data_frame.groupby(
            dimension).count().toPandas()
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "groupby",
            "info",
            weightKey="script")

        frequency_dict[dimension] = grouped_dataframe.to_dict()
        grouped_dataframe = grouped_dataframe.dropna()
        frequency_dict = json.dumps(frequency_dict)
        freq_dimension_result.set_params(frequency_dict)
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "completion",
            "info",
            weightKey="script")
        return freq_dimension_result
Esempio n. 2
0
    def stats_for_measure_column(self, measure_column):
        if not self._dataframe_helper.is_numeric_column(measure_column):
            raise BIException.non_numeric_column(measure_column)

        descr_stats = MeasureDescriptiveStats()
        num_values = self._data_frame.select(measure_column).count()
        min_value = Stats.min(self._data_frame, measure_column)
        max_value = Stats.max(self._data_frame, measure_column)
        total_value = Stats.total(self._data_frame, measure_column)
        mean = Stats.mean(self._data_frame, measure_column)
        variance = Stats.variance(self._data_frame, measure_column)
        std_dev = Stats.std_dev(self._data_frame, measure_column)

        if min_value == max_value:
            skewness = 0
            kurtosis = 0
        else:
            skewness = Stats.skew(self._data_frame, measure_column)
            kurtosis = Stats.kurtosis(self._data_frame, measure_column)

        descr_stats.set_summary_stats(num_values=num_values,
                                      min_value=min_value,
                                      max_value=max_value,
                                      total=total_value,
                                      mean=mean,
                                      variance=variance,
                                      std_dev=std_dev,
                                      skew=skewness,
                                      kurtosis=kurtosis)
        descr_stats.set_five_point_summary_stats(
            self.five_point_summary(measure_column))

        descr_stats.set_histogram(
            Binner(self._data_frame,
                   self._dataframe_helper).get_bins(measure_column))

        #descr_stats.set_raw_data([float(row[0]) for row in self._data_frame.select(measure_column).collect()])
        # self._completionStatus += self._scriptWeightDict[self._analysisName]["script"]
        # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
        #                             "statCalculationEnd",\
        #                             "info",\
        #                             self._scriptStages["statCalculationEnd"]["summary"],\
        #                             self._completionStatus,\
        #                             self._completionStatus)
        # CommonUtils.save_progress_message(self._messageURL,progressMessage)
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "statCalculationEnd",
            "info",
            display=False,
            emptyBin=False,
            customMsg=None,
            weightKey="script")
        # self._dataframe_context.update_completion_status(self._completionStatus)
        return descr_stats
Esempio n. 3
0
    def test_all(self,
                 measure_columns=None,
                 dimension_columns=None,
                 max_num_levels=40):
        # CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._analysisName,"chisquareStats","info",display=False,weightKey="script")
        targetDimension = dimension_columns[0]
        all_dimensions = self._dimension_columns
        all_dimensions = [x for x in all_dimensions if x != targetDimension]
        # if self._analysisDict != {}:
        #     nColsToUse = self._analysisDict[self._analysisName]["noOfColumnsToUse"]
        # else:
        #     nColsToUse = None
        # if nColsToUse != None:
        #     all_dimensions = all_dimensions[:nColsToUse]
        all_measures = self._measure_columns
        df_chisquare_result = DFChiSquareResult()
        # print "df_chisquare_result"*50
        # print df_chisquare_result
        for d in all_dimensions:
            try:
                chisquare_result = self.test_dimension(targetDimension, d)
                df_chisquare_result.add_chisquare_result(
                    targetDimension, d, chisquare_result)
            except Exception as e:
                print(repr(e), d)
                continue
        for m in all_measures:
            try:
                if self._pandas_flag:
                    if len(self._data_frame[m].unique()
                           ) > self._analysisDict['Dimension vs. Dimension'][
                               'binSetting']['binCardinality']:
                        chisquare_result = self.test_measures(
                            targetDimension, m)
                        df_chisquare_result.add_chisquare_result(
                            targetDimension, m, chisquare_result)
                else:
                    if self._data_frame.select(F.countDistinct(m)).collect(
                    )[0][0] > self._analysisDict['Dimension vs. Dimension'][
                            'binSetting']['binCardinality']:
                        chisquare_result = self.test_measures(
                            targetDimension, m)
                        df_chisquare_result.add_chisquare_result(
                            targetDimension, m, chisquare_result)
            except Exception as e:
                print(str(e), m)
                continue

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "completion",
            "info",
            display=False,
            weightKey="script")
        return df_chisquare_result
Esempio n. 4
0
    def __init__(self, data_frame, df_context, df_helper, spark, meta_parser,scriptWeight=None, analysisName=None):
        self._spark = spark
        self._metaParser = meta_parser
        self._data_frame = data_frame
        self._data_frame1 = data_frame
        self._dataframe_helper = df_helper
        self._dataframe_context = df_context
        self._measure_columns = self._dataframe_helper.get_numeric_columns()
        self._dimension_columns = self._dataframe_helper.get_string_columns()
        self._date_columns = self._dataframe_context.get_date_columns()
        self._uid_col = self._dataframe_context.get_uid_column()
        if self._metaParser.check_column_isin_ignored_suggestion(self._uid_col):
            self._dimension_columns = list(set(self._dimension_columns) - {self._uid_col})
        if len(self._date_columns) >0 :
            self._dimension_columns = list(set(self._dimension_columns)-set(self._date_columns))
        self._mapping_dict = {}
        self._new_rules = {}
        self._total = {}
        self._success = {}
        self._probability = {}
        self._alias_dict = {}
        self._important_vars = {}
        self._numCluster = None

        self._data_frame = self._dataframe_helper.fill_missing_values(self._data_frame)
        self._data_frame1 = self._dataframe_helper.fill_missing_values(self._data_frame1)

        self._completionStatus = self._dataframe_context.get_completion_status()
        self._messageURL = self._dataframe_context.get_message_url()
        if analysisName == None:
            self._analysisName = self._dataframe_context.get_analysis_name()
        else:
            self._analysisName = analysisName
        if scriptWeight == None:
            self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight()
        else:
            self._scriptWeightDict = scriptWeight
        self._scriptStages = {
            "dtreeTrainingStart":{
                "summary":"Started the Decision Tree Regression Script",
                "weight":0
                },
            "dtreeTrainingEnd":{
                "summary":"Decision Tree Regression Learning Finished",
                "weight":10
                },
            }

        CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._analysisName,"dtreeTrainingStart","info",weightKey="script")
Esempio n. 5
0
    def test_all(self, measure_columns=None, dimension_columns=None):
        if dimension_columns is None:
            dimensions = self._dimension_columns
        self._target_dimension = measure_columns[0]
        dimension = self._target_dimension
        max_num_levels = GLOBALSETTINGS.DTREE_TARGET_DIMENSION_MAX_LEVEL
        max_num_levels = min(max_num_levels, round(self._dataframe_helper.get_num_rows()**0.5))
        # all_dimensions = [dim for dim in self._dimension_columns if self._dataframe_helper.get_num_unique_values(dim) <= max_num_levels]
        all_dimensions = [dim for dim in self._dimension_columns if self._metaParser.get_num_unique_values(dim) <= max_num_levels]
        all_measures = [x for x in self._measure_columns if x!=self._target_dimension]
        self.transform_data_frames()
        decision_tree_result = DecisionTreeResult()
        cat_feature_info = [len(self._mapping_dict[c]) for c in all_dimensions]
        if len(cat_feature_info)>0:
            max_length = max(cat_feature_info)
        else:
            max_length=32
        cat_feature_info = dict(enumerate(cat_feature_info))
        # print cat_feature_info
        dimension_classes = self._data_frame.select(dimension).distinct().count()
        self._data_frame = self._data_frame[[dimension] + all_dimensions + all_measures]
        data = self._data_frame.rdd.map(lambda x: LabeledPoint(x[0], x[1:]))
        (trainingData, testData) = data.randomSplit([1.0, 0.0])
        # TO DO : set maxBins at least equal to the max level of categories in dimension column
        model = DecisionTree.trainClassifier(trainingData, numClasses=dimension_classes, categoricalFeaturesInfo=cat_feature_info, impurity='gini', maxDepth=3, maxBins=max_length)
        output_result = model.toDebugString()
        print "output_result",output_result
        decision_tree = self.tree_json(output_result, self._data_frame)
        self._new_tree = self.generate_new_tree(decision_tree)
        self._new_tree = self.wrap_tree(self._new_tree)
        # self._new_tree = utils.recursiveRemoveNullNodes(self._new_tree)
        # decision_tree_result.set_params(self._new_tree, self._new_rules, self._total, self._success, self._probability)
        decision_tree_result.set_params(self._new_tree, self._new_rules, self._total, self._success, self._probability)
        decision_tree_result.set_target_map(self._mapping_dict[self._target_dimension], self._aggr_data, self._important_vars)

        # self._completionStatus += self._scriptWeightDict[self._analysisName]["script"]*self._scriptStages["dtreeTrainingStart"]["weight"]/10
        # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
        #                             "dtreeTrainingEnd",\
        #                             "info",\
        #                             self._scriptStages["dtreeTrainingEnd"]["summary"],\
        #                             self._completionStatus,\
        #                             self._completionStatus)
        # CommonUtils.save_progress_message(self._messageURL,progressMessage)
        # self._dataframe_context.update_completion_status(self._completionStatus)
        CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._analysisName,"dtreeTrainingEnd","info",weightKey="script")

        # print decision_tree_result
        return decision_tree_result
    def __init__(self,
                 data_frame,
                 df_helper,
                 df_context,
                 scriptWeight=None,
                 analysisName=None):
        self._data_frame = data_frame
        self._dataframe_helper = df_helper
        self._dataframe_context = df_context

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        if analysisName == None:
            self._analysisName = self._dataframe_context.get_analysis_name()
        else:
            self._analysisName = analysisName
        self._messageURL = self._dataframe_context.get_message_url()
        if scriptWeight == None:
            self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight(
            )
        else:
            self._scriptWeightDict = scriptWeight

        self._scriptStages = {
            "freqinitialization": {
                "summary": "Initialized the Frequency Scripts",
                "weight": 4
            },
            "groupby": {
                "summary": "running groupby operations",
                "weight": 6
            },
            "completion": {
                "summary": "Frequency Stats Calculated",
                "weight": 0
            },
        }
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "freqinitialization",
            "info",
            weightKey="script")
Esempio n. 7
0
    def __init__(self,
                 data_frame,
                 df_helper,
                 df_context,
                 meta_parser,
                 scriptWeight=None,
                 analysisName=None):
        self._data_frame = data_frame
        self._dataframe_helper = df_helper
        self._dataframe_context = df_context
        self._pandas_flag = df_context._pandas_flag
        self._metaParser = meta_parser
        self._measure_columns = self._dataframe_helper.get_numeric_columns()
        self._dimension_columns = self._dataframe_helper.get_string_columns()
        self._timestamp_columns = self._dataframe_helper.get_timestamp_columns(
        )
        self._date_columns = self._dataframe_context.get_date_columns()
        self._uid_col = self._dataframe_context.get_uid_column()
        if self._metaParser.check_column_isin_ignored_suggestion(
                self._uid_col):
            self._dimension_columns = list(
                set(self._dimension_columns) - {self._uid_col})
        if len(self._date_columns) > 0:
            self._dimension_columns = list(
                set(self._dimension_columns) - set(self._date_columns))

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        if analysisName == None:
            self._analysisName = self._dataframe_context.get_analysis_name()
        else:
            self._analysisName = analysisName
        self._analysisDict = self._dataframe_context.get_analysis_dict()
        self._messageURL = self._dataframe_context.get_message_url()
        if scriptWeight == None:
            self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight(
            )
        else:
            self._scriptWeightDict = scriptWeight
        self._scriptStages = {
            "initialization": {
                "summary": "Initialized the Chisquare Scripts",
                "weight": 1
            },
            "chisquareStats": {
                "summary": "Running Chisquare For Relevant Dimension Columns",
                "weight": 2
            },
            "completion": {
                "summary": "Chisquare Stats Calculated",
                "weight": 2
            },
        }

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "initialization",
            "info",
            display=False,
            weightKey="script")
    def Train(self):
        st_global = time.time()

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._slug,
            "initialization",
            "info",
            display=True,
            emptyBin=False,
            customMsg=None,
            weightKey="total")

        algosToRun = self._dataframe_context.get_algorithms_to_run()
        algoSetting = [
            x for x in algosToRun if x.get_algorithm_slug() == self._slug
        ][0]
        categorical_columns = self._dataframe_helper.get_string_columns()
        uid_col = self._dataframe_context.get_uid_column()

        if self._metaParser.check_column_isin_ignored_suggestion(uid_col):
            categorical_columns = list(set(categorical_columns) - {uid_col})

        allDateCols = self._dataframe_context.get_date_columns()
        categorical_columns = list(set(categorical_columns) - set(allDateCols))
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        result_column = self._dataframe_context.get_result_column()
        categorical_columns = [
            x for x in categorical_columns if x != result_column
        ]

        appType = self._dataframe_context.get_app_type()

        model_path = self._dataframe_context.get_model_path()
        if model_path.startswith("file"):
            model_path = model_path[7:]
        validationDict = self._dataframe_context.get_validation_dict()
        print("model_path", model_path)
        pipeline_filepath = "file://" + str(model_path) + "/" + str(
            self._slug) + "/pipeline/"
        model_filepath = "file://" + str(model_path) + "/" + str(
            self._slug) + "/model"
        pmml_filepath = "file://" + str(model_path) + "/" + str(
            self._slug) + "/modelPmml"

        df = self._data_frame
        levels = df.select(result_column).distinct().count()

        appType = self._dataframe_context.get_app_type()

        model_filepath = model_path + "/" + self._slug + "/model"
        pmml_filepath = str(model_path) + "/" + str(
            self._slug) + "/traindeModel.pmml"

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._slug,
            "training",
            "info",
            display=True,
            emptyBin=False,
            customMsg=None,
            weightKey="total")

        st = time.time()
        pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns,
                                                      categorical_columns,
                                                      result_column)

        trainingData, validationData = MLUtils.get_training_and_validation_data(
            df, result_column, 0.8)  # indexed

        labelIndexer = StringIndexer(inputCol=result_column, outputCol="label")
        # OriginalTargetconverter = IndexToString(inputCol="label", outputCol="originalTargetColumn")

        # Label Mapping and Inverse
        labelIdx = labelIndexer.fit(trainingData)
        labelMapping = {k: v for k, v in enumerate(labelIdx.labels)}
        inverseLabelMapping = {
            v: float(k)
            for k, v in enumerate(labelIdx.labels)
        }
        if self._dataframe_context.get_trainerMode() == "autoML":
            automl_enable = True
        else:
            automl_enable = False
        clf = NaiveBayes()
        if not algoSetting.is_hyperparameter_tuning_enabled():
            algoParams = algoSetting.get_params_dict()
        else:
            algoParams = algoSetting.get_params_dict_hyperparameter()
        print("=" * 100)
        print(algoParams)
        print("=" * 100)
        clfParams = [prm.name for prm in clf.params]
        algoParams = {
            getattr(clf, k): v if isinstance(v, list) else [v]
            for k, v in algoParams.items() if k in clfParams
        }
        #print("="*100)
        #print("ALGOPARAMS - ",algoParams)
        #print("="*100)

        paramGrid = ParamGridBuilder()
        # if not algoSetting.is_hyperparameter_tuning_enabled():
        #     for k,v in algoParams.items():
        #         if v == [None] * len(v):
        #             continue
        #         if k.name == 'thresholds':
        #             paramGrid = paramGrid.addGrid(k,v[0])
        #         else:
        #             paramGrid = paramGrid.addGrid(k,v)
        #     paramGrid = paramGrid.build()

        # if not algoSetting.is_hyperparameter_tuning_enabled():
        for k, v in algoParams.items():
            print(k, v)
            if v == [None] * len(v):
                continue
            paramGrid = paramGrid.addGrid(k, v)
        paramGrid = paramGrid.build()
        # else:
        #     for k,v in algoParams.items():
        #         print k.name, v
        #         if v[0] == [None] * len(v[0]):
        #             continue
        #         paramGrid = paramGrid.addGrid(k,v[0])
        #     paramGrid = paramGrid.build()

        #print("="*143)
        #print("PARAMGRID - ", paramGrid)
        #print("="*143)

        if len(paramGrid) > 1:
            hyperParamInitParam = algoSetting.get_hyperparameter_params()
            evaluationMetricDict = {
                "name": hyperParamInitParam["evaluationMetric"]
            }
            evaluationMetricDict[
                "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[
                    evaluationMetricDict["name"]]
        else:
            evaluationMetricDict = {
                "name": GLOBALSETTINGS.CLASSIFICATION_MODEL_EVALUATION_METRIC
            }
            evaluationMetricDict[
                "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[
                    evaluationMetricDict["name"]]

        self._result_setter.set_hyper_parameter_results(self._slug, None)

        if validationDict["name"] == "kFold":
            numFold = int(validationDict["value"])
            estimator = Pipeline(stages=[pipeline, labelIndexer, clf])
            if algoSetting.is_hyperparameter_tuning_enabled():
                modelFilepath = "/".join(model_filepath.split("/")[:-1])
                pySparkHyperParameterResultObj = PySparkGridSearchResult(
                    estimator, paramGrid, appType, modelFilepath, levels,
                    evaluationMetricDict, trainingData, validationData,
                    numFold, self._targetLevel, labelMapping,
                    inverseLabelMapping, df)
                resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models(
                )
                self._result_setter.set_hyper_parameter_results(
                    self._slug, resultArray)
                self._result_setter.set_metadata_parallel_coordinates(
                    self._slug, {
                        "ignoreList":
                        pySparkHyperParameterResultObj.get_ignore_list(),
                        "hideColumns":
                        pySparkHyperParameterResultObj.get_hide_columns(),
                        "metricColName":
                        pySparkHyperParameterResultObj.
                        get_comparison_metric_colname(),
                        "columnOrder":
                        pySparkHyperParameterResultObj.get_keep_columns()
                    })

                bestModel = pySparkHyperParameterResultObj.getBestModel()
                prediction = pySparkHyperParameterResultObj.getBestPrediction()

            else:
                if automl_enable:
                    paramGrid = (ParamGridBuilder().addGrid(
                        clf.smoothing, [1.0, 0.2]).build())
                crossval = CrossValidator(
                    estimator=estimator,
                    estimatorParamMaps=paramGrid,
                    evaluator=BinaryClassificationEvaluator()
                    if levels == 2 else MulticlassClassificationEvaluator(),
                    numFolds=3 if numFold is None else
                    numFold)  # use 3+ folds in practice
                cvnb = crossval.fit(trainingData)
                prediction = cvnb.transform(validationData)
                bestModel = cvnb.bestModel

        else:
            train_test_ratio = float(
                self._dataframe_context.get_train_test_split())
            estimator = Pipeline(stages=[pipeline, labelIndexer, clf])
            if algoSetting.is_hyperparameter_tuning_enabled():
                modelFilepath = "/".join(model_filepath.split("/")[:-1])
                pySparkHyperParameterResultObj = PySparkTrainTestResult(
                    estimator, paramGrid, appType, modelFilepath, levels,
                    evaluationMetricDict, trainingData, validationData,
                    train_test_ratio, self._targetLevel, labelMapping,
                    inverseLabelMapping, df)
                resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models(
                )
                self._result_setter.set_hyper_parameter_results(
                    self._slug, resultArray)
                self._result_setter.set_metadata_parallel_coordinates(
                    self._slug, {
                        "ignoreList":
                        pySparkHyperParameterResultObj.get_ignore_list(),
                        "hideColumns":
                        pySparkHyperParameterResultObj.get_hide_columns(),
                        "metricColName":
                        pySparkHyperParameterResultObj.
                        get_comparison_metric_colname(),
                        "columnOrder":
                        pySparkHyperParameterResultObj.get_keep_columns()
                    })

                bestModel = pySparkHyperParameterResultObj.getBestModel()
                prediction = pySparkHyperParameterResultObj.getBestPrediction()

            else:
                tvs = TrainValidationSplit(
                    estimator=estimator,
                    estimatorParamMaps=paramGrid,
                    evaluator=BinaryClassificationEvaluator()
                    if levels == 2 else MulticlassClassificationEvaluator(),
                    trainRatio=train_test_ratio)

                tvspnb = tvs.fit(trainingData)
                prediction = tvspnb.transform(validationData)
                bestModel = tvspnb.bestModel

        modelmanagement_ = {
            param[0].name: param[1]
            for param in bestModel.stages[2].extractParamMap().items()
        }

        MLUtils.save_pipeline_or_model(bestModel, model_filepath)
        predsAndLabels = prediction.select(['prediction',
                                            'label']).rdd.map(tuple)
        # label_classes = prediction.select("label").distinct().collect()
        # label_classes = prediction.agg((F.collect_set('label').alias('label'))).first().asDict()['label']
        #results = transformed.select(["prediction","label"])
        # if len(label_classes) > 2:
        #     metrics = MulticlassMetrics(predsAndLabels) # accuracy of the model
        # else:
        #     metrics = BinaryClassificationMetrics(predsAndLabels)
        posLabel = inverseLabelMapping[self._targetLevel]
        metrics = MulticlassMetrics(predsAndLabels)

        trainingTime = time.time() - st

        f1_score = metrics.fMeasure(inverseLabelMapping[self._targetLevel],
                                    1.0)
        precision = metrics.precision(inverseLabelMapping[self._targetLevel])
        recall = metrics.recall(inverseLabelMapping[self._targetLevel])
        accuracy = metrics.accuracy

        print(f1_score, precision, recall, accuracy)

        #gain chart implementation
        def cal_prob_eval(x):
            if len(x) == 1:
                if x == posLabel:
                    return (float(x[1]))
                else:
                    return (float(1 - x[1]))
            else:
                return (float(x[int(posLabel)]))

        column_name = 'probability'

        def y_prob_for_eval_udf():
            return udf(lambda x: cal_prob_eval(x))

        prediction = prediction.withColumn(
            "y_prob_for_eval",
            y_prob_for_eval_udf()(col(column_name)))

        try:
            pys_df = prediction.select(
                ['y_prob_for_eval', 'prediction', 'label'])
            gain_lift_ks_obj = GainLiftKS(pys_df, 'y_prob_for_eval',
                                          'prediction', 'label', posLabel,
                                          self._spark)
            gain_lift_KS_dataframe = gain_lift_ks_obj.Run().toPandas()
        except:
            try:
                temp_df = pys_df.toPandas()
                gain_lift_ks_obj = GainLiftKS(temp_df, 'y_prob_for_eval',
                                              'prediction', 'label', posLabel,
                                              self._spark)
                gain_lift_KS_dataframe = gain_lift_ks_obj.Rank_Ordering()
            except:
                print("gain chant failed")
                gain_lift_KS_dataframe = None

        #feature_importance = MLUtils.calculate_sparkml_feature_importance(df, bestModel.stages[-1], categorical_columns, numerical_columns)
        act_list = prediction.select('label').collect()
        actual = [int(row.label) for row in act_list]

        pred_list = prediction.select('prediction').collect()
        predicted = [int(row.prediction) for row in pred_list]
        prob_list = prediction.select('probability').collect()
        probability = [list(row.probability) for row in prob_list]
        # objs = {"trained_model":bestModel,"actual":prediction.select('label'),"predicted":prediction.select('prediction'),
        # "probability":prediction.select('probability'),"feature_importance":None,
        # "featureList":list(categorical_columns) + list(numerical_columns),"labelMapping":labelMapping}
        objs = {
            "trained_model": bestModel,
            "actual": actual,
            "predicted": predicted,
            "probability": probability,
            "feature_importance": None,
            "featureList": list(categorical_columns) + list(numerical_columns),
            "labelMapping": labelMapping
        }

        conf_mat_ar = metrics.confusionMatrix().toArray()
        print(conf_mat_ar)
        confusion_matrix = {}
        for i in range(len(conf_mat_ar)):
            confusion_matrix[labelMapping[i]] = {}
            for j, val in enumerate(conf_mat_ar[i]):
                confusion_matrix[labelMapping[i]][labelMapping[j]] = val
        print(confusion_matrix)  # accuracy of the model
        '''ROC CURVE IMPLEMENTATION'''
        y_prob = probability
        y_score = predicted
        y_test = actual
        logLoss = log_loss(y_test, y_prob)
        if levels <= 2:
            positive_label_probs = []
            for val in y_prob:
                positive_label_probs.append(val[int(posLabel)])
            roc_auc = roc_auc_score(y_test, y_score)

            roc_data_dict = {
                "y_score": y_score,
                "y_test": y_test,
                "positive_label_probs": positive_label_probs,
                "y_prob": y_prob,
                "positive_label": posLabel
            }
            roc_dataframe = pd.DataFrame({
                "y_score":
                y_score,
                "y_test":
                y_test,
                "positive_label_probs":
                positive_label_probs
            })
            #roc_dataframe.to_csv("binary_roc_data.csv")
            fpr, tpr, thresholds = roc_curve(y_test,
                                             positive_label_probs,
                                             pos_label=posLabel)
            roc_df = pd.DataFrame({
                "FPR": fpr,
                "TPR": tpr,
                "thresholds": thresholds
            })
            roc_df["tpr-fpr"] = roc_df["TPR"] - roc_df["FPR"]

            optimal_index = np.argmax(np.array(roc_df["tpr-fpr"]))
            fpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "FPR"]
            tpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "TPR"]

            rounded_roc_df = roc_df.round({'FPR': 2, 'TPR': 4})

            unique_fpr = rounded_roc_df["FPR"].unique()

            final_roc_df = rounded_roc_df.groupby("FPR",
                                                  as_index=False)[["TPR"
                                                                   ]].mean()
            endgame_roc_df = final_roc_df.round({'FPR': 2, 'TPR': 3})
        elif levels > 2:
            positive_label_probs = []
            for val in y_prob:
                positive_label_probs.append(val[int(posLabel)])

            y_test_roc_multi = []
            for val in y_test:
                if val != posLabel:
                    val = posLabel + 1
                    y_test_roc_multi.append(val)
                else:
                    y_test_roc_multi.append(val)

            y_score_roc_multi = []
            for val in y_score:
                if val != posLabel:
                    val = posLabel + 1
                    y_score_roc_multi.append(val)
                else:
                    y_score_roc_multi.append(val)

            roc_auc = roc_auc_score(y_test_roc_multi, y_score_roc_multi)

            fpr, tpr, thresholds = roc_curve(y_test_roc_multi,
                                             positive_label_probs,
                                             pos_label=posLabel)
            roc_df = pd.DataFrame({
                "FPR": fpr,
                "TPR": tpr,
                "thresholds": thresholds
            })
            roc_df["tpr-fpr"] = roc_df["TPR"] - roc_df["FPR"]

            optimal_index = np.argmax(np.array(roc_df["tpr-fpr"]))
            fpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "FPR"]
            tpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "TPR"]

            rounded_roc_df = roc_df.round({'FPR': 2, 'TPR': 4})
            unique_fpr = rounded_roc_df["FPR"].unique()
            final_roc_df = rounded_roc_df.groupby("FPR",
                                                  as_index=False)[["TPR"
                                                                   ]].mean()
            endgame_roc_df = final_roc_df.round({'FPR': 2, 'TPR': 3})
        # Calculating prediction_split
        val_cnts = prediction.groupBy('label').count()
        val_cnts = map(lambda row: row.asDict(), val_cnts.collect())
        prediction_split = {}
        total_nos = prediction.select('label').count()
        for item in val_cnts:
            print(labelMapping)
            classname = labelMapping[item['label']]
            prediction_split[classname] = round(
                item['count'] * 100 / float(total_nos), 2)

        if not algoSetting.is_hyperparameter_tuning_enabled():
            modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH -
                                     1) + "1"
            modelFilepathArr = model_filepath.split("/")[:-1]
            modelFilepathArr.append(modelName)
            bestModel.save("/".join(modelFilepathArr))
        runtime = round((time.time() - st_global), 2)

        try:
            print(pmml_filepath)
            pmmlBuilder = PMMLBuilder(self._spark, trainingData,
                                      bestModel).putOption(
                                          clf, 'compact', True)
            pmmlBuilder.buildFile(pmml_filepath)
            pmmlfile = open(pmml_filepath, "r")
            pmmlText = pmmlfile.read()
            pmmlfile.close()
            self._result_setter.update_pmml_object({self._slug: pmmlText})
        except Exception as e:
            print("PMML failed...", str(e))
            pass

        cat_cols = list(set(categorical_columns) - {result_column})
        self._model_summary = MLModelSummary()
        self._model_summary.set_algorithm_name("Naive Bayes")
        self._model_summary.set_algorithm_display_name("Naive Bayes")
        self._model_summary.set_slug(self._slug)
        self._model_summary.set_training_time(runtime)
        self._model_summary.set_confusion_matrix(confusion_matrix)
        # self._model_summary.set_feature_importance(objs["feature_importance"])
        self._model_summary.set_feature_list(objs["featureList"])
        self._model_summary.set_model_accuracy(accuracy)
        self._model_summary.set_training_time(round((time.time() - st), 2))
        self._model_summary.set_precision_recall_stats([precision, recall])
        self._model_summary.set_model_precision(precision)
        self._model_summary.set_model_recall(recall)
        self._model_summary.set_model_F1_score(f1_score)
        self._model_summary.set_model_log_loss(logLoss)
        self._model_summary.set_gain_lift_KS_data(gain_lift_KS_dataframe)
        self._model_summary.set_AUC_score(roc_auc)
        self._model_summary.set_target_variable(result_column)
        self._model_summary.set_prediction_split(prediction_split)
        self._model_summary.set_validation_method("KFold")
        self._model_summary.set_level_map_dict(objs["labelMapping"])
        # self._model_summary.set_model_features(list(set(x_train.columns)-set([result_column])))
        self._model_summary.set_model_features(objs["featureList"])
        self._model_summary.set_level_counts(
            self._metaParser.get_unique_level_dict(
                list(set(categorical_columns)) + [result_column]))
        #self._model_summary.set_num_trees(objs['trained_model'].getNumTrees)
        self._model_summary.set_num_rules(300)
        self._model_summary.set_target_level(self._targetLevel)

        if not algoSetting.is_hyperparameter_tuning_enabled():
            modelDropDownObj = {
                "name": self._model_summary.get_algorithm_name(),
                "evaluationMetricValue": accuracy,
                "evaluationMetricName": "accuracy",
                "slug": self._model_summary.get_slug(),
                "Model Id": modelName
            }
            modelSummaryJson = {
                "dropdown": modelDropDownObj,
                "levelcount": self._model_summary.get_level_counts(),
                "modelFeatureList": self._model_summary.get_feature_list(),
                "levelMapping": self._model_summary.get_level_map_dict(),
                "slug": self._model_summary.get_slug(),
                "name": self._model_summary.get_algorithm_name()
            }
        else:
            modelDropDownObj = {
                "name": self._model_summary.get_algorithm_name(),
                "evaluationMetricValue": accuracy,
                "evaluationMetricName": "accuracy",
                "slug": self._model_summary.get_slug(),
                "Model Id": resultArray[0]["Model Id"]
            }
            modelSummaryJson = {
                "dropdown": modelDropDownObj,
                "levelcount": self._model_summary.get_level_counts(),
                "modelFeatureList": self._model_summary.get_feature_list(),
                "levelMapping": self._model_summary.get_level_map_dict(),
                "slug": self._model_summary.get_slug(),
                "name": self._model_summary.get_algorithm_name()
            }
        self._model_management = MLModelSummary()
        print(modelmanagement_)
        self._model_management.set_job_type(
            self._dataframe_context.get_job_name())  #Project name
        self._model_management.set_training_status(
            data="completed")  # training status
        self._model_management.set_target_level(
            self._targetLevel)  # target column value
        self._model_management.set_training_time(runtime)  # run time
        self._model_management.set_model_accuracy(round(metrics.accuracy, 2))
        # self._model_management.set_model_accuracy(round(metrics.accuracy_score(objs["actual"], objs["predicted"]),2))#accuracy
        self._model_management.set_algorithm_name(
            "NaiveBayes")  #algorithm name
        self._model_management.set_validation_method(
            str(validationDict["displayName"]) + "(" +
            str(validationDict["value"]) + ")")  #validation method
        self._model_management.set_target_variable(
            result_column)  #target column name
        self._model_management.set_creation_date(data=str(
            datetime.now().strftime('%b %d ,%Y  %H:%M ')))  #creation date
        self._model_management.set_datasetName(self._datasetName)
        self._model_management.set_model_type(data='classification')
        self._model_management.set_var_smoothing(
            data=int(modelmanagement_['smoothing']))

        # self._model_management.set_no_of_independent_variables(df) #no of independent varables

        modelManagementSummaryJson = [
            ["Project Name",
             self._model_management.get_job_type()],
            ["Algorithm",
             self._model_management.get_algorithm_name()],
            ["Training Status",
             self._model_management.get_training_status()],
            ["Accuracy",
             self._model_management.get_model_accuracy()],
            ["RunTime", self._model_management.get_training_time()],
            #["Owner",None],
            ["Created On",
             self._model_management.get_creation_date()]
        ]

        modelManagementModelSettingsJson = [
            ["Training Dataset",
             self._model_management.get_datasetName()],
            ["Target Column",
             self._model_management.get_target_variable()],
            ["Target Column Value",
             self._model_management.get_target_level()],
            ["Algorithm",
             self._model_management.get_algorithm_name()],
            [
                "Model Validation",
                self._model_management.get_validation_method()
            ],
            ["Model Type",
             self._model_management.get_model_type()],
            ["Smoothing",
             self._model_management.get_var_smoothing()],

            #,["priors",self._model_management.get_priors()]
            #,["var_smoothing",self._model_management.get_var_smoothing()]
        ]

        nbOverviewCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj))
            for cardObj in MLUtils.create_model_management_card_overview(
                self._model_management, modelManagementSummaryJson,
                modelManagementModelSettingsJson)
        ]
        nbPerformanceCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj))
            for cardObj in MLUtils.create_model_management_cards(
                self._model_summary, endgame_roc_df)
        ]
        nbDeploymentCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj))
            for cardObj in MLUtils.create_model_management_deploy_empty_card()
        ]
        nbCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for
            cardObj in MLUtils.create_model_summary_cards(self._model_summary)
        ]
        NB_Overview_Node = NarrativesTree()
        NB_Overview_Node.set_name("Overview")
        NB_Performance_Node = NarrativesTree()
        NB_Performance_Node.set_name("Performance")
        NB_Deployment_Node = NarrativesTree()
        NB_Deployment_Node.set_name("Deployment")
        for card in nbOverviewCards:
            NB_Overview_Node.add_a_card(card)
        for card in nbPerformanceCards:
            NB_Performance_Node.add_a_card(card)
        for card in nbDeploymentCards:
            NB_Deployment_Node.add_a_card(card)
        for card in nbCards:
            self._prediction_narrative.add_a_card(card)

        self._result_setter.set_model_summary({
            "naivebayes":
            json.loads(
                CommonUtils.convert_python_object_to_json(self._model_summary))
        })
        self._result_setter.set_naive_bayes_model_summary(modelSummaryJson)
        self._result_setter.set_nb_cards(nbCards)
        self._result_setter.set_nb_nodes(
            [NB_Overview_Node, NB_Performance_Node, NB_Deployment_Node])
        self._result_setter.set_nb_fail_card({
            "Algorithm_Name": "Naive Bayes",
            "success": "True"
        })

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._slug,
            "completion",
            "info",
            display=True,
            emptyBin=False,
            customMsg=None,
            weightKey="total")

        print("\n\n")
Esempio n. 9
0
    def __init__(self,
                 df_helper,
                 df_chisquare_result,
                 spark,
                 df_context,
                 data_frame,
                 story_narrative,
                 result_setter,
                 scriptWeight=None,
                 analysisName=None):
        self._story_narrative = story_narrative
        self._result_setter = result_setter
        self._data_frame = data_frame
        self._dataframe_context = df_context
        self._dataframe_helper = df_helper
        self._storyOnScoredData = self._dataframe_context.get_story_on_scored_data(
        )
        self._measure_columns = df_helper.get_numeric_columns()
        self._df_chisquare = df_chisquare_result
        self._df_chisquare_result = df_chisquare_result.get_result()
        self.narratives = {}
        self._appid = df_context.get_app_id()
        self._chiSquareNode = NarrativesTree()
        self._chiSquareNode.set_name("Association")
        self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER
        self._noOfSigDimsToShow = GLOBALSETTINGS.CHISQUARESIGNIFICANTDIMENSIONTOSHOW
        self._base_dir = "/chisquare/"
        self._spark = spark

        ############################DataFrame Measure to Dimesion Column#####################

        pandas_df = self._data_frame.toPandas()
        target_dimension = self._df_chisquare_result.keys()

        bin_data = {}
        for col in self._measure_columns:
            chisquare_result = self._df_chisquare.get_chisquare_result(
                target_dimension[0], col)
            bin_data[col] = chisquare_result.get_contingency_table(
            ).get_column_two_levels()

        for bin_col in bin_data.keys():
            for split in bin_data[bin_col]:
                val = split.split('to')
                pandas_df[bin_col][
                    (pandas_df[bin_col] >= float(val[0].replace(',', '')))
                    & (pandas_df[bin_col] < float(val[1].replace(',', ''))
                       )] = split

        fields = [
            StructField(field_name, StringType(), True)
            for field_name in pandas_df.columns
        ]
        schema = StructType(fields)

        SQLctx = SQLContext(sparkContext=self._spark.sparkContext,
                            sparkSession=self._spark)
        self._data_frame = SQLctx.createDataFrame(pandas_df, schema)

        # print self._data_frame
        ############################DataFrame Measure to Dimesion Column#####################

        if self._appid != None:
            if self._appid == "1":
                self._base_dir += "appid1/"
            elif self._appid == "2":
                self._base_dir += "appid2/"

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        if analysisName == None:
            self._analysisName = self._dataframe_context.get_analysis_name()
        else:
            self._analysisName = analysisName

        self._messageURL = self._dataframe_context.get_message_url()
        if scriptWeight == None:
            self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight(
            )
        else:
            self._scriptWeightDict = scriptWeight
        self._analysisDict = self._dataframe_context.get_analysis_dict()
        if self._analysisDict != {}:
            self._nColsToUse = self._analysisDict[
                self._analysisName]["noOfColumnsToUse"]
        else:
            self._nColsToUse = None

        self._scriptStages = {
            "initialization": {
                "summary": "Initialized the Frequency Narratives",
                "weight": 0
            },
            "summarygeneration": {
                "summary": "summary generation finished",
                "weight": 10
            },
            "completion": {
                "summary": "Frequency Stats Narratives done",
                "weight": 0
            },
        }
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "initialization",
            "info",
            display=False,
            weightKey="narratives")

        self._generate_narratives()

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "summarygeneration",
            "info",
            display=False,
            weightKey="narratives")

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "completion",
            "info",
            display=False,
            weightKey="narratives")
Esempio n. 10
0
    def Train(self):
        st_global = time.time()

        CommonUtils.create_update_and_save_progress_message(self._dataframe_context, self._scriptWeightDict,
                                                            self._scriptStages, self._slug, "initialization", "info",
                                                            display=True, emptyBin=False, customMsg=None,
                                                            weightKey="total")

        algosToRun = self._dataframe_context.get_algorithms_to_run()
        algoSetting = [x for x in algosToRun if x.get_algorithm_slug()==self._slug][0]
        categorical_columns = self._dataframe_helper.get_string_columns()
        uid_col = self._dataframe_context.get_uid_column()

        if self._metaParser.check_column_isin_ignored_suggestion(uid_col):
            categorical_columns = list(set(categorical_columns) - {uid_col})

        allDateCols = self._dataframe_context.get_date_columns()
        categorical_columns = list(set(categorical_columns) - set(allDateCols))
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        result_column = self._dataframe_context.get_result_column()
        categorical_columns = [x for x in categorical_columns if x != result_column]

        appType = self._dataframe_context.get_app_type()

        model_path = self._dataframe_context.get_model_path()
        if model_path.startswith("file"):
            model_path = model_path[7:]
        validationDict = self._dataframe_context.get_validation_dict()

        # pipeline_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/pipeline/"
        # model_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/model"
        # pmml_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/modelPmml"

        df = self._data_frame
        levels = df.select(result_column).distinct().count()

        appType = self._dataframe_context.get_app_type()

        model_filepath = model_path + "/" + self._slug + "/model"
        pmml_filepath = str(model_path) + "/" + str(self._slug) + "/trainedModel.pmml"

        CommonUtils.create_update_and_save_progress_message(self._dataframe_context, self._scriptWeightDict,
                                                            self._scriptStages, self._slug, "training", "info",
                                                            display=True, emptyBin=False, customMsg=None,
                                                            weightKey="total")

        st = time.time()
        pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns, categorical_columns, result_column)
        vectorFeats = pipeline.getStages()[-1].transform(df)
        input_feats = len(vectorFeats.select('features').take(1)[0][0])

        trainingData, validationData = MLUtils.get_training_and_validation_data(df, result_column, 0.8)  # indexed

        labelIndexer = StringIndexer(inputCol=result_column, outputCol="label")
        # OriginalTargetconverter = IndexToString(inputCol="label", outputCol="originalTargetColumn")

        # Label Mapping and Inverse
        labelIdx = labelIndexer.fit(trainingData)
        labelMapping = {k: v for k, v in enumerate(labelIdx.labels)}
        inverseLabelMapping = {v: float(k) for k, v in enumerate(labelIdx.labels)}

        clf = MultilayerPerceptronClassifier()
        if not algoSetting.is_hyperparameter_tuning_enabled():
            algoParams = algoSetting.get_params_dict()
        else:
            algoParams = algoSetting.get_params_dict_hyperparameter()
        clfParams = [prm.name for prm in clf.params]

        algoParams = {getattr(clf, k): v if isinstance(v, list) else [v] for k, v in algoParams.items() if
                      k in clfParams}

        paramGrid = ParamGridBuilder()
        layer_param_val = algoParams[getattr(clf, 'layers')]

        for layer in layer_param_val:
            layer.insert(0, input_feats)
            layer.append(levels)

        print('layer_param_val =', layer_param_val)

        # if not algoSetting.is_hyperparameter_tuning_enabled():
        #     for k,v in algoParams.items():
        #         if k.name == 'layers':
        #             paramGrid = paramGrid.addGrid(k,layer_param_val)
        #         else:
        #             paramGrid = paramGrid.addGrid(k,v)
        #     paramGrid = paramGrid.build()
        # else:
        for k, v in algoParams.items():
            if v == [None] * len(v):
                continue
            if k.name == 'layers':
                paramGrid = paramGrid.addGrid(k, layer_param_val)
            else:
                paramGrid = paramGrid.addGrid(k, v)
        paramGrid = paramGrid.build()

        if len(paramGrid) > 1:
            hyperParamInitParam = algoSetting.get_hyperparameter_params()
            evaluationMetricDict = {"name": hyperParamInitParam["evaluationMetric"]}
            evaluationMetricDict["displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[
                evaluationMetricDict["name"]]
        else:
            evaluationMetricDict = {"name": GLOBALSETTINGS.CLASSIFICATION_MODEL_EVALUATION_METRIC}
            evaluationMetricDict["displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[
                evaluationMetricDict["name"]]

        self._result_setter.set_hyper_parameter_results(self._slug, None)

        if validationDict["name"] == "kFold":
            numFold = int(validationDict["value"])
            estimator = Pipeline(stages=[pipeline, labelIndexer, clf])
            if algoSetting.is_hyperparameter_tuning_enabled():
                modelFilepath = "/".join(model_filepath.split("/")[:-1])
                pySparkHyperParameterResultObj = PySparkGridSearchResult(estimator, paramGrid, appType, modelFilepath,
                                                                         levels,
                                                                         evaluationMetricDict, trainingData,
                                                                         validationData, numFold, self._targetLevel,
                                                                         labelMapping, inverseLabelMapping,
                                                                         df)
                resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models()
                self._result_setter.set_hyper_parameter_results(self._slug, resultArray)
                self._result_setter.set_metadata_parallel_coordinates(self._slug,
                                                                      {
                                                                          "ignoreList": pySparkHyperParameterResultObj.get_ignore_list(),
                                                                          "hideColumns": pySparkHyperParameterResultObj.get_hide_columns(),
                                                                          "metricColName": pySparkHyperParameterResultObj.get_comparison_metric_colname(),
                                                                          "columnOrder": pySparkHyperParameterResultObj.get_keep_columns()})

                bestModel = pySparkHyperParameterResultObj.getBestModel()
                prediction = pySparkHyperParameterResultObj.getBestPrediction()
                bestModelName = resultArray[0]["Model Id"]

            else:
                crossval = CrossValidator(estimator=estimator,
                                          estimatorParamMaps=paramGrid,
                                          evaluator=BinaryClassificationEvaluator() if levels == 2 else MulticlassClassificationEvaluator(),
                                          numFolds=3 if numFold is None else numFold)  # use 3+ folds in practice
                cvrf = crossval.fit(trainingData)
                prediction = cvrf.transform(validationData)
                bestModel = cvrf.bestModel
                bestModelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1"

        else:
            train_test_ratio = float(self._dataframe_context.get_train_test_split())
            estimator = Pipeline(stages=[pipeline, labelIndexer, clf])
            if algoSetting.is_hyperparameter_tuning_enabled():
                modelFilepath = "/".join(model_filepath.split("/")[:-1])
                pySparkHyperParameterResultObj = PySparkTrainTestResult(estimator, paramGrid, appType, modelFilepath,
                                                                        levels,
                                                                        evaluationMetricDict, trainingData,
                                                                        validationData, train_test_ratio,
                                                                        self._targetLevel, labelMapping,
                                                                        inverseLabelMapping,
                                                                        df)
                resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models()
                self._result_setter.set_hyper_parameter_results(self._slug, resultArray)
                self._result_setter.set_metadata_parallel_coordinates(self._slug,
                                                                      {
                                                                          "ignoreList": pySparkHyperParameterResultObj.get_ignore_list(),
                                                                          "hideColumns": pySparkHyperParameterResultObj.get_hide_columns(),
                                                                          "metricColName": pySparkHyperParameterResultObj.get_comparison_metric_colname(),
                                                                          "columnOrder": pySparkHyperParameterResultObj.get_keep_columns()})

                bestModel = pySparkHyperParameterResultObj.getBestModel()
                prediction = pySparkHyperParameterResultObj.getBestPrediction()
                bestModelName = resultArray[0]["Model Id"]

            else:
                tvs = TrainValidationSplit(estimator=estimator,
                                           estimatorParamMaps=paramGrid,
                                           evaluator=BinaryClassificationEvaluator() if levels == 2 else MulticlassClassificationEvaluator(),
                                           trainRatio=train_test_ratio)

                tvrf = tvs.fit(trainingData)
                prediction = tvrf.transform(validationData)
                bestModel = tvrf.bestModel
                bestModelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1"

        MLUtils.save_pipeline_or_model(bestModel,model_filepath)
        predsAndLabels = prediction.select(['prediction', 'label']).rdd.map(tuple)
        metrics = MulticlassMetrics(predsAndLabels)
        posLabel = inverseLabelMapping[self._targetLevel]

        conf_mat_ar = metrics.confusionMatrix().toArray()
        print(conf_mat_ar)
        confusion_matrix = {}
        for i in range(len(conf_mat_ar)):
            confusion_matrix[labelMapping[i]] = {}
            for j, val in enumerate(conf_mat_ar[i]):
                confusion_matrix[labelMapping[i]][labelMapping[j]] = val
        print(confusion_matrix)

        trainingTime = time.time() - st

        f1_score = metrics.fMeasure(inverseLabelMapping[self._targetLevel], 1.0)
        precision = metrics.precision(inverseLabelMapping[self._targetLevel])
        recall = metrics.recall(inverseLabelMapping[self._targetLevel])
        accuracy = metrics.accuracy
        roc_auc = 'Undefined'
        if levels == 2:
            bin_metrics = BinaryClassificationMetrics(predsAndLabels)
            roc_auc = bin_metrics.areaUnderROC
            precision = metrics.precision(inverseLabelMapping[self._targetLevel])
            recall = metrics.recall(inverseLabelMapping[self._targetLevel])
        print(f1_score,precision,recall,accuracy)

        #gain chart implementation
        def cal_prob_eval(x):
            if len(x) == 1:
                if x == posLabel:
                    return(float(x[1]))
                else:
                    return(float(1 - x[1]))
            else:
                return(float(x[int(posLabel)]))


        column_name= 'probability'
        def y_prob_for_eval_udf():
            return udf(lambda x:cal_prob_eval(x))
        prediction = prediction.withColumn("y_prob_for_eval", y_prob_for_eval_udf()(col(column_name)))

        try:
            pys_df = prediction.select(['y_prob_for_eval','prediction','label'])
            gain_lift_ks_obj = GainLiftKS(pys_df, 'y_prob_for_eval', 'prediction', 'label', posLabel, self._spark)
            gain_lift_KS_dataframe = gain_lift_ks_obj.Run().toPandas()
        except:
            try:
                temp_df = pys_df.toPandas()
                gain_lift_ks_obj = GainLiftKS(temp_df, 'y_prob_for_eval', 'prediction', 'label', posLabel, self._spark)
                gain_lift_KS_dataframe = gain_lift_ks_obj.Rank_Ordering()
            except:
                print("gain chant failed")
                gain_lift_KS_dataframe = None


        objs = {"trained_model": bestModel, "actual": prediction.select('label'),
                "predicted": prediction.select('prediction'),
                "probability": prediction.select('probability'), "feature_importance": None,
                "featureList": list(categorical_columns) + list(numerical_columns), "labelMapping": labelMapping}

        # Calculating prediction_split
        val_cnts = prediction.groupBy('label').count()
        val_cnts = map(lambda row: row.asDict(), val_cnts.collect())
        prediction_split = {}
        total_nos = objs['actual'].count()
        for item in val_cnts:
            classname = labelMapping[item['label']]
            prediction_split[classname] = round(item['count'] * 100 / float(total_nos), 2)

        if not algoSetting.is_hyperparameter_tuning_enabled():
            # modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1"
            modelFilepathArr = model_filepath.split("/")[:-1]
            modelFilepathArr.append(bestModelName)
            bestModel.save("/".join(modelFilepathArr))
        runtime = round((time.time() - st_global), 2)

        try:
            print(pmml_filepath)
            pmmlBuilder = PMMLBuilder(self._spark, trainingData, bestModel).putOption(clf, 'compact', True)
            pmmlBuilder.buildFile(pmml_filepath)
            pmmlfile = open(pmml_filepath, "r")
            pmmlText = pmmlfile.read()
            pmmlfile.close()
            self._result_setter.update_pmml_object({self._slug: pmmlText})
        except Exception as e:
            print("PMML failed...", str(e))
            pass

        cat_cols = list(set(categorical_columns) - {result_column})
        self._model_summary = MLModelSummary()
        self._model_summary.set_algorithm_name("Spark ML Multilayer Perceptron")
        self._model_summary.set_algorithm_display_name("Spark ML Multilayer Perceptron")
        self._model_summary.set_slug(self._slug)
        self._model_summary.set_training_time(runtime)
        self._model_summary.set_confusion_matrix(confusion_matrix)
        self._model_summary.set_feature_importance(objs["feature_importance"])
        self._model_summary.set_feature_list(objs["featureList"])
        self._model_summary.set_model_accuracy(accuracy)
        self._model_summary.set_training_time(round((time.time() - st), 2))
        self._model_summary.set_precision_recall_stats([precision, recall])
        self._model_summary.set_model_precision(precision)
        self._model_summary.set_model_recall(recall)
        self._model_summary.set_target_variable(result_column)
        self._model_summary.set_prediction_split(prediction_split)
        self._model_summary.set_validation_method("KFold")
        self._model_summary.set_level_map_dict(objs["labelMapping"])
        self._model_summary.set_model_features(objs["featureList"])
        self._model_summary.set_level_counts(
            self._metaParser.get_unique_level_dict(list(set(categorical_columns)) + [result_column]))
        self._model_summary.set_num_trees(None)
        self._model_summary.set_num_rules(300)
        self._model_summary.set_target_level(self._targetLevel)

        modelManagementJson = {
            "Model ID": "SPMLP-" + bestModelName,
            "Project Name": self._dataframe_context.get_job_name(),
            "Algorithm": self._model_summary.get_algorithm_name(),
            "Status": 'Completed',
            "Accuracy": accuracy,
            "Runtime": runtime,
            "Created On": "",
            "Owner": "",
            "Deployment": 0,
            "Action": ''
        }

        # if not algoSetting.is_hyperparameter_tuning_enabled():
        #     modelDropDownObj = {
        #         "name": self._model_summary.get_algorithm_name(),
        #         "evaluationMetricValue": locals()[evaluationMetricDict["name"]], # accuracy
        #         "evaluationMetricName": evaluationMetricDict["displayName"], # accuracy
        #         "slug": self._model_summary.get_slug(),
        #         "Model Id": bestModelName
        #     }
        #     modelSummaryJson = {
        #         "dropdown": modelDropDownObj,
        #         "levelcount": self._model_summary.get_level_counts(),
        #         "modelFeatureList": self._model_summary.get_feature_list(),
        #         "levelMapping": self._model_summary.get_level_map_dict(),
        #         "slug": self._model_summary.get_slug(),
        #         "name": self._model_summary.get_algorithm_name()
        #     }
        # else:
        modelDropDownObj = {
            "name": self._model_summary.get_algorithm_name(),
            "evaluationMetricValue": accuracy, #locals()[evaluationMetricDict["name"]],
            "evaluationMetricName": "accuracy", # evaluationMetricDict["name"],
            "slug": self._model_summary.get_slug(),
            "Model Id": bestModelName
        }
        modelSummaryJson = {
            "dropdown": modelDropDownObj,
            "levelcount": self._model_summary.get_level_counts(),
            "modelFeatureList": self._model_summary.get_feature_list(),
            "levelMapping": self._model_summary.get_level_map_dict(),
            "slug": self._model_summary.get_slug(),
            "name": self._model_summary.get_algorithm_name()
        }

        mlpcCards = [json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in
                     MLUtils.create_model_summary_cards(self._model_summary)]
        for card in mlpcCards:
            self._prediction_narrative.add_a_card(card)

        self._result_setter.set_model_summary(
            {"sparkperceptron": json.loads(CommonUtils.convert_python_object_to_json(self._model_summary))})
        self._result_setter.set_spark_multilayer_perceptron_model_summary(modelSummaryJson)
        self._result_setter.set_spark_multilayer_perceptron_management_summary(modelManagementJson)
        self._result_setter.set_mlpc_cards(mlpcCards)

        CommonUtils.create_update_and_save_progress_message(self._dataframe_context, self._scriptWeightDict,
                                                            self._scriptStages, self._slug, "completion", "info",
                                                            display=True, emptyBin=False, customMsg=None,
                                                            weightKey="total")
    def Predict(self):
        self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight()
        self._scriptStages = {
            "initialization":{
                "summary":"Initialized the Decision Tree Regression Scripts",
                "weight":2
                },
            "predictionStart":{
                "summary":"Decision Tree Regression Model Prediction Started",
                "weight":2
                },
            "predictionFinished":{
                "summary":"Decision Tree Regression Model Prediction Finished",
                "weight":6
                }
            }
        CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"initialization","info",display=True,emptyBin=False,customMsg=None,weightKey="total")

        SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark)
        dataSanity = True
        categorical_columns = self._dataframe_helper.get_string_columns()
        uid_col = self._dataframe_context.get_uid_column()
        if self._metaParser.check_column_isin_ignored_suggestion(uid_col):
            categorical_columns = list(set(categorical_columns) - {uid_col})
        allDateCols = self._dataframe_context.get_date_columns()
        categorical_columns = list(set(categorical_columns)-set(allDateCols))
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        result_column = self._dataframe_context.get_result_column()
        test_data_path = self._dataframe_context.get_input_file()

        if self._mlEnv == "spark":
            score_data_path = self._dataframe_context.get_score_path()+"/data.csv"
            trained_model_path = "file://" + self._dataframe_context.get_model_path()
            trained_model_path += "/model"
            pipeline_path = "/".join(trained_model_path.split("/")[:-1])+"/pipeline"
            print "trained_model_path",trained_model_path
            print "pipeline_path",pipeline_path
            print "score_data_path",score_data_path
            pipelineModel = MLUtils.load_pipeline(pipeline_path)
            trained_model = MLUtils.load_dtree_regresssion_pyspark_model(trained_model_path)
            df = self._data_frame
            indexed = pipelineModel.transform(df)
            transformed = trained_model.transform(indexed)
            if result_column in transformed.columns:
                transformed = transformed.withColumnRenamed(result_column,"originalLabel")
            transformed = transformed.withColumnRenamed("prediction",result_column)
            pandas_scored_df = transformed.select(list(set(self._data_frame.columns+[result_column]))).toPandas()
            if score_data_path.startswith("file"):
                score_data_path = score_data_path[7:]
            pandas_scored_df.to_csv(score_data_path,header=True,index=False)

            print "STARTING Measure ANALYSIS ..."
            columns_to_keep = []
            columns_to_drop = []
            columns_to_keep = self._dataframe_context.get_score_consider_columns()
            if len(columns_to_keep) > 0:
                columns_to_drop = list(set(df.columns)-set(columns_to_keep))
            else:
                columns_to_drop += ["predicted_probability"]
            columns_to_drop = [x for x in columns_to_drop if x in df.columns and x != result_column]
            print "columns_to_drop",columns_to_drop
            spark_scored_df = transformed.select(list(set(columns_to_keep+[result_column])))

        elif self._mlEnv == "sklearn":
            CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"predictionStart","info",display=True,emptyBin=False,customMsg=None,weightKey="total")
            score_data_path = self._dataframe_context.get_score_path()+"/data.csv"
            trained_model_path = "file://" + self._dataframe_context.get_model_path()
            trained_model_path += "/"+self._dataframe_context.get_model_for_scoring()+".pkl"
            print "trained_model_path",trained_model_path
            print "score_data_path",score_data_path
            if trained_model_path.startswith("file"):
                trained_model_path = trained_model_path[7:]
            trained_model = joblib.load(trained_model_path)
            model_columns = self._dataframe_context.get_model_features()
            print "model_columns",model_columns

            df = self._data_frame.toPandas()
            # pandas_df = MLUtils.factorize_columns(df,[x for x in categorical_columns if x != result_column])
            pandas_df = MLUtils.create_dummy_columns(df,[x for x in categorical_columns if x != result_column])
            pandas_df = MLUtils.fill_missing_columns(pandas_df,model_columns,result_column)

            if uid_col:
                pandas_df = pandas_df[[x for x in pandas_df.columns if x != uid_col]]
            y_score = trained_model.predict(pandas_df)

            scoreKpiArray = MLUtils.get_scored_data_summary(y_score)
            kpiCard = NormalCard()
            kpiCardData = [KpiData(data=x) for x in scoreKpiArray]
            kpiCard.set_card_data(kpiCardData)
            kpiCard.set_cente_alignment(True)
            print CommonUtils.convert_python_object_to_json(kpiCard)
            self._result_setter.set_kpi_card_regression_score(kpiCard)

            pandas_df[result_column] = y_score
            df[result_column] = y_score
            df.to_csv(score_data_path,header=True,index=False)
            CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"predictionFinished","info",display=True,emptyBin=False,customMsg=None,weightKey="total")


            print "STARTING Measure ANALYSIS ..."
            columns_to_keep = []
            columns_to_drop = []
            columns_to_keep = self._dataframe_context.get_score_consider_columns()
            if len(columns_to_keep) > 0:
                columns_to_drop = list(set(df.columns)-set(columns_to_keep))
            else:
                columns_to_drop += ["predicted_probability"]

            columns_to_drop = [x for x in columns_to_drop if x in df.columns and x != result_column]
            print "columns_to_drop",columns_to_drop
            pandas_scored_df = df[list(set(columns_to_keep+[result_column]))]
            spark_scored_df = SQLctx.createDataFrame(pandas_scored_df)
            # spark_scored_df.write.csv(score_data_path+"/data",mode="overwrite",header=True)
            # TODO update metadata for the newly created dataframe
            self._dataframe_context.update_consider_columns(columns_to_keep)
            print spark_scored_df.printSchema()

        df_helper = DataFrameHelper(spark_scored_df, self._dataframe_context,self._metaParser)
        df_helper.set_params()
        df = df_helper.get_data_frame()
        # self._dataframe_context.set_dont_send_message(True)
        try:
            fs = time.time()
            descr_stats_obj = DescriptiveStatsScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,scriptWeight=self._scriptWeightDict,analysisName="Descriptive analysis")
            descr_stats_obj.Run()
            print "DescriptiveStats Analysis Done in ", time.time() - fs, " seconds."
        except:
            print "Frequency Analysis Failed "

        # try:
        #     fs = time.time()
        #     df_helper.fill_na_dimension_nulls()
        #     df = df_helper.get_data_frame()
        #     dt_reg = DecisionTreeRegressionScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,self._metaParser,scriptWeight=self._scriptWeightDict,analysisName="Predictive modeling")
        #     dt_reg.Run()
        #     print "DecisionTrees Analysis Done in ", time.time() - fs, " seconds."
        # except:
        #     print "DTREE FAILED"

        try:
            fs = time.time()
            two_way_obj = TwoWayAnovaScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,self._metaParser,scriptWeight=self._scriptWeightDict,analysisName="Measure vs. Dimension")
            two_way_obj.Run()
            print "OneWayAnova Analysis Done in ", time.time() - fs, " seconds."
        except:
            print "Anova Analysis Failed"
Esempio n. 12
0
    def Train(self):
        st_global = time.time()

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._slug,
            "initialization",
            "info",
            display=True,
            emptyBin=False,
            customMsg=None,
            weightKey="total")

        algosToRun = self._dataframe_context.get_algorithms_to_run()
        algoSetting = [
            x for x in algosToRun if x.get_algorithm_slug() == self._slug
        ][0]
        categorical_columns = self._dataframe_helper.get_string_columns()
        uid_col = self._dataframe_context.get_uid_column()
        if self._metaParser.check_column_isin_ignored_suggestion(uid_col):
            categorical_columns = list(set(categorical_columns) - {uid_col})
        allDateCols = self._dataframe_context.get_date_columns()
        categorical_columns = list(set(categorical_columns) - set(allDateCols))
        print(categorical_columns)
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        result_column = self._dataframe_context.get_result_column()

        model_path = self._dataframe_context.get_model_path()
        if model_path.startswith("file"):
            model_path = model_path[7:]
        validationDict = self._dataframe_context.get_validation_dict()
        print("model_path", model_path)
        pipeline_filepath = "file://" + str(model_path) + "/" + str(
            self._slug) + "/pipeline/"
        model_filepath = "file://" + str(model_path) + "/" + str(
            self._slug) + "/model"
        pmml_filepath = "file://" + str(model_path) + "/" + str(
            self._slug) + "/modelPmml"

        df = self._data_frame
        if self._mlEnv == "spark":
            pass
        elif self._mlEnv == "sklearn":
            model_filepath = model_path + "/" + self._slug + "/model.pkl"
            pmml_filepath = str(model_path) + "/" + str(
                self._slug) + "/traindeModel.pmml"

            x_train, x_test, y_train, y_test = self._dataframe_helper.get_train_test_data(
            )
            x_train = MLUtils.create_dummy_columns(
                x_train,
                [x for x in categorical_columns if x != result_column])
            x_test = MLUtils.create_dummy_columns(
                x_test, [x for x in categorical_columns if x != result_column])
            x_test = MLUtils.fill_missing_columns(x_test, x_train.columns,
                                                  result_column)

            CommonUtils.create_update_and_save_progress_message(
                self._dataframe_context,
                self._scriptWeightDict,
                self._scriptStages,
                self._slug,
                "training",
                "info",
                display=True,
                emptyBin=False,
                customMsg=None,
                weightKey="total")

            st = time.time()
            levels = df[result_column].unique()
            clf = SVC(kernel='linear', probability=True)

            labelEncoder = preprocessing.LabelEncoder()
            labelEncoder.fit(np.concatenate([y_train, y_test]))
            y_train = pd.Series(labelEncoder.transform(y_train))
            y_test = labelEncoder.transform(y_test)
            classes = labelEncoder.classes_
            transformed = labelEncoder.transform(classes)
            labelMapping = dict(list(zip(transformed, classes)))
            inverseLabelMapping = dict(list(zip(classes, transformed)))
            posLabel = inverseLabelMapping[self._targetLevel]
            appType = self._dataframe_context.get_app_type()

            print(appType, labelMapping, inverseLabelMapping, posLabel,
                  self._targetLevel)

            if algoSetting.is_hyperparameter_tuning_enabled():
                hyperParamInitParam = algoSetting.get_hyperparameter_params()
                evaluationMetricDict = {
                    "name": hyperParamInitParam["evaluationMetric"]
                }
                evaluationMetricDict[
                    "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[
                        evaluationMetricDict["name"]]
                hyperParamAlgoName = algoSetting.get_hyperparameter_algo_name()
                params_grid = algoSetting.get_params_dict_hyperparameter()
                params_grid = {
                    k: v
                    for k, v in list(params_grid.items())
                    if k in clf.get_params()
                }
                print(params_grid)
                if hyperParamAlgoName == "gridsearchcv":
                    clfGrid = GridSearchCV(clf, params_grid)
                    gridParams = clfGrid.get_params()
                    hyperParamInitParam = {
                        k: v
                        for k, v in list(hyperParamInitParam.items())
                        if k in gridParams
                    }
                    clfGrid.set_params(**hyperParamInitParam)
                    #clfGrid.fit(x_train,y_train)
                    grid_param = {}
                    grid_param['params'] = ParameterGrid(params_grid)
                    #bestEstimator = clfGrid.best_estimator_
                    modelFilepath = "/".join(model_filepath.split("/")[:-1])
                    sklearnHyperParameterResultObj = SklearnGridSearchResult(
                        grid_param, clf, x_train, x_test, y_train, y_test,
                        appType, modelFilepath, levels, posLabel,
                        evaluationMetricDict)
                    resultArray = sklearnHyperParameterResultObj.train_and_save_models(
                    )
                    self._result_setter.set_hyper_parameter_results(
                        self._slug, resultArray)
                    self._result_setter.set_metadata_parallel_coordinates(
                        self._slug, {
                            "ignoreList":
                            sklearnHyperParameterResultObj.get_ignore_list(),
                            "hideColumns":
                            sklearnHyperParameterResultObj.get_hide_columns(),
                            "metricColName":
                            sklearnHyperParameterResultObj.
                            get_comparison_metric_colname(),
                            "columnOrder":
                            sklearnHyperParameterResultObj.get_keep_columns()
                        })
                elif hyperParamAlgoName == "randomsearchcv":
                    clfRand = RandomizedSearchCV(clf, params_grid)
                    clfRand.set_params(**hyperParamInitParam)
                    bestEstimator = None
            else:
                evaluationMetricDict = {
                    "name":
                    GLOBALSETTINGS.CLASSIFICATION_MODEL_EVALUATION_METRIC
                }
                evaluationMetricDict[
                    "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[
                        evaluationMetricDict["name"]]
                self._result_setter.set_hyper_parameter_results(
                    self._slug, None)
                algoParams = algoSetting.get_params_dict()
                algoParams = {
                    k: v
                    for k, v in list(algoParams.items())
                    if k in list(clf.get_params().keys())
                }
                clf.set_params(**algoParams)
                print("!" * 50)
                print(clf.get_params())
                print("!" * 50)
                if validationDict["name"] == "kFold":
                    defaultSplit = GLOBALSETTINGS.DEFAULT_VALIDATION_OBJECT[
                        "value"]
                    numFold = int(validationDict["value"])
                    if numFold == 0:
                        numFold = 3
                    kFoldClass = SkleanrKFoldResult(
                        numFold,
                        clf,
                        x_train,
                        x_test,
                        y_train,
                        y_test,
                        appType,
                        levels,
                        posLabel,
                        evaluationMetricDict=evaluationMetricDict)
                    kFoldClass.train_and_save_result()
                    kFoldOutput = kFoldClass.get_kfold_result()
                    bestEstimator = kFoldClass.get_best_estimator()
                elif validationDict["name"] == "trainAndtest":
                    clf.fit(x_train, y_train)
                    bestEstimator = clf

            # clf.fit(x_train, y_train)
            # bestEstimator = clf
            trainingTime = time.time() - st
            y_score = bestEstimator.predict(x_test)
            try:
                y_prob = bestEstimator.predict_proba(x_test)
            except:
                y_prob = [0] * len(y_score)

            # overall_precision_recall = MLUtils.calculate_overall_precision_recall(y_test,y_score,targetLevel = self._targetLevel)
            # print overall_precision_recall
            accuracy = metrics.accuracy_score(y_test, y_score)
            if len(levels) <= 2:
                precision = metrics.precision_score(y_test,
                                                    y_score,
                                                    pos_label=posLabel,
                                                    average="binary")
                recall = metrics.recall_score(y_test,
                                              y_score,
                                              pos_label=posLabel,
                                              average="binary")
                auc = metrics.roc_auc_score(y_test, y_score)
            elif len(levels) > 2:
                precision = metrics.precision_score(y_test,
                                                    y_score,
                                                    pos_label=posLabel,
                                                    average="macro")
                recall = metrics.recall_score(y_test,
                                              y_score,
                                              pos_label=posLabel,
                                              average="macro")
                # auc = metrics.roc_auc_score(y_test,y_score,average="weighted")
                auc = None
            y_score = labelEncoder.inverse_transform(y_score)
            y_test = labelEncoder.inverse_transform(y_test)

            featureImportance = {}
            feature_importance = dict(
                sorted(zip(x_train.columns,
                           bestEstimator.feature_importances_),
                       key=lambda x: x[1],
                       reverse=True))
            for k, v in feature_importance.items():
                feature_importance[k] = CommonUtils.round_sig(v)
            objs = {
                "trained_model": bestEstimator,
                "actual": y_test,
                "predicted": y_score,
                "probability": y_prob,
                "feature_importance": feature_importance,
                "featureList": list(x_train.columns),
                "labelMapping": labelMapping
            }

            if not algoSetting.is_hyperparameter_tuning_enabled():
                modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH -
                                         1) + "1"
                modelFilepathArr = model_filepath.split("/")[:-1]
                modelFilepathArr.append(modelName + ".pkl")
                joblib.dump(objs["trained_model"], "/".join(modelFilepathArr))
            runtime = round((time.time() - st_global), 2)

        try:
            modelPmmlPipeline = PMMLPipeline([("pretrained-estimator",
                                               objs["trained_model"])])
            modelPmmlPipeline.target_field = result_column
            modelPmmlPipeline.active_fields = np.array(
                [col for col in x_train.columns if col != result_column])
            sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr=True)
            pmmlfile = open(pmml_filepath, "r")
            pmmlText = pmmlfile.read()
            pmmlfile.close()
            self._result_setter.update_pmml_object({self._slug: pmmlText})
        except:
            pass
        cat_cols = list(set(categorical_columns) - {result_column})
        overall_precision_recall = MLUtils.calculate_overall_precision_recall(
            objs["actual"], objs["predicted"], targetLevel=self._targetLevel)
        self._model_summary = MLModelSummary()
        self._model_summary.set_algorithm_name("Svm")
        self._model_summary.set_algorithm_display_name(
            "Support Vector Machine")
        self._model_summary.set_slug(self._slug)
        self._model_summary.set_training_time(runtime)
        self._model_summary.set_confusion_matrix(
            MLUtils.calculate_confusion_matrix(objs["actual"],
                                               objs["predicted"]))
        self._model_summary.set_feature_importance(objs["feature_importance"])
        self._model_summary.set_feature_list(objs["featureList"])
        self._model_summary.set_model_accuracy(
            round(metrics.accuracy_score(objs["actual"], objs["predicted"]),
                  2))
        self._model_summary.set_training_time(round((time.time() - st), 2))
        self._model_summary.set_precision_recall_stats(
            overall_precision_recall["classwise_stats"])
        self._model_summary.set_model_precision(
            overall_precision_recall["precision"])
        self._model_summary.set_model_recall(
            overall_precision_recall["recall"])
        self._model_summary.set_target_variable(result_column)
        self._model_summary.set_prediction_split(
            overall_precision_recall["prediction_split"])
        self._model_summary.set_validation_method("Train and Test")
        self._model_summary.set_level_map_dict(objs["labelMapping"])
        # self._model_summary.set_model_features(list(set(x_train.columns)-set([result_column])))
        self._model_summary.set_model_features(
            [col for col in x_train.columns if col != result_column])
        self._model_summary.set_level_counts(
            self._metaParser.get_unique_level_dict(
                list(set(categorical_columns))))
        self._model_summary.set_num_trees(100)
        self._model_summary.set_num_rules(300)
        if not algoSetting.is_hyperparameter_tuning_enabled():
            modelDropDownObj = {
                "name": self._model_summary.get_algorithm_name(),
                "evaluationMetricValue":
                self._model_summary.get_model_accuracy(),
                "evaluationMetricName": "accuracy",
                "slug": self._model_summary.get_slug(),
                "Model Id": modelName
            }

            modelSummaryJson = {
                "dropdown": modelDropDownObj,
                "levelcount": self._model_summary.get_level_counts(),
                "modelFeatureList": self._model_summary.get_feature_list(),
                "levelMapping": self._model_summary.get_level_map_dict(),
                "slug": self._model_summary.get_slug(),
                "name": self._model_summary.get_algorithm_name()
            }
        else:
            modelDropDownObj = {
                "name": self._model_summary.get_algorithm_name(),
                "evaluationMetricValue": resultArray[0]["Accuracy"],
                "evaluationMetricName": "accuracy",
                "slug": self._model_summary.get_slug(),
                "Model Id": resultArray[0]["Model Id"]
            }
            modelSummaryJson = {
                "dropdown": modelDropDownObj,
                "levelcount": self._model_summary.get_level_counts(),
                "modelFeatureList": self._model_summary.get_feature_list(),
                "levelMapping": self._model_summary.get_level_map_dict(),
                "slug": self._model_summary.get_slug(),
                "name": self._model_summary.get_algorithm_name()
            }

        svmCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for
            cardObj in MLUtils.create_model_summary_cards(self._model_summary)
        ]
        for card in svmCards:
            self._prediction_narrative.add_a_card(card)

        self._result_setter.set_model_summary({
            "svm":
            json.loads(
                CommonUtils.convert_python_object_to_json(self._model_summary))
        })
        self._result_setter.set_svm_model_summary(modelSummaryJson)
        self._result_setter.set_rf_cards(svmCards)

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._slug,
            "completion",
            "info",
            display=True,
            emptyBin=False,
            customMsg=None,
            weightKey="total")
Esempio n. 13
0
    def __init__(self, data_frame,column_name, measure_descr_stats, df_helper, df_context, result_setter, story_narrative,scriptWeight=None, analysisName=None):
        self._story_narrative = story_narrative
        self._result_setter = result_setter
        self._column_name = column_name.lower()
        self._capitalized_column_name = "%s%s" % (column_name[0].upper(), column_name[1:])
        self._measure_descr_stats = measure_descr_stats
        self._five_point_summary_stats = measure_descr_stats.get_five_point_summary_stats()
        self._data_frame = data_frame
        try:
            self._total_rows = self._data_frame.shape[0]
        except:
            self._total_rows = self._data_frame.count()
        # self._histogram = measure_descr_stats.get_histogram()
        # self._num_columns = context.get_column_count()
        # self._num_rows = context.get_row_count()
        # self._measures = context.get_measures()
        # self._dimensions = context.get_dimensions()
        # self._time_dimensions = context.get_time_dimension()
        self._dataframe_helper = df_helper
        self._dataframe_context = df_context
        self._pandas_flag = self._dataframe_context._pandas_flag
        self._storyOnScoredData = self._dataframe_context.get_story_on_scored_data()
        self.title = None
        self.heading = self._capitalized_column_name + ' Performance Analysis'
        self.sub_heading = "Distribution of " + self._capitalized_column_name
        self.summary = None
        self._analysis1 = None
        self._analysis2 = None
        self.analysis = None
        self.take_away = None
        self.card2 = ''
        self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER
        self._highlightFlag = "|~HIGHLIGHT~|"
        self._base_dir = "/descriptive/"
        self.num_measures = len(self._dataframe_helper.get_numeric_columns())
        self.num_dimensions = len(self._dataframe_helper.get_string_columns())
        self.num_time_dimensions = len(self._dataframe_helper.get_timestamp_columns())

        self._completionStatus = self._dataframe_context.get_completion_status()
        self._messageURL = self._dataframe_context.get_message_url()
        if analysisName == None:
            self._analysisName = self._dataframe_context.get_analysis_name()
        else:
            self._analysisName = analysisName
        if scriptWeight == None:
            self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight()
        else:
            self._scriptWeightDict = scriptWeight
        self._scriptStages = {
            "statNarrativeStart":{
                "summary":"Started The Descriptive Stats Narratives",
                "weight":0
                },
            "statNarrativeEnd":{
                "summary":"Narratives For Descriptive Stats Finished",
                "weight":10
                },
            }

        CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._analysisName,"statNarrativeStart","info",display=False,emptyBin=False,customMsg=None,weightKey="narratives")


        self._measureSummaryNode = NarrativesTree()
        self._headNode = NarrativesTree()
        self._headNode.set_name("Overview")
        self._generate_narratives()
        self._story_narrative.add_a_node(self._measureSummaryNode)
        self._result_setter.set_head_node(self._headNode)
        self._result_setter.set_distribution_node(self._measureSummaryNode)

        CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._analysisName,"statNarrativeEnd","info",display=False,emptyBin=False,customMsg=None,weightKey="narratives")
Esempio n. 14
0
    def Predict(self):
        self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight(
        )
        self._scriptStages = {
            "initialization": {
                "summary": "Initialized The Neural Network (PyTorch)  Scripts",
                "weight": 2
            },
            "predictionStart": {
                "summary": "Neural Network (PyTorch)  Prediction Started",
                "weight": 2
            },
            "predictionFinished": {
                "summary": "Neural Network (PyTorch)  Prediction Finished",
                "weight": 6
            }
        }
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._slug,
            "initialization",
            "info",
            display=True,
            emptyBin=False,
            customMsg=None,
            weightKey="total")

        SQLctx = SQLContext(sparkContext=self._spark.sparkContext,
                            sparkSession=self._spark)
        dataSanity = True
        categorical_columns = self._dataframe_helper.get_string_columns()
        uid_col = self._dataframe_context.get_uid_column()
        if self._metaParser.check_column_isin_ignored_suggestion(uid_col):
            categorical_columns = list(set(categorical_columns) - {uid_col})
        allDateCols = self._dataframe_context.get_date_columns()
        categorical_columns = list(set(categorical_columns) - set(allDateCols))
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        result_column = self._dataframe_context.get_result_column()
        test_data_path = self._dataframe_context.get_input_file()

        if self._mlEnv == "spark":
            pass

        elif self._mlEnv == "sklearn":
            CommonUtils.create_update_and_save_progress_message(
                self._dataframe_context,
                self._scriptWeightDict,
                self._scriptStages,
                self._slug,
                "predictionStart",
                "info",
                display=True,
                emptyBin=False,
                customMsg=None,
                weightKey="total")
            score_data_path = self._dataframe_context.get_score_path(
            ) + "/data.csv"
            trained_model_path = "file://" + self._dataframe_context.get_model_path(
            )
            trained_model_path += "/" + self._dataframe_context.get_model_for_scoring(
            ) + ".pt"
            print("trained_model_path", trained_model_path)
            print("score_data_path", score_data_path)
            if trained_model_path.startswith("file"):
                trained_model_path = trained_model_path[7:]
            #trained_model = joblib.load(trained_model_path)
            trained_model = torch.load(trained_model_path,
                                       map_location=torch.device('cpu'))
            model_columns = self._dataframe_context.get_model_features()
            print("model_columns", model_columns)
            try:
                df = self._data_frame.toPandas()
            except:
                df = self._data_frame
            # pandas_df = MLUtils.factorize_columns(df,[x for x in categorical_columns if x != result_column])
            pandas_df = MLUtils.create_dummy_columns(
                df, [x for x in categorical_columns if x != result_column])
            pandas_df = MLUtils.fill_missing_columns(pandas_df, model_columns,
                                                     result_column)

            if uid_col:
                pandas_df = pandas_df[[
                    x for x in pandas_df.columns if x != uid_col
                ]]

            test_df = np.stack(
                [pandas_df[col].values for col in pandas_df.columns], 1)
            tensored_test_df = torch.tensor(test_df, dtype=torch.float)

            outputs_test_df_tensored = trained_model(tensored_test_df.float())

            y_score_mid = outputs_test_df_tensored.tolist()
            y_score = [x[0] for x in y_score_mid]

            scoreKpiArray = MLUtils.get_scored_data_summary(y_score)
            kpiCard = NormalCard()
            kpiCardData = [KpiData(data=x) for x in scoreKpiArray]
            kpiCard.set_card_data(kpiCardData)
            kpiCard.set_cente_alignment(True)
            print(CommonUtils.convert_python_object_to_json(kpiCard))
            self._result_setter.set_kpi_card_regression_score(kpiCard)

            pandas_df[result_column] = y_score
            df[result_column] = y_score
            df.to_csv(score_data_path, header=True, index=False)
            CommonUtils.create_update_and_save_progress_message(
                self._dataframe_context,
                self._scriptWeightDict,
                self._scriptStages,
                self._slug,
                "predictionFinished",
                "info",
                display=True,
                emptyBin=False,
                customMsg=None,
                weightKey="total")

            print("STARTING Measure ANALYSIS ...")
            columns_to_keep = []
            columns_to_drop = []
            columns_to_keep = self._dataframe_context.get_score_consider_columns(
            )
            if len(columns_to_keep) > 0:
                columns_to_drop = list(set(df.columns) - set(columns_to_keep))
            else:
                columns_to_drop += ["predicted_probability"]

            columns_to_drop = [
                x for x in columns_to_drop
                if x in df.columns and x != result_column
            ]
            print("columns_to_drop", columns_to_drop)
            pandas_scored_df = df[list(set(columns_to_keep + [result_column]))]
            spark_scored_df = SQLctx.createDataFrame(pandas_scored_df)
            # spark_scored_df.write.csv(score_data_path+"/data",mode="overwrite",header=True)
            # TODO update metadata for the newly created dataframe
            self._dataframe_context.update_consider_columns(columns_to_keep)
            print(spark_scored_df.printSchema())

        df_helper = DataFrameHelper(spark_scored_df, self._dataframe_context,
                                    self._metaParser)
        df_helper.set_params()
        df = df_helper.get_data_frame()
        # self._dataframe_context.set_dont_send_message(True)
        try:
            fs = time.time()
            descr_stats_obj = DescriptiveStatsScript(
                df,
                df_helper,
                self._dataframe_context,
                self._result_setter,
                self._spark,
                self._prediction_narrative,
                scriptWeight=self._scriptWeightDict,
                analysisName="Descriptive analysis")
            descr_stats_obj.Run()
            print("DescriptiveStats Analysis Done in ",
                  time.time() - fs, " seconds.")
        except:
            print("Frequency Analysis Failed ")

        # try:
        #     fs = time.time()
        #     df_helper.fill_na_dimension_nulls()
        #     df = df_helper.get_data_frame()
        #     dt_reg = DecisionTreeRegressionScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,self._metaParser,scriptWeight=self._scriptWeightDict,analysisName="Predictive modeling")
        #     dt_reg.Run()
        #     print "DecisionTrees Analysis Done in ", time.time() - fs, " seconds."
        # except:
        #     print "DTREE FAILED"

        try:
            fs = time.time()
            two_way_obj = TwoWayAnovaScript(
                df,
                df_helper,
                self._dataframe_context,
                self._result_setter,
                self._spark,
                self._prediction_narrative,
                self._metaParser,
                scriptWeight=self._scriptWeightDict,
                analysisName="Measure vs. Dimension")
            two_way_obj.Run()
            print("OneWayAnova Analysis Done in ",
                  time.time() - fs, " seconds.")
        except:
            print("Anova Analysis Failed")
Esempio n. 15
0
    def Train(self):
        st_global = time.time()

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._slug,
            "initialization",
            "info",
            display=True,
            emptyBin=False,
            customMsg=None,
            weightKey="total")
        appType = self._dataframe_context.get_app_type()
        algosToRun = self._dataframe_context.get_algorithms_to_run()
        algoSetting = [
            x for x in algosToRun if x.get_algorithm_slug() == self._slug
        ][0]
        categorical_columns = self._dataframe_helper.get_string_columns()
        uid_col = self._dataframe_context.get_uid_column()
        if self._metaParser.check_column_isin_ignored_suggestion(uid_col):
            categorical_columns = list(set(categorical_columns) - {uid_col})
        allDateCols = self._dataframe_context.get_date_columns()
        categorical_columns = list(set(categorical_columns) - set(allDateCols))
        print("CATEGORICAL COLS - ", categorical_columns)
        result_column = self._dataframe_context.get_result_column()
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        numerical_columns = [
            x for x in numerical_columns if x != result_column
        ]

        model_path = self._dataframe_context.get_model_path()
        if model_path.startswith("file"):
            model_path = model_path[7:]
        validationDict = self._dataframe_context.get_validation_dict()
        print("model_path", model_path)
        pipeline_filepath = "file://" + str(model_path) + "/" + str(
            self._slug) + "/pipeline/"
        model_filepath = "file://" + str(model_path) + "/" + str(
            self._slug) + "/model"
        pmml_filepath = "file://" + str(model_path) + "/" + str(
            self._slug) + "/modelPmml"

        df = self._data_frame
        if self._mlEnv == "spark":
            pass
        elif self._mlEnv == "sklearn":
            model_filepath = model_path + "/" + self._slug + "/model.pkl"

            x_train, x_test, y_train, y_test = self._dataframe_helper.get_train_test_data(
            )
            x_train = MLUtils.create_dummy_columns(
                x_train,
                [x for x in categorical_columns if x != result_column])
            x_test = MLUtils.create_dummy_columns(
                x_test, [x for x in categorical_columns if x != result_column])
            x_test = MLUtils.fill_missing_columns(x_test, x_train.columns,
                                                  result_column)

            print("=" * 150)
            print("X-Train Shape - ", x_train.shape)
            print("Y-Train Shape - ", y_train.shape)
            print("X-Test Shape - ", x_test.shape)
            print("Y-Test Shape - ", y_test.shape)
            print("~" * 50)
            print("X-Train dtype - ", type(x_train))
            print("Y-Train dtype - ", type(y_train))
            print("X-Test dtype - ", type(x_test))
            print("Y-Test dtype - ", type(y_test))
            print("~" * 50)

            CommonUtils.create_update_and_save_progress_message(
                self._dataframe_context,
                self._scriptWeightDict,
                self._scriptStages,
                self._slug,
                "training",
                "info",
                display=True,
                emptyBin=False,
                customMsg=None,
                weightKey="total")

            st = time.time()

            self._result_setter.set_hyper_parameter_results(self._slug, None)
            evaluationMetricDict = algoSetting.get_evaluvation_metric(
                Type="REGRESSION")
            evaluationMetricDict = {
                "name": GLOBALSETTINGS.REGRESSION_MODEL_EVALUATION_METRIC
            }
            evaluationMetricDict[
                "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[
                    evaluationMetricDict["name"]]

            x_train_tensored, y_train_tensored, x_test_tensored, y_test_tensored = PYTORCHUTILS.get_tensored_data(
                x_train, y_train, x_test, y_test)
            trainset = torch_data_utils.TensorDataset(x_train_tensored,
                                                      y_train_tensored)
            testset = torch_data_utils.TensorDataset(x_test_tensored,
                                                     y_test_tensored)

            nnptr_params = algoSetting.get_nnptr_params_dict()[0]
            layers_for_network = PYTORCHUTILS.get_layers_for_network_module(
                nnptr_params,
                task_type="REGRESSION",
                first_layer_units=x_train.shape[1])

            # Use GPU if available
            device = torch.device(
                "cuda:0" if torch.cuda.is_available() else "cpu")
            network = PyTorchNetwork(layers_for_network).to(device)
            network.eval()

            other_params_dict = PYTORCHUTILS.get_other_pytorch_params(
                nnptr_params,
                task_type="REGRESSION",
                network_params=network.parameters())

            print("~" * 50)
            print("NNPTR-PARAMS - ", nnptr_params)
            print("~" * 50)
            print("OTHER-PARAMS-DICT - ", other_params_dict)
            print("~" * 50)
            print("NEURAL-NETWORK - ", network)
            print("~" * 50)

            criterion = other_params_dict["loss_criterion"]
            n_epochs = other_params_dict["number_of_epochs"]
            batch_size = other_params_dict["batch_size"]
            optimizer = other_params_dict["optimizer"]

            dataloader_params = {
                "batch_size": batch_size,
                "shuffle": True
                # "num_workers":
            }

            train_loader = torch_data_utils.DataLoader(trainset,
                                                       **dataloader_params)
            test_loader = torch_data_utils.DataLoader(testset,
                                                      **dataloader_params)
            '''
            Training the network;
            Batchnormalization(num_features) should be equal to units_op for that layer in training config;
            else --> RuntimeError('running_mean should contain 100 elements not 200',)
            '''

            for epoch in range(n_epochs):
                batchwise_losses = []
                average_loss = 0.0

                for i, (inputs, labels) in enumerate(train_loader):
                    inputs = inputs.to(device)
                    labels = labels.to(device)

                    # Zero the parameter gradients
                    optimizer.zero_grad()

                    # Forward + backward + optimize
                    outputs = network(inputs.float())
                    loss = criterion(outputs, labels.float())
                    loss.backward()
                    optimizer.step()

                    average_loss += loss.item()
                    batchwise_losses.append(loss.item())

                average_loss_per_epoch = old_div(average_loss, (i + 1))
                print("+" * 80)
                print("EPOCH - ", epoch)
                print("BATCHWISE_LOSSES shape - ", len(batchwise_losses))
                print("AVERAGE LOSS PER EPOCH - ", average_loss_per_epoch)
                print("+" * 80)

            trainingTime = time.time() - st
            bestEstimator = network

            outputs_x_test_tensored = network(x_test_tensored.float())
            y_score_mid = outputs_x_test_tensored.tolist()
            y_score = [x[0] for x in y_score_mid]
            print("Y-SCORE - ", y_score)
            print("Y-SCORE length - ", len(y_score))
            y_prob = None

            featureImportance = {}
            objs = {
                "trained_model": bestEstimator,
                "actual": y_test,
                "predicted": y_score,
                "probability": y_prob,
                "feature_importance": featureImportance,
                "featureList": list(x_train.columns),
                "labelMapping": {}
            }
            #featureImportance = objs["trained_model"].feature_importances_
            #featuresArray = [(col_name, featureImportance[idx]) for idx, col_name in enumerate(x_train.columns)]
            featuresArray = []
            if not algoSetting.is_hyperparameter_tuning_enabled():
                modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH -
                                         1) + "1"
                modelFilepathArr = model_filepath.split("/")[:-1]
                modelFilepathArr.append(modelName + ".pt")
                torch.save(objs["trained_model"], "/".join(modelFilepathArr))
                #joblib.dump(objs["trained_model"],"/".join(modelFilepathArr))
                runtime = round((time.time() - st), 2)
            else:
                runtime = round((time.time() - hyper_st), 2)

            try:
                modelPmmlPipeline = PMMLPipeline([("pretrained-estimator",
                                                   objs["trained_model"])])
                modelPmmlPipeline.target_field = result_column
                modelPmmlPipeline.active_fields = np.array(
                    [col for col in x_train.columns if col != result_column])
                sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr=True)
                pmmlfile = open(pmml_filepath, "r")
                pmmlText = pmmlfile.read()
                pmmlfile.close()
                self._result_setter.update_pmml_object({self._slug: pmmlText})
            except:
                pass

            metrics = {}
            metrics["r2"] = r2_score(y_test, y_score)
            metrics["neg_mean_squared_error"] = mean_squared_error(
                y_test, y_score)
            metrics["neg_mean_absolute_error"] = mean_absolute_error(
                y_test, y_score)
            metrics["RMSE"] = sqrt(metrics["neg_mean_squared_error"])
            metrics["explained_variance_score"] = explained_variance_score(
                y_test, y_score)
            transformed = pd.DataFrame({
                "prediction": y_score,
                result_column: y_test
            })
            print("TRANSFORMED PREDICTION TYPE - ",
                  type(transformed["prediction"]))
            print(transformed["prediction"])
            print("TRANSFORMED RESULT COL TYPE - ",
                  type(transformed[result_column]))
            print(transformed[result_column])
            transformed["difference"] = transformed[
                result_column] - transformed["prediction"]
            transformed["mape"] = old_div(
                np.abs(transformed["difference"]) * 100,
                transformed[result_column])

            sampleData = None
            nrows = transformed.shape[0]
            if nrows > 100:
                sampleData = transformed.sample(n=100, random_state=420)
            else:
                sampleData = transformed
            print(sampleData.head())
            if transformed["mape"].max() > 100:
                GLOBALSETTINGS.MAPEBINS.append(transformed["mape"].max())
                mapeCountArr = list(
                    pd.cut(transformed["mape"], GLOBALSETTINGS.MAPEBINS).
                    value_counts().to_dict().items())
                GLOBALSETTINGS.MAPEBINS.pop(5)
            else:
                mapeCountArr = list(
                    pd.cut(transformed["mape"], GLOBALSETTINGS.MAPEBINS).
                    value_counts().to_dict().items())
            mapeStatsArr = [(str(idx), dictObj) for idx, dictObj in enumerate(
                sorted([{
                    "count": x[1],
                    "splitRange": (x[0].left, x[0].right)
                } for x in mapeCountArr],
                       key=lambda x: x["splitRange"][0]))]
            print(mapeStatsArr)
            print(mapeCountArr)
            predictionColSummary = transformed["prediction"].describe(
            ).to_dict()
            quantileBins = [
                predictionColSummary["min"], predictionColSummary["25%"],
                predictionColSummary["50%"], predictionColSummary["75%"],
                predictionColSummary["max"]
            ]
            print(quantileBins)
            quantileBins = sorted(list(set(quantileBins)))
            transformed["quantileBinId"] = pd.cut(transformed["prediction"],
                                                  quantileBins)
            quantileDf = transformed.groupby("quantileBinId").agg({
                "prediction": [np.sum, np.mean, np.size]
            }).reset_index()
            quantileDf.columns = ["prediction", "sum", "mean", "count"]
            print(quantileDf)
            quantileArr = list(quantileDf.T.to_dict().items())
            quantileSummaryArr = [(obj[0], {
                "splitRange":
                (obj[1]["prediction"].left, obj[1]["prediction"].right),
                "count":
                obj[1]["count"],
                "mean":
                obj[1]["mean"],
                "sum":
                obj[1]["sum"]
            }) for obj in quantileArr]
            print(quantileSummaryArr)
            runtime = round((time.time() - st_global), 2)

            self._model_summary.set_model_type("regression")
            self._model_summary.set_algorithm_name("Neural Network (PyTorch)")
            self._model_summary.set_algorithm_display_name(
                "Neural Network (PyTorch)")
            self._model_summary.set_slug(self._slug)
            self._model_summary.set_training_time(runtime)
            self._model_summary.set_training_time(trainingTime)
            self._model_summary.set_target_variable(result_column)
            self._model_summary.set_validation_method(
                validationDict["displayName"])
            self._model_summary.set_model_evaluation_metrics(metrics)
            self._model_summary.set_model_params(nnptr_params)
            self._model_summary.set_quantile_summary(quantileSummaryArr)
            self._model_summary.set_mape_stats(mapeStatsArr)
            self._model_summary.set_sample_data(sampleData.to_dict())
            self._model_summary.set_feature_importance(featuresArray)
            self._model_summary.set_feature_list(list(x_train.columns))
            self._model_summary.set_model_mse(
                metrics["neg_mean_squared_error"])
            self._model_summary.set_model_mae(
                metrics["neg_mean_absolute_error"])
            self._model_summary.set_rmse(metrics["RMSE"])
            self._model_summary.set_model_rsquared(metrics["r2"])
            self._model_summary.set_model_exp_variance_score(
                metrics["explained_variance_score"])

            try:
                pmml_filepath = str(model_path) + "/" + str(
                    self._slug) + "/traindeModel.pmml"
                modelPmmlPipeline = PMMLPipeline([("pretrained-estimator",
                                                   objs["trained_model"])])
                modelPmmlPipeline.target_field = result_column
                modelPmmlPipeline.active_fields = np.array(
                    [col for col in x_train.columns if col != result_column])
                sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr=True)
                pmmlfile = open(pmml_filepath, "r")
                pmmlText = pmmlfile.read()
                pmmlfile.close()
                self._result_setter.update_pmml_object({self._slug: pmmlText})
            except:
                pass

        if algoSetting.is_hyperparameter_tuning_enabled():
            modelDropDownObj = {
                "name": self._model_summary.get_algorithm_name(),
                "evaluationMetricValue": metrics[evaluationMetricDict["name"]],
                "evaluationMetricName": evaluationMetricDict["name"],
                "slug": self._model_summary.get_slug(),
                "Model Id": modelName
            }

            modelSummaryJson = {
                "dropdown": modelDropDownObj,
                "levelcount": self._model_summary.get_level_counts(),
                "modelFeatureList": self._model_summary.get_feature_list(),
                "levelMapping": self._model_summary.get_level_map_dict(),
                "slug": self._model_summary.get_slug(),
                "name": self._model_summary.get_algorithm_name()
            }
        else:
            modelDropDownObj = {
                "name": self._model_summary.get_algorithm_name(),
                "evaluationMetricValue": metrics[evaluationMetricDict["name"]],
                "evaluationMetricName": evaluationMetricDict["name"],
                "slug": self._model_summary.get_slug(),
                "Model Id": modelName
            }
            modelSummaryJson = {
                "dropdown": modelDropDownObj,
                "levelcount": self._model_summary.get_level_counts(),
                "modelFeatureList": self._model_summary.get_feature_list(),
                "levelMapping": self._model_summary.get_level_map_dict(),
                "slug": self._model_summary.get_slug(),
                "name": self._model_summary.get_algorithm_name()
            }
        modelmanagement_ = nnptr_params

        self._model_management = MLModelSummary()
        if algoSetting.is_hyperparameter_tuning_enabled():
            pass
        else:
            self._model_management.set_layer_info(
                data=modelmanagement_['hidden_layer_info'])
            self._model_management.set_loss_function(
                data=modelmanagement_['loss'])
            self._model_management.set_optimizer(
                data=modelmanagement_['optimizer'])
            self._model_management.set_batch_size(
                data=modelmanagement_['batch_size'])
            self._model_management.set_no_epochs(
                data=modelmanagement_['number_of_epochs'])
            # self._model_management.set_model_evaluation_metrics(data=modelmanagement_['metrics'])
            self._model_management.set_job_type(
                self._dataframe_context.get_job_name())  #Project name
            self._model_management.set_training_status(
                data="completed")  # training status
            self._model_management.set_no_of_independent_variables(
                data=x_train)  #no of independent varables
            self._model_management.set_training_time(runtime)  # run time
            self._model_management.set_rmse(metrics["RMSE"])
            self._model_management.set_algorithm_name(
                "Neural Network (TensorFlow)")  #algorithm name
            self._model_management.set_validation_method(
                str(validationDict["displayName"]) + "(" +
                str(validationDict["value"]) + ")")  #validation method
            self._model_management.set_target_variable(
                result_column)  #target column name
            self._model_management.set_creation_date(data=str(
                datetime.now().strftime('%b %d ,%Y  %H:%M ')))  #creation date
            self._model_management.set_datasetName(self._datasetName)
        modelManagementSummaryJson = [
            ["Project Name",
             self._model_management.get_job_type()],
            ["Algorithm",
             self._model_management.get_algorithm_name()],
            ["Training Status",
             self._model_management.get_training_status()],
            ["RMSE", self._model_management.get_rmse()],
            ["RunTime", self._model_management.get_training_time()],
            #["Owner",None],
            ["Created On",
             self._model_management.get_creation_date()]
        ]
        if algoSetting.is_hyperparameter_tuning_enabled():
            modelManagementModelSettingsJson = []
        else:
            modelManagementModelSettingsJson = [
                ["Training Dataset",
                 self._model_management.get_datasetName()],
                [
                    "Target Column",
                    self._model_management.get_target_variable()
                ],
                [
                    "Number Of Independent Variables",
                    self._model_management.get_no_of_independent_variables()
                ], ["Algorithm",
                    self._model_management.get_algorithm_name()],
                [
                    "Model Validation",
                    self._model_management.get_validation_method()
                ],
                ["batch_size",
                 str(self._model_management.get_batch_size())],
                ["Loss", self._model_management.get_loss_function()],
                ["Optimizer",
                 self._model_management.get_optimizer()],
                ["Epochs", self._model_management.get_no_epochs()],
                [
                    "Metrics",
                    self._model_management.get_model_evaluation_metrics()
                ]
            ]
            for i in modelmanagement_["hidden_layer_info"]:
                string = ""
                key = str(modelmanagement_["hidden_layer_info"][i]
                          ["layer"]) + " " + str(i) + ":"
                for j in modelmanagement_["hidden_layer_info"][i]:
                    string = string + str(j) + ":" + str(
                        modelmanagement_["hidden_layer_info"][i][j]) + ",   "
                modelManagementModelSettingsJson.append([key, string])
        print(modelManagementModelSettingsJson)

        nnptrCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for
            cardObj in MLUtils.create_model_summary_cards(self._model_summary)
        ]
        nnptrPerformanceCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj))
            for cardObj in MLUtils.create_model_management_cards_regression(
                self._model_summary)
        ]
        nnptrOverviewCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj))
            for cardObj in MLUtils.create_model_management_card_overview(
                self._model_management, modelManagementSummaryJson,
                modelManagementModelSettingsJson)
        ]
        nnptrDeploymentCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj))
            for cardObj in MLUtils.create_model_management_deploy_empty_card()
        ]
        nnptr_Overview_Node = NarrativesTree()
        nnptr_Overview_Node.set_name("Overview")
        nnptr_Performance_Node = NarrativesTree()
        nnptr_Performance_Node.set_name("Performance")
        nnptr_Deployment_Node = NarrativesTree()
        nnptr_Deployment_Node.set_name("Deployment")
        for card in nnptrOverviewCards:
            nnptr_Overview_Node.add_a_card(card)
        for card in nnptrPerformanceCards:
            nnptr_Performance_Node.add_a_card(card)
        for card in nnptrDeploymentCards:
            nnptr_Deployment_Node.add_a_card(card)
        for card in nnptrCards:
            self._prediction_narrative.add_a_card(card)
        self._result_setter.set_model_summary({
            "Neural Network (PyTorch)":
            json.loads(
                CommonUtils.convert_python_object_to_json(self._model_summary))
        })
        self._result_setter.set_nnptr_regression_model_summary(
            modelSummaryJson)
        self._result_setter.set_nnptr_cards(nnptrCards)
        self._result_setter.set_nnptr_nodes([
            nnptr_Overview_Node, nnptr_Performance_Node, nnptr_Deployment_Node
        ])
        self._result_setter.set_nnptr_fail_card({
            "Algorithm_Name": "Neural Network (PyTorch)",
            "Success": "True"
        })
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._slug,
            "completion",
            "info",
            display=True,
            emptyBin=False,
            customMsg=None,
            weightKey="total")
Esempio n. 16
0
    def Train(self):
        st_global = time.time()

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._slug,
            "initialization",
            "info",
            display=True,
            emptyBin=False,
            customMsg=None,
            weightKey="total")

        appType = self._dataframe_context.get_app_type()
        algosToRun = self._dataframe_context.get_algorithms_to_run()
        algoSetting = [
            x for x in algosToRun if x.get_algorithm_slug() == self._slug
        ][0]
        categorical_columns = self._dataframe_helper.get_string_columns()
        uid_col = self._dataframe_context.get_uid_column()
        if self._metaParser.check_column_isin_ignored_suggestion(uid_col):
            categorical_columns = list(set(categorical_columns) - {uid_col})
        allDateCols = self._dataframe_context.get_date_columns()
        categorical_columns = list(set(categorical_columns) - set(allDateCols))
        print(categorical_columns)
        result_column = self._dataframe_context.get_result_column()
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        numerical_columns = [
            x for x in numerical_columns if x != result_column
        ]

        model_path = self._dataframe_context.get_model_path()
        if model_path.startswith("file"):
            model_path = model_path[7:]
        validationDict = self._dataframe_context.get_validation_dict()
        print("model_path", model_path)
        pipeline_filepath = "file://" + str(model_path) + "/" + str(
            self._slug) + "/pipeline/"
        model_filepath = "file://" + str(model_path) + "/" + str(
            self._slug) + "/model"
        pmml_filepath = "file://" + str(model_path) + "/" + str(
            self._slug) + "/modelPmml"

        df = self._data_frame
        if self._mlEnv == "spark":
            pass
        elif self._mlEnv == "sklearn":
            model_filepath = model_path + "/" + self._slug + "/model.pkl"
            x_train, x_test, y_train, y_test = self._dataframe_helper.get_train_test_data(
            )
            x_train = MLUtils.create_dummy_columns(
                x_train,
                [x for x in categorical_columns if x != result_column])
            x_test = MLUtils.create_dummy_columns(
                x_test, [x for x in categorical_columns if x != result_column])
            x_test = MLUtils.fill_missing_columns(x_test, x_train.columns,
                                                  result_column)

            st = time.time()

            CommonUtils.create_update_and_save_progress_message(
                self._dataframe_context,
                self._scriptWeightDict,
                self._scriptStages,
                self._slug,
                "training",
                "info",
                display=True,
                emptyBin=False,
                customMsg=None,
                weightKey="total")

            if algoSetting.is_hyperparameter_tuning_enabled():
                pass
            else:
                self._result_setter.set_hyper_parameter_results(
                    self._slug, None)
                evaluationMetricDict = algoSetting.get_evaluvation_metric(
                    Type="Regression")
                evaluationMetricDict[
                    "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[
                        evaluationMetricDict["name"]]
                params_tf = algoSetting.get_tf_params_dict()
                algoParams = algoSetting.get_params_dict()
                algoParams = {k: v for k, v in list(algoParams.items())}

                model = tf.keras.models.Sequential()
                first_layer_flag = True

                for i in range(len(list(
                        params_tf['hidden_layer_info'].keys()))):
                    if params_tf['hidden_layer_info'][str(
                            i)]["layer"] == "Dense":

                        if first_layer_flag:
                            model.add(
                                tf.keras.layers.Dense(
                                    params_tf['hidden_layer_info'][str(
                                        i)]["units"],
                                    activation=params_tf['hidden_layer_info'][
                                        str(i)]["activation"],
                                    input_shape=(len(x_train.columns), ),
                                    use_bias=params_tf['hidden_layer_info'][
                                        str(i)]["use_bias"],
                                    kernel_initializer=params_tf[
                                        'hidden_layer_info'][str(
                                            i)]["kernel_initializer"],
                                    bias_initializer=params_tf[
                                        'hidden_layer_info'][str(
                                            i)]["bias_initializer"],
                                    kernel_regularizer=params_tf[
                                        'hidden_layer_info'][str(
                                            i)]["kernel_regularizer"],
                                    bias_regularizer=params_tf[
                                        'hidden_layer_info'][str(
                                            i)]["bias_regularizer"],
                                    activity_regularizer=params_tf[
                                        'hidden_layer_info'][str(
                                            i)]["activity_regularizer"],
                                    kernel_constraint=params_tf[
                                        'hidden_layer_info'][str(
                                            i)]["kernel_constraint"],
                                    bias_constraint=params_tf[
                                        'hidden_layer_info'][str(
                                            i)]["bias_constraint"]))
                            try:
                                if params_tf['hidden_layer_info'][str(
                                        i)]["batch_normalization"] == "True":
                                    model.add(
                                        tf.keras.layers.BatchNormalization())
                            except:
                                print(
                                    "BATCH_NORM_FAILED ##########################"
                                )
                                pass
                            first_layer_flag = False
                        else:
                            model.add(
                                tf.keras.layers.Dense(
                                    params_tf['hidden_layer_info'][str(
                                        i)]["units"],
                                    activation=params_tf['hidden_layer_info'][
                                        str(i)]["activation"],
                                    use_bias=params_tf['hidden_layer_info'][
                                        str(i)]["use_bias"],
                                    kernel_initializer=params_tf[
                                        'hidden_layer_info'][str(
                                            i)]["kernel_initializer"],
                                    bias_initializer=params_tf[
                                        'hidden_layer_info'][str(
                                            i)]["bias_initializer"],
                                    kernel_regularizer=params_tf[
                                        'hidden_layer_info'][str(
                                            i)]["kernel_regularizer"],
                                    bias_regularizer=params_tf[
                                        'hidden_layer_info'][str(
                                            i)]["bias_regularizer"],
                                    activity_regularizer=params_tf[
                                        'hidden_layer_info'][str(
                                            i)]["activity_regularizer"],
                                    kernel_constraint=params_tf[
                                        'hidden_layer_info'][str(
                                            i)]["kernel_constraint"],
                                    bias_constraint=params_tf[
                                        'hidden_layer_info'][str(
                                            i)]["bias_constraint"]))
                            try:
                                if params_tf['hidden_layer_info'][str(
                                        i)]["batch_normalization"] == "True":
                                    model.add(
                                        tf.keras.layers.BatchNormalization())
                            except:
                                print(
                                    "BATCH_NORM_FAILED ##########################"
                                )
                                pass

                    elif params_tf['hidden_layer_info'][str(
                            i)]["layer"] == "Dropout":
                        model.add(
                            tf.keras.layers.Dropout(
                                float(params_tf['hidden_layer_info'][str(i)]
                                      ["rate"])))

                    elif params_tf['hidden_layer_info'][str(
                            i)]["layer"] == "Lambda":
                        if params_tf['hidden_layer_info'][str(
                                i)]["lambda"] == "Addition":
                            model.add(
                                tf.keras.layers.Lambda(lambda x: x + int(
                                    params_tf['hidden_layer_info'][str(i)][
                                        "units"])))
                        if params_tf['hidden_layer_info'][str(
                                i)]["lambda"] == "Multiplication":
                            model.add(
                                tf.keras.layers.Lambda(lambda x: x * int(
                                    params_tf['hidden_layer_info'][str(i)][
                                        "units"])))
                        if params_tf['hidden_layer_info'][str(
                                i)]["lambda"] == "Subtraction":
                            model.add(
                                tf.keras.layers.Lambda(lambda x: x - int(
                                    params_tf['hidden_layer_info'][str(i)][
                                        "units"])))
                        if params_tf['hidden_layer_info'][str(
                                i)]["lambda"] == "Division":
                            model.add(
                                tf.keras.layers.Lambda(lambda x: old_div(
                                    x,
                                    int(params_tf['hidden_layer_info'][str(i)][
                                        "units"]))))

                model.compile(optimizer=algoParams["optimizer"],
                              loss=algoParams["loss"],
                              metrics=[algoParams['metrics']])

                model.fit(x_train,
                          y_train,
                          epochs=algoParams["number_of_epochs"],
                          verbose=1,
                          batch_size=algoParams["batch_size"])

                bestEstimator = model
            print(model.summary())
            trainingTime = time.time() - st
            y_score = bestEstimator.predict(x_test)
            y_score = list(y_score.flatten())
            try:
                y_prob = bestEstimator.predict_proba(x_test)
            except:
                y_prob = [0] * len(y_score)
            featureImportance = {}

            objs = {
                "trained_model": bestEstimator,
                "actual": y_test,
                "predicted": y_score,
                "probability": y_prob,
                "feature_importance": featureImportance,
                "featureList": list(x_train.columns),
                "labelMapping": {}
            }
            #featureImportance = objs["trained_model"].feature_importances_
            #featuresArray = [(col_name, featureImportance[idx]) for idx, col_name in enumerate(x_train.columns)]
            featuresArray = []
            if not algoSetting.is_hyperparameter_tuning_enabled():
                modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH -
                                         1) + "1"
                modelFilepathArr = model_filepath.split("/")[:-1]
                modelFilepathArr.append(modelName + ".h5")
                objs["trained_model"].save("/".join(modelFilepathArr))
                #joblib.dump(objs["trained_model"],"/".join(modelFilepathArr))
            metrics = {}
            metrics["r2"] = r2_score(y_test, y_score)
            metrics["neg_mean_squared_error"] = mean_squared_error(
                y_test, y_score)
            metrics["neg_mean_absolute_error"] = mean_absolute_error(
                y_test, y_score)
            metrics["RMSE"] = sqrt(metrics["neg_mean_squared_error"])
            metrics["explained_variance_score"] = explained_variance_score(
                y_test, y_score)
            transformed = pd.DataFrame({
                "prediction": y_score,
                result_column: y_test
            })
            transformed["difference"] = transformed[
                result_column] - transformed["prediction"]
            transformed["mape"] = old_div(
                np.abs(transformed["difference"]) * 100,
                transformed[result_column])

            sampleData = None
            nrows = transformed.shape[0]
            if nrows > 100:
                sampleData = transformed.sample(n=100, random_state=420)
            else:
                sampleData = transformed
            print(sampleData.head())
            if transformed["mape"].max() > 100:
                GLOBALSETTINGS.MAPEBINS.append(transformed["mape"].max())
                mapeCountArr = list(
                    pd.cut(transformed["mape"], GLOBALSETTINGS.MAPEBINS).
                    value_counts().to_dict().items())
                GLOBALSETTINGS.MAPEBINS.pop(5)
            else:
                mapeCountArr = list(
                    pd.cut(transformed["mape"], GLOBALSETTINGS.MAPEBINS).
                    value_counts().to_dict().items())
            mapeStatsArr = [(str(idx), dictObj) for idx, dictObj in enumerate(
                sorted([{
                    "count": x[1],
                    "splitRange": (x[0].left, x[0].right)
                } for x in mapeCountArr],
                       key=lambda x: x["splitRange"][0]))]
            print(mapeStatsArr)
            print(mapeCountArr)
            predictionColSummary = transformed["prediction"].describe(
            ).to_dict()
            quantileBins = [
                predictionColSummary["min"], predictionColSummary["25%"],
                predictionColSummary["50%"], predictionColSummary["75%"],
                predictionColSummary["max"]
            ]
            print(quantileBins)
            quantileBins = sorted(list(set(quantileBins)))
            transformed["quantileBinId"] = pd.cut(transformed["prediction"],
                                                  quantileBins)
            quantileDf = transformed.groupby("quantileBinId").agg({
                "prediction": [np.sum, np.mean, np.size]
            }).reset_index()
            quantileDf.columns = ["prediction", "sum", "mean", "count"]
            print(quantileDf)
            quantileArr = list(quantileDf.T.to_dict().items())
            quantileSummaryArr = [(obj[0], {
                "splitRange":
                (obj[1]["prediction"].left, obj[1]["prediction"].right),
                "count":
                obj[1]["count"],
                "mean":
                obj[1]["mean"],
                "sum":
                obj[1]["sum"]
            }) for obj in quantileArr]
            print(quantileSummaryArr)
            runtime = round((time.time() - st_global), 2)

            self._model_summary.set_model_type("regression")
            self._model_summary.set_algorithm_name(
                "Neural Network (TensorFlow)")
            self._model_summary.set_algorithm_display_name(
                "Neural Network (TensorFlow)")
            self._model_summary.set_slug(self._slug)
            self._model_summary.set_training_time(runtime)
            self._model_summary.set_training_time(trainingTime)
            self._model_summary.set_target_variable(result_column)
            self._model_summary.set_validation_method(
                validationDict["displayName"])
            self._model_summary.set_model_evaluation_metrics(metrics)
            self._model_summary.set_model_params(params_tf)
            self._model_summary.set_quantile_summary(quantileSummaryArr)
            self._model_summary.set_mape_stats(mapeStatsArr)
            self._model_summary.set_sample_data(sampleData.to_dict())
            self._model_summary.set_feature_importance(featuresArray)
            self._model_summary.set_feature_list(list(x_train.columns))
            self._model_summary.set_model_mse(
                metrics["neg_mean_squared_error"])
            self._model_summary.set_model_mae(
                metrics["neg_mean_absolute_error"])
            self._model_summary.set_rmse(metrics["RMSE"])
            self._model_summary.set_model_rsquared(metrics["r2"])
            self._model_summary.set_model_exp_variance_score(
                metrics["explained_variance_score"])

            try:
                pmml_filepath = str(model_path) + "/" + str(
                    self._slug) + "/traindeModel.pmml"
                modelPmmlPipeline = PMMLPipeline([("pretrained-estimator",
                                                   objs["trained_model"])])
                modelPmmlPipeline.target_field = result_column
                modelPmmlPipeline.active_fields = np.array(
                    [col for col in x_train.columns if col != result_column])
                sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr=True)
                pmmlfile = open(pmml_filepath, "r")
                pmmlText = pmmlfile.read()
                pmmlfile.close()
                self._result_setter.update_pmml_object({self._slug: pmmlText})
            except:
                pass

        if algoSetting.is_hyperparameter_tuning_enabled():
            modelDropDownObj = {
                "name": self._model_summary.get_algorithm_name(),
                "evaluationMetricValue": metrics[evaluationMetricDict["name"]],
                "evaluationMetricName": evaluationMetricDict["name"],
                "slug": self._model_summary.get_slug(),
                "Model Id": modelName
            }

            modelSummaryJson = {
                "dropdown": modelDropDownObj,
                "levelcount": self._model_summary.get_level_counts(),
                "modelFeatureList": self._model_summary.get_feature_list(),
                "levelMapping": self._model_summary.get_level_map_dict(),
                "slug": self._model_summary.get_slug(),
                "name": self._model_summary.get_algorithm_name()
            }
        else:
            modelDropDownObj = {
                "name": self._model_summary.get_algorithm_name(),
                "evaluationMetricValue": metrics[evaluationMetricDict["name"]],
                "evaluationMetricName": evaluationMetricDict["name"],
                "slug": self._model_summary.get_slug(),
                "Model Id": modelName
            }
            modelSummaryJson = {
                "dropdown": modelDropDownObj,
                "levelcount": self._model_summary.get_level_counts(),
                "modelFeatureList": self._model_summary.get_feature_list(),
                "levelMapping": self._model_summary.get_level_map_dict(),
                "slug": self._model_summary.get_slug(),
                "name": self._model_summary.get_algorithm_name()
            }
        modelmanagement_ = params_tf
        modelmanagement_.update(algoParams)

        self._model_management = MLModelSummary()
        if algoSetting.is_hyperparameter_tuning_enabled():
            pass
        else:
            self._model_management.set_layer_info(
                data=modelmanagement_['hidden_layer_info'])
            self._model_management.set_loss_function(
                data=modelmanagement_['loss'])
            self._model_management.set_optimizer(
                data=modelmanagement_['optimizer'])
            self._model_management.set_batch_size(
                data=modelmanagement_['batch_size'])
            self._model_management.set_no_epochs(
                data=modelmanagement_['number_of_epochs'])
            self._model_management.set_model_evaluation_metrics(
                data=modelmanagement_['metrics'])
            self._model_management.set_job_type(
                self._dataframe_context.get_job_name())  #Project name
            self._model_management.set_training_status(
                data="completed")  # training status
            self._model_management.set_no_of_independent_variables(
                data=x_train)  #no of independent varables
            self._model_management.set_training_time(runtime)  # run time
            self._model_management.set_rmse(metrics["RMSE"])
            self._model_management.set_algorithm_name(
                "Neural Network (TensorFlow)")  #algorithm name
            self._model_management.set_validation_method(
                str(validationDict["displayName"]) + "(" +
                str(validationDict["value"]) + ")")  #validation method
            self._model_management.set_target_variable(
                result_column)  #target column name
            self._model_management.set_creation_date(data=str(
                datetime.now().strftime('%b %d ,%Y  %H:%M ')))  #creation date
            self._model_management.set_datasetName(self._datasetName)
        modelManagementSummaryJson = [
            ["Project Name",
             self._model_management.get_job_type()],
            ["Algorithm",
             self._model_management.get_algorithm_name()],
            ["Training Status",
             self._model_management.get_training_status()],
            ["RMSE", self._model_management.get_rmse()],
            ["RunTime", self._model_management.get_training_time()],
            #["Owner",None],
            ["Created On",
             self._model_management.get_creation_date()]
        ]
        if algoSetting.is_hyperparameter_tuning_enabled():
            modelManagementModelSettingsJson = []
        else:
            modelManagementModelSettingsJson = [
                ["Training Dataset",
                 self._model_management.get_datasetName()],
                [
                    "Target Column",
                    self._model_management.get_target_variable()
                ],
                [
                    "Number Of Independent Variables",
                    self._model_management.get_no_of_independent_variables()
                ], ["Algorithm",
                    self._model_management.get_algorithm_name()],
                [
                    "Model Validation",
                    self._model_management.get_validation_method()
                ],
                ["batch_size",
                 str(self._model_management.get_batch_size())],
                ["Loss", self._model_management.get_loss_function()],
                ["Optimizer",
                 self._model_management.get_optimizer()],
                ["Epochs", self._model_management.get_no_epochs()],
                [
                    "Metrics",
                    self._model_management.get_model_evaluation_metrics()
                ]
            ]
            for i in range(
                    len(list(modelmanagement_['hidden_layer_info'].keys()))):
                string = ""
                key = "layer No-" + str(i) + "-" + str(
                    modelmanagement_["hidden_layer_info"][str(i)]["layer"] +
                    "-")
                for j in modelmanagement_["hidden_layer_info"][str(i)]:
                    modelManagementModelSettingsJson.append([
                        key + j + ":",
                        modelmanagement_["hidden_layer_info"][str(i)][j]
                    ])
        print(modelManagementModelSettingsJson)

        tfregCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for
            cardObj in MLUtils.create_model_summary_cards(self._model_summary)
        ]

        tfregPerformanceCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj))
            for cardObj in MLUtils.create_model_management_cards_regression(
                self._model_summary)
        ]
        tfregOverviewCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj))
            for cardObj in MLUtils.create_model_management_card_overview(
                self._model_management, modelManagementSummaryJson,
                modelManagementModelSettingsJson)
        ]
        tfregDeploymentCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj))
            for cardObj in MLUtils.create_model_management_deploy_empty_card()
        ]
        TFReg_Overview_Node = NarrativesTree()
        TFReg_Overview_Node.set_name("Overview")
        TFReg_Performance_Node = NarrativesTree()
        TFReg_Performance_Node.set_name("Performance")
        TFReg_Deployment_Node = NarrativesTree()
        TFReg_Deployment_Node.set_name("Deployment")
        for card in tfregOverviewCards:
            TFReg_Overview_Node.add_a_card(card)
        for card in tfregPerformanceCards:
            TFReg_Performance_Node.add_a_card(card)
        for card in tfregDeploymentCards:
            TFReg_Deployment_Node.add_a_card(card)
        for card in tfregCards:
            self._prediction_narrative.add_a_card(card)
        self._result_setter.set_model_summary({
            "Neural Network (TensorFlow)":
            json.loads(
                CommonUtils.convert_python_object_to_json(self._model_summary))
        })
        self._result_setter.set_tfreg_regression_model_summart(
            modelSummaryJson)
        self._result_setter.set_tfreg_cards(tfregCards)
        self._result_setter.set_tfreg_nodes([
            TFReg_Overview_Node, TFReg_Performance_Node, TFReg_Deployment_Node
        ])
        self._result_setter.set_tfreg_fail_card({
            "Algorithm_Name": "Neural Network (TensorFlow)",
            "Success": "True"
        })
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._slug,
            "completion",
            "info",
            display=True,
            emptyBin=False,
            customMsg=None,
            weightKey="total")
Esempio n. 17
0
    def __init__(self,
                 df_anova_result,
                 df_helper,
                 df_context,
                 result_setter,
                 story_narrative,
                 scriptWeight=None,
                 analysisName=None):
        self._story_narrative = story_narrative
        self._result_setter = result_setter
        self._dataframe_context = df_context
        self._df_anova_result = df_anova_result
        self._df_helper = df_helper
        self.narratives = {}
        self.narratives['variables'] = ''
        self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER
        self._base_dir = "/anova/"

        self._analysisName = self._dataframe_context.get_analysis_name()
        self._analysisDict = self._dataframe_context.get_analysis_dict()

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        self._messageURL = self._dataframe_context.get_message_url()
        if analysisName == None:
            self._analysisName = self._dataframe_context.get_analysis_name()
        else:
            self._analysisName = analysisName
        if scriptWeight == None:
            self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight(
            )
        else:
            self._scriptWeightDict = scriptWeight
        self._scriptStages = {
            "anovaNarrativeStart": {
                "summary": "Started The Anova Narratives",
                "weight": 0
            },
            "anovaNarrativeEnd": {
                "summary": "Narratives For Anova Finished",
                "weight": 10
            },
        }
        # self._completionStatus += self._scriptWeightDict[self._analysisName]["narratives"]*self._scriptStages["anovaNarrativeStart"]["weight"]/10
        # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
        #                             "anovaNarrativeStart",\
        #                             "info",\
        #                             self._scriptStages["anovaNarrativeStart"]["summary"],\
        #                             self._completionStatus,\
        #                             self._completionStatus)
        # CommonUtils.save_progress_message(self._messageURL,progressMessage)
        # self._dataframe_context.update_completion_status(self._completionStatus)
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "anovaNarrativeStart",
            "info",
            display=False,
            emptyBin=False,
            customMsg=None,
            weightKey="narratives")

        self._generate_narratives()

        # self._completionStatus += self._scriptWeightDict[self._analysisName]["narratives"]*self._scriptStages["anovaNarrativeEnd"]["weight"]/10
        # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
        #                             "anovaNarrativeEnd",\
        #                             "info",\
        #                             self._scriptStages["anovaNarrativeEnd"]["summary"],\
        #                             self._completionStatus,\
        #                             self._completionStatus)
        # CommonUtils.save_progress_message(self._messageURL,progressMessage)
        # self._dataframe_context.update_completion_status(self._completionStatus)
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "anovaNarrativeEnd",
            "info",
            display=False,
            emptyBin=False,
            customMsg=None,
            weightKey="narratives")

        if self._anovaNodes.get_card_count() > 0:
            self._story_narrative.add_a_node(self._anovaNodes)
            #self._generate_take_away()
            self._result_setter.set_anova_node(self._anovaNodes)
Esempio n. 18
0
    def __init__(self,
                 column_name,
                 df_helper,
                 df_context,
                 freq_dimension_stats,
                 result_setter,
                 story_narrative,
                 scriptWeight=None,
                 analysisName=None):
        self._story_narrative = story_narrative
        self._result_setter = result_setter
        self._column_name = column_name.lower()
        self._colname = column_name
        self._capitalized_column_name = "%s%s" % (column_name[0].upper(),
                                                  column_name[1:])
        self._dimension_col_freq_dict = freq_dimension_stats.get_frequency_dict(
        )
        self.header = None
        self.subheader = None
        self.count = {}
        self.summary = []
        self.analysis = []
        self.frequency_dict = json.loads(self._dimension_col_freq_dict)
        self.appid = df_context.get_app_id()
        self._base_dir = "/dimensions/"
        if self.appid != None:
            if self.appid == "1":
                self._base_dir += "appid1/"
            elif self.appid == "2":
                self._base_dir += "appid2/"
        self._dataframe_context = df_context
        self._dataframe_helper = df_helper
        self._storyOnScoredData = self._dataframe_context.get_story_on_scored_data(
        )
        self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER
        self._dimensionSummaryNode = NarrativesTree()
        self._dimensionSummaryNode.set_name("Overview")
        self._headNode = NarrativesTree()
        self._headNode.set_name("Overview")

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        if analysisName == None:
            self._analysisName = self._dataframe_context.get_analysis_name()
        else:
            self._analysisName = analysisName
        self._messageURL = self._dataframe_context.get_message_url()
        if scriptWeight == None:
            self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight(
            )
        else:
            self._scriptWeightDict = scriptWeight
        self._scriptStages = {
            "initialization": {
                "summary": "Initialized the Frequency Narratives",
                "weight": 2
            },
            "summarygeneration": {
                "summary": "summary generation finished",
                "weight": 8
            },
            "completion": {
                "summary": "Frequency Stats Narratives done",
                "weight": 0
            },
        }

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "initialization",
            "info",
            weightKey="narratives")
        self._generate_narratives()
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "summarygeneration",
            "info",
            weightKey="narratives")

        self._story_narrative.add_a_node(self._dimensionSummaryNode)

        self._result_setter.set_head_node(self._headNode)
        self._result_setter.set_distribution_node(self._dimensionSummaryNode)
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "summarygeneration",
            "info",
            weightKey="narratives")
Esempio n. 19
0
                print repr(e), d
                continue
        for m in all_measures:
            try:
                chisquare_result = self.test_measures(targetDimension, m)
                df_chisquare_result.add_chisquare_result(
                    targetDimension, m, chisquare_result)
            except Exception, e:
                print str(e), m
                continue

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "completion",
            "info",
            display=False,
            weightKey="script")
        return df_chisquare_result

    @accepts(object, basestring, basestring)
    def test_dimension(self, targetDimension, testDimension):
        if not targetDimension in self._dataframe_helper.get_string_columns():
            raise BIException.non_string_column(testDimension)
        chisquare_result = ChiSquareResult()
        pivot_table = self._data_frame.stat.crosstab(
            "{}".format(targetDimension), testDimension)
        # rdd = pivot_table.rdd.flatMap(lambda x: x).filter(lambda x: str(x).isdigit()).collect()
        rdd = list(
Esempio n. 20
0
    def test_all(self, measure_columns=None, dimension_columns=None):
        measures = measure_columns
        if measure_columns is None:
            measures = self._measure_columns
        dimensions = dimension_columns
        print("===================dimensions================")
        if dimension_columns is None:
            dimensions = self._dimension_columns
        try:
            nColsToUse = self._analysisDict[
                self._analysisName]["noOfColumnsToUse"]
        except:
            nColsToUse = None
        # if nColsToUse != None:
        #     dimensions = dimensions[:nColsToUse]
        sqrt_nrows = round(self._dataframe_helper.get_num_rows()**0.5)
        acceptable_level_count = GLOBALSETTINGS.ANOVAMAXLEVEL
        print({
            "acceptable_level_count": acceptable_level_count,
            "sqrt_nrows": sqrt_nrows
        })
        max_levels = builtins.min([acceptable_level_count, int(sqrt_nrows)])
        df_anova_result = DFTwoWayAnovaResult()
        dimensions_to_test = [
            dim for dim in dimensions
            if self._dataframe_helper.get_num_unique_values(dim) <= max_levels
        ]
        print(
            "======================= dimensions_to_test ==============================="
        )
        print(dimensions_to_test)
        self._dimensions_to_test = [
            x for x in dimensions_to_test if x in self._data_frame.columns
        ]
        print("dimensions to test ", self._dimensions_to_test)
        for measure in measures:
            if self._pandas_flag:
                measureColStat = [[
                    self._data_frame[measure].sum().item(),
                    self._data_frame[measure].mean(),
                    self._data_frame[measure].count().item()
                ]]
            else:
                measureColStat = self._data_frame.select([
                    sum(measure).alias("total"),
                    mean(measure).alias("average"),
                    count(measure).alias("count")
                ]).collect()
            measureColMean = measureColStat[0][1]
            measureColCount = measureColStat[0][2]
            if self._pandas_flag:
                measureColSst = ((self._data_frame[measure] -
                                  measureColMean)**2).sum()
            else:
                measureColSst = self._data_frame.select(
                    sum(pow(col(measure) - measureColMean, 2))).collect()[0][0]
            self._anova_result = MeasureAnovaResult(
                measureColMean=measureColMean,
                measureColCount=measureColCount,
                measureColSst=measureColSst)
            print(self._dataRangeStats)
            if self._dateFormatDetected:
                grouped_data = NarrativesUtils.get_grouped_data_for_trend(
                    self._data_frame, self._dataRangeStats["dataLevel"],
                    measure, "measure", self._pandas_flag)
                trendData = TrendData()
                trendData.set_params(grouped_data,None,\
                                    self._dataRangeStats["lastDate"],\
                                    self._dataRangeStats["firstDate"],\
                                    self._dataRangeStats["duration"],\
                                    self._dataRangeStats["durationString"],\
                                    self._dataRangeStats["dataLevel"]
                                    )
                self._anova_result.set_trend_data(trendData)

            for dimension in self._dimensions_to_test:
                print("dimension--", dimension)
                anovaResult = self.one_way_anova_test(
                    self._data_frame,
                    measure,
                    dimension,
                    measureColMean=measureColMean,
                    measureColCount=measureColCount,
                    measureColSst=measureColSst)
                dimensionAnovaResult = OneWayAnovaResult()
                dimensionAnovaResult.set_params(
                    df_within=anovaResult["df_within"],
                    df_between=anovaResult["df_between"],
                    sum_of_squares_between=anovaResult["ss_between"],
                    sum_of_squares_within=anovaResult["ss_within"],
                    mean_sum_of_squares_between=anovaResult["ms_between"],
                    mean_sum_of_squares_within=anovaResult["ms_within"],
                    f_value=anovaResult["f_stat"],
                    p_value=anovaResult["p_value"],
                    eta_squared=anovaResult["eta_squared"],
                    f_critical=anovaResult["f_critical"],
                    total_number_of_records=anovaResult["n_total"],
                    n_groups=anovaResult["n_groups"],
                    levelDf=anovaResult["levelDf"])
                self._anova_result.set_oneWayAnovaResultDict(
                    dimension, dimensionAnovaResult)
                # for top level
                if anovaResult["p_value"] < 0.05:
                    effect_size = anovaResult["eta_squared"]
                    self._dataframe_helper.add_significant_dimension(
                        dimension, effect_size)
                    topLevelAnova = TopLevelDfAnovaStats()
                    levelDf = anovaResult["levelDf"]
                    toplevelStats = levelDf.loc[levelDf["total"].argmax()]
                    print("toplevelStats", toplevelStats)
                    topLevelAnova.set_top_level_stat(toplevelStats)
                    if self._pandas_flag:
                        topLevelDf = self._data_frame[
                            self._data_frame[dimension].isin(
                                [toplevelStats.levels])]
                    else:
                        topLevelDf = self._data_frame.where(
                            col(dimension).isin([toplevelStats.levels]))
                    if self._dateFormatDetected:
                        levelPivot = NarrativesUtils.get_level_pivot(
                            self._data_frame,
                            'day',
                            measure,
                            dimension,
                            index_col=None,
                            pandas_flag=self._pandas_flag)
                        topLevelGroupedData = NarrativesUtils.get_grouped_data_for_trend(
                            topLevelDf, self._dataRangeStats["dataLevel"],
                            measure, "measure", self._pandas_flag)
                        trendData = TrendData()
                        trendData.set_grouped_data(topLevelGroupedData)
                        trendData.set_level_pivot(levelPivot)
                        topLevelAnova.set_trend_data(trendData)

                    if self._pandas_flag:
                        topLevelDfMeasureColStat = [[
                            topLevelDf[measure].sum().item(),
                            topLevelDf[measure].mean(),
                            topLevelDf[measure].count().item()
                        ]]
                    else:
                        topLevelDfMeasureColStat = topLevelDf.select([
                            sum(measure).alias("total"),
                            mean(measure).alias("average"),
                            count(measure).alias("count")
                        ]).collect()
                    topLevelDfMeasureColMean = topLevelDfMeasureColStat[0][1]
                    topLevelDfMeasureColCount = topLevelDfMeasureColStat[0][2]
                    if self._pandas_flag:
                        topLevelDfMeasureColSst = (
                            (topLevelDf[measure] -
                             topLevelDfMeasureColMean)**2).sum()
                    else:
                        topLevelDfMeasureColSst = topLevelDf.select(
                            sum(pow(
                                col(measure) - topLevelDfMeasureColMean,
                                2))).collect()[0][0]

                    dimensions_to_test_for_top_level = list(
                        set(self._dimensions_to_test) - {dimension})
                    topLevelAnovaDimensions = {}
                    for dimensionlTopLevel in dimensions_to_test_for_top_level:
                        print("top level dimensions", dimensionlTopLevel)
                        topLevelDfAnovaResult = self.one_way_anova_test(
                            topLevelDf,
                            measure,
                            dimensionlTopLevel,
                            measureColMean=topLevelDfMeasureColMean,
                            measureColCount=topLevelDfMeasureColCount,
                            measureColSst=topLevelDfMeasureColSst)
                        dimensiontopLevelAnovaResult = OneWayAnovaResult()
                        dimensiontopLevelAnovaResult.set_params(
                            df_within=topLevelDfAnovaResult["df_within"],
                            df_between=topLevelDfAnovaResult["df_between"],
                            sum_of_squares_between=topLevelDfAnovaResult[
                                "ss_between"],
                            sum_of_squares_within=topLevelDfAnovaResult[
                                "ss_within"],
                            mean_sum_of_squares_between=topLevelDfAnovaResult[
                                "ms_between"],
                            mean_sum_of_squares_within=topLevelDfAnovaResult[
                                "ms_within"],
                            f_value=topLevelDfAnovaResult["f_stat"],
                            p_value=topLevelDfAnovaResult["p_value"],
                            eta_squared=topLevelDfAnovaResult["eta_squared"],
                            f_critical=topLevelDfAnovaResult["f_critical"],
                            total_number_of_records=topLevelDfAnovaResult[
                                "n_total"],
                            n_groups=topLevelDfAnovaResult["n_groups"],
                            levelDf=topLevelDfAnovaResult["levelDf"])
                        topLevelAnova.set_top_level_anova(
                            dimensionlTopLevel, dimensiontopLevelAnovaResult)
                        # contributionDict = self.compute_contributions(topLevelDfAnovaResult["levelDf"])
                        # print contributionDict
                        # topLevelAnova.set_dimension_contributions(dimension,contributionDict)
                    self._anova_result.set_topLevelDfAnovaResult(
                        dimension, topLevelAnova)

            df_anova_result.add_measure_result(measure, self._anova_result)
            print(self._anova_result.get_dimensions_analyzed())
            print(
                "checking effect size access",
                self._anova_result.get_OneWayAnovaEffectSize(
                    self._dimensions_to_test[0]))

        # self._completionStatus += self._scriptWeightDict[self._analysisName]["script"]
        # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
        #                             "anovaEnd",\
        #                             "info",\
        #                             self._scriptStages["anovaEnd"]["summary"],\
        #                             self._completionStatus,\
        #                             self._completionStatus)
        # CommonUtils.save_progress_message(self._messageURL,progressMessage)
        # self._dataframe_context.update_completion_status(self._completionStatus)
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "anovaEnd",
            "info",
            display=False,
            emptyBin=False,
            customMsg=None,
            weightKey="script")
        return df_anova_result
Esempio n. 21
0
    def __init__(self,
                 data_frame,
                 df_helper,
                 df_context,
                 meta_parser,
                 scriptWeight=None,
                 analysisName=None):
        self._data_frame = data_frame
        self._dataframe_helper = df_helper
        self._dataframe_context = df_context
        self._pandas_flag = df_context._pandas_flag
        self._metaParser = meta_parser
        self._measure_columns = self._dataframe_helper.get_numeric_columns()
        self._dimension_columns = self._dataframe_helper.get_string_columns()
        self._timestamp_columns = self._dataframe_helper.get_timestamp_columns(
        )
        if analysisName == None:
            self._analysisName = self._dataframe_context.get_analysis_name()
        else:
            self._analysisName = analysisName
        if scriptWeight == None:
            self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight(
            )
        else:
            self._scriptWeightDict = scriptWeight

        print("=================dimension columns======================")
        print(self._dimension_columns)
        print("=================dimension columns======================")

        print("==================measure_columns ========================")
        print(self._measure_columns)
        print("==================measure_columns ========================")

        self._storyOnScoredData = self._dataframe_context.get_story_on_scored_data(
        )
        self._date_columns = self._dataframe_context.get_date_columns()
        self._uid_col = self._dataframe_context.get_uid_column()
        if self._metaParser.check_column_isin_ignored_suggestion(
                self._uid_col):
            self._dimension_columns = list(
                set(self._dimension_columns) - {self._uid_col})
        if len(self._date_columns) > 0:
            self._dimension_columns = list(
                set(self._dimension_columns) - set(self._date_columns))
        self.top_dimension_result = {}
        # if selected date col empty then on td_node
        self._dataRangeStats = None
        self._dateFormatDetected = False
        self._trend_on_td_column = False
        self._existingDateFormat = None
        self._dateFormatConversionDict = NarrativesUtils.date_formats_mapping_dict(
        )
        self._dateColumnFormatDict = df_context.get_date_format_dict()
        if self._dataframe_context.get_requested_date_format() != None:
            self._requestedDateFormat = df_context.get_requested_date_format()
        else:
            self._requestedDateFormat = None
        dateColCheck = None
        scriptsToRun = self._dataframe_context.get_analysis_name_list()

        print(self._dateColumnFormatDict)
        self._selected_date_columns = self._dataframe_context.get_selected_date_columns(
        )
        if self._selected_date_columns != None:
            dateColCheck = NarrativesUtils.check_date_column_formats(self._selected_date_columns,\
                                                    self._timestamp_columns,\
                                                    self._dateColumnFormatDict,\
                                                    self._dateFormatConversionDict,
                                                    self._requestedDateFormat)
        # print dateColCheck
        if not self._dataframe_context.get_anova_on_scored_data():
            if dateColCheck:
                self._dataframe_context.set_date_format_details(dateColCheck)
                self._dateFormatDetected = dateColCheck["dateFormatDetected"]
                self._trend_on_td_column = dateColCheck["trendOnTdCol"]
                if self._dateFormatDetected:
                    self._requestedDateFormat = dateColCheck[
                        "requestedDateFormat"]
                    self._existingDateFormat = dateColCheck[
                        "existingDateFormat"]
                    self._date_columns_suggested = dateColCheck[
                        "suggestedDateColumn"]
            if self._dateFormatDetected:
                print("self._existingDateFormat", self._existingDateFormat)
                print("self._existingDateFormat", self._existingDateFormat)
                self._data_frame, self._dataRangeStats = NarrativesUtils.calculate_data_range_stats(
                    self._data_frame, self._existingDateFormat,
                    self._date_columns_suggested, self._trend_on_td_column,
                    self._pandas_flag)

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        self._analysisName = self._dataframe_context.get_analysis_name()
        self._analysisDict = self._dataframe_context.get_analysis_dict()
        self._messageURL = self._dataframe_context.get_message_url()
        self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight(
        )
        if analysisName == None:
            self._analysisName = self._dataframe_context.get_analysis_name()
        else:
            self._analysisName = analysisName
        if scriptWeight == None:
            self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight(
            )
        else:
            self._scriptWeightDict = scriptWeight
        self._scriptStages = {
            "anovaStart": {
                "summary": "Initialized The Anova Scripts",
                "weight": 0
            },
            "anovaEnd": {
                "summary": "Anova Calculated",
                "weight": 10
            },
        }
        # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
        #                             "anovaStart",\
        #                             "info",\
        #                             self._scriptStages["anovaStart"]["summary"],\
        #                             self._completionStatus,\
        #                             self._completionStatus)
        # CommonUtils.save_progress_message(self._messageURL,progressMessage)
        # self._dataframe_context.update_completion_status(self._completionStatus)
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "anovaStart",
            "info",
            display=False,
            emptyBin=False,
            customMsg=None,
            weightKey="script")
    def Predict(self):
        self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight()
        self._scriptStages = {
            "initialization":{
                "summary":"Initialized the Generalized Linear Regression Scripts",
                "weight":2
                },
            "predictionStart":{
                "summary":"Generalized Linear Regression Model Prediction Started",
                "weight":2
                },
            "predictionFinished":{
                "summary":"Generalized Linear Regression Model Prediction Finished",
                "weight":6
                }
            }
        CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"initialization","info",display=True,emptyBin=False,customMsg=None,weightKey="total")

        SQLctx = SQLContext(sparkContext=self._spark.sparkContext, sparkSession=self._spark)
        dataSanity = True
        categorical_columns = self._dataframe_helper.get_string_columns()
        uid_col = self._dataframe_context.get_uid_column()
        if self._metaParser.check_column_isin_ignored_suggestion(uid_col):
            categorical_columns = list(set(categorical_columns) - {uid_col})
        allDateCols = self._dataframe_context.get_date_columns()
        categorical_columns = list(set(categorical_columns)-set(allDateCols))
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        result_column = self._dataframe_context.get_result_column()
        test_data_path = self._dataframe_context.get_input_file()
        CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"predictionStart","info",display=True,emptyBin=False,customMsg=None,weightKey="total")

        test_data_path = self._dataframe_context.get_input_file()
        score_data_path = self._dataframe_context.get_score_path()+"/data.csv"
        trained_model_path = "file://" + self._dataframe_context.get_model_path()
        trained_model_path += "/model"
        pipeline_path = "/".join(trained_model_path.split("/")[:-1])+"/pipeline"
        print "trained_model_path",trained_model_path
        print "pipeline_path",pipeline_path
        print "score_data_path",score_data_path
        pipelineModel = MLUtils.load_pipeline(pipeline_path)
        trained_model = MLUtils.load_generalized_linear_regresssion_pyspark_model(trained_model_path)
        df = self._data_frame
        indexed = pipelineModel.transform(df)
        transformed = trained_model.transform(indexed)
        if result_column in transformed.columns:
            transformed = transformed.withColumnRenamed(result_column,"originalLabel")
        transformed = transformed.withColumnRenamed("prediction",result_column)
        pandas_scored_df = transformed.select(list(set(self._data_frame.columns+[result_column]))).toPandas()
        if score_data_path.startswith("file"):
            score_data_path = score_data_path[7:]
        pandas_scored_df.to_csv(score_data_path,header=True,index=False)
        CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"predictionFinished","info",display=True,emptyBin=False,customMsg=None,weightKey="total")


        print "STARTING Measure ANALYSIS ..."
        columns_to_keep = []
        columns_to_drop = []
        columns_to_keep = self._dataframe_context.get_score_consider_columns()
        if len(columns_to_keep) > 0:
            columns_to_drop = list(set(df.columns)-set(columns_to_keep))
        else:
            columns_to_drop += ["predicted_probability"]
        columns_to_drop = [x for x in columns_to_drop if x in df.columns and x != result_column]
        print "columns_to_drop",columns_to_drop
        spark_scored_df = transformed.select(list(set(columns_to_keep+[result_column])))

        df_helper = DataFrameHelper(spark_scored_df, self._dataframe_context,self._metaParser)
        df_helper.set_params()
        df = df_helper.get_data_frame()
        # self._dataframe_context.set_dont_send_message(True)
        try:
            fs = time.time()
            descr_stats_obj = DescriptiveStatsScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,scriptWeight=self._scriptWeightDict,analysisName="Descriptive analysis")
            descr_stats_obj.Run()
            print "DescriptiveStats Analysis Done in ", time.time() - fs, " seconds."
        except:
            print "Frequency Analysis Failed "

        try:
            fs = time.time()
            df_helper.fill_na_dimension_nulls()
            df = df_helper.get_data_frame()
            dt_reg = DecisionTreeRegressionScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,self._metaParser,scriptWeight=self._scriptWeightDict,analysisName="Predictive modeling")
            dt_reg.Run()
            print "DecisionTrees Analysis Done in ", time.time() - fs, " seconds."
        except:
            print "DTREE FAILED"

        try:
            fs = time.time()
            two_way_obj = TwoWayAnovaScript(df, df_helper, self._dataframe_context, self._result_setter, self._spark,self._prediction_narrative,self._metaParser,scriptWeight=self._scriptWeightDict,analysisName="Measure vs. Dimension")
            two_way_obj.Run()
            print "OneWayAnova Analysis Done in ", time.time() - fs, " seconds."
        except:
            print "Anova Analysis Failed"
Esempio n. 23
0
    def _generate_narratives(self):
        """
        generate main card narrative and remaining cards are generated by calling ChiSquareAnalysis class for each of analyzed dimensions
        """
        for target_dimension in self._df_chisquare_result.keys():
            target_chisquare_result = self._df_chisquare_result[
                target_dimension]
            analysed_variables = target_chisquare_result.keys(
            )  ## List of all analyzed var.
            # List of significant var out of analyzed var.
            significant_variables = [
                dim for dim in target_chisquare_result.keys()
                if target_chisquare_result[dim].get_pvalue() <= 0.05
            ]
            effect_sizes = [
                target_chisquare_result[dim].get_effect_size()
                for dim in significant_variables
            ]

            effect_size_dict = dict(zip(significant_variables, effect_sizes))
            significant_variables = [
                y
                for (x, y) in sorted(zip(effect_sizes, significant_variables),
                                     reverse=True)
            ]
            #insignificant_variables = [i for i in self._df_chisquare_result[target_dimension] if i['pv']>0.05]

            num_analysed_variables = len(analysed_variables)
            num_significant_variables = len(significant_variables)
            self.narratives['main_card'] = {}
            self.narratives['main_card'][
                'heading'] = 'Relationship between ' + target_dimension + ' and other factors'
            self.narratives['main_card']['paragraphs'] = {}
            data_dict = {
                'num_variables': num_analysed_variables,
                'num_significant_variables': num_significant_variables,
                'significant_variables': significant_variables,
                'target': target_dimension,
                'analysed_dimensions': analysed_variables,
                'blockSplitter': self._blockSplitter
            }  # for both para 1 and para 2
            paragraph = {}
            paragraph['header'] = ''

            paragraph['content'] = NarrativesUtils.get_template_output(
                self._base_dir, 'main_card.html', data_dict)
            self.narratives['main_card']['paragraphs'] = [paragraph]
            self.narratives['cards'] = []
            chart = {
                'header':
                'Strength of association between ' + target_dimension +
                ' and other dimensions'
            }
            chart['data'] = effect_size_dict
            chart['label_text'] = {
                'x': 'Dimensions',
                'y': 'Effect Size (Cramers-V)'
            }

            chart_data = []
            chartDataValues = []
            for k, v in effect_size_dict.items():
                chart_data.append({"key": k, "value": float(v)})
                chartDataValues.append(float(v))
            chart_data = sorted(chart_data,
                                key=lambda x: x["value"],
                                reverse=True)
            chart_json = ChartJson()
            chart_json.set_data(chart_data)
            chart_json.set_chart_type("bar")
            # chart_json.set_label_text({'x':'Dimensions','y':'Effect Size (Cramers-V)'})
            chart_json.set_label_text({
                'x': '  ',
                'y': 'Effect Size (Cramers-V)'
            })
            chart_json.set_axis_rotation(True)
            chart_json.set_axes({"x": "key", "y": "value"})
            # chart_json.set_yaxis_number_format(".4f")
            chart_json.set_yaxis_number_format(
                NarrativesUtils.select_y_axis_format(chartDataValues))
            self.narratives['main_card']['chart'] = chart

            main_card = NormalCard()
            header = "<h3>Strength of association between " + target_dimension + " and other dimensions</h3>"
            main_card_data = [HtmlData(data=header)]
            main_card_narrative = NarrativesUtils.get_template_output(
                self._base_dir, 'main_card.html', data_dict)
            main_card_narrative = NarrativesUtils.block_splitter(
                main_card_narrative, self._blockSplitter)
            main_card_data += main_card_narrative
            # st_info = ["Test : Chi Square", "Threshold for p-value : 0.05", "Effect Size : Cramer's V"]
            # print "chartdata",chart_data
            if len(chart_data) > 0:
                statistical_info_array = [
                    ("Test Type", "Chi-Square"),
                    ("Effect Size", "Cramer's V"),
                    ("Max Effect Size", chart_data[0]["key"]),
                    ("Min Effect Size", chart_data[-1]["key"]),
                ]
                statistical_inferenc = ""
                if len(chart_data) == 1:
                    statistical_inference = "{} is the only variable that have significant association with the {} (Target) having an \
                     Effect size of {}".format(
                        chart_data[0]["key"],
                        self._dataframe_context.get_result_column(),
                        round(chart_data[0]["value"], 4))
                elif len(chart_data) == 2:
                    statistical_inference = "There are two variables ({} and {}) that have significant association with the {} (Target) and the \
                     Effect size ranges are {} and {} respectively".format(
                        chart_data[0]["key"], chart_data[1]["key"],
                        self._dataframe_context.get_result_column(),
                        round(chart_data[0]["value"], 4),
                        round(chart_data[1]["value"], 4))
                else:
                    statistical_inference = "There are {} variables that have significant association with the {} (Target) and the \
                     Effect size ranges from {} to {}".format(
                        len(chart_data),
                        self._dataframe_context.get_result_column(),
                        round(chart_data[0]["value"], 4),
                        round(chart_data[-1]["value"], 4))
                if statistical_inference != "":
                    statistical_info_array.append(
                        ("Inference", statistical_inference))
                statistical_info_array = NarrativesUtils.statistical_info_array_formatter(
                    statistical_info_array)
            else:
                statistical_info_array = []
            main_card_data.append(
                C3ChartData(data=chart_json, info=statistical_info_array))
            main_card.set_card_data(main_card_data)
            main_card.set_card_name("Key Influencers")

            if self._storyOnScoredData != True:
                self._chiSquareNode.add_a_card(main_card)
                self._result_setter.add_a_score_chi_card(main_card)

            print "target_dimension", target_dimension
            if self._appid == '2' and num_significant_variables > 5:
                significant_variables = significant_variables[:5]
            else:
                if self._nColsToUse != None:
                    significant_variables = significant_variables[:self.
                                                                  _nColsToUse]

            CommonUtils.create_update_and_save_progress_message(
                self._dataframe_context,
                self._scriptWeightDict,
                self._scriptStages,
                self._analysisName,
                "custom",
                "info",
                display=True,
                customMsg="Analyzing key drivers",
                weightKey="narratives")
            for analysed_dimension in significant_variables[:self.
                                                            _noOfSigDimsToShow]:
                chisquare_result = self._df_chisquare.get_chisquare_result(
                    target_dimension, analysed_dimension)
                if self._appid == '2':
                    print "APPID 2 is used"
                    card = ChiSquareAnalysis(
                        self._dataframe_context, self._dataframe_helper,
                        chisquare_result, target_dimension, analysed_dimension,
                        significant_variables, num_analysed_variables,
                        self._data_frame, self._measure_columns,
                        self._base_dir, None, target_chisquare_result)
                    # self.narratives['cards'].append(card)
                    self._result_setter.add_a_score_chi_card(
                        json.loads(
                            CommonUtils.convert_python_object_to_json(
                                card.get_dimension_card1())))

                elif self._appid == '1':
                    print "APPID 1 is used"
                    card = ChiSquareAnalysis(
                        self._dataframe_context, self._dataframe_helper,
                        chisquare_result, target_dimension, analysed_dimension,
                        significant_variables, num_analysed_variables,
                        self._data_frame, self._measure_columns,
                        self._base_dir, None, target_chisquare_result)
                    # self.narratives['cards'].append(card)
                    self._result_setter.add_a_score_chi_card(
                        json.loads(
                            CommonUtils.convert_python_object_to_json(
                                card.get_dimension_card1())))
                else:
                    target_dimension_card = ChiSquareAnalysis(
                        self._dataframe_context, self._dataframe_helper,
                        chisquare_result, target_dimension, analysed_dimension,
                        significant_variables, num_analysed_variables,
                        self._data_frame, self._measure_columns,
                        self._base_dir, None, target_chisquare_result)
                    self.narratives['cards'].append(target_dimension_card)
                    self._chiSquareNode.add_a_node(
                        target_dimension_card.get_dimension_node())
        self._story_narrative.add_a_node(self._chiSquareNode)
        self._result_setter.set_chisquare_node(self._chiSquareNode)
    def Train(self):
        st_global = time.time()

        CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"initialization","info",display=True,emptyBin=False,customMsg=None,weightKey="total")

        appType = self._dataframe_context.get_app_type()
        algosToRun = self._dataframe_context.get_algorithms_to_run()
        algoSetting = filter(lambda x:x.get_algorithm_slug()==self._slug,algosToRun)[0]
        categorical_columns = self._dataframe_helper.get_string_columns()
        uid_col = self._dataframe_context.get_uid_column()
        if self._metaParser.check_column_isin_ignored_suggestion(uid_col):
            categorical_columns = list(set(categorical_columns) - {uid_col})
        allDateCols = self._dataframe_context.get_date_columns()
        categorical_columns = list(set(categorical_columns)-set(allDateCols))
        print categorical_columns
        result_column = self._dataframe_context.get_result_column()
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        numerical_columns = [x for x in numerical_columns if x != result_column]

        model_path = self._dataframe_context.get_model_path()
        if model_path.startswith("file"):
            model_path = model_path[7:]
        validationDict = self._dataframe_context.get_validation_dict()
        print "model_path",model_path
        pipeline_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/pipeline/"
        model_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/model"
        pmml_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/modelPmml"

        df = self._data_frame
        if self._mlEnv == "spark":
            pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns,categorical_columns,result_column,algoType="regression")

            pipelineModel = pipeline.fit(df)
            indexed = pipelineModel.transform(df)
            featureMapping = sorted((attr["idx"], attr["name"]) for attr in (chain(*indexed.schema["features"].metadata["ml_attr"]["attrs"].values())))

            # print indexed.select([result_column,"features"]).show(5)
            MLUtils.save_pipeline_or_model(pipelineModel,pipeline_filepath)
            # OriginalTargetconverter = IndexToString(inputCol="label", outputCol="originalTargetColumn")
            dtreer = DecisionTreeRegressor(labelCol=result_column, featuresCol='features',predictionCol="prediction")
            if validationDict["name"] == "kFold":
                defaultSplit = GLOBALSETTINGS.DEFAULT_VALIDATION_OBJECT["value"]
                numFold = int(validationDict["value"])
                if numFold == 0:
                    numFold = 3
                trainingData,validationData = indexed.randomSplit([defaultSplit,1-defaultSplit], seed=12345)
                paramGrid = ParamGridBuilder()\
                    .addGrid(dtreer.regParam, [0.1, 0.01]) \
                    .addGrid(dtreer.fitIntercept, [False, True])\
                    .addGrid(dtreer.elasticNetParam, [0.0, 0.5, 1.0])\
                    .build()
                crossval = CrossValidator(estimator=dtreer,
                              estimatorParamMaps=paramGrid,
                              evaluator=RegressionEvaluator(predictionCol="prediction", labelCol=result_column),
                              numFolds=numFold)
                st = time.time()
                cvModel = crossval.fit(indexed)
                trainingTime = time.time()-st
                print "cvModel training takes",trainingTime
                bestModel = cvModel.bestModel
            elif validationDict["name"] == "trainAndtest":
                trainingData,validationData = indexed.randomSplit([float(validationDict["value"]),1-float(validationDict["value"])], seed=12345)
                st = time.time()
                fit = dtreer.fit(trainingData)
                trainingTime = time.time()-st
                print "time to train",trainingTime
                bestModel = fit

            featureImportance = bestModel.featureImportances
            print featureImportance,type(featureImportance)
            # print featureImportance[0],len(featureImportance[1],len(featureImportance[2]))
            print len(featureMapping)
            featuresArray = [(name, featureImportance[idx]) for idx, name in featureMapping]
            print featuresArray
            MLUtils.save_pipeline_or_model(bestModel,model_filepath)
            transformed = bestModel.transform(validationData)
            transformed = transformed.withColumn(result_column,transformed[result_column].cast(DoubleType()))
            transformed = transformed.select([result_column,"prediction",transformed[result_column]-transformed["prediction"]])
            transformed = transformed.withColumnRenamed(transformed.columns[-1],"difference")
            transformed = transformed.select([result_column,"prediction","difference",FN.abs(transformed["difference"])*100/transformed[result_column]])
            transformed = transformed.withColumnRenamed(transformed.columns[-1],"mape")
            sampleData = None
            nrows = transformed.count()
            if nrows > 100:
                sampleData = transformed.sample(False, float(100)/nrows, seed=420)
            else:
                sampleData = transformed
            print sampleData.show()
            evaluator = RegressionEvaluator(predictionCol="prediction",labelCol=result_column)
            metrics = {}
            metrics["r2"] = evaluator.evaluate(transformed,{evaluator.metricName: "r2"})
            metrics["rmse"] = evaluator.evaluate(transformed,{evaluator.metricName: "rmse"})
            metrics["mse"] = evaluator.evaluate(transformed,{evaluator.metricName: "mse"})
            metrics["mae"] = evaluator.evaluate(transformed,{evaluator.metricName: "mae"})
            runtime = round((time.time() - st_global),2)
            # print transformed.count()
            mapeDf = transformed.select("mape")
            # print mapeDf.show()
            mapeStats = MLUtils.get_mape_stats(mapeDf,"mape")
            mapeStatsArr = mapeStats.items()
            mapeStatsArr = sorted(mapeStatsArr,key=lambda x:int(x[0]))
            # print mapeStatsArr
            quantileDf = transformed.select("prediction")
            # print quantileDf.show()
            quantileSummaryDict = MLUtils.get_quantile_summary(quantileDf,"prediction")
            quantileSummaryArr = quantileSummaryDict.items()
            quantileSummaryArr = sorted(quantileSummaryArr,key=lambda x:int(x[0]))
            # print quantileSummaryArr
            self._model_summary.set_model_type("regression")
            self._model_summary.set_algorithm_name("dtree Regression")
            self._model_summary.set_algorithm_display_name("Decision Tree Regression")
            self._model_summary.set_slug(self._slug)
            self._model_summary.set_training_time(runtime)
            self._model_summary.set_training_time(trainingTime)
            self._model_summary.set_target_variable(result_column)
            self._model_summary.set_validation_method(validationDict["displayName"])
            self._model_summary.set_model_evaluation_metrics(metrics)
            self._model_summary.set_model_params(bestEstimator.get_params())
            self._model_summary.set_quantile_summary(quantileSummaryArr)
            self._model_summary.set_mape_stats(mapeStatsArr)
            self._model_summary.set_sample_data(sampleData.toPandas().to_dict())
            self._model_summary.set_feature_importance(featureImportance)
            # print CommonUtils.convert_python_object_to_json(self._model_summary)
        elif self._mlEnv == "sklearn":
            model_filepath = model_path+"/"+self._slug+"/model.pkl"
            x_train,x_test,y_train,y_test = self._dataframe_helper.get_train_test_data()
            x_train = MLUtils.create_dummy_columns(x_train,[x for x in categorical_columns if x != result_column])
            x_test = MLUtils.create_dummy_columns(x_test,[x for x in categorical_columns if x != result_column])
            x_test = MLUtils.fill_missing_columns(x_test,x_train.columns,result_column)

            st = time.time()
            est = DecisionTreeRegressor()

            CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"training","info",display=True,emptyBin=False,customMsg=None,weightKey="total")

            if algoSetting.is_hyperparameter_tuning_enabled():
                hyperParamInitParam = algoSetting.get_hyperparameter_params()
                evaluationMetricDict = {"name":hyperParamInitParam["evaluationMetric"]}
                evaluationMetricDict["displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[evaluationMetricDict["name"]]
                hyperParamAlgoName = algoSetting.get_hyperparameter_algo_name()
                params_grid = algoSetting.get_params_dict_hyperparameter()
                params_grid = {k:v for k,v in params_grid.items() if k in est.get_params()}
                print params_grid
                if hyperParamAlgoName == "gridsearchcv":
                    estGrid = GridSearchCV(est,params_grid)
                    gridParams = estGrid.get_params()
                    hyperParamInitParam = {k:v for k,v in hyperParamInitParam.items() if k in gridParams}
                    estGrid.set_params(**hyperParamInitParam)
                    estGrid.fit(x_train,y_train)
                    bestEstimator = estGrid.best_estimator_
                    modelFilepath = "/".join(model_filepath.split("/")[:-1])
                    sklearnHyperParameterResultObj = SklearnGridSearchResult(estGrid.cv_results_,est,x_train,x_test,y_train,y_test,appType,modelFilepath,evaluationMetricDict=evaluationMetricDict)
                    resultArray = sklearnHyperParameterResultObj.train_and_save_models()
                    self._result_setter.set_hyper_parameter_results(self._slug,resultArray)
                    self._result_setter.set_metadata_parallel_coordinates(self._slug,{"ignoreList":sklearnHyperParameterResultObj.get_ignore_list(),"hideColumns":sklearnHyperParameterResultObj.get_hide_columns(),"metricColName":sklearnHyperParameterResultObj.get_comparison_metric_colname(),"columnOrder":sklearnHyperParameterResultObj.get_keep_columns()})

                elif hyperParamAlgoName == "randomsearchcv":
                    estRand = RandomizedSearchCV(est,params_grid)
                    estRand.set_params(**hyperParamInitParam)
                    bestEstimator = None
            else:
                evaluationMetricDict = {"name":GLOBALSETTINGS.REGRESSION_MODEL_EVALUATION_METRIC}
                evaluationMetricDict["displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[evaluationMetricDict["name"]]
                algoParams = algoSetting.get_params_dict()
                algoParams = {k:v for k,v in algoParams.items() if k in est.get_params().keys()}
                est.set_params(**algoParams)
                self._result_setter.set_hyper_parameter_results(self._slug,None)
                if validationDict["name"] == "kFold":
                    defaultSplit = GLOBALSETTINGS.DEFAULT_VALIDATION_OBJECT["value"]
                    numFold = int(validationDict["value"])
                    if numFold == 0:
                        numFold = 3
                    kFoldClass = SkleanrKFoldResult(numFold,est,x_train,x_test,y_train,y_test,appType,evaluationMetricDict=evaluationMetricDict)
                    kFoldClass.train_and_save_result()
                    kFoldOutput = kFoldClass.get_kfold_result()
                    bestEstimator = kFoldClass.get_best_estimator()
                elif validationDict["name"] == "trainAndtest":
                    est.fit(x_train, y_train)
                    bestEstimator = est
            trainingTime = time.time()-st
            y_score = bestEstimator.predict(x_test)
            try:
                y_prob = bestEstimator.predict_proba(x_test)
            except:
                y_prob = [0]*len(y_score)
            featureImportance={}

            objs = {"trained_model":bestEstimator,"actual":y_test,"predicted":y_score,"probability":y_prob,"feature_importance":featureImportance,"featureList":list(x_train.columns),"labelMapping":{}}
            featureImportance = objs["trained_model"].feature_importances_
            featuresArray = [(col_name, featureImportance[idx]) for idx, col_name in enumerate(x_train.columns)]

            if not algoSetting.is_hyperparameter_tuning_enabled():
                modelName = "M"+"0"*(GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH-1)+"1"
                modelFilepathArr = model_filepath.split("/")[:-1]
                modelFilepathArr.append(modelName+".pkl")
                joblib.dump(objs["trained_model"],"/".join(modelFilepathArr))
            metrics = {}
            metrics["r2"] = r2_score(y_test, y_score)
            metrics["mse"] = mean_squared_error(y_test, y_score)
            metrics["mae"] = mean_absolute_error(y_test, y_score)
            metrics["rmse"] = sqrt(metrics["mse"])
            transformed = pd.DataFrame({"prediction":y_score,result_column:y_test})
            transformed["difference"] = transformed[result_column] - transformed["prediction"]
            transformed["mape"] = np.abs(transformed["difference"])*100/transformed[result_column]

            sampleData = None
            nrows = transformed.shape[0]
            if nrows > 100:
                sampleData = transformed.sample(n=100,random_state=420)
            else:
                sampleData = transformed
            print sampleData.head()

            mapeCountArr = pd.cut(transformed["mape"],GLOBALSETTINGS.MAPEBINS).value_counts().to_dict().items()
            mapeStatsArr = [(str(idx),dictObj) for idx,dictObj in enumerate(sorted([{"count":x[1],"splitRange":(x[0].left,x[0].right)} for x in mapeCountArr],key = lambda x:x["splitRange"][0]))]

            predictionColSummary = transformed["prediction"].describe().to_dict()
            quantileBins = [predictionColSummary["min"],predictionColSummary["25%"],predictionColSummary["50%"],predictionColSummary["75%"],predictionColSummary["max"]]
            print quantileBins
            quantileBins = sorted(list(set(quantileBins)))
            transformed["quantileBinId"] = pd.cut(transformed["prediction"],quantileBins)
            quantileDf = transformed.groupby("quantileBinId").agg({"prediction":[np.sum,np.mean,np.size]}).reset_index()
            quantileDf.columns = ["prediction","sum","mean","count"]
            print quantileDf
            quantileArr = quantileDf.T.to_dict().items()
            quantileSummaryArr = [(obj[0],{"splitRange":(obj[1]["prediction"].left,obj[1]["prediction"].right),"count":obj[1]["count"],"mean":obj[1]["mean"],"sum":obj[1]["sum"]}) for obj in quantileArr]
            print quantileSummaryArr
            runtime = round((time.time() - st_global),2)

            self._model_summary.set_model_type("regression")
            self._model_summary.set_algorithm_name("DTREE Regression")
            self._model_summary.set_algorithm_display_name("Decision Tree Regression")
            self._model_summary.set_slug(self._slug)
            self._model_summary.set_training_time(runtime)
            self._model_summary.set_training_time(trainingTime)
            self._model_summary.set_target_variable(result_column)
            self._model_summary.set_validation_method(validationDict["displayName"])
            self._model_summary.set_model_evaluation_metrics(metrics)
            self._model_summary.set_model_params(bestEstimator.get_params())
            self._model_summary.set_quantile_summary(quantileSummaryArr)
            self._model_summary.set_mape_stats(mapeStatsArr)
            self._model_summary.set_sample_data(sampleData.to_dict())
            self._model_summary.set_feature_importance(featuresArray)
            self._model_summary.set_feature_list(list(x_train.columns))


            try:
                pmml_filepath = str(model_path)+"/"+str(self._slug)+"/traindeModel.pmml"
                modelPmmlPipeline = PMMLPipeline([
                  ("pretrained-estimator", objs["trained_model"])
                ])
                modelPmmlPipeline.target_field = result_column
                modelPmmlPipeline.active_fields = np.array([col for col in x_train.columns if col != result_column])
                sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr = True)
                pmmlfile = open(pmml_filepath,"r")
                pmmlText = pmmlfile.read()
                pmmlfile.close()
                self._result_setter.update_pmml_object({self._slug:pmmlText})
            except:
                pass
        if not algoSetting.is_hyperparameter_tuning_enabled():
            modelDropDownObj = {
                        "name":self._model_summary.get_algorithm_name(),
                        "evaluationMetricValue":self._model_summary.get_model_accuracy(),
                        "evaluationMetricName":"r2",
                        "slug":self._model_summary.get_slug(),
                        "Model Id":modelName
                        }

            modelSummaryJson = {
                "dropdown":modelDropDownObj,
                "levelcount":self._model_summary.get_level_counts(),
                "modelFeatureList":self._model_summary.get_feature_list(),
                "levelMapping":self._model_summary.get_level_map_dict(),
                "slug":self._model_summary.get_slug(),
                "name":self._model_summary.get_algorithm_name()
            }
        else:
            modelDropDownObj = {
                        "name":self._model_summary.get_algorithm_name(),
                        "evaluationMetricValue":resultArray[0]["R-Squared"],
                        "evaluationMetricName":"r2",
                        "slug":self._model_summary.get_slug(),
                        "Model Id":resultArray[0]["Model Id"]
                        }
            modelSummaryJson = {
                "dropdown":modelDropDownObj,
                "levelcount":self._model_summary.get_level_counts(),
                "modelFeatureList":self._model_summary.get_feature_list(),
                "levelMapping":self._model_summary.get_level_map_dict(),
                "slug":self._model_summary.get_slug(),
                "name":self._model_summary.get_algorithm_name()
            }

        dtreerCards = [json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary)]

        for card in dtreerCards:
            self._prediction_narrative.add_a_card(card)
        self._result_setter.set_model_summary({"dtreeregression":json.loads(CommonUtils.convert_python_object_to_json(self._model_summary))})
        self._result_setter.set_dtree_regression_model_summart(modelSummaryJson)
        self._result_setter.set_dtreer_cards(dtreerCards)

        CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"completion","info",display=True,emptyBin=False,customMsg=None,weightKey="total")
Esempio n. 25
0
    def __init__(self,
                 column_name,
                 decision_tree_rules,
                 df_helper,
                 df_context,
                 result_setter,
                 story_narrative,
                 scriptWeight=None,
                 analysisName=None):
        self._story_narrative = story_narrative
        self._result_setter = result_setter
        self._dataframe_context = df_context
        self._column_name = column_name.lower()
        self._colname = column_name
        self._capitalized_column_name = "%s%s" % (column_name[0].upper(),
                                                  column_name[1:])
        self._decision_rules_dict = decision_tree_rules.get_decision_rules()
        self._table = decision_tree_rules.get_table()
        self.new_table = {}
        self.successful_predictions = decision_tree_rules.get_success()
        self.total_predictions = decision_tree_rules.get_total()
        self.success_percent = decision_tree_rules.get_success_percent()
        self._important_vars = decision_tree_rules.get_significant_vars()
        self._target_distribution = decision_tree_rules.get_target_contributions(
        )
        self._get_new_table()
        self._df_helper = df_helper
        self.subheader = None
        self.dropdownComment = None
        self.dropdownValues = None
        self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER
        self._base_dir = "/decisiontree/"
        self._decisionTreeNode = NarrativesTree(name='Prediction')
        # self._decisionTreeNode.set_name("Decision Tree Regression")

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        self._messageURL = self._dataframe_context.get_message_url()
        if analysisName == None:
            self._analysisName = self._dataframe_context.get_analysis_name()
        else:
            self._analysisName = analysisName
        if scriptWeight == None:
            self._scriptWeightDict = self._dataframe_context.get_measure_analysis_weight(
            )
        else:
            self._scriptWeightDict = scriptWeight
        self._scriptStages = {
            "dtreeNarrativeStart": {
                "summary": "Started the Decision Tree Regression Narratives",
                "weight": 0
            },
            "dtreeNarrativeEnd": {
                "summary": "Narratives for Decision Tree Regression Finished",
                "weight": 10
            },
        }
        # self._completionStatus += self._scriptWeightDict[self._analysisName]["narratives"]*self._scriptStages["dtreeNarrativeStart"]["weight"]/10
        # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
        #                             "dtreeNarrativeStart",\
        #                             "info",\
        #                             self._scriptStages["dtreeNarrativeStart"]["summary"],\
        #                             self._completionStatus,\
        #                             self._completionStatus)
        # CommonUtils.save_progress_message(self._messageURL,progressMessage)
        # self._dataframe_context.update_completion_status(self._completionStatus)
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "dtreeNarrativeStart",
            "info",
            weightKey="narratives")

        self._generate_narratives()
        self._story_narrative.add_a_node(self._decisionTreeNode)
        self._result_setter.set_decision_tree_node(self._decisionTreeNode)

        # self._completionStatus = self._dataframe_context.get_completion_status()
        # self._completionStatus += self._scriptWeightDict[self._analysisName]["narratives"]*self._scriptStages["dtreeNarrativeEnd"]["weight"]/10
        # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
        #                             "dtreeNarrativeEnd",\
        #                             "info",\
        #                             self._scriptStages["dtreeNarrativeEnd"]["summary"],\
        #                             self._completionStatus,\
        #                             self._completionStatus)
        # CommonUtils.save_progress_message(self._messageURL,progressMessage)
        # self._dataframe_context.update_completion_status(self._completionStatus)
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "dtreeNarrativeEnd",
            "info",
            weightKey="narratives")