コード例 #1
0
    def Train(self):
        st = time.time()
        categorical_columns = self._dataframe_helper.get_string_columns()
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        result_column = self._dataframe_context.get_result_column()
        categorical_columns = [
            x for x in categorical_columns if x != result_column
        ]

        model_path = self._dataframe_context.get_model_path()
        pipeline_filepath = model_path + "/LogisticRegression/TrainedModels/pipeline"
        model_filepath = model_path + "/LogisticRegression/TrainedModels/model"
        summary_filepath = model_path + "/LogisticRegression/ModelSummary/summary.json"

        df = self._data_frame
        pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns,
                                                      categorical_columns,
                                                      result_column)
        pipelineModel = pipeline.fit(df)
        indexed = pipelineModel.transform(df)
        MLUtils.save_pipeline_or_model(pipelineModel, pipeline_filepath)
        trainingData, validationData = MLUtils.get_training_and_validation_data(
            indexed, result_column, 0.8)
        OriginalTargetconverter = IndexToString(
            inputCol="label", outputCol="originalTargetColumn")
        levels = trainingData.select("label").distinct().collect()

        if self._classifier == "lr":
            if len(levels) == 2:
                lr = LogisticRegression(maxIter=10,
                                        regParam=0.3,
                                        elasticNetParam=0.8)
            elif len(levels) > 2:
                lr = LogisticRegression(maxIter=10,
                                        regParam=0.3,
                                        elasticNetParam=0.8,
                                        family="multinomial")
            fit = lr.fit(trainingData)
        elif self._classifier == "OneVsRest":
            lr = LogisticRegression()
            ovr = OneVsRest(classifier=lr)
            fit = ovr.fit(trainingData)
        transformed = fit.transform(validationData)
        MLUtils.save_pipeline_or_model(fit, model_filepath)

        print fit.coefficientMatrix
        print fit.interceptVector

        # feature_importance = MLUtils.calculate_sparkml_feature_importance(indexed,fit,categorical_columns,numerical_columns)
        label_classes = transformed.select("label").distinct().collect()
        results = transformed.select(["prediction", "label"])
        if len(label_classes) > 2:
            evaluator = MulticlassClassificationEvaluator(
                predictionCol="prediction")
            evaluator.evaluate(results)
            self._model_summary["model_accuracy"] = evaluator.evaluate(
                results,
                {evaluator.metricName: "accuracy"})  # accuracy of the model
        else:
            evaluator = BinaryClassificationEvaluator(
                rawPredictionCol="prediction")
            evaluator.evaluate(results)
            # print evaluator.evaluate(results,{evaluator.metricName: "areaUnderROC"})
            # print evaluator.evaluate(results,{evaluator.metricName: "areaUnderPR"})
            self._model_summary["model_accuracy"] = evaluator.evaluate(
                results,
                {evaluator.metricName: "areaUnderPR"})  # accuracy of the model

        # self._model_summary["feature_importance"] = MLUtils.transform_feature_importance(feature_importance)
        self._model_summary["runtime_in_seconds"] = round((time.time() - st),
                                                          2)

        transformed = OriginalTargetconverter.transform(transformed)
        label_indexer_dict = [
            dict(enumerate(field.metadata["ml_attr"]["vals"]))
            for field in transformed.schema.fields if field.name == "label"
        ][0]
        prediction_to_levels = udf(lambda x: label_indexer_dict[x],
                                   StringType())
        transformed = transformed.withColumn(
            "predictedClass", prediction_to_levels(transformed.prediction))
        prediction_df = transformed.select(
            ["originalTargetColumn", "predictedClass"]).toPandas()
        objs = {
            "actual": prediction_df["originalTargetColumn"],
            "predicted": prediction_df["predictedClass"]
        }

        self._model_summary[
            "confusion_matrix"] = MLUtils.calculate_confusion_matrix(
                objs["actual"], objs["predicted"])
        overall_precision_recall = MLUtils.calculate_overall_precision_recall(
            objs["actual"], objs["predicted"])
        self._model_summary[
            "precision_recall_stats"] = overall_precision_recall[
                "classwise_stats"]
        self._model_summary["model_precision"] = overall_precision_recall[
            "precision"]
        self._model_summary["model_recall"] = overall_precision_recall[
            "recall"]
        self._model_summary["target_variable"] = result_column
        self._model_summary[
            "test_sample_prediction"] = overall_precision_recall[
                "prediction_split"]
        self._model_summary["algorithm_name"] = "Random Forest"
        self._model_summary["validation_method"] = "Train and Test"
        self._model_summary["independent_variables"] = len(
            categorical_columns) + len(numerical_columns)
        self._model_summary["level_counts"] = CommonUtils.get_level_count_dict(
            trainingData,
            categorical_columns,
            self._dataframe_context.get_column_separator(),
            dataType="spark")
        # print json.dumps(self._model_summary,indent=2)
        self._model_summary["total_trees"] = 100
        self._model_summary["total_rules"] = 300
        CommonUtils.write_to_file(
            summary_filepath, json.dumps({"modelSummary":
                                          self._model_summary}))
コード例 #2
0
    def Predict(self):
        self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight(
        )
        self._scriptStages = {
            "initialization": {
                "summary": "Initialized the Naive Bayes Scripts",
                "weight": 2
            },
            "prediction": {
                "summary": "Spark ML Naive Bayes Model Prediction Finished",
                "weight": 2
            },
            "frequency": {
                "summary": "descriptive analysis finished",
                "weight": 2
            },
            "chisquare": {
                "summary": "chi Square analysis finished",
                "weight": 4
            },
            "completion": {
                "summary": "all analysis finished",
                "weight": 4
            },
        }

        self._completionStatus += self._scriptWeightDict[self._analysisName][
            "total"] * self._scriptStages["initialization"]["weight"] / 10
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "initialization",\
                                    "info",\
                                    self._scriptStages["initialization"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL, progressMessage)
        self._dataframe_context.update_completion_status(
            self._completionStatus)

        SQLctx = SQLContext(sparkContext=self._spark.sparkContext,
                            sparkSession=self._spark)
        dataSanity = True
        level_counts_train = self._dataframe_context.get_level_count_dict()
        categorical_columns = self._dataframe_helper.get_string_columns()
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        time_dimension_columns = self._dataframe_helper.get_timestamp_columns()
        result_column = self._dataframe_context.get_result_column()
        categorical_columns = [
            x for x in categorical_columns if x != result_column
        ]

        level_counts_score = CommonUtils.get_level_count_dict(
            self._data_frame,
            categorical_columns,
            self._dataframe_context.get_column_separator(),
            output_type="dict",
            dataType="spark")
        for key in level_counts_train:
            if key in level_counts_score:
                if level_counts_train[key] != level_counts_score[key]:
                    dataSanity = False
            else:
                dataSanity = False

        test_data_path = self._dataframe_context.get_input_file()
        score_data_path = self._dataframe_context.get_score_path(
        ) + "/data.csv"
        trained_model_path = self._dataframe_context.get_model_path()
        trained_model_path = "/".join(
            trained_model_path.split("/")[:-1]
        ) + "/" + self._slug + "/" + self._dataframe_context.get_model_for_scoring(
        )
        # score_summary_path = self._dataframe_context.get_score_path()+"/Summary/summary.json"

        pipelineModel = MLUtils.load_pipeline(trained_model_path)

        df = self._data_frame
        transformed = pipelineModel.transform(df)
        label_indexer_dict = MLUtils.read_string_indexer_mapping(
            trained_model_path, SQLctx)
        prediction_to_levels = udf(lambda x: label_indexer_dict[x],
                                   StringType())
        transformed = transformed.withColumn(
            result_column, prediction_to_levels(transformed.prediction))

        if "probability" in transformed.columns:
            probability_dataframe = transformed.select(
                [result_column, "probability"]).toPandas()
            probability_dataframe = probability_dataframe.rename(
                index=str, columns={result_column: "predicted_class"})
            probability_dataframe[
                "predicted_probability"] = probability_dataframe[
                    "probability"].apply(lambda x: max(x))
            self._score_summary[
                "prediction_split"] = MLUtils.calculate_scored_probability_stats(
                    probability_dataframe)
            self._score_summary["result_column"] = result_column
            scored_dataframe = transformed.select(
                categorical_columns + time_dimension_columns +
                numerical_columns + [result_column, "probability"]).toPandas()
            scored_dataframe['predicted_probability'] = probability_dataframe[
                "predicted_probability"].values
            # scored_dataframe = scored_dataframe.rename(index=str, columns={"predicted_probability": "probability"})
        else:
            self._score_summary["prediction_split"] = []
            self._score_summary["result_column"] = result_column
            scored_dataframe = transformed.select(categorical_columns +
                                                  time_dimension_columns +
                                                  numerical_columns +
                                                  [result_column]).toPandas()

        labelMappingDict = self._dataframe_context.get_label_map()
        if score_data_path.startswith("file"):
            score_data_path = score_data_path[7:]
        scored_dataframe.to_csv(score_data_path, header=True, index=False)

        uidCol = self._dataframe_context.get_uid_column()
        if uidCol == None:
            uidCols = self._metaParser.get_suggested_uid_columns()
            if len(uidCols) > 0:
                uidCol = uidCols[0]
        uidTableData = []
        predictedClasses = list(scored_dataframe[result_column].unique())
        if uidCol:
            if uidCol in df.columns:
                for level in predictedClasses:
                    levelDf = scored_dataframe[scored_dataframe[result_column]
                                               == level]
                    levelDf = levelDf[[
                        uidCol, "predicted_probability", result_column
                    ]]
                    levelDf.sort_values(by="predicted_probability",
                                        ascending=False,
                                        inplace=True)
                    levelDf["predicted_probability"] = levelDf[
                        "predicted_probability"].apply(
                            lambda x: humanize.apnumber(x * 100) + "%"
                            if x * 100 >= 10 else str(int(x * 100)) + "%")
                    uidTableData.append(levelDf[:5])
                uidTableData = pd.concat(uidTableData)
                uidTableData = [list(arr) for arr in list(uidTableData.values)]
                uidTableData = [[uidCol, "Probability", result_column]
                                ] + uidTableData
                uidTable = TableData()
                uidTable.set_table_width(25)
                uidTable.set_table_data(uidTableData)
                uidTable.set_table_type("normalHideColumn")
                self._result_setter.set_unique_identifier_table(
                    json.loads(
                        CommonUtils.convert_python_object_to_json(uidTable)))

        self._completionStatus += self._scriptWeightDict[self._analysisName][
            "total"] * self._scriptStages["prediction"]["weight"] / 10
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "prediction",\
                                    "info",\
                                    self._scriptStages["prediction"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL, progressMessage)
        self._dataframe_context.update_completion_status(
            self._completionStatus)

        print("STARTING DIMENSION ANALYSIS ...")
        columns_to_keep = []
        columns_to_drop = []

        columns_to_keep = self._dataframe_context.get_score_consider_columns()

        if len(columns_to_keep) > 0:
            columns_to_drop = list(set(df.columns) - set(columns_to_keep))
        else:
            columns_to_drop += ["predicted_probability"]

        scored_df = transformed.select(categorical_columns +
                                       time_dimension_columns +
                                       numerical_columns + [result_column])
        columns_to_drop = [
            x for x in columns_to_drop if x in scored_df.columns
        ]
        modified_df = scored_df.select(
            [x for x in scored_df.columns if x not in columns_to_drop])
        resultColLevelCount = dict(
            modified_df.groupby(result_column).count().collect())
        self._metaParser.update_column_dict(
            result_column, {
                "LevelCount": resultColLevelCount,
                "numberOfUniqueValues": len(resultColLevelCount.keys())
            })
        self._dataframe_context.set_story_on_scored_data(True)

        self._dataframe_context.update_consider_columns(columns_to_keep)
        df_helper = DataFrameHelper(modified_df, self._dataframe_context,
                                    self._metaParser)
        df_helper.set_params()
        spark_scored_df = df_helper.get_data_frame()

        if len(predictedClasses) >= 2:
            try:
                fs = time.time()
                df_decision_tree_obj = DecisionTrees(
                    spark_scored_df,
                    df_helper,
                    self._dataframe_context,
                    self._spark,
                    self._metaParser,
                    scriptWeight=self._scriptWeightDict,
                    analysisName=self._analysisName).test_all(
                        dimension_columns=[result_column])
                narratives_obj = CommonUtils.as_dict(
                    DecisionTreeNarrative(result_column,
                                          df_decision_tree_obj,
                                          self._dataframe_helper,
                                          self._dataframe_context,
                                          self._metaParser,
                                          self._result_setter,
                                          story_narrative=None,
                                          analysisName=self._analysisName,
                                          scriptWeight=self._scriptWeightDict))
                print(narratives_obj)
            except Exception as e:
                print("DecisionTree Analysis Failed ", str(e))
        else:
            data_dict = {
                "npred": len(predictedClasses),
                "nactual": len(labelMappingDict.values())
            }

            if data_dict["nactual"] > 2:
                levelCountDict[predictedClasses[0]] = resultColLevelCount[
                    predictedClasses[0]]
                levelCountDict["Others"] = sum([
                    v for k, v in resultColLevelCount.items()
                    if k != predictedClasses[0]
                ])
            else:
                levelCountDict = resultColLevelCount
                otherClass = list(
                    set(labelMappingDict.values()) - set(predictedClasses))[0]
                levelCountDict[otherClass] = 0

                print(levelCountDict)

            total = float(
                sum([x for x in levelCountDict.values() if x != None]))
            levelCountTuple = [({
                "name":
                k,
                "count":
                v,
                "percentage":
                humanize.apnumber(v * 100 / total) + "%"
            }) for k, v in levelCountDict.items() if v != None]
            levelCountTuple = sorted(levelCountTuple,
                                     key=lambda x: x["count"],
                                     reverse=True)
            data_dict["blockSplitter"] = "|~NEWBLOCK~|"
            data_dict["targetcol"] = result_column
            data_dict["nlevel"] = len(levelCountDict.keys())
            data_dict["topLevel"] = levelCountTuple[0]
            data_dict["secondLevel"] = levelCountTuple[1]
            maincardSummary = NarrativesUtils.get_template_output(
                "/apps/", 'scorewithoutdtree.html', data_dict)

            main_card = NormalCard()
            main_card_data = []
            main_card_narrative = NarrativesUtils.block_splitter(
                maincardSummary, "|~NEWBLOCK~|")
            main_card_data += main_card_narrative

            chartData = NormalChartData([levelCountDict]).get_data()
            chartJson = ChartJson(data=chartData)
            chartJson.set_title(result_column)
            chartJson.set_chart_type("donut")
            mainCardChart = C3ChartData(data=chartJson)
            mainCardChart.set_width_percent(33)
            main_card_data.append(mainCardChart)

            uidTable = self._result_setter.get_unique_identifier_table()
            if uidTable != None:
                main_card_data.append(uidTable)
            main_card.set_card_data(main_card_data)
            main_card.set_card_name(
                "Predicting Key Drivers of {}".format(result_column))
            self._result_setter.set_score_dtree_cards([main_card], {})
コード例 #3
0
    def Predict(self):
        SQLctx = SQLContext(sparkContext=self._spark.sparkContext,
                            sparkSession=self._spark)
        dataSanity = True
        level_counts_train = self._dataframe_context.get_level_count_dict()
        categorical_columns = self._dataframe_helper.get_string_columns()
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        time_dimension_columns = self._dataframe_helper.get_timestamp_columns()
        result_column = self._dataframe_context.get_result_column()
        categorical_columns = [
            x for x in categorical_columns if x != result_column
        ]

        level_counts_score = CommonUtils.get_level_count_dict(
            self._data_frame,
            categorical_columns,
            self._dataframe_context.get_column_separator(),
            output_type="dict",
            dataType="spark")
        for key in level_counts_train:
            if key in level_counts_score:
                if level_counts_train[key] != level_counts_score[key]:
                    dataSanity = False
            else:
                dataSanity = False

        test_data_path = self._dataframe_context.get_input_file()
        score_data_path = self._dataframe_context.get_score_path(
        ) + "/ScoredData/data.csv"
        trained_model_path = self._dataframe_context.get_model_path()
        if trained_model_path.endswith(".pkl"):
            trained_model_path = "/".join(
                trained_model_path.split("/")[:-1]) + "/model"
        pipeline_path = "/".join(
            trained_model_path.split("/")[:-1]) + "/pipeline"
        score_summary_path = self._dataframe_context.get_score_path(
        ) + "/Summary/summary.json"

        pipelineModel = MLUtils.load_pipeline(pipeline_path)
        if self._classifier == "OneVsRest":
            trained_model = MLUtils.load_one_vs_rest_model(trained_model_path)
        elif self._classifier == "lr":
            trained_model = MLUtils.load_logistic_model(trained_model_path)

        df = self._data_frame
        indexed = pipelineModel.transform(df)
        transformed = trained_model.transform(indexed)
        label_indexer_dict = MLUtils.read_string_indexer_mapping(
            pipeline_path, SQLctx)
        prediction_to_levels = udf(lambda x: label_indexer_dict[x],
                                   StringType())
        transformed = transformed.withColumn(
            result_column, prediction_to_levels(transformed.prediction))

        # udf_to_calculate_probability = udf(lambda x:max(x[0]))
        # transformed = transformed.withColumn("predicted_probability",udf_to_calculate_probability(transformed.probability))
        # print transformed.select("predicted_probability").show(5)

        if "probability" in transformed.columns:
            probability_dataframe = transformed.select(
                [result_column, "probability"]).toPandas()
            probability_dataframe = probability_dataframe.rename(
                index=str, columns={result_column: "predicted_class"})
            probability_dataframe[
                "predicted_probability"] = probability_dataframe[
                    "probability"].apply(lambda x: max(x))
            self._score_summary[
                "prediction_split"] = MLUtils.calculate_scored_probability_stats(
                    probability_dataframe)
            self._score_summary["result_column"] = result_column
            scored_dataframe = transformed.select(
                categorical_columns + time_dimension_columns +
                numerical_columns + [result_column, "probability"]).toPandas()
            # scored_dataframe = scored_dataframe.rename(index=str, columns={"predicted_probability": "probability"})
        else:
            self._score_summary["prediction_split"] = []
            self._score_summary["result_column"] = result_column
            scored_dataframe = transformed.select(categorical_columns +
                                                  time_dimension_columns +
                                                  numerical_columns +
                                                  [result_column]).toPandas()

        if score_data_path.startswith("file"):
            score_data_path = score_data_path[7:]
        scored_dataframe.to_csv(score_data_path, header=True, index=False)
        # print json.dumps({"scoreSummary":self._score_summary},indent=2)
        CommonUtils.write_to_file(
            score_summary_path,
            json.dumps({"scoreSummary": self._score_summary}))

        print "STARTING DIMENSION ANALYSIS ..."
        columns_to_keep = []
        columns_to_drop = []
        considercolumnstype = self._dataframe_context.get_score_consider_columns_type(
        )
        considercolumns = self._dataframe_context.get_score_consider_columns()
        if considercolumnstype != None:
            if considercolumns != None:
                if considercolumnstype == ["excluding"]:
                    columns_to_drop = considercolumns
                elif considercolumnstype == ["including"]:
                    columns_to_keep = considercolumns
        if len(columns_to_keep) > 0:
            columns_to_drop = list(set(df.columns) - set(columns_to_keep))
        # spark_scored_df = transformed.select(categorical_columns+time_dimension_columns+numerical_columns+[result_column])
        scored_df = transformed.select(categorical_columns +
                                       time_dimension_columns +
                                       numerical_columns + [result_column])

        SQLctx = SQLContext(sparkContext=self._spark.sparkContext,
                            sparkSession=self._spark)
        spark_scored_df = SQLctx.createDataFrame(scored_df.toPandas())
        columns_to_drop = [
            x for x in columns_to_drop if x in spark_scored_df.columns
        ]
        modified_df = spark_scored_df.select(
            [x for x in spark_scored_df.columns if x not in columns_to_drop])
        df_helper = DataFrameHelper(modified_df, self._dataframe_context)
        df_helper.set_params()
        df = df_helper.get_data_frame()
        try:
            fs = time.time()
            narratives_file = self._dataframe_context.get_score_path(
            ) + "/narratives/FreqDimension/data.json"
            result_file = self._dataframe_context.get_score_path(
            ) + "/results/FreqDimension/data.json"
            df_freq_dimension_obj = FreqDimensions(
                spark_scored_df, df_helper, self._dataframe_context).test_all(
                    dimension_columns=[result_column])
            df_freq_dimension_result = CommonUtils.as_dict(
                df_freq_dimension_obj)
            CommonUtils.write_to_file(result_file,
                                      json.dumps(df_freq_dimension_result))
            narratives_obj = DimensionColumnNarrative(result_column, df_helper,
                                                      self._dataframe_context,
                                                      df_freq_dimension_obj)
            narratives = CommonUtils.as_dict(narratives_obj)
            CommonUtils.write_to_file(narratives_file, json.dumps(narratives))
            print "Frequency Analysis Done in ", time.time() - fs, " seconds."
        except:
            print "Frequency Analysis Failed "

        try:
            fs = time.time()
            narratives_file = self._dataframe_context.get_score_path(
            ) + "/narratives/ChiSquare/data.json"
            result_file = self._dataframe_context.get_score_path(
            ) + "/results/ChiSquare/data.json"
            df_chisquare_obj = ChiSquare(df, df_helper,
                                         self._dataframe_context).test_all(
                                             dimension_columns=[result_column])
            df_chisquare_result = CommonUtils.as_dict(df_chisquare_obj)
            # print 'RESULT: %s' % (json.dumps(df_chisquare_result, indent=2))
            CommonUtils.write_to_file(result_file,
                                      json.dumps(df_chisquare_result))
            chisquare_narratives = CommonUtils.as_dict(
                ChiSquareNarratives(df_helper, df_chisquare_obj,
                                    self._dataframe_context, df))
            # print 'Narrarives: %s' %(json.dumps(chisquare_narratives, indent=2))
            CommonUtils.write_to_file(narratives_file,
                                      json.dumps(chisquare_narratives))
            print "ChiSquare Analysis Done in ", time.time() - fs, " seconds."
        except:
            print "ChiSquare Analysis Failed "