Ejemplo n.º 1
0
 def Run(self):
     df_anova_obj = OneWayAnova(self._data_frame, self._dataframe_helper, self._dataframe_context).test_all(measure_columns=(self._dataframe_context.get_result_column(),))
     df_anova_result = CommonUtils.as_dict(df_anova_obj)
     #print 'RESULT: %s' % (json.dumps(df_anova_result, indent=2))
     DataWriter.write_dict_as_json(self._spark, {'RESULT': json.dumps(df_anova_result)}, self._dataframe_context.get_result_file()+'OneWayAnova/')
     #print 'Written Anova Result'
     anova_narratives = CommonUtils.as_dict(AnovaNarratives(len(self._dataframe_helper.get_string_columns()), df_anova_obj, self._dataframe_helper))
     #print "Narratives: %s" % (json.dumps(anova_narratives, indent=2))
     DataWriter.write_dict_as_json(self._spark, anova_narratives, self._dataframe_context.get_narratives_file()+'OneWayAnova/')
    def Run(self):
        fs = time.time()
        regression_result_obj = LinearRegression(self._data_frame, self._dataframe_helper, self._dataframe_context,self._metaParser,self._spark).fit(self._dataframe_context.get_result_column())
        regression_result = CommonUtils.as_dict(regression_result_obj)
        print "time taken for regression ",time.time()-fs,"seconds"

        # print 'Regression result: %s' % (json.dumps(regression_result, indent=2))
        # DataWriter.write_dict_as_json(self._spark, regression_result, self._dataframe_context.get_result_file()+'Regression/')

        # regression_narratives_obj = LinearRegressionNarrative(len(self._dataframe_helper.get_numeric_columns()),regression_result_obj, self._correlations,self._dataframe_helper)
        # regression_narratives = CommonUtils.as_dict(regression_narratives_obj)

        regression_narratives_obj = RegressionNarrative(self._dataframe_helper,self._dataframe_context,self._result_setter,self._spark,regression_result_obj,self._correlations,self._story_narrative,self._metaParser)
        regression_narratives = CommonUtils.as_dict(regression_narratives_obj.narratives)
Ejemplo n.º 3
0
 def Run(self):
     trend_narratives_obj = TimeSeriesNarrative(self._dataframe_helper,
                                                self._dataframe_context,
                                                self._result_setter,
                                                self._spark,
                                                self._story_narrative,
                                                self._metaParser)
     trend_narratives = CommonUtils.as_dict(trend_narratives_obj)
Ejemplo n.º 4
0
    def Run(self):
        correlations_obj = Correlation(
            self._data_frame, self._dataframe_helper,
            self._dataframe_context).correlations_for_one_column(
                self._dataframe_context.get_result_column())
        correlations_result = CommonUtils.as_dict(correlations_obj)

        # print 'RESULT: %s' % (json.dumps(correlations_result, indent=2))
        # DataWriter.write_dict_as_json(self._spark, correlations_result, self._dataframe_context.get_result_file()+'Correlation/')
        return correlations_obj
Ejemplo n.º 5
0
    def Run(self):
        df_freq_dimension_obj = FreqDimensions(
            self._data_frame, self._dataframe_helper,
            self._dataframe_context).test_all(dimension_columns=(
                self._dataframe_context.get_result_column(), ))
        # df_freq_dimension_result = CommonUtils.as_dict(df_freq_dimension_obj)
        # print 'RESULT: %s' % (json.dumps(df_freq_dimension_result, indent=2))
        # DataWriter.write_dict_as_json(self._spark, df_freq_dimension_result, self._dataframe_context.get_result_file()+'FreqDimension/')

        # Narratives
        narratives_obj = DimensionColumnNarrative(
            self._data_frame, self._dataframe_context.get_result_column(),
            self._dataframe_helper, self._dataframe_context,
            df_freq_dimension_obj, self._story_narrative, self._result_setter)
        narratives = CommonUtils.as_dict(narratives_obj)
Ejemplo n.º 6
0
 def Run(self):
     df_chisquare_obj = ChiSquare(
         self._data_frame, self._dataframe_helper, self._dataframe_context,
         self._metaParser).test_all(dimension_columns=(
             self._dataframe_context.get_result_column(), ))
     # df_chisquare_result = CommonUtils.as_dict(df_chisquare_obj)
     # print 'RESULT: %s' % (json.dumps(df_chisquare_result, indent=2))
     # DataWriter.write_dict_as_json(self._spark, df_chisquare_result, self._dataframe_context.get_result_file()+'ChiSquare/')
     # Narratives
     # print self._data_frame.select('Sales').show()
     if df_chisquare_obj.get_result() != {}:
         chisquare_narratives = CommonUtils.as_dict(
             ChiSquareNarratives(self._dataframe_helper, df_chisquare_obj,
                                 self._spark, self._dataframe_context,
                                 self._data_frame, self._story_narrative,
                                 self._result_setter))
Ejemplo n.º 7
0
    def Predict(self):
        self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight(
        )
        self._scriptStages = {
            "initialization": {
                "summary": "Initialized the Random Forest Scripts",
                "weight": 2
            },
            "prediction": {
                "summary": "Random Forest Model Prediction Finished",
                "weight": 2
            },
            "frequency": {
                "summary": "descriptive analysis finished",
                "weight": 2
            },
            "chisquare": {
                "summary": "chi Square analysis finished",
                "weight": 4
            },
            "completion": {
                "summary": "all analysis finished",
                "weight": 4
            },
        }

        self._completionStatus += old_div(
            self._scriptWeightDict[self._analysisName]["total"] *
            self._scriptStages["initialization"]["weight"], 10)
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "initialization",\
                                    "info",\
                                    self._scriptStages["initialization"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsg)
        self._dataframe_context.update_completion_status(
            self._completionStatus)
        # Match with the level_counts and then clean the data
        dataSanity = True
        level_counts_train = self._dataframe_context.get_level_count_dict()
        cat_cols = self._dataframe_helper.get_string_columns()
        # level_counts_score = CommonUtils.get_level_count_dict(self._data_frame,cat_cols,self._dataframe_context.get_column_separator(),output_type="dict")
        # if level_counts_train != {}:
        #     for key in level_counts_train:
        #         if key in level_counts_score:
        #             if level_counts_train[key] != level_counts_score[key]:
        #                 dataSanity = False
        #         else:
        #             dataSanity = False
        categorical_columns = self._dataframe_helper.get_string_columns()
        uid_col = self._dataframe_context.get_uid_column()
        if self._metaParser.check_column_isin_ignored_suggestion(uid_col):
            categorical_columns = list(set(categorical_columns) - {uid_col})
        allDateCols = self._dataframe_context.get_date_columns()
        categorical_columns = list(set(categorical_columns) - set(allDateCols))
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        result_column = self._dataframe_context.get_result_column()
        test_data_path = self._dataframe_context.get_input_file()

        if self._mlEnv == "spark":
            pass
        elif self._mlEnv == "sklearn":

            score_data_path = self._dataframe_context.get_score_path(
            ) + "/data.csv"
            if score_data_path.startswith("file"):
                score_data_path = score_data_path[7:]
            trained_model_path = self._dataframe_context.get_model_path()
            trained_model_path += "/" + self._dataframe_context.get_model_for_scoring(
            ) + ".pkl"
            if trained_model_path.startswith("file"):
                trained_model_path = trained_model_path[7:]
            score_summary_path = self._dataframe_context.get_score_path(
            ) + "/Summary/summary.json"
            if score_summary_path.startswith("file"):
                score_summary_path = score_summary_path[7:]
            trained_model = joblib.load(trained_model_path)
            # pandas_df = self._data_frame.toPandas()
            df = self._data_frame.toPandas()
            model_columns = self._dataframe_context.get_model_features()
            pandas_df = MLUtils.create_dummy_columns(
                df, [x for x in categorical_columns if x != result_column])
            pandas_df = MLUtils.fill_missing_columns(pandas_df, model_columns,
                                                     result_column)
            if uid_col:
                pandas_df = pandas_df[[
                    x for x in pandas_df.columns if x != uid_col
                ]]
            y_score = trained_model.predict(pandas_df)
            y_prob = trained_model.predict_proba(pandas_df)
            y_prob = MLUtils.calculate_predicted_probability(y_prob)
            y_prob = list([round(x, 2) for x in y_prob])
            score = {
                "predicted_class": y_score,
                "predicted_probability": y_prob
            }

        df["predicted_class"] = score["predicted_class"]
        labelMappingDict = self._dataframe_context.get_label_map()
        df["predicted_class"] = df["predicted_class"].apply(
            lambda x: labelMappingDict[x] if x != None else "NA")
        df["predicted_probability"] = score["predicted_probability"]
        self._score_summary[
            "prediction_split"] = MLUtils.calculate_scored_probability_stats(
                df)
        self._score_summary["result_column"] = result_column
        if result_column in df.columns:
            df.drop(result_column, axis=1, inplace=True)
        df = df.rename(index=str, columns={"predicted_class": result_column})
        df.to_csv(score_data_path, header=True, index=False)
        uidCol = self._dataframe_context.get_uid_column()
        if uidCol == None:
            uidCols = self._metaParser.get_suggested_uid_columns()
            if len(uidCols) > 0:
                uidCol = uidCols[0]
        uidTableData = []
        predictedClasses = list(df[result_column].unique())
        if uidCol:
            if uidCol in df.columns:
                for level in predictedClasses:
                    levelDf = df[df[result_column] == level]
                    levelDf = levelDf[[
                        uidCol, "predicted_probability", result_column
                    ]]
                    levelDf.sort_values(by="predicted_probability",
                                        ascending=False,
                                        inplace=True)
                    levelDf["predicted_probability"] = levelDf[
                        "predicted_probability"].apply(
                            lambda x: humanize.apnumber(x * 100) + "%"
                            if x * 100 >= 10 else str(int(x * 100)) + "%")
                    uidTableData.append(levelDf[:5])
                uidTableData = pd.concat(uidTableData)
                uidTableData = [list(arr) for arr in list(uidTableData.values)]
                uidTableData = [[uidCol, "Probability", result_column]
                                ] + uidTableData
                uidTable = TableData()
                uidTable.set_table_width(25)
                uidTable.set_table_data(uidTableData)
                uidTable.set_table_type("normalHideColumn")
                self._result_setter.set_unique_identifier_table(
                    json.loads(
                        CommonUtils.convert_python_object_to_json(uidTable)))

        self._completionStatus += old_div(
            self._scriptWeightDict[self._analysisName]["total"] *
            self._scriptStages["prediction"]["weight"], 10)
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "prediction",\
                                    "info",\
                                    self._scriptStages["prediction"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsg)
        self._dataframe_context.update_completion_status(
            self._completionStatus)
        # CommonUtils.write_to_file(score_summary_path,json.dumps({"scoreSummary":self._score_summary}))

        print("STARTING DIMENSION ANALYSIS ...")
        columns_to_keep = []
        columns_to_drop = []

        # considercolumnstype = self._dataframe_context.get_score_consider_columns_type()
        # considercolumns = self._dataframe_context.get_score_consider_columns()
        # if considercolumnstype != None:
        #     if considercolumns != None:
        #         if considercolumnstype == ["excluding"]:
        #             columns_to_drop = considercolumns
        #         elif considercolumnstype == ["including"]:
        #             columns_to_keep = considercolumns

        columns_to_keep = self._dataframe_context.get_score_consider_columns()
        if len(columns_to_keep) > 0:
            columns_to_drop = list(set(df.columns) - set(columns_to_keep))
        else:
            columns_to_drop += ["predicted_probability"]
        columns_to_drop = [
            x for x in columns_to_drop
            if x in df.columns and x != result_column
        ]
        print("columns_to_drop", columns_to_drop)
        df.drop(columns_to_drop, axis=1, inplace=True)

        resultColLevelCount = dict(df[result_column].value_counts())
        # self._metaParser.update_level_counts(result_column,resultColLevelCount)
        self._metaParser.update_column_dict(
            result_column, {
                "LevelCount": resultColLevelCount,
                "numberOfUniqueValues": len(list(resultColLevelCount.keys()))
            })
        self._dataframe_context.set_story_on_scored_data(True)
        SQLctx = SQLContext(sparkContext=self._spark.sparkContext,
                            sparkSession=self._spark)
        spark_scored_df = SQLctx.createDataFrame(df)
        # spark_scored_df.write.csv(score_data_path+"/data",mode="overwrite",header=True)
        # TODO update metadata for the newly created dataframe
        self._dataframe_context.update_consider_columns(columns_to_keep)
        df_helper = DataFrameHelper(spark_scored_df, self._dataframe_context,
                                    self._metaParser)
        df_helper.set_params()
        spark_scored_df = df_helper.get_data_frame()
        # try:
        #     fs = time.time()
        #     narratives_file = self._dataframe_context.get_score_path()+"/narratives/FreqDimension/data.json"
        #     if narratives_file.startswith("file"):
        #         narratives_file = narratives_file[7:]
        #     result_file = self._dataframe_context.get_score_path()+"/results/FreqDimension/data.json"
        #     if result_file.startswith("file"):
        #         result_file = result_file[7:]
        #     init_freq_dim = FreqDimensions(df, df_helper, self._dataframe_context,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName)
        #     df_freq_dimension_obj = init_freq_dim.test_all(dimension_columns=[result_column])
        #     df_freq_dimension_result = CommonUtils.as_dict(df_freq_dimension_obj)
        #     narratives_obj = DimensionColumnNarrative(result_column, df_helper, self._dataframe_context, df_freq_dimension_obj,self._result_setter,self._prediction_narrative,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName)
        #     narratives = CommonUtils.as_dict(narratives_obj)
        #
        #     print "Frequency Analysis Done in ", time.time() - fs,  " seconds."
        #     self._completionStatus += self._scriptWeightDict[self._analysisName]["total"]*self._scriptStages["frequency"]["weight"]/10
        #     progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
        #                                 "frequency",\
        #                                 "info",\
        #                                 self._scriptStages["frequency"]["summary"],\
        #                                 self._completionStatus,\
        #                                 self._completionStatus)
        #     CommonUtils.save_progress_message(self._messageURL,progressMessage,ignore=self._ignoreMsg)
        #     self._dataframe_context.update_completion_status(self._completionStatus)
        #     print "Frequency ",self._completionStatus
        # except:
        #     print "Frequency Analysis Failed "
        #
        # try:
        #     fs = time.time()
        #     narratives_file = self._dataframe_context.get_score_path()+"/narratives/ChiSquare/data.json"
        #     if narratives_file.startswith("file"):
        #         narratives_file = narratives_file[7:]
        #     result_file = self._dataframe_context.get_score_path()+"/results/ChiSquare/data.json"
        #     if result_file.startswith("file"):
        #         result_file = result_file[7:]
        #     init_chisquare_obj = ChiSquare(df, df_helper, self._dataframe_context,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName)
        #     df_chisquare_obj = init_chisquare_obj.test_all(dimension_columns= [result_column])
        #     df_chisquare_result = CommonUtils.as_dict(df_chisquare_obj)
        #     chisquare_narratives = CommonUtils.as_dict(ChiSquareNarratives(df_helper, df_chisquare_obj, self._dataframe_context,df,self._prediction_narrative,self._result_setter,scriptWeight=self._scriptWeightDict,analysisName=self._analysisName))
        # except:
        #     print "ChiSquare Analysis Failed "
        if len(predictedClasses) >= 2:
            try:
                fs = time.time()
                df_decision_tree_obj = DecisionTrees(
                    spark_scored_df,
                    df_helper,
                    self._dataframe_context,
                    self._spark,
                    self._metaParser,
                    scriptWeight=self._scriptWeightDict,
                    analysisName=self._analysisName).test_all(
                        dimension_columns=[result_column])
                narratives_obj = CommonUtils.as_dict(
                    DecisionTreeNarrative(result_column,
                                          df_decision_tree_obj,
                                          self._dataframe_helper,
                                          self._dataframe_context,
                                          self._metaParser,
                                          self._result_setter,
                                          story_narrative=None,
                                          analysisName=self._analysisName,
                                          scriptWeight=self._scriptWeightDict))
                print(narratives_obj)
            except:
                print("DecisionTree Analysis Failed ")
        else:
            data_dict = {
                "npred": len(predictedClasses),
                "nactual": len(list(labelMappingDict.values()))
            }
            if data_dict["nactual"] > 2:
                levelCountDict[predictedClasses[0]] = resultColLevelCount[
                    predictedClasses[0]]
                levelCountDict["Others"] = sum([
                    v for k, v in list(resultColLevelCount.items())
                    if k != predictedClasses[0]
                ])
            else:
                levelCountDict = resultColLevelCount
                otherClass = list(
                    set(labelMappingDict.values()) - set(predictedClasses))[0]
                levelCountDict[otherClass] = 0

                print(levelCountDict)

            total = float(
                sum([x for x in list(levelCountDict.values()) if x != None]))
            levelCountTuple = [({
                "name":
                k,
                "count":
                v,
                "percentage":
                humanize.apnumber(old_div(v * 100, total)) +
                "%" if old_div(v * 100, total) >= 10 else
                str(int(old_div(v * 100, total))) + "%"
            }) for k, v in list(levelCountDict.items()) if v != None]
            levelCountTuple = sorted(levelCountTuple,
                                     key=lambda x: x["count"],
                                     reverse=True)
            data_dict["blockSplitter"] = "|~NEWBLOCK~|"
            data_dict["targetcol"] = result_column
            data_dict["nlevel"] = len(list(levelCountDict.keys()))
            data_dict["topLevel"] = levelCountTuple[0]
            data_dict["secondLevel"] = levelCountTuple[1]
            maincardSummary = NarrativesUtils.get_template_output(
                "/apps/", 'scorewithoutdtree.html', data_dict)

            main_card = NormalCard()
            main_card_data = []
            main_card_narrative = NarrativesUtils.block_splitter(
                maincardSummary, "|~NEWBLOCK~|")
            main_card_data += main_card_narrative

            chartData = NormalChartData([levelCountDict]).get_data()
            chartJson = ChartJson(data=chartData)
            chartJson.set_title(result_column)
            chartJson.set_chart_type("donut")
            mainCardChart = C3ChartData(data=chartJson)
            mainCardChart.set_width_percent(33)
            main_card_data.append(mainCardChart)

            uidTable = self._result_setter.get_unique_identifier_table()
            if uidTable != None:
                main_card_data.append(uidTable)
            main_card.set_card_data(main_card_data)
            main_card.set_card_name(
                "Predicting Key Drivers of {}".format(result_column))
            self._result_setter.set_score_dtree_cards([main_card], {})
 def Run(self):
     print("executive summary script called")
     executive_summary_obj = ExecutiveSummaryNarrative(
         self._dataframe_helper, self._dataframe_context,
         self._result_setter, self._spark)
     executive_summary = CommonUtils.as_dict(executive_summary_obj)
    def Predict(self):
        self._scriptWeightDict = self._dataframe_context.get_ml_model_prediction_weight(
        )
        self._scriptStages = {
            "initialization": {
                "summary": "Initialized the Naive Bayes Scripts",
                "weight": 2
            },
            "prediction": {
                "summary": "Spark ML Naive Bayes Model Prediction Finished",
                "weight": 2
            },
            "frequency": {
                "summary": "descriptive analysis finished",
                "weight": 2
            },
            "chisquare": {
                "summary": "chi Square analysis finished",
                "weight": 4
            },
            "completion": {
                "summary": "all analysis finished",
                "weight": 4
            },
        }

        self._completionStatus += self._scriptWeightDict[self._analysisName][
            "total"] * self._scriptStages["initialization"]["weight"] / 10
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "initialization",\
                                    "info",\
                                    self._scriptStages["initialization"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL, progressMessage)
        self._dataframe_context.update_completion_status(
            self._completionStatus)

        SQLctx = SQLContext(sparkContext=self._spark.sparkContext,
                            sparkSession=self._spark)
        dataSanity = True
        level_counts_train = self._dataframe_context.get_level_count_dict()
        categorical_columns = self._dataframe_helper.get_string_columns()
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        time_dimension_columns = self._dataframe_helper.get_timestamp_columns()
        result_column = self._dataframe_context.get_result_column()
        categorical_columns = [
            x for x in categorical_columns if x != result_column
        ]

        level_counts_score = CommonUtils.get_level_count_dict(
            self._data_frame,
            categorical_columns,
            self._dataframe_context.get_column_separator(),
            output_type="dict",
            dataType="spark")
        for key in level_counts_train:
            if key in level_counts_score:
                if level_counts_train[key] != level_counts_score[key]:
                    dataSanity = False
            else:
                dataSanity = False

        test_data_path = self._dataframe_context.get_input_file()
        score_data_path = self._dataframe_context.get_score_path(
        ) + "/data.csv"
        trained_model_path = self._dataframe_context.get_model_path()
        trained_model_path = "/".join(
            trained_model_path.split("/")[:-1]
        ) + "/" + self._slug + "/" + self._dataframe_context.get_model_for_scoring(
        )
        # score_summary_path = self._dataframe_context.get_score_path()+"/Summary/summary.json"

        pipelineModel = MLUtils.load_pipeline(trained_model_path)

        df = self._data_frame
        transformed = pipelineModel.transform(df)
        label_indexer_dict = MLUtils.read_string_indexer_mapping(
            trained_model_path, SQLctx)
        prediction_to_levels = udf(lambda x: label_indexer_dict[x],
                                   StringType())
        transformed = transformed.withColumn(
            result_column, prediction_to_levels(transformed.prediction))

        if "probability" in transformed.columns:
            probability_dataframe = transformed.select(
                [result_column, "probability"]).toPandas()
            probability_dataframe = probability_dataframe.rename(
                index=str, columns={result_column: "predicted_class"})
            probability_dataframe[
                "predicted_probability"] = probability_dataframe[
                    "probability"].apply(lambda x: max(x))
            self._score_summary[
                "prediction_split"] = MLUtils.calculate_scored_probability_stats(
                    probability_dataframe)
            self._score_summary["result_column"] = result_column
            scored_dataframe = transformed.select(
                categorical_columns + time_dimension_columns +
                numerical_columns + [result_column, "probability"]).toPandas()
            scored_dataframe['predicted_probability'] = probability_dataframe[
                "predicted_probability"].values
            # scored_dataframe = scored_dataframe.rename(index=str, columns={"predicted_probability": "probability"})
        else:
            self._score_summary["prediction_split"] = []
            self._score_summary["result_column"] = result_column
            scored_dataframe = transformed.select(categorical_columns +
                                                  time_dimension_columns +
                                                  numerical_columns +
                                                  [result_column]).toPandas()

        labelMappingDict = self._dataframe_context.get_label_map()
        if score_data_path.startswith("file"):
            score_data_path = score_data_path[7:]
        scored_dataframe.to_csv(score_data_path, header=True, index=False)

        uidCol = self._dataframe_context.get_uid_column()
        if uidCol == None:
            uidCols = self._metaParser.get_suggested_uid_columns()
            if len(uidCols) > 0:
                uidCol = uidCols[0]
        uidTableData = []
        predictedClasses = list(scored_dataframe[result_column].unique())
        if uidCol:
            if uidCol in df.columns:
                for level in predictedClasses:
                    levelDf = scored_dataframe[scored_dataframe[result_column]
                                               == level]
                    levelDf = levelDf[[
                        uidCol, "predicted_probability", result_column
                    ]]
                    levelDf.sort_values(by="predicted_probability",
                                        ascending=False,
                                        inplace=True)
                    levelDf["predicted_probability"] = levelDf[
                        "predicted_probability"].apply(
                            lambda x: humanize.apnumber(x * 100) + "%"
                            if x * 100 >= 10 else str(int(x * 100)) + "%")
                    uidTableData.append(levelDf[:5])
                uidTableData = pd.concat(uidTableData)
                uidTableData = [list(arr) for arr in list(uidTableData.values)]
                uidTableData = [[uidCol, "Probability", result_column]
                                ] + uidTableData
                uidTable = TableData()
                uidTable.set_table_width(25)
                uidTable.set_table_data(uidTableData)
                uidTable.set_table_type("normalHideColumn")
                self._result_setter.set_unique_identifier_table(
                    json.loads(
                        CommonUtils.convert_python_object_to_json(uidTable)))

        self._completionStatus += self._scriptWeightDict[self._analysisName][
            "total"] * self._scriptStages["prediction"]["weight"] / 10
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "prediction",\
                                    "info",\
                                    self._scriptStages["prediction"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL, progressMessage)
        self._dataframe_context.update_completion_status(
            self._completionStatus)

        print("STARTING DIMENSION ANALYSIS ...")
        columns_to_keep = []
        columns_to_drop = []

        columns_to_keep = self._dataframe_context.get_score_consider_columns()

        if len(columns_to_keep) > 0:
            columns_to_drop = list(set(df.columns) - set(columns_to_keep))
        else:
            columns_to_drop += ["predicted_probability"]

        scored_df = transformed.select(categorical_columns +
                                       time_dimension_columns +
                                       numerical_columns + [result_column])
        columns_to_drop = [
            x for x in columns_to_drop if x in scored_df.columns
        ]
        modified_df = scored_df.select(
            [x for x in scored_df.columns if x not in columns_to_drop])
        resultColLevelCount = dict(
            modified_df.groupby(result_column).count().collect())
        self._metaParser.update_column_dict(
            result_column, {
                "LevelCount": resultColLevelCount,
                "numberOfUniqueValues": len(resultColLevelCount.keys())
            })
        self._dataframe_context.set_story_on_scored_data(True)

        self._dataframe_context.update_consider_columns(columns_to_keep)
        df_helper = DataFrameHelper(modified_df, self._dataframe_context,
                                    self._metaParser)
        df_helper.set_params()
        spark_scored_df = df_helper.get_data_frame()

        if len(predictedClasses) >= 2:
            try:
                fs = time.time()
                df_decision_tree_obj = DecisionTrees(
                    spark_scored_df,
                    df_helper,
                    self._dataframe_context,
                    self._spark,
                    self._metaParser,
                    scriptWeight=self._scriptWeightDict,
                    analysisName=self._analysisName).test_all(
                        dimension_columns=[result_column])
                narratives_obj = CommonUtils.as_dict(
                    DecisionTreeNarrative(result_column,
                                          df_decision_tree_obj,
                                          self._dataframe_helper,
                                          self._dataframe_context,
                                          self._metaParser,
                                          self._result_setter,
                                          story_narrative=None,
                                          analysisName=self._analysisName,
                                          scriptWeight=self._scriptWeightDict))
                print(narratives_obj)
            except Exception as e:
                print("DecisionTree Analysis Failed ", str(e))
        else:
            data_dict = {
                "npred": len(predictedClasses),
                "nactual": len(labelMappingDict.values())
            }

            if data_dict["nactual"] > 2:
                levelCountDict[predictedClasses[0]] = resultColLevelCount[
                    predictedClasses[0]]
                levelCountDict["Others"] = sum([
                    v for k, v in resultColLevelCount.items()
                    if k != predictedClasses[0]
                ])
            else:
                levelCountDict = resultColLevelCount
                otherClass = list(
                    set(labelMappingDict.values()) - set(predictedClasses))[0]
                levelCountDict[otherClass] = 0

                print(levelCountDict)

            total = float(
                sum([x for x in levelCountDict.values() if x != None]))
            levelCountTuple = [({
                "name":
                k,
                "count":
                v,
                "percentage":
                humanize.apnumber(v * 100 / total) + "%"
            }) for k, v in levelCountDict.items() if v != None]
            levelCountTuple = sorted(levelCountTuple,
                                     key=lambda x: x["count"],
                                     reverse=True)
            data_dict["blockSplitter"] = "|~NEWBLOCK~|"
            data_dict["targetcol"] = result_column
            data_dict["nlevel"] = len(levelCountDict.keys())
            data_dict["topLevel"] = levelCountTuple[0]
            data_dict["secondLevel"] = levelCountTuple[1]
            maincardSummary = NarrativesUtils.get_template_output(
                "/apps/", 'scorewithoutdtree.html', data_dict)

            main_card = NormalCard()
            main_card_data = []
            main_card_narrative = NarrativesUtils.block_splitter(
                maincardSummary, "|~NEWBLOCK~|")
            main_card_data += main_card_narrative

            chartData = NormalChartData([levelCountDict]).get_data()
            chartJson = ChartJson(data=chartData)
            chartJson.set_title(result_column)
            chartJson.set_chart_type("donut")
            mainCardChart = C3ChartData(data=chartJson)
            mainCardChart.set_width_percent(33)
            main_card_data.append(mainCardChart)

            uidTable = self._result_setter.get_unique_identifier_table()
            if uidTable != None:
                main_card_data.append(uidTable)
            main_card.set_card_data(main_card_data)
            main_card.set_card_name(
                "Predicting Key Drivers of {}".format(result_column))
            self._result_setter.set_score_dtree_cards([main_card], {})
    def Predict(self):
        SQLctx = SQLContext(sparkContext=self._spark.sparkContext,
                            sparkSession=self._spark)
        dataSanity = True
        level_counts_train = self._dataframe_context.get_level_count_dict()
        categorical_columns = self._dataframe_helper.get_string_columns()
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        time_dimension_columns = self._dataframe_helper.get_timestamp_columns()
        result_column = self._dataframe_context.get_result_column()
        categorical_columns = [
            x for x in categorical_columns if x != result_column
        ]

        level_counts_score = CommonUtils.get_level_count_dict(
            self._data_frame,
            categorical_columns,
            self._dataframe_context.get_column_separator(),
            output_type="dict",
            dataType="spark")
        for key in level_counts_train:
            if key in level_counts_score:
                if level_counts_train[key] != level_counts_score[key]:
                    dataSanity = False
            else:
                dataSanity = False

        test_data_path = self._dataframe_context.get_input_file()
        score_data_path = self._dataframe_context.get_score_path(
        ) + "/ScoredData/data.csv"
        trained_model_path = self._dataframe_context.get_model_path()
        if trained_model_path.endswith(".pkl"):
            trained_model_path = "/".join(
                trained_model_path.split("/")[:-1]) + "/model"
        pipeline_path = "/".join(
            trained_model_path.split("/")[:-1]) + "/pipeline"
        score_summary_path = self._dataframe_context.get_score_path(
        ) + "/Summary/summary.json"

        pipelineModel = MLUtils.load_pipeline(pipeline_path)
        if self._classifier == "OneVsRest":
            trained_model = MLUtils.load_one_vs_rest_model(trained_model_path)
        elif self._classifier == "lr":
            trained_model = MLUtils.load_logistic_model(trained_model_path)

        df = self._data_frame
        indexed = pipelineModel.transform(df)
        transformed = trained_model.transform(indexed)
        label_indexer_dict = MLUtils.read_string_indexer_mapping(
            pipeline_path, SQLctx)
        prediction_to_levels = udf(lambda x: label_indexer_dict[x],
                                   StringType())
        transformed = transformed.withColumn(
            result_column, prediction_to_levels(transformed.prediction))

        # udf_to_calculate_probability = udf(lambda x:max(x[0]))
        # transformed = transformed.withColumn("predicted_probability",udf_to_calculate_probability(transformed.probability))
        # print transformed.select("predicted_probability").show(5)

        if "probability" in transformed.columns:
            probability_dataframe = transformed.select(
                [result_column, "probability"]).toPandas()
            probability_dataframe = probability_dataframe.rename(
                index=str, columns={result_column: "predicted_class"})
            probability_dataframe[
                "predicted_probability"] = probability_dataframe[
                    "probability"].apply(lambda x: max(x))
            self._score_summary[
                "prediction_split"] = MLUtils.calculate_scored_probability_stats(
                    probability_dataframe)
            self._score_summary["result_column"] = result_column
            scored_dataframe = transformed.select(
                categorical_columns + time_dimension_columns +
                numerical_columns + [result_column, "probability"]).toPandas()
            # scored_dataframe = scored_dataframe.rename(index=str, columns={"predicted_probability": "probability"})
        else:
            self._score_summary["prediction_split"] = []
            self._score_summary["result_column"] = result_column
            scored_dataframe = transformed.select(categorical_columns +
                                                  time_dimension_columns +
                                                  numerical_columns +
                                                  [result_column]).toPandas()

        if score_data_path.startswith("file"):
            score_data_path = score_data_path[7:]
        scored_dataframe.to_csv(score_data_path, header=True, index=False)
        # print json.dumps({"scoreSummary":self._score_summary},indent=2)
        CommonUtils.write_to_file(
            score_summary_path,
            json.dumps({"scoreSummary": self._score_summary}))

        print "STARTING DIMENSION ANALYSIS ..."
        columns_to_keep = []
        columns_to_drop = []
        considercolumnstype = self._dataframe_context.get_score_consider_columns_type(
        )
        considercolumns = self._dataframe_context.get_score_consider_columns()
        if considercolumnstype != None:
            if considercolumns != None:
                if considercolumnstype == ["excluding"]:
                    columns_to_drop = considercolumns
                elif considercolumnstype == ["including"]:
                    columns_to_keep = considercolumns
        if len(columns_to_keep) > 0:
            columns_to_drop = list(set(df.columns) - set(columns_to_keep))
        # spark_scored_df = transformed.select(categorical_columns+time_dimension_columns+numerical_columns+[result_column])
        scored_df = transformed.select(categorical_columns +
                                       time_dimension_columns +
                                       numerical_columns + [result_column])

        SQLctx = SQLContext(sparkContext=self._spark.sparkContext,
                            sparkSession=self._spark)
        spark_scored_df = SQLctx.createDataFrame(scored_df.toPandas())
        columns_to_drop = [
            x for x in columns_to_drop if x in spark_scored_df.columns
        ]
        modified_df = spark_scored_df.select(
            [x for x in spark_scored_df.columns if x not in columns_to_drop])
        df_helper = DataFrameHelper(modified_df, self._dataframe_context)
        df_helper.set_params()
        df = df_helper.get_data_frame()
        try:
            fs = time.time()
            narratives_file = self._dataframe_context.get_score_path(
            ) + "/narratives/FreqDimension/data.json"
            result_file = self._dataframe_context.get_score_path(
            ) + "/results/FreqDimension/data.json"
            df_freq_dimension_obj = FreqDimensions(
                spark_scored_df, df_helper, self._dataframe_context).test_all(
                    dimension_columns=[result_column])
            df_freq_dimension_result = CommonUtils.as_dict(
                df_freq_dimension_obj)
            CommonUtils.write_to_file(result_file,
                                      json.dumps(df_freq_dimension_result))
            narratives_obj = DimensionColumnNarrative(result_column, df_helper,
                                                      self._dataframe_context,
                                                      df_freq_dimension_obj)
            narratives = CommonUtils.as_dict(narratives_obj)
            CommonUtils.write_to_file(narratives_file, json.dumps(narratives))
            print "Frequency Analysis Done in ", time.time() - fs, " seconds."
        except:
            print "Frequency Analysis Failed "

        try:
            fs = time.time()
            narratives_file = self._dataframe_context.get_score_path(
            ) + "/narratives/ChiSquare/data.json"
            result_file = self._dataframe_context.get_score_path(
            ) + "/results/ChiSquare/data.json"
            df_chisquare_obj = ChiSquare(df, df_helper,
                                         self._dataframe_context).test_all(
                                             dimension_columns=[result_column])
            df_chisquare_result = CommonUtils.as_dict(df_chisquare_obj)
            # print 'RESULT: %s' % (json.dumps(df_chisquare_result, indent=2))
            CommonUtils.write_to_file(result_file,
                                      json.dumps(df_chisquare_result))
            chisquare_narratives = CommonUtils.as_dict(
                ChiSquareNarratives(df_helper, df_chisquare_obj,
                                    self._dataframe_context, df))
            # print 'Narrarives: %s' %(json.dumps(chisquare_narratives, indent=2))
            CommonUtils.write_to_file(narratives_file,
                                      json.dumps(chisquare_narratives))
            print "ChiSquare Analysis Done in ", time.time() - fs, " seconds."
        except:
            print "ChiSquare Analysis Failed "
Ejemplo n.º 11
0
    def __init__(self,
                 column_name,
                 decision_tree_rules,
                 df_helper,
                 df_context,
                 meta_parser,
                 result_setter,
                 story_narrative=None,
                 analysisName=None,
                 scriptWeight=None):
        self._story_narrative = story_narrative
        self._metaParser = meta_parser
        self._dataframe_context = df_context
        self._ignoreMsg = self._dataframe_context.get_message_ignore()
        self._result_setter = result_setter
        self._blockSplitter = GLOBALSETTINGS.BLOCKSPLITTER
        self._column_name = column_name.lower()
        self._colname = column_name

        self._capitalized_column_name = "%s%s" % (column_name[0].upper(),
                                                  column_name[1:])
        self._decision_rules_dict = decision_tree_rules.get_decision_rules()
        self._decision_tree_json = CommonUtils.as_dict(decision_tree_rules)
        self._decision_tree_raw = self._decision_rules_dict
        # self._decision_tree_raw = {"tree":{"children":None}}
        # self._decision_tree_raw['tree']["children"] = self._decision_tree_json['tree']["children"]
        self._table = decision_tree_rules.get_table()
        self._new_table = {}
        self.successful_predictions = decision_tree_rules.get_success()
        self.total_predictions = decision_tree_rules.get_total()
        self.success_percent = decision_tree_rules.get_success_percent()
        self._important_vars = decision_tree_rules.get_significant_vars()
        self._target_distribution = decision_tree_rules.get_target_contributions(
        )
        self._get_new_table()
        self._df_helper = df_helper
        self.subheader = None
        #self.table = {}
        self.dropdownComment = None
        self.dropdownValues = None
        self._base_dir = "/decisiontree/"

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        if analysisName == None:
            self._analysisName = self._dataframe_context.get_analysis_name()
        else:
            self._analysisName = analysisName
        self._messageURL = self._dataframe_context.get_message_url()
        if scriptWeight == None:
            self._scriptWeightDict = self._dataframe_context.get_dimension_analysis_weight(
            )
        else:
            self._scriptWeightDict = scriptWeight
        self._scriptStages = {
            "dtreeNarrativeStart": {
                "summary": "Started the Decision Tree Narratives",
                "weight": 0
            },
            "dtreeNarrativeEnd": {
                "summary": "Narratives for Decision Tree Finished",
                "weight": 10
            },
        }
        self._completionStatus += self._scriptWeightDict[
            self._analysisName]["narratives"] * self._scriptStages[
                "dtreeNarrativeStart"]["weight"] / 10
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "dtreeNarrativeStart",\
                                    "info",\
                                    self._scriptStages["dtreeNarrativeStart"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsg)
        self._dataframe_context.update_completion_status(
            self._completionStatus)

        self._decisionTreeNode = NarrativesTree()
        self._decisionTreeNode.set_name("Prediction")
        self._generate_narratives()
        # self._story_narrative.add_a_node(self._decisionTreeNode)
        self._result_setter.set_decision_tree_node(self._decisionTreeNode)
        self._result_setter.set_score_dtree_cards(
            json.loads(
                CommonUtils.convert_python_object_to_json(
                    self._decisionTreeNode.get_all_cards())))

        self._completionStatus = self._dataframe_context.get_completion_status(
        )
        self._completionStatus += self._scriptWeightDict[
            self._analysisName]["narratives"] * self._scriptStages[
                "dtreeNarrativeEnd"]["weight"] / 10
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "dtreeNarrativeEnd",\
                                    "info",\
                                    self._scriptStages["dtreeNarrativeEnd"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsg)
        self._dataframe_context.update_completion_status(
            self._completionStatus)
Ejemplo n.º 12
0
 def Run(self):
     binner = Binner(self._data_frame, self._dataframe_helper)
     #histogram_data = CommonUtils.as_dict(binner.get_bins_for_all_measure_columns())
     histogram_data = CommonUtils.as_dict(
         binner.get_bins(self._dataframe_context.get_result_column()))