def createGraphData(self, regressor, regressionInfo, etlStats):
        # getting data from regressionInfo
        modelId = regressionInfo.get(PredictiveConstants.MODELID)
        locationAddress = regressionInfo.get(
            PredictiveConstants.LOCATIONADDRESS)
        modelName = regressionInfo.get(PredictiveConstants.MODELSHEETNAME)
        spark = regressionInfo.get(PredictiveConstants.SPARK)

        # getting data from etl data
        labelColm = etlStats.get(PredictiveConstants.LABELCOLM)
        trainData = etlStats.get(PredictiveConstants.TRAINDATA)
        testData = etlStats.get(PredictiveConstants.TESTDATA)

        trainPredictedData = regressor.transform(trainData)
        testPredictedData = regressor.transform(testData)
        # training Actual vs Predicted dataset
        trainingPredictionActual = \
            trainPredictedData.select(labelColm, modelName)
        trainingPredictionActualGraphFileName = \
            pUtil.writeToParquet(fileName="trainingPredictedVsActualEnsemble",
                                 locationAddress=locationAddress,
                                 userId=modelId,
                                 data=trainingPredictionActual)
        # test Actual Vs Predicted dataset
        testPredictionActual = \
            testPredictedData.select(labelColm, modelName)
        testPredictionActualGraphFileName = \
            pUtil.writeToParquet(fileName="testPredictedVsActualEnsemble",
                                 locationAddress=locationAddress,
                                 userId=modelId,
                                 data=testPredictionActual)
        # creating the residual vs fitted graph data
        residualDataColm = trainingPredictionActual.withColumn(
            'residuals',
            col(labelColm) - col(modelName))
        residualDataColm = residualDataColm.select('residuals')
        residualsPredictiveDataTraining = \
            pUtil.residualsFittedGraph(residualsData=residualDataColm,
                                       predictionData=trainingPredictionActual,
                                       modelSheetName=modelName,
                                       spark=spark)
        residualsVsFittedGraphFileName = \
            pUtil.writeToParquet(fileName="residualsVsFittedEnsemble",
                                 locationAddress=locationAddress,
                                 userId=modelId,
                                 data=residualsPredictiveDataTraining)

        graphNameDict = {
            PredictiveConstants.RESIDUALSVSFITTEDGRAPHFILENAME:
            residualsVsFittedGraphFileName,
            PredictiveConstants.TRAININGPREDICTIONACTUALFILENAME:
            trainingPredictionActualGraphFileName,
            PredictiveConstants.TESTPREDICTIONACTUALFILENAME:
            testPredictionActualGraphFileName
        }
        return graphNameDict
 def writeDataset(self, dataset, infoData):
     storageLocation = infoData.get(pc.STORAGELOCATION)
     modelName = infoData.get(pc.MODELNAME)
     userId = infoData.get(pc.USERID)
     """
     write the dataset if not exists and if do then append the new data inside the dataset
     - keep the datasetID information, and coversationID should be unique in the dataset.
     """
     datasetInfo = pu.writeToParquet(modelName, storageLocation, userId,
                                     dataset)
     return datasetInfo
Ejemplo n.º 3
0
    def featureStats(self, etlStats, predictiveData):
        numericalFeatures = etlStats.get(PredictiveConstants.NUMERICALFEATURES)
        label = etlStats.get(PredictiveConstants.LABELCOLM)
        dataset = etlStats.get(PredictiveConstants.DATASET)
        categoricalFeatures = etlStats.get(
            PredictiveConstants.CATEGORICALFEATURES)
        categoryColmStats = etlStats.get(PredictiveConstants.CATEGORYCOLMSTATS)

        locationAddress = predictiveData.get(
            PredictiveConstants.LOCATIONADDRESS)
        featureId = predictiveData.get(PredictiveConstants.MODELID)

        # statistics
        columnListForfeaturesStats = numericalFeatures.copy()
        columnListForfeaturesStats.insert(0, label)
        dataTransformationObj = PredictiveDataTransformation(dataset=dataset)
        dataStatsResult = \
            dataTransformationObj.dataStatistics(categoricalFeatures=categoricalFeatures,
                                                 numericalFeatures=columnListForfeaturesStats,
                                                 categoricalColmStat=categoryColmStats)
        summaryDict = dataStatsResult

        # creating the dataset for statschart visualization in features selection chart
        datasetForStatsChart = dataset.select(columnListForfeaturesStats)
        datasetForStatsChartFileName = \
            PredictiveUtilities.writeToParquet(fileName="datasetForStatsChart",
                                               locationAddress=locationAddress,
                                               userId=featureId,
                                               data=datasetForStatsChart)

        featuresStatsDict = {
            "columnsName": columnListForfeaturesStats,
            "datasetFileName": datasetForStatsChartFileName
        }

        featureStatistics = {
            PredictiveConstants.SUMMARYDICT: summaryDict,
            PredictiveConstants.FEATURESSTATSDICT: featuresStatsDict
        }

        return featureStatistics
Ejemplo n.º 4
0
    def loadModel(self):

        if self.algoName == "linear_reg" or self.algoName == \
                "ridge_reg" or self.algoName == "lasso_reg" :
            regressionPrediction = LinearRegressionModel.load(self.modelStorageLocation)
        if self.algoName == "RandomForestAlgo" :
            regressionPrediction = RandomForestRegressionModel.load(self.modelStorageLocation)
        if self.algoName == "GradientBoostAlgo":
            regressionPrediction = GBTRegressionModel.load(self.modelStorageLocation)

        #dropping the already existed column of prediction on same model
        self.dataset = self.dataset.drop(self.modelSheetName)

        predictionData = regressionPrediction.transform(self.dataset)
        predictionData = predictionData.drop(self.featuresColm)

        #dropping extra added column
        if self.indexedFeatures:
            self.indexedFeatures.extend(self.oneHotEncodedFeaturesList)
            predictionData = predictionData.drop(*self.indexedFeatures)
        else:
            predictionData = predictionData

        #overWriting the original dataset

        '''this step is needed to write because of the nature of spark to not read or write whole data at once
        it only takes limited data to memory and another problem was lazy evaluation of spark.
        so overwriting the same dataset which is already in the memory is not possible'''
        emptyUserId = ''
        fileNameWithPathTemp = self.locationAddress + emptyUserId + self.datasetName + "_temp.parquet"
        predictionData.write.parquet(fileNameWithPathTemp, mode="overwrite")
        predictionDataReadAgain = self.spark.read.parquet(fileNameWithPathTemp)

        predictionTableData = \
            PredictiveUtilities.writeToParquet(fileName=self.datasetName,
                                                       locationAddress=self.locationAddress,
                                                       userId=emptyUserId,
                                                       data=predictionDataReadAgain)        
        return predictionTableData
Ejemplo n.º 5
0
    def featuresSelection(self, dataset_add, feature_colm,
                          label_colm, relation_list, relation, userId, algoName,
                          locationAddress):
        dataset = self.spark.read.parquet(dataset_add)
        # PredictiveUtilities = PredictiveUtilities()

        # changing the relationship of the colm(log,squareroot,exponential)
        dataTransformationObj = PredictiveDataTransformation(dataset=dataset)
        dataset = dataTransformationObj.colmTransformation(colmTransformationList=relation_list) \
            if relation == PredictiveConstants.NON_LINEAR  else dataset
        # transformation
        dataTransformationObj = PredictiveDataTransformation(dataset=dataset)
        dataTransformationResult = dataTransformationObj.dataTranform(labelColm=label_colm,
                                                                      featuresColm=feature_colm,
                                                                      userId=userId)
        dataset = dataTransformationResult.get(PredictiveConstants.DATASET)
        categoricalFeatures = dataTransformationResult.get(PredictiveConstants.CATEGORICALFEATURES)
        numericalFeatures = dataTransformationResult.get(PredictiveConstants.NUMERICALFEATURES)
        maxCategories = dataTransformationResult.get(PredictiveConstants.MAXCATEGORIES)
        categoryColmStats = dataTransformationResult.get(PredictiveConstants.CATEGORYCOLMSTATS)
        indexedFeatures = dataTransformationResult.get(PredictiveConstants.INDEXEDFEATURES)
        label = dataTransformationResult.get(PredictiveConstants.LABEL)
        idNameFeaturesOrdered = dataTransformationResult.get(PredictiveConstants.IDNAMEFEATURESORDERED)
        oneHotEncodedFeaturesList = dataTransformationResult.get(PredictiveConstants.ONEHOTENCODEDFEATURESLIST)
        indexedLabelNameDict = dataTransformationResult.get(PredictiveConstants.INDEXEDLABELNAMEDICT)
        featuresColm = dataTransformationResult.get(PredictiveConstants.VECTORFEATURES)

        # statistics
        columnListForfeaturesStats = numericalFeatures.copy()
        columnListForfeaturesStats.insert(0, label)
        dataTransformationObj = PredictiveDataTransformation(dataset=dataset)
        dataStatsResult = \
            dataTransformationObj.dataStatistics(categoricalFeatures=categoricalFeatures,
                                                 numericalFeatures=columnListForfeaturesStats,
                                                 categoricalColmStat=categoryColmStats)
        summaryDict = dataStatsResult

        # creating the dataset for statschart visualization in features selection chart
        datasetForStatsChart = dataset.select(columnListForfeaturesStats)
        datasetForStatsChartFileName = \
            PredictiveUtilities.writeToParquet(fileName="datasetForStatsChart",
                                                  locationAddress=locationAddress,
                                                  userId=userId,
                                                  data=datasetForStatsChart)

        featuresStatsDict = {"columnsName": columnListForfeaturesStats,
                             "datasetFileName": datasetForStatsChartFileName}

        # applying the algorithm
        ##calling the pearson test
        trainData, testData = dataset.randomSplit([0.80, 0.20], seed=40)

        keyStatsTest = ''
        statisticalTestResult = {}
        if algoName == PredictiveConstants.RANDOMREGRESSOR:
            statisticalTestObj = PredictiveStatisticalTest(dataset=dataset,
                                                           features=numericalFeatures,
                                                           labelColm=label)
            statisticalTestResult = statisticalTestObj.pearsonTest()
            randomForestModel = \
                RandomForestRegressor(labelCol=label,
                                      featuresCol=featuresColm,
                                      numTrees=10)
            keyStatsTest = "pearson_test_data"
        if algoName == PredictiveConstants.RANDOMCLASSIFIER:
            statisticalTestObj = PredictiveStatisticalTest(dataset=dataset,
                                                           features=indexedFeatures,
                                                           labelColm=label)
            statisticalTestResult = \
                statisticalTestObj.chiSquareTest(categoricalFeatures=categoricalFeatures,
                                                 maxCategories=maxCategories)
            randomForestModel = RandomForestClassifier(labelCol=label,
                                                       featuresCol=featuresColm,
                                                       numTrees=10)
            keyStatsTest = "ChiSquareTestData"
        randomForestModelFit = randomForestModel.fit(trainData)
        # predictions = randomForestModelFit.transform(testData)
        print(randomForestModelFit.featureImportances)
        # feature_importance = randomForestModelFit.featureImportances.toArray().tolist()
        # print(feature_importance)
        import pyspark.sql.functions as F
        import builtins
        round = getattr(builtins, 'round')

        featuresImportance = list(randomForestModelFit.featureImportances)
        featuresImportance = [round(x, 4) for x in featuresImportance]
        featuresImportanceDict = {}
        for importance in featuresImportance:
            featuresImportanceDict[featuresImportance.index(importance)] = round(importance, 4)

        featuresImportanceDictWithName = \
            PredictiveUtilities.summaryTable(featuresName=idNameFeaturesOrdered,
                                                featuresStat=featuresImportanceDict)

        # feature_importance = randomForestModelFit.featureImportances.toArray().tolist()
        # print(feature_importance)
        # featureImportance = []
        # for x in feature_importance:
        #     featureImportance.append(round(x, 4))
        # features_column_for_user = numericalFeatures + categoricalFeatures
        featuresColmList = idNameFeaturesOrdered
        feat = []
        for val in featuresColmList.values():
            feat.append(val)
        feature_imp = {PredictiveConstants.FEATURE_IMPORTANCE: featuresImportance, "feature_column": feat}

        response_dict = {
            PredictiveConstants.FEATURE_IMPORTANCE: feature_imp,
            keyStatsTest: statisticalTestResult,
            'summaryDict': summaryDict,
            'categoricalSummary': categoryColmStats,
            "featuresImportanceDict": featuresImportanceDictWithName,
            "featuresStatsDict": featuresStatsDict
        }
        return response_dict
 def csvToParquet(self):
     dataset = spark.read.csv(reviewDatasetPath, header=True)
     dataset = dataset.select(colsName)  #according to the requirement
     dataset = dataset.withColumnRenamed("Document Class", "Sentiment")
     dataset = dataset.withColumnRenamed("Prediction (Document Class)", "prediction_knime")
     PredictiveUtilities.writeToParquet("knimeTestDataset","/home/fidel/Documents/","",dataset)
    def randomGradientRegressionModelEvaluation(self, regressor):
        trainPredictedData = regressor.transform(self.trainData)
        testPredictedData = regressor.transform(self.testData)
        from pyspark.ml.evaluation import RegressionEvaluator
        metricsList = ['r2', 'rmse', 'mse', 'mae']
        trainDataMetrics = {}
        metricName = ''
        for metric in metricsList:
            if metric.__eq__("r2"):
                metricName = PredictiveConstants.RSQUARE
            elif metric.__eq__("rmse"):
                metricName = PredictiveConstants.RMSE
            elif metric.__eq__("mse"):
                metricName = PredictiveConstants.MSE
            elif metric.__eq__("mae"):
                metricName = PredictiveConstants.MAE
            evaluator = RegressionEvaluator(labelCol=self.labelColm,
                                            predictionCol=self.modelSheetName,
                                            metricName=metric)
            metricValue = evaluator.evaluate(trainPredictedData)
            trainDataMetrics[metricName] = metricValue

        #training Actual vs Predicted dataset
        trainingPredictionActual = \
            trainPredictedData.select(self.labelColm, self.modelSheetName)
        trainingPredictionActualGraphFileName = \
            PredictiveUtilities.writeToParquet(fileName="trainingPredictedVsActualEnsemble",
                                               locationAddress=self.locationAddress,
                                               userId=self.userId,
                                               data=trainingPredictionActual)
        #test Actual Vs Predicted dataset
        testPredictionActual = \
            testPredictedData.select(self.labelColm, self.modelSheetName)
        testPredictionActualGraphFileName = \
            PredictiveUtilities.writeToParquet(fileName="testPredictedVsActualEnsemble",
                                               locationAddress=self.locationAddress,
                                               userId=self.userId,
                                               data=testPredictionActual)

        # summary stats
        noTrees = regressor.getNumTrees
        treeWeights = regressor.treeWeights
        treeNodes = list(regressor.trees)
        totalNoNodes = regressor.totalNumNodes
        debugString = regressor.toDebugString

        debugString = str(debugString).splitlines()

        featuresImportance = list(regressor.featureImportances)
        featuresImportance = [round(x, 4) for x in featuresImportance]
        print(featuresImportance)
        featuresImportanceDict = {}
        for importance in featuresImportance:
            featuresImportanceDict[featuresImportance.index(
                importance)] = importance

        featuresImportanceDictWithName = \
            PredictiveUtilities.summaryTable(featuresName=self.idNameFeaturesOrdered,
                                             featuresStat=featuresImportanceDict)

        trainDataMetrics["No Trees"] = noTrees
        trainDataMetrics["Total Nodes"] = totalNoNodes

        summaryStats = {
            'noTrees': noTrees,
            'treeWeights': treeWeights,
            'totalNodes': totalNoNodes,
            'featuresImportance': featuresImportanceDictWithName,
            'metrics': trainDataMetrics,
            'debugString': debugString,
        }

        #creating the residual vs fitted graph data
        residualDataColm = trainingPredictionActual.withColumn(
            'residuals',
            col(self.labelColm) - col(self.modelSheetName))
        residualDataColm = residualDataColm.select('residuals')
        residualsPredictiveDataTraining = \
            PredictiveUtilities.residualsFittedGraph(residualsData=residualDataColm,
                                                     predictionData=trainingPredictionActual,
                                                     modelSheetName=self.modelSheetName)
        residualsVsFittedGraphFileName = \
            PredictiveUtilities.writeToParquet(fileName="residualsVsFittedEnsemble",
                                               locationAddress=self.locationAddress,
                                               userId=self.userId,
                                               data=residualsPredictiveDataTraining)

        graphNameDict = {
            PredictiveConstants.RESIDUALSVSFITTEDGRAPHFILENAME:
            residualsVsFittedGraphFileName,
            PredictiveConstants.TRAININGPREDICTIONACTUALFILENAME:
            trainingPredictionActualGraphFileName,
            PredictiveConstants.TESTPREDICTIONACTUALFILENAME:
            testPredictionActualGraphFileName
        }

        response = {
            PredictiveConstants.STATDATA: summaryStats,
            PredictiveConstants.GRAPHDATA: graphNameDict
        }

        return response
    def regressionModelEvaluation(self, regressor, spark):

        import builtins
        round = getattr(builtins, 'round')

        try:
            coefficientStdErrorList = regressor.summary.coefficientStandardErrors
            coefficientStdErrorDict = {}
            statsDictName = "coefficientStdErrorDictWithName"

            coefficientStdErrorDictWithName = self.statsDict(
                coefficientStdErrorList, coefficientStdErrorDict)

            pValuesList = regressor.summary.pValues
            pValuesDict = {}

            pValuesDictWithName = self.statsDict(pValuesList, pValuesDict)

            tValuesList = regressor.summary.tValues
            tValuesDict = {}

            tValuesDictWithName = self.statsDict(tValuesList, tValuesDict)

            significanceDict = {}
            for pkey, pVal in pValuesDict.items():
                if (0 <= pVal < 0.001):
                    significanceDict[pkey] = '***'
                if (0.001 <= pVal < 0.01):
                    significanceDict[pkey] = '**'
                if (0.01 <= pVal < 0.05):
                    significanceDict[pkey] = '*'
                if (0.05 <= pVal < 0.1):
                    significanceDict[pkey] = '.'
                if (0.1 <= pVal < 1):
                    significanceDict[pkey] = '-'
            significanceDictWithName = \
                PredictiveUtilities.summaryTable(featuresName=self.idNameFeaturesOrdered,
                                                         featuresStat=significanceDict)
        except:
            coefficientStdErrorDictWithName = {}
            pValuesDictWithName = {}
            tValuesDictWithName = {}
            significanceDictWithName = {}

        coefficientList = list(map(float, list(regressor.coefficients)))
        coefficientDict = {}
        coefficientDictWithName = self.statsDict(coefficientList,
                                                 coefficientDict)

        # creating the table chart data
        summaryTableChartList = []
        if self.algoName != "lasso_reg":
            for (keyOne, valueOne), valueTwo, valueThree, valueFour, valueFive in \
                    zip(coefficientStdErrorDictWithName.items(), coefficientDictWithName.values(),
                        pValuesDictWithName.values(),
                        tValuesDictWithName.values(), significanceDictWithName.values()):
                chartList = [
                    keyOne, valueOne, valueTwo, valueThree, valueFour,
                    valueFive
                ]
                summaryTableChartList.append(chartList)
            schemaSummaryTable = StructType([
                StructField("Column_Name", StringType(), True),
                StructField("std_Error", DoubleType(), True),
                StructField("coefficient", DoubleType(), True),
                StructField("P_value", DoubleType(), True),
                StructField("T_value", DoubleType(), True),
                StructField("significance", StringType(), True)
            ])

        if (coefficientStdErrorDictWithName == {}
                or self.algoName == "lasso_reg"):
            for (keyOne, valueOne) in coefficientDictWithName.items():
                chartList = [keyOne, valueOne]
                summaryTableChartList.append(chartList)

            schemaSummaryTable = StructType([
                StructField("Column_Name", StringType(), True),
                StructField("coefficient", DoubleType(), True)
            ])

        summaryTableChartData = spark.createDataFrame(
            summaryTableChartList, schema=schemaSummaryTable)
        summaryTableChartDataFileName = \
            PredictiveUtilities.writeToParquet(fileName="summaryTableChart",
                                                       locationAddress=self.locationAddress,
                                                       userId=self.userId,
                                                       data=summaryTableChartData)

        # creating the equation for the regression model
        intercept = round(regressor.intercept, 4)
        equation = self.labelColm, "=", intercept, "+"
        for feature, coeff in zip(self.idNameFeaturesOrdered.values(),
                                  coefficientDict.values()):
            coeffFeature = coeff, "*", feature, "+"
            equation += coeffFeature
        equation = list(equation[:-1])

        # training summary
        trainingSummary = regressor.summary
        RMSE = round(trainingSummary.rootMeanSquaredError, 4)
        MAE = round(trainingSummary.meanAbsoluteError, 4)
        MSE = round(trainingSummary.meanSquaredError, 4)
        rSquare = round(trainingSummary.r2, 4)
        adjustedRSquare = round(trainingSummary.r2adj, 4)
        degreeOfFreedom = trainingSummary.degreesOfFreedom
        explainedVariance = round(trainingSummary.explainedVariance, 4)
        totalNumberOfFeatures = regressor.numFeatures
        residualsTraining = trainingSummary.residuals  # sparkDataframe

        # test and training data predicted vs actual graphdata

        trainingPredictionAllColm = trainingSummary.predictions
        trainingPredictionActual = \
            trainingPredictionAllColm.select(self.labelColm, self.modelSheetName)
        trainingPredictionActualGraphFileName = \
            PredictiveUtilities.writeToParquet(fileName="trainingPredictedVsActual",
                                                       locationAddress=self.locationAddress,
                                                       userId=self.userId,
                                                       data=trainingPredictionActual)
        testPredictionAllColm = regressor.transform(self.testData)
        testPredictionActual = \
            testPredictionAllColm.select(self.labelColm, self.modelSheetName)
        testPredictionActualGraphFileName = \
            PredictiveUtilities.writeToParquet(fileName="testPredictedVsActual",
                                                       locationAddress=self.locationAddress,
                                                       userId=self.userId,
                                                       data=testPredictionActual)

        # appending train and test dataset together
        # for future use only
        trainTestMerged = trainingPredictionAllColm.union(
            testPredictionAllColm)
        trainTestMergedFileName = \
            PredictiveUtilities.writeToParquet(fileName="trainTestMerged",
                                                       locationAddress=self.locationAddress,
                                                       userId=self.userId,
                                                       data=trainTestMerged)

        # residual vs fitted graph

        residualsPredictiveDataTraining = \
            PredictiveUtilities.residualsFittedGraph(residualsData=residualsTraining,
                                                             predictionData=trainingPredictionActual,
                                                             modelSheetName=self.modelSheetName)
        residualsVsFittedGraphFileName = \
            PredictiveUtilities.writeToParquet(fileName="residualsVsFitted",
                                                       locationAddress=self.locationAddress,
                                                       userId=self.userId,
                                                       data=residualsPredictiveDataTraining)
        # scale location plot
        sqrtStdResiduals = \
            PredictiveUtilities.scaleLocationGraph(label=self.labelColm,
                                                           predictionTargetData=trainingPredictionActual,
                                                           residualsData=residualsTraining,
                                                           modelSheetName=self.modelSheetName)
        scaleLocationGraphFileName = \
            PredictiveUtilities.writeToParquet(fileName="scaleLocation",
                                                       locationAddress=self.locationAddress,
                                                       userId=self.userId,
                                                       data=sqrtStdResiduals)
        # quantile plot
        quantileQuantileData = \
            PredictiveUtilities.quantileQuantileGraph(residualsData=residualsTraining,
                                                              spark=spark)

        quantileQuantileGraphFileName = \
            PredictiveUtilities.writeToParquet(fileName="quantileQuantile",
                                                       locationAddress=self.locationAddress,
                                                       userId=self.userId,
                                                       data=quantileQuantileData)

        # creating dictionary for the graph data and summary stats
        graphNameDict = {
            PredictiveConstants.RESIDUALSVSFITTEDGRAPHFILENAME:
            residualsVsFittedGraphFileName,
            PredictiveConstants.SCALELOCATIONGRAPHFILENAME:
            scaleLocationGraphFileName,
            PredictiveConstants.QUANTILEQUANTILEGRAPHFILENAME:
            quantileQuantileGraphFileName,
            PredictiveConstants.TRAININGPREDICTIONACTUALFILENAME:
            trainingPredictionActualGraphFileName,
            PredictiveConstants.TESTPREDICTIONACTUALFILENAME:
            testPredictionActualGraphFileName
        }
        summaryStats = {
            PredictiveConstants.RMSE: RMSE,
            PredictiveConstants.MSE: MSE,
            PredictiveConstants.MAE: MAE,
            PredictiveConstants.RSQUARE: rSquare,
            PredictiveConstants.ADJRSQUARE: adjustedRSquare,
            PredictiveConstants.INTERCEPT: intercept,
            PredictiveConstants.DOF: degreeOfFreedom,
            PredictiveConstants.EXPLAINEDVARIANCE: explainedVariance,
            PredictiveConstants.TOTALFEATURES: totalNumberOfFeatures
        }

        summaryTable = {
            "summaryTableChartDataFileName": summaryTableChartDataFileName
        }

        response = {
            PredictiveConstants.GRAPHDATA: graphNameDict,
            PredictiveConstants.STATDATA: summaryStats,
            PredictiveConstants.TABLEDATA: summaryTable,
            PredictiveConstants.EQUATION: equation
        }

        return response
    def prediction(self, predictiveData):

        '''creating duplicate dataset to avoid the datatype change of the original dataset '''
        datasetAdd = predictiveData.get(PredictiveConstants.DATASETADD)
        spark = predictiveData.get(PredictiveConstants.SPARK)
        dataset = spark.read.parquet(datasetAdd)

        # adding extra index column in the dataset
        dataset = PredictiveUtilities.addInternalId(dataset)
        predictiveData.update({
            PredictiveConstants.DATASET: dataset
        })

        etlStats = PredictiveUtilities.performETL(etlInfo=predictiveData)
        dataset = etlStats.get(PredictiveConstants.DATASET)
        originalDataset = etlStats.get(PredictiveConstants.ORIGINALDATASET)

        algoName = predictiveData.get(PredictiveConstants.ALGORITHMNAME)
        modelStorageLocation = predictiveData.get(PredictiveConstants.MODELSTORAGELOCATION)
        modelName = predictiveData.get(PredictiveConstants.MODELSHEETNAME)
        datasetName = predictiveData.get(PredictiveConstants.DATASETNAME)
        spark = predictiveData.get(PredictiveConstants.SPARK)
        locationAddress = predictiveData.get(PredictiveConstants.LOCATIONADDRESS)

        if PredictiveConstants.LINEAR_REG.__eq__(algoName) or \
                PredictiveConstants.RIDGE_REG.__eq__(algoName) or PredictiveConstants.LASSO_REG.__eq__(algoName):
            regressionPrediction = LinearRegressionModel.load(modelStorageLocation)
        if PredictiveConstants.RANDOMFORESTALGO.__eq__(algoName):
            regressionPrediction = RandomForestRegressionModel.load(modelStorageLocation)
        if PredictiveConstants.GRADIENTBOOSTALGO.__eq__(algoName):
            regressionPrediction = GBTRegressionModel.load(modelStorageLocation)

        dataset = dataset.drop(modelName)
        originalDataset = originalDataset.drop(modelName)
        dataset = regressionPrediction.transform(dataset)
        dataset = dataset.select(PredictiveConstants.DMXINDEX, modelName)
        finalDataset = originalDataset.join(dataset, on=[PredictiveConstants.DMXINDEX]) \
            .sort(PredictiveConstants.DMXINDEX).drop(PredictiveConstants.DMXINDEX)

        # predictionData = predictionData.drop(featuresColm)
        #
        # #dropping extra added column
        # if indexedFeatures:
        #     indexedFeatures.extend(oneHotEncodedFeaturesList)
        #     predictionData = predictionData.drop(*indexedFeatures)
        # else:
        #     predictionData = predictionData

        # overWriting the original dataset
        '''this step is needed to write because of the nature of spark to not read or write whole data at once
        it only takes limited data to memory and another problem was lazy evaluation of spark.
        so overwriting the same dataset which is already in the memory is not possible'''
        emptyUserId = ''
        randomUUID = str(uuid.uuid4())
        fileNameWithPathTemp = locationAddress + randomUUID + datasetName + "_temp.parquet" #correct the name.
        finalDataset.write.parquet(fileNameWithPathTemp, mode="overwrite")  # send this path to java for deletion
        predictionDataReadAgain = spark.read.parquet(fileNameWithPathTemp)

        predictionTableData = \
            PredictiveUtilities.writeToParquet(fileName=datasetName,
                                               locationAddress=locationAddress,
                                               userId=emptyUserId,
                                               data=predictionDataReadAgain)
        return predictionTableData