Python PredictiveUtilities 예제들, PredictionAlgorithms.PredictiveUtilities.PredictiveUtilities Python 예제들

예제 #1

0

파일 보기

    def __init__(self, trainDataRatio, dataset_add, feature_colm, label_colm,
                 relation_list, relation, userId, locationAddress, algoName,
                 spark):
        self.trainDataRatio = trainDataRatio
        self.datasetAdd = dataset_add
        self.featuresColmList = feature_colm
        self.labelColmList = label_colm
        self.relationshipList = relation_list
        self.relation = relation
        self.userId = userId
        self.locationAddress = locationAddress
        self.algoName = algoName
        self.spark = spark

        # only for etlpart of the dataset
        self.predictiveUtilitiesObj = PredictiveUtilities()

        ETLOnDatasetStats = \
            self.predictiveUtilitiesObj.ETLOnDataset(datasetAdd=self.datasetAdd,
                                                     featuresColmList=self.featuresColmList,
                                                     labelColmList=self.labelColmList,
                                                     relationshipList=self.relationshipList,
                                                     relation=self.relation,
                                                     trainDataRatio=self.trainDataRatio,
                                                     spark=spark,
                                                     userId=userId)
        self.dataset = ETLOnDatasetStats.get("dataset")
        self.featuresColm = ETLOnDatasetStats.get("featuresColm")
        self.labelColm = ETLOnDatasetStats.get("labelColm")
        self.trainData = ETLOnDatasetStats.get("trainData")
        self.testData = ETLOnDatasetStats.get("testData")
        self.idNameFeaturesOrdered = ETLOnDatasetStats.get(
            "idNameFeaturesOrdered")

예제 #2

0

파일 보기

파일: PredictiveRandomForestRegression.py 프로젝트: sahilsingh1123/dmx_deepinsight_prediction

    def createGraphData(self, regressor, regressionInfo, etlStats):
        # getting data from regressionInfo
        modelId = regressionInfo.get(PredictiveConstants.MODELID)
        locationAddress = regressionInfo.get(
            PredictiveConstants.LOCATIONADDRESS)
        modelName = regressionInfo.get(PredictiveConstants.MODELSHEETNAME)
        spark = regressionInfo.get(PredictiveConstants.SPARK)

        # getting data from etl data
        labelColm = etlStats.get(PredictiveConstants.LABELCOLM)
        trainData = etlStats.get(PredictiveConstants.TRAINDATA)
        testData = etlStats.get(PredictiveConstants.TESTDATA)

        trainPredictedData = regressor.transform(trainData)
        testPredictedData = regressor.transform(testData)
        # training Actual vs Predicted dataset
        trainingPredictionActual = \
            trainPredictedData.select(labelColm, modelName)
        trainingPredictionActualGraphFileName = \
            pUtil.writeToParquet(fileName="trainingPredictedVsActualEnsemble",
                                 locationAddress=locationAddress,
                                 userId=modelId,
                                 data=trainingPredictionActual)
        # test Actual Vs Predicted dataset
        testPredictionActual = \
            testPredictedData.select(labelColm, modelName)
        testPredictionActualGraphFileName = \
            pUtil.writeToParquet(fileName="testPredictedVsActualEnsemble",
                                 locationAddress=locationAddress,
                                 userId=modelId,
                                 data=testPredictionActual)
        # creating the residual vs fitted graph data
        residualDataColm = trainingPredictionActual.withColumn(
            'residuals',
            col(labelColm) - col(modelName))
        residualDataColm = residualDataColm.select('residuals')
        residualsPredictiveDataTraining = \
            pUtil.residualsFittedGraph(residualsData=residualDataColm,
                                       predictionData=trainingPredictionActual,
                                       modelSheetName=modelName,
                                       spark=spark)
        residualsVsFittedGraphFileName = \
            pUtil.writeToParquet(fileName="residualsVsFittedEnsemble",
                                 locationAddress=locationAddress,
                                 userId=modelId,
                                 data=residualsPredictiveDataTraining)

        graphNameDict = {
            PredictiveConstants.RESIDUALSVSFITTEDGRAPHFILENAME:
            residualsVsFittedGraphFileName,
            PredictiveConstants.TRAININGPREDICTIONACTUALFILENAME:
            trainingPredictionActualGraphFileName,
            PredictiveConstants.TESTPREDICTIONACTUALFILENAME:
            testPredictionActualGraphFileName
        }
        return graphNameDict

예제 #3

0

파일 보기

파일: SparkNLP.py 프로젝트: sahilsingh1123/dmx_deepinsight_prediction

 def sentimentAnalysis(self, infoData):
     infoData = self.textProcessing(infoData)
     infoData = self.viveknSentimentAnalysis(infoData)
     storagePath = infoData.get(pc.STORAGELOCATION)
     modelName = infoData.get(pc.MODELSHEETNAME)
     modelPath = storagePath + modelName
     #writing the info data in the json file for using in sentiment prediction.
     """drop the obj which is not jsonSerializable"""
     infoData.pop(pc.SPARK, "None")
     infoData.pop(pc.DATASET, "None")
     infoData.pop(pc.TESTDATA, "None")
     pu.writeToJson(modelPath, infoData)
     print("success")

예제 #4

0

파일 보기

    def __init__(self,trainDataRatio,dataset_add,
                 feature_colm, label_colm, relation_list,relation,userId,
                 locationAddress,modelStorageLocation,algoName,modelSheetName,datasetName,
                 spark):
        self.trainDataRatio = None if trainDataRatio == None else trainDataRatio
        self.datasetAdd = dataset_add
        self.featuresColmList = feature_colm
        self.labelColmList = None if label_colm == None else label_colm
        self.relationshipList = relation_list
        self.relation = relation
        self.userId = userId
        self.locationAddress = locationAddress
        self.modelStorageLocation = modelStorageLocation
        self.algoName = algoName
        self.modelSheetName = "prediction_" + modelSheetName
        self.datasetName = datasetName
        self.spark = spark

        # only for etlpart of the dataset
        # PredictiveUtilities = PredictiveUtilities()

        ETLOnDatasetStats = \
            PredictiveUtilities.ETLOnDataset(datasetAdd=self.datasetAdd,
                                                     featuresColmList=self.featuresColmList,
                                                     labelColmList=self.labelColmList,
                                                     relationshipList=self.relationshipList,
                                                     relation=self.relation,
                                                     trainDataRatio=self.trainDataRatio,
                                                     spark=spark,
                                                     userId=userId)
        self.dataset = ETLOnDatasetStats.get("dataset")
        self.featuresColm = ETLOnDatasetStats.get("featuresColm")
        self.indexedFeatures = ETLOnDatasetStats.get("indexedFeatures")
        self.oneHotEncodedFeaturesList = ETLOnDatasetStats.get("oneHotEncodedFeaturesList")

예제 #5

0

파일 보기

    def sentimentData(self, sentimentDataInfo):

        sentimentDataInfo = self.sentimentAnalysis(sentimentDataInfo)
        sentimentDataInfo = self.trainModel(sentimentDataInfo)
        sentimentDataInfo = self.invertIndexColm(sentimentDataInfo)
        modelName = sentimentDataInfo.get(pc.MODELSHEETNAME)
        storagePath = sentimentDataInfo.get(pc.STORAGELOCATION)
        jsonStorageLocation = storagePath + modelName
        #--sahil store the data in json format --> write the separate method for this.
        sentimentDataInfo.pop(pc.SPARK, "None")
        sentimentDataInfo.pop(pc.DATASET, "None")
        sentimentDataInfo.pop(pc.TESTDATA, "None")
        sentimentDataInfo.pop(pc.TRAINDATA, "None")
        sentimentDataInfo.pop(pc.MODEL, "None")
        # json.dump(sentimentDataInfo, open(storagePath + modelName + ".json", 'w'))
        pu.writeToJson(jsonStorageLocation, sentimentDataInfo)

예제 #6

0

파일 보기

    def mergePosNegDataset(self, infoData):
        positiveDatasetPath = infoData.get(pc.POSITIVEDATASETPATH)
        negativeDatasetPath = infoData.get(pc.NEGATIVEDATASETPATH)
        spark = infoData.get(pc.SPARK)
        '''positive dictionary dataset--------- call convert to string method from here'''
        positiveDataset = spark.read.format("com.databricks.spark.csv").option(
            "header", "true").option("inferSchema",
                                     "true").load(positiveDatasetPath)
        positiveDataset = positiveDataset.select(positiveDataset.columns[:1])
        positiveColName = str((positiveDataset.schema.names)[0])
        positiveDataset = positiveDataset.withColumnRenamed(
            positiveColName, pc.DMXDICTIONARYCOLNAME)
        positiveDataset = positiveDataset.withColumn(pc.DMXSENTIMENT, lit(pc.DMXPOSITIVE)) \
            .select(pc.DMXDICTIONARYCOLNAME, pc.DMXSENTIMENT)
        '''negative dictionary dataset'''
        negativeDataset = spark.read.format("com.databricks.spark.csv").option(
            "header", "true").option("inferSchema",
                                     "true").load(negativeDatasetPath)
        negativeDataset = negativeDataset.select(negativeDataset.columns[:1])
        negativeColName = str((negativeDataset.schema.names)[0])
        negativeDataset = negativeDataset.withColumnRenamed(
            negativeColName, pc.DMXDICTIONARYCOLNAME)
        negativeDataset = negativeDataset.withColumn(pc.DMXSENTIMENT, lit(pc.DMXNEGATIVE)) \
            .select(pc.DMXDICTIONARYCOLNAME, pc.DMXSENTIMENT)
        '''before appending negative dataset with positive make all colm name similar with each other'''
        posNegDataset = positiveDataset.union(negativeDataset)
        posNegDataset = pu.addInternalId(posNegDataset)

        return posNegDataset

예제 #7

0

파일 보기

파일: PredictiveRegressionModel.py 프로젝트: sahilsingh1123/predictive_analysis_git

    def __init__(self, trainDataRatio, dataset_add, feature_colm, label_colm, relation_list,
                 relation, userId, locationAddress, algoName, modelSheetName,spark):
        self.trainDataRatio = trainDataRatio
        self.datasetAdd = dataset_add
        self.featuresColmList = feature_colm
        self.labelColmList = label_colm
        self.relationshipList = relation_list
        self.relation = relation
        self.userId = userId
        self.locationAddress = locationAddress
        self.algoName = algoName
        self.modelSheetName = PredictiveConstants.PREDICTION_ + modelSheetName
        # self.spark = spark


        # only for etlpart of the dataset
        # PredictiveUtilities = PredictiveUtilities()

        ETLOnDatasetStats = \
            PredictiveUtilities.ETLOnDataset(datasetAdd=self.datasetAdd,
                                                     featuresColmList=self.featuresColmList,
                                                     labelColmList=self.labelColmList,
                                                     relationshipList=self.relationshipList,
                                                     relation=self.relation,
                                                     trainDataRatio=self.trainDataRatio,
                                                     spark=spark,
                                                     userId=userId)
        self.dataset = ETLOnDatasetStats.get(PredictiveConstants.DATASET)
        self.featuresColm = ETLOnDatasetStats.get(PredictiveConstants.FEATURESCOLM)
        self.labelColm = ETLOnDatasetStats.get(PredictiveConstants.LABELCOLM)
        self.trainData = ETLOnDatasetStats.get(PredictiveConstants.TRAINDATA)
        self.testData = ETLOnDatasetStats.get(PredictiveConstants.TESTDATA)
        self.idNameFeaturesOrdered = ETLOnDatasetStats.get(PredictiveConstants.IDNAMEFEATURESORDERED)

예제 #8

0

파일 보기

    def getFeatureImportance(self, randomForestModelFit,
                             idNameFeaturesOrdered):
        import builtins
        round = getattr(builtins, 'round')

        featuresImportance = list(randomForestModelFit.featureImportances)
        featuresImportance = [round(x, 4) for x in featuresImportance]
        featuresImportanceDict = {}
        for importance in featuresImportance:
            featuresImportanceDict[featuresImportance.index(
                importance)] = round(importance, 4)

        featuresImportanceDictWithName = \
            PredictiveUtilities.summaryTable(featuresName=idNameFeaturesOrdered,
                                             featuresStat=featuresImportanceDict)

        featuresColmList = idNameFeaturesOrdered
        feat = []
        for val in featuresColmList.values():
            feat.append(val)
        feature_imp = {
            PredictiveConstants.FEATURE_IMPORTANCE: featuresImportance,
            "feature_column": feat
        }

        featuresImportanceData = {
            PredictiveConstants.FEATURE_IMPORTANCE:
            feature_imp,
            PredictiveConstants.FEATURESIMPORTANCEDICT:
            featuresImportanceDictWithName
        }
        return featuresImportanceData

예제 #9

0

파일 보기

파일: TextProcessing.py 프로젝트: sahilsingh1123/dmx_deepinsight_prediction

 def getExtractedLemmaData(self, dataset, indexList, lemmatizedList):
     zipData = zip(indexList, lemmatizedList)
     columnList = [pc.DMXINDEX, pc.DMXLEMMATIZED]
     pandasDataframe = pd.DataFrame(zipData, columns=columnList)
     lemmatizedDataset = spark.createDataFrame(pandasDataframe)
     dataset = dataset.drop(pc.DMXLEMMATIZED)
     dataset = pu.joinDataset(dataset, lemmatizedDataset, pc.DMXINDEX)
     return dataset

예제 #10

0

파일 보기

    def sentimentAnalysis(self, sentimentInfoData):

        spark = sentimentInfoData.get(pc.SPARK)
        datasetPath = sentimentInfoData.get(pc.SENTIMENTDATASETPATH)
        dataset = spark.read.parquet(datasetPath)
        dataset = pu.addInternalId(dataset)
        sentimentInfoData.update({pc.DATASET: dataset})

        isNgram = sentimentInfoData.get(pc.ISNGRAM)
        sentimentDataset = self.textPreProcessing(sentimentInfoData)  # do the oneHot Encoding after that.
        textProcessing = TextProcessing()
        sentimentDataset = textProcessing.lemmatization(sentimentDataset, pc.DMXSTOPWORDS)
        # sentimentDataset = textProcessing.sparkLemmatizer(sentimentDataset, pc.DMXSTOPWORDS)
        if(isNgram):
            ngramPara = sentimentInfoData.get(pc.NGRAMPARA)
            sentimentDataset = textProcessing.ngrams(sentimentDataset, pc.DMXLEMMATIZED, ngramPara)  # with n-grams

        modelName = sentimentInfoData.get(pc.MODELSHEETNAME)
        labelColm = sentimentInfoData.get(pc.LABELCOLM)
        indexedColm = pc.INDEXED_ + labelColm
        encodedColm = pc.ONEHOTENCODED_ + pc.DMXLEMMATIZED
        featuresColm = modelName + pc.DMXFEATURE

        sentimentInfoData.update({pc.COLMTOENCODE: pc.DMXLEMMATIZED,
                                  pc.DATASET: sentimentDataset,
                                  pc.COLMTOINDEX: labelColm,
                                  pc.INDEXEDCOLM: indexedColm,
                                  pc.ENCODEDCOLM: encodedColm,
                                  pc.COLMTOVECTORIZED: encodedColm,
                                  pc.FEATURESCOLM: featuresColm,
                                  pc.ORIGINALCOLMNAME: labelColm
                                  })

        sentimentInfoData = pu.stringIndexer(sentimentInfoData)  # after this will get the indexed label
        sentimentInfoData = pu.countVectorizer(sentimentInfoData)  # using the lemmatized colm for now.
        if(isNgram):
            sentimentInfoData.update({
                pc.COLMTOENCODE: pc.DMXNGRAMS,
                pc.ENCODEDCOLM: pc.ONEHOTENCODED_ + pc.DMXNGRAMS
            })
            sentimentInfoData = pu.countVectorizer(sentimentInfoData)
            sentimentInfoData.update({pc.COLMTOVECTORIZED: [pc.ONEHOTENCODED_ + pc.DMXLEMMATIZED, pc.ONEHOTENCODED_ + pc.DMXNGRAMS]})

        sentimentInfoData = pu.featureAssembler(sentimentInfoData)  # creating feature vector

        return sentimentInfoData

예제 #11

0

파일 보기

파일: ViveknPretrainedModel.py 프로젝트: sahilsingh1123/dmx_deepinsight_prediction

 def writeDataset(self, dataset, infoData):
     storageLocation = infoData.get(pc.STORAGELOCATION)
     modelName = infoData.get(pc.MODELNAME)
     userId = infoData.get(pc.USERID)
     """
     write the dataset if not exists and if do then append the new data inside the dataset
     - keep the datasetID information, and coversationID should be unique in the dataset.
     """
     datasetInfo = pu.writeToParquet(modelName, storageLocation, userId,
                                     dataset)
     return datasetInfo

예제 #12

0

파일 보기

    def prediction(self, infoData):
        isNgram = False if infoData.get(pc.ISNGRAM) == None else infoData.get(
            pc.ISNGRAM)
        predictionColm = infoData.get(pc.PREDICTIONCOLM)
        algoName = infoData.get(pc.ALGORITHMNAME)
        modelStorageLocation = infoData.get(pc.MODELSTORAGELOCATION)
        spark = infoData.get(pc.SPARK)
        datasetPath = infoData.get(pc.SENTIMENTDATASETPATH)
        originalDataset = spark.read.parquet(datasetPath)
        originalDataset = pu.addInternalId(originalDataset)
        infoData.update({pc.DATASET: originalDataset})

        infoData = self.dataTransformation(infoData)

        dataset = infoData.get(pc.DATASET)
        if (isNgram):
            """sahil-- handle the none value for ngram parameter at the time of data creation"""
            textProcessing = TextProcessing()
            ngramPara = infoData.get(pc.NGRAMPARA)
            dataset = textProcessing.ngrams(dataset, pc.DMXLEMMATIZED,
                                            ngramPara)
        """
        -- sahil- hardCoding the algorithm name for comparision handle this while finalising
        """
        if ("GradientBoostClassifier".__eq__(algoName)):
            predictionModel = GBTClassificationModel.load(modelStorageLocation)
        if ("DecisionTreeClassifier".__eq__(algoName)):
            predictionModel = DecisionTreeClassificationModel.load(
                modelStorageLocation)

        dataset = dataset.drop(predictionColm)
        originalDataset = originalDataset.drop(predictionColm)
        dataset = predictionModel.transform(dataset)
        """calling indexToString method after the prediction"""
        infoData.update({pc.DATASET: dataset})
        infoData = self.invertIndex(infoData)

        dataset = infoData.get(pc.DATASET)
        dataset = dataset.select(pc.DMXINDEX, predictionColm)
        finalDataset = pu.joinDataset(originalDataset, dataset, pc.DMXINDEX)
        return finalDataset

예제 #13

0

파일 보기

class PredictiveClassificationModel():
    def __init__(self, trainDataRatio, dataset_add, feature_colm, label_colm,
                 relation_list, relation, userId, locationAddress, algoName,
                 spark):
        self.trainDataRatio = trainDataRatio
        self.datasetAdd = dataset_add
        self.featuresColmList = feature_colm
        self.labelColmList = label_colm
        self.relationshipList = relation_list
        self.relation = relation
        self.userId = userId
        self.locationAddress = locationAddress
        self.algoName = algoName
        self.spark = spark

        # only for etlpart of the dataset
        self.predictiveUtilitiesObj = PredictiveUtilities()

        ETLOnDatasetStats = \
            self.predictiveUtilitiesObj.ETLOnDataset(datasetAdd=self.datasetAdd,
                                                     featuresColmList=self.featuresColmList,
                                                     labelColmList=self.labelColmList,
                                                     relationshipList=self.relationshipList,
                                                     relation=self.relation,
                                                     trainDataRatio=self.trainDataRatio,
                                                     spark=spark,
                                                     userId=userId)
        self.dataset = ETLOnDatasetStats.get("dataset")
        self.featuresColm = ETLOnDatasetStats.get("featuresColm")
        self.labelColm = ETLOnDatasetStats.get("labelColm")
        self.trainData = ETLOnDatasetStats.get("trainData")
        self.testData = ETLOnDatasetStats.get("testData")
        self.idNameFeaturesOrdered = ETLOnDatasetStats.get(
            "idNameFeaturesOrdered")

    def classificationModelStat(self, classifier):
        trainingSummary = classifier.summary

    def logisticRegression(self):
        # family = auto,multinomial and bionomial
        logisticRegressionModelFit = \
            LogisticRegression(featuresCol=self.featuresColm, labelCol=self.labelColm,
                               maxIter=5, regParam=0.1, elasticNetParam=1.0,
                               threshold=0.3, family="auto")
        classifier = logisticRegressionModelFit.fit(self.trainData)

    def randomForestClassifierModel(self):
        randomForestClassifierModelFit = \
            RandomForestClassifier(labelCol=self.labelColm,
                                   featuresCol=self.featuresColm,
                                   numTrees=10)
        classifier = randomForestClassifierModelFit.fit(self.trainData)

예제 #14

0

파일 보기

파일: SAMachineLearning.py 프로젝트: sahilsingh1123/dmx_deepinsight_prediction

    def invertIndexColm(self, infoData):
        originalColm = infoData.get(pc.COLMTOINDEX)
        stringIndexerPath = (infoData.get(
            pc.INDEXERPATHMAPPING)).get(originalColm)
        inverterColm = infoData.get(pc.PREDICTIONCOLM)
        testDataset = infoData.get(pc.TESTDATA)
        trainDataset = infoData.get(pc.TRAINDATA)
        infoData.update({
            pc.INDEXERPATH: stringIndexerPath,
            pc.COLMTOINVERT: inverterColm
        })
        """
        run the indexing part on test and train dataset seperately since needs to show the user accordingly
        """
        infoData.update({pc.DATASET: trainDataset})
        trainDataset = pu.indexToString(infoData)
        infoData.update({pc.DATASET: testDataset})
        testDataset = pu.indexToString(infoData)

        infoData.update({pc.TRAINDATA: trainDataset, pc.TESTDATA: testDataset})

        return infoData

예제 #15

0

파일 보기

 def invertIndex(self, infoData):
     originalColName = infoData.get(pc.ORIGINALCOLMNAME)
     indexerPath = (infoData.get(
         pc.INDEXERPATHMAPPING)).get(originalColName)
     infoData.update({pc.INDEXERPATH: indexerPath})
     dataset = pu.indexToString(infoData)
     infoData.update({pc.DATASET: dataset})
     """
     datasetTest = datasetTest.select("Text","Sentiment","prediction_knime", predictionColm)
     datasetTest.coalesce(
         1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv(
         "/home/fidel/Documents/decisionTreeKNIMEPrediction.csv")
     """
     return infoData

예제 #16

0

파일 보기

    def countVectorizer(self, infoData):
        originalColName = infoData.get(pc.ORIGINALCOLMNAME)
        dataset = infoData.get(pc.DATASET)
        oneHotEncoderMapping = infoData.get(pc.ONEHOTENCODERPATHMAPPING)
        countVectorizerPath = oneHotEncoderMapping.get(originalColName)
        countVectorizer = CountVectorizerModel.load(countVectorizerPath)
        encodedColmName = infoData.get(pc.ENCODEDCOLM)
        dataset = dataset.drop(encodedColmName)

        dataset = countVectorizer.transform(dataset)
        infoData.update({pc.DATASET: dataset})

        infoData = pu.featureAssembler(infoData)

        return infoData

예제 #17

0

파일 보기

    def createTaggedDataset(self, dataset, indexList, taggedRowList,
                            positiveNum, negativeNum, totalNum,
                            sentimentScores):
        zipData = zip(indexList, taggedRowList, positiveNum, negativeNum,
                      totalNum, sentimentScores)
        columnList = [
            pc.DMXINDEX, pc.DMXTAGGEDCOLM, pc.POSITIVENUM, pc.NEGATIVENUM,
            pc.TOTALWORDS, pc.SENTIMENTSCORE
        ]
        pandasDataframe = pd.DataFrame(zipData, columns=columnList)
        taggedDataset = self.spark.createDataFrame(pandasDataframe)
        dataset = pu.joinDataset(dataset, taggedDataset, pc.DMXINDEX)
        '''RN not dropping the neutral sentiment-- after discussion we will decide whether to drop or not'''
        #dataset = self.dropNeutral(dataset)
        dataset = self.performSentimentAnalysis(dataset)

        return dataset

예제 #18

0

파일 보기

파일: LexiconSentimentAnalysis.py 프로젝트: sahilsingh1123/predictiveAnalysis

    def createTaggedDataset(self, dataset, indexList, taggedRowList,
                            positiveNum, negativeNum, totalNum,
                            sentimentScores):
        zipData = zip(indexList, taggedRowList, positiveNum, negativeNum,
                      totalNum, sentimentScores)
        columnList = [
            pc.DMXINDEX, pc.DMXTAGGEDCOLM, pc.POSITIVENUM, pc.NEGATIVENUM,
            pc.TOTALWORDS, pc.SENTIMENTSCORE
        ]
        pandasDataframe = pd.DataFrame(zipData, columns=columnList)
        taggedDataset = spark.createDataFrame(pandasDataframe)
        dataset = PredictiveUtilities.joinDataset(dataset, taggedDataset,
                                                  pc.DMXINDEX)
        dataset = self.dropNeutral(dataset)
        dataset = self.performSentimentAnalysis(dataset)

        return dataset

예제 #19

0

파일 보기

파일: TextAnalysisFirstTestCase.py 프로젝트: sahilsingh1123/dmx_deepinsight_prediction

    def textAnalytics(self, infoData):
        sparkSession = infoData.get(pc.SPARK)
        global spark
        spark = sparkSession

        datasetPath = infoData.get(pc.DATASETPATH)
        try:
            dataset = spark.read.parquet(datasetPath)
        except:
            dataset = spark.read.csv(datasetPath, header=True)
        dataset = pu.addInternalId(dataset)
        infoData.update({pc.DATASET: dataset})
        # below method textPreprocessing is related to sentiment analysis
        # make sure you will make it common for both sentiment as well text analtics.
        dataset = self.textPreProcessing(infoData)
        ''' 
        after that try to use the tf-idf method for finding frequencies and all.
        '''
        clusteredDataset = self.calTFIDF(dataset, pc.DMXLEMMATIZED)

예제 #20

0

파일 보기

파일: LexiconSentimentAnalysis.py 프로젝트: sahilsingh1123/predictiveAnalysis

    def sentimentAnalysis(self, sentimentInfoData):
        '''requirement--
        1.- sentiment sentence containing colmName-
        2.- sentiment dataset parquet path
        3.- positive dictionary parquet path
        4.- negative dictionary parquet path
        5.- positive and negative dictionary colm name containing words/sentiment. '''
        sparkSession = sentimentInfoData.get(pc.SPARK)
        global spark
        spark = sparkSession

        datasetPath = sentimentInfoData.get(pc.SENTIMENTDATASETPATH)
        dataset = spark.read.parquet(datasetPath)
        dataset = pu.addInternalId(dataset)
        sentimentInfoData.update({pc.DATASET: dataset})
        sentimentDataset = self.textPreProcessing(sentimentInfoData)
        textProcessing = TextProcessing()
        posNegDataset = textProcessing.mergePosNegDataset(sentimentInfoData)
        dataset = self.addTag(sentimentDataset, pc.DMXSTEMMEDWORDS,
                              posNegDataset)

예제 #21

0

파일 보기

    def featureStats(self, etlStats, predictiveData):
        numericalFeatures = etlStats.get(PredictiveConstants.NUMERICALFEATURES)
        label = etlStats.get(PredictiveConstants.LABELCOLM)
        dataset = etlStats.get(PredictiveConstants.DATASET)
        categoricalFeatures = etlStats.get(
            PredictiveConstants.CATEGORICALFEATURES)
        categoryColmStats = etlStats.get(PredictiveConstants.CATEGORYCOLMSTATS)

        locationAddress = predictiveData.get(
            PredictiveConstants.LOCATIONADDRESS)
        featureId = predictiveData.get(PredictiveConstants.MODELID)

        # statistics
        columnListForfeaturesStats = numericalFeatures.copy()
        columnListForfeaturesStats.insert(0, label)
        dataTransformationObj = PredictiveDataTransformation(dataset=dataset)
        dataStatsResult = \
            dataTransformationObj.dataStatistics(categoricalFeatures=categoricalFeatures,
                                                 numericalFeatures=columnListForfeaturesStats,
                                                 categoricalColmStat=categoryColmStats)
        summaryDict = dataStatsResult

        # creating the dataset for statschart visualization in features selection chart
        datasetForStatsChart = dataset.select(columnListForfeaturesStats)
        datasetForStatsChartFileName = \
            PredictiveUtilities.writeToParquet(fileName="datasetForStatsChart",
                                               locationAddress=locationAddress,
                                               userId=featureId,
                                               data=datasetForStatsChart)

        featuresStatsDict = {
            "columnsName": columnListForfeaturesStats,
            "datasetFileName": datasetForStatsChartFileName
        }

        featureStatistics = {
            PredictiveConstants.SUMMARYDICT: summaryDict,
            PredictiveConstants.FEATURESSTATSDICT: featuresStatsDict
        }

        return featureStatistics

예제 #22

0

파일 보기

    def loadModel(self):

        if self.algoName == "linear_reg" or self.algoName == \
                "ridge_reg" or self.algoName == "lasso_reg" :
            regressionPrediction = LinearRegressionModel.load(self.modelStorageLocation)
        if self.algoName == "RandomForestAlgo" :
            regressionPrediction = RandomForestRegressionModel.load(self.modelStorageLocation)
        if self.algoName == "GradientBoostAlgo":
            regressionPrediction = GBTRegressionModel.load(self.modelStorageLocation)

        #dropping the already existed column of prediction on same model
        self.dataset = self.dataset.drop(self.modelSheetName)

        predictionData = regressionPrediction.transform(self.dataset)
        predictionData = predictionData.drop(self.featuresColm)

        #dropping extra added column
        if self.indexedFeatures:
            self.indexedFeatures.extend(self.oneHotEncodedFeaturesList)
            predictionData = predictionData.drop(*self.indexedFeatures)
        else:
            predictionData = predictionData

        #overWriting the original dataset

        '''this step is needed to write because of the nature of spark to not read or write whole data at once
        it only takes limited data to memory and another problem was lazy evaluation of spark.
        so overwriting the same dataset which is already in the memory is not possible'''
        emptyUserId = ''
        fileNameWithPathTemp = self.locationAddress + emptyUserId + self.datasetName + "_temp.parquet"
        predictionData.write.parquet(fileNameWithPathTemp, mode="overwrite")
        predictionDataReadAgain = self.spark.read.parquet(fileNameWithPathTemp)

        predictionTableData = \
            PredictiveUtilities.writeToParquet(fileName=self.datasetName,
                                                       locationAddress=self.locationAddress,
                                                       userId=emptyUserId,
                                                       data=predictionDataReadAgain)        
        return predictionTableData

예제 #23

0

파일 보기

    def featureSelection(self, predictiveData):
        algoName = predictiveData.get(PredictiveConstants.ALGORITHMNAME)

        etlStats = PredictiveUtilities.performETL(predictiveData)
        featureStatistics = self.featureStats(etlStats, predictiveData)

        categoryColmStats = etlStats.get(PredictiveConstants.CATEGORYCOLMSTATS)
        idNameFeaturesOrdered = etlStats.get(
            PredictiveConstants.IDNAMEFEATURESORDERED)

        featureAnalysis = self.featureAnalysis(etlStats, algoName)
        randomForestModelFit = featureAnalysis.get(
            PredictiveConstants.RANDOMFORESTMODEL)

        featureImportanceData = self.getFeatureImportance(
            randomForestModelFit, idNameFeaturesOrdered)

        featureImportanceDict = featureImportanceData.get(
            PredictiveConstants.FEATURESIMPORTANCEDICT)
        featureImportance = featureImportanceData.get(
            PredictiveConstants.FEATURE_IMPORTANCE)
        summaryDict = featureStatistics.get(PredictiveConstants.SUMMARYDICT)
        featuresStatsDict = featureStatistics.get(
            PredictiveConstants.FEATURESSTATSDICT)
        keyStatsTest = featureAnalysis.get(PredictiveConstants.KEYSTATSTEST)
        statisticalTestResult = featureAnalysis.get(
            PredictiveConstants.STATISTICALTESTRESULT)

        responseData = {
            PredictiveConstants.FEATURE_IMPORTANCE: featureImportance,
            keyStatsTest: statisticalTestResult,
            PredictiveConstants.SUMMARYDICT: summaryDict,
            PredictiveConstants.CATEGORICALSUMMARY: categoryColmStats,
            PredictiveConstants.FEATURESIMPORTANCEDICT: featureImportanceDict,
            PredictiveConstants.FEATURESSTATSDICT: featuresStatsDict
        }

        return responseData

예제 #24

0

파일 보기

파일: PredictiveRegression.py 프로젝트: sahilsingh1123/dmx_deepinsight_prediction

 def etlOperation(self, etlInfo):
     etlStats = PredictiveUtilities.performETL(etlInfo)
     return etlStats

예제 #25

0

파일 보기

파일: PredictiveRandomForestRegression.py 프로젝트: sahilsingh1123/dmx_deepinsight_prediction

    def regressionEvaluation(self, regressor, regressionInfo, etlStats):
        modelName = regressionInfo.get(PredictiveConstants.MODELSHEETNAME)

        # getting data from etl data
        labelColm = etlStats.get(PredictiveConstants.LABELCOLM)
        trainData = etlStats.get(PredictiveConstants.TRAINDATA)
        idNameFeaturesOrdered = etlStats.get(
            PredictiveConstants.IDNAMEFEATURESORDERED)

        trainPredictedData = regressor.transform(trainData)
        metricsList = ['r2', 'rmse', 'mse', 'mae']
        trainDataMetrics = {}
        metricName = ''
        for metric in metricsList:
            if metric.__eq__("r2"):
                metricName = PredictiveConstants.RSQUARE
            elif metric.__eq__("rmse"):
                metricName = PredictiveConstants.RMSE
            elif metric.__eq__("mse"):
                metricName = PredictiveConstants.MSE
            elif metric.__eq__("mae"):
                metricName = PredictiveConstants.MAE
            evaluator = RegressionEvaluator(labelCol=labelColm,
                                            predictionCol=modelName,
                                            metricName=metric)
            metricValue = evaluator.evaluate(trainPredictedData)
            trainDataMetrics[metricName] = metricValue

        # summary stats
        noTrees = regressor.getNumTrees
        treeWeights = regressor.treeWeights
        treeNodes = list(regressor.trees)
        totalNoNodes = regressor.totalNumNodes
        debugString = regressor.toDebugString

        debugString = str(debugString).splitlines()

        featuresImportance = list(regressor.featureImportances)
        featuresImportance = [round(x, 4) for x in featuresImportance]
        featuresImportanceDict = {}
        for importance in featuresImportance:
            featuresImportanceDict[featuresImportance.index(
                importance)] = importance

        featuresImportanceDictWithName = \
            pUtil.summaryTable(featuresName=idNameFeaturesOrdered,
                               featuresStat=featuresImportanceDict)

        trainDataMetrics["No Trees"] = noTrees
        trainDataMetrics["Total Nodes"] = totalNoNodes

        summaryStats = {
            'noTrees': noTrees,
            'treeWeights': treeWeights,
            'totalNodes': totalNoNodes,
            'featuresImportance': featuresImportanceDictWithName,
            'metrics': trainDataMetrics,
            'debugString': debugString,
        }

        graphDataInfo = self.createGraphData(regressor, regressionInfo,
                                             etlStats)

        response = {
            PredictiveConstants.STATDATA: summaryStats,
            PredictiveConstants.GRAPHDATA: graphDataInfo
        }

        return response

예제 #26

0

파일 보기

파일: PredictiveEvaluation.py 프로젝트: sahilsingh1123/predictive_analysis_git

    def regressionModelEvaluation(self, regressor, spark):

        import builtins
        round = getattr(builtins, 'round')

        try:
            coefficientStdErrorList = regressor.summary.coefficientStandardErrors
            coefficientStdErrorDict = {}
            statsDictName = "coefficientStdErrorDictWithName"

            coefficientStdErrorDictWithName = self.statsDict(
                coefficientStdErrorList, coefficientStdErrorDict)

            pValuesList = regressor.summary.pValues
            pValuesDict = {}

            pValuesDictWithName = self.statsDict(pValuesList, pValuesDict)

            tValuesList = regressor.summary.tValues
            tValuesDict = {}

            tValuesDictWithName = self.statsDict(tValuesList, tValuesDict)

            significanceDict = {}
            for pkey, pVal in pValuesDict.items():
                if (0 <= pVal < 0.001):
                    significanceDict[pkey] = '***'
                if (0.001 <= pVal < 0.01):
                    significanceDict[pkey] = '**'
                if (0.01 <= pVal < 0.05):
                    significanceDict[pkey] = '*'
                if (0.05 <= pVal < 0.1):
                    significanceDict[pkey] = '.'
                if (0.1 <= pVal < 1):
                    significanceDict[pkey] = '-'
            significanceDictWithName = \
                PredictiveUtilities.summaryTable(featuresName=self.idNameFeaturesOrdered,
                                                         featuresStat=significanceDict)
        except:
            coefficientStdErrorDictWithName = {}
            pValuesDictWithName = {}
            tValuesDictWithName = {}
            significanceDictWithName = {}

        coefficientList = list(map(float, list(regressor.coefficients)))
        coefficientDict = {}
        coefficientDictWithName = self.statsDict(coefficientList,
                                                 coefficientDict)

        # creating the table chart data
        summaryTableChartList = []
        if self.algoName != "lasso_reg":
            for (keyOne, valueOne), valueTwo, valueThree, valueFour, valueFive in \
                    zip(coefficientStdErrorDictWithName.items(), coefficientDictWithName.values(),
                        pValuesDictWithName.values(),
                        tValuesDictWithName.values(), significanceDictWithName.values()):
                chartList = [
                    keyOne, valueOne, valueTwo, valueThree, valueFour,
                    valueFive
                ]
                summaryTableChartList.append(chartList)
            schemaSummaryTable = StructType([
                StructField("Column_Name", StringType(), True),
                StructField("std_Error", DoubleType(), True),
                StructField("coefficient", DoubleType(), True),
                StructField("P_value", DoubleType(), True),
                StructField("T_value", DoubleType(), True),
                StructField("significance", StringType(), True)
            ])

        if (coefficientStdErrorDictWithName == {}
                or self.algoName == "lasso_reg"):
            for (keyOne, valueOne) in coefficientDictWithName.items():
                chartList = [keyOne, valueOne]
                summaryTableChartList.append(chartList)

            schemaSummaryTable = StructType([
                StructField("Column_Name", StringType(), True),
                StructField("coefficient", DoubleType(), True)
            ])

        summaryTableChartData = spark.createDataFrame(
            summaryTableChartList, schema=schemaSummaryTable)
        summaryTableChartDataFileName = \
            PredictiveUtilities.writeToParquet(fileName="summaryTableChart",
                                                       locationAddress=self.locationAddress,
                                                       userId=self.userId,
                                                       data=summaryTableChartData)

        # creating the equation for the regression model
        intercept = round(regressor.intercept, 4)
        equation = self.labelColm, "=", intercept, "+"
        for feature, coeff in zip(self.idNameFeaturesOrdered.values(),
                                  coefficientDict.values()):
            coeffFeature = coeff, "*", feature, "+"
            equation += coeffFeature
        equation = list(equation[:-1])

        # training summary
        trainingSummary = regressor.summary
        RMSE = round(trainingSummary.rootMeanSquaredError, 4)
        MAE = round(trainingSummary.meanAbsoluteError, 4)
        MSE = round(trainingSummary.meanSquaredError, 4)
        rSquare = round(trainingSummary.r2, 4)
        adjustedRSquare = round(trainingSummary.r2adj, 4)
        degreeOfFreedom = trainingSummary.degreesOfFreedom
        explainedVariance = round(trainingSummary.explainedVariance, 4)
        totalNumberOfFeatures = regressor.numFeatures
        residualsTraining = trainingSummary.residuals  # sparkDataframe

        # test and training data predicted vs actual graphdata

        trainingPredictionAllColm = trainingSummary.predictions
        trainingPredictionActual = \
            trainingPredictionAllColm.select(self.labelColm, self.modelSheetName)
        trainingPredictionActualGraphFileName = \
            PredictiveUtilities.writeToParquet(fileName="trainingPredictedVsActual",
                                                       locationAddress=self.locationAddress,
                                                       userId=self.userId,
                                                       data=trainingPredictionActual)
        testPredictionAllColm = regressor.transform(self.testData)
        testPredictionActual = \
            testPredictionAllColm.select(self.labelColm, self.modelSheetName)
        testPredictionActualGraphFileName = \
            PredictiveUtilities.writeToParquet(fileName="testPredictedVsActual",
                                                       locationAddress=self.locationAddress,
                                                       userId=self.userId,
                                                       data=testPredictionActual)

        # appending train and test dataset together
        # for future use only
        trainTestMerged = trainingPredictionAllColm.union(
            testPredictionAllColm)
        trainTestMergedFileName = \
            PredictiveUtilities.writeToParquet(fileName="trainTestMerged",
                                                       locationAddress=self.locationAddress,
                                                       userId=self.userId,
                                                       data=trainTestMerged)

        # residual vs fitted graph

        residualsPredictiveDataTraining = \
            PredictiveUtilities.residualsFittedGraph(residualsData=residualsTraining,
                                                             predictionData=trainingPredictionActual,
                                                             modelSheetName=self.modelSheetName)
        residualsVsFittedGraphFileName = \
            PredictiveUtilities.writeToParquet(fileName="residualsVsFitted",
                                                       locationAddress=self.locationAddress,
                                                       userId=self.userId,
                                                       data=residualsPredictiveDataTraining)
        # scale location plot
        sqrtStdResiduals = \
            PredictiveUtilities.scaleLocationGraph(label=self.labelColm,
                                                           predictionTargetData=trainingPredictionActual,
                                                           residualsData=residualsTraining,
                                                           modelSheetName=self.modelSheetName)
        scaleLocationGraphFileName = \
            PredictiveUtilities.writeToParquet(fileName="scaleLocation",
                                                       locationAddress=self.locationAddress,
                                                       userId=self.userId,
                                                       data=sqrtStdResiduals)
        # quantile plot
        quantileQuantileData = \
            PredictiveUtilities.quantileQuantileGraph(residualsData=residualsTraining,
                                                              spark=spark)

        quantileQuantileGraphFileName = \
            PredictiveUtilities.writeToParquet(fileName="quantileQuantile",
                                                       locationAddress=self.locationAddress,
                                                       userId=self.userId,
                                                       data=quantileQuantileData)

        # creating dictionary for the graph data and summary stats
        graphNameDict = {
            PredictiveConstants.RESIDUALSVSFITTEDGRAPHFILENAME:
            residualsVsFittedGraphFileName,
            PredictiveConstants.SCALELOCATIONGRAPHFILENAME:
            scaleLocationGraphFileName,
            PredictiveConstants.QUANTILEQUANTILEGRAPHFILENAME:
            quantileQuantileGraphFileName,
            PredictiveConstants.TRAININGPREDICTIONACTUALFILENAME:
            trainingPredictionActualGraphFileName,
            PredictiveConstants.TESTPREDICTIONACTUALFILENAME:
            testPredictionActualGraphFileName
        }
        summaryStats = {
            PredictiveConstants.RMSE: RMSE,
            PredictiveConstants.MSE: MSE,
            PredictiveConstants.MAE: MAE,
            PredictiveConstants.RSQUARE: rSquare,
            PredictiveConstants.ADJRSQUARE: adjustedRSquare,
            PredictiveConstants.INTERCEPT: intercept,
            PredictiveConstants.DOF: degreeOfFreedom,
            PredictiveConstants.EXPLAINEDVARIANCE: explainedVariance,
            PredictiveConstants.TOTALFEATURES: totalNumberOfFeatures
        }

        summaryTable = {
            "summaryTableChartDataFileName": summaryTableChartDataFileName
        }

        response = {
            PredictiveConstants.GRAPHDATA: graphNameDict,
            PredictiveConstants.STATDATA: summaryStats,
            PredictiveConstants.TABLEDATA: summaryTable,
            PredictiveConstants.EQUATION: equation
        }

        return response

예제 #27

0

파일 보기

파일: PredictiveEvaluation.py 프로젝트: sahilsingh1123/predictive_analysis_git

    def randomGradientRegressionModelEvaluation(self, regressor):
        trainPredictedData = regressor.transform(self.trainData)
        testPredictedData = regressor.transform(self.testData)
        from pyspark.ml.evaluation import RegressionEvaluator
        metricsList = ['r2', 'rmse', 'mse', 'mae']
        trainDataMetrics = {}
        metricName = ''
        for metric in metricsList:
            if metric.__eq__("r2"):
                metricName = PredictiveConstants.RSQUARE
            elif metric.__eq__("rmse"):
                metricName = PredictiveConstants.RMSE
            elif metric.__eq__("mse"):
                metricName = PredictiveConstants.MSE
            elif metric.__eq__("mae"):
                metricName = PredictiveConstants.MAE
            evaluator = RegressionEvaluator(labelCol=self.labelColm,
                                            predictionCol=self.modelSheetName,
                                            metricName=metric)
            metricValue = evaluator.evaluate(trainPredictedData)
            trainDataMetrics[metricName] = metricValue

        #training Actual vs Predicted dataset
        trainingPredictionActual = \
            trainPredictedData.select(self.labelColm, self.modelSheetName)
        trainingPredictionActualGraphFileName = \
            PredictiveUtilities.writeToParquet(fileName="trainingPredictedVsActualEnsemble",
                                               locationAddress=self.locationAddress,
                                               userId=self.userId,
                                               data=trainingPredictionActual)
        #test Actual Vs Predicted dataset
        testPredictionActual = \
            testPredictedData.select(self.labelColm, self.modelSheetName)
        testPredictionActualGraphFileName = \
            PredictiveUtilities.writeToParquet(fileName="testPredictedVsActualEnsemble",
                                               locationAddress=self.locationAddress,
                                               userId=self.userId,
                                               data=testPredictionActual)

        # summary stats
        noTrees = regressor.getNumTrees
        treeWeights = regressor.treeWeights
        treeNodes = list(regressor.trees)
        totalNoNodes = regressor.totalNumNodes
        debugString = regressor.toDebugString

        debugString = str(debugString).splitlines()

        featuresImportance = list(regressor.featureImportances)
        featuresImportance = [round(x, 4) for x in featuresImportance]
        print(featuresImportance)
        featuresImportanceDict = {}
        for importance in featuresImportance:
            featuresImportanceDict[featuresImportance.index(
                importance)] = importance

        featuresImportanceDictWithName = \
            PredictiveUtilities.summaryTable(featuresName=self.idNameFeaturesOrdered,
                                             featuresStat=featuresImportanceDict)

        trainDataMetrics["No Trees"] = noTrees
        trainDataMetrics["Total Nodes"] = totalNoNodes

        summaryStats = {
            'noTrees': noTrees,
            'treeWeights': treeWeights,
            'totalNodes': totalNoNodes,
            'featuresImportance': featuresImportanceDictWithName,
            'metrics': trainDataMetrics,
            'debugString': debugString,
        }

        #creating the residual vs fitted graph data
        residualDataColm = trainingPredictionActual.withColumn(
            'residuals',
            col(self.labelColm) - col(self.modelSheetName))
        residualDataColm = residualDataColm.select('residuals')
        residualsPredictiveDataTraining = \
            PredictiveUtilities.residualsFittedGraph(residualsData=residualDataColm,
                                                     predictionData=trainingPredictionActual,
                                                     modelSheetName=self.modelSheetName)
        residualsVsFittedGraphFileName = \
            PredictiveUtilities.writeToParquet(fileName="residualsVsFittedEnsemble",
                                               locationAddress=self.locationAddress,
                                               userId=self.userId,
                                               data=residualsPredictiveDataTraining)

        graphNameDict = {
            PredictiveConstants.RESIDUALSVSFITTEDGRAPHFILENAME:
            residualsVsFittedGraphFileName,
            PredictiveConstants.TRAININGPREDICTIONACTUALFILENAME:
            trainingPredictionActualGraphFileName,
            PredictiveConstants.TESTPREDICTIONACTUALFILENAME:
            testPredictionActualGraphFileName
        }

        response = {
            PredictiveConstants.STATDATA: summaryStats,
            PredictiveConstants.GRAPHDATA: graphNameDict
        }

        return response

예제 #28

0

파일 보기

파일: PredictiveEvaluation.py 프로젝트: sahilsingh1123/predictive_analysis_git

    def statsDict(self, statList, statDict):
        for index, value in enumerate(statList):
            statDict[index] = round(value, 4)

        return PredictiveUtilities.summaryTable(
            featuresName=self.idNameFeaturesOrdered, featuresStat=statDict)

예제 #29

0

파일 보기

파일: csvToParquet.py 프로젝트: sahilsingh1123/dmx_deepinsight_prediction

 def csvToParquet(self):
     dataset = spark.read.csv(reviewDatasetPath, header=True)
     dataset = dataset.select(colsName)  #according to the requirement
     dataset = dataset.withColumnRenamed("Document Class", "Sentiment")
     dataset = dataset.withColumnRenamed("Prediction (Document Class)", "prediction_knime")
     PredictiveUtilities.writeToParquet("knimeTestDataset","/home/fidel/Documents/","",dataset)

예제 #30

0

파일 보기

    def featuresSelection(self, dataset_add, feature_colm,
                          label_colm, relation_list, relation, userId, algoName,
                          locationAddress):
        dataset = self.spark.read.parquet(dataset_add)
        # PredictiveUtilities = PredictiveUtilities()

        # changing the relationship of the colm(log,squareroot,exponential)
        dataTransformationObj = PredictiveDataTransformation(dataset=dataset)
        dataset = dataTransformationObj.colmTransformation(colmTransformationList=relation_list) \
            if relation == PredictiveConstants.NON_LINEAR  else dataset
        # transformation
        dataTransformationObj = PredictiveDataTransformation(dataset=dataset)
        dataTransformationResult = dataTransformationObj.dataTranform(labelColm=label_colm,
                                                                      featuresColm=feature_colm,
                                                                      userId=userId)
        dataset = dataTransformationResult.get(PredictiveConstants.DATASET)
        categoricalFeatures = dataTransformationResult.get(PredictiveConstants.CATEGORICALFEATURES)
        numericalFeatures = dataTransformationResult.get(PredictiveConstants.NUMERICALFEATURES)
        maxCategories = dataTransformationResult.get(PredictiveConstants.MAXCATEGORIES)
        categoryColmStats = dataTransformationResult.get(PredictiveConstants.CATEGORYCOLMSTATS)
        indexedFeatures = dataTransformationResult.get(PredictiveConstants.INDEXEDFEATURES)
        label = dataTransformationResult.get(PredictiveConstants.LABEL)
        idNameFeaturesOrdered = dataTransformationResult.get(PredictiveConstants.IDNAMEFEATURESORDERED)
        oneHotEncodedFeaturesList = dataTransformationResult.get(PredictiveConstants.ONEHOTENCODEDFEATURESLIST)
        indexedLabelNameDict = dataTransformationResult.get(PredictiveConstants.INDEXEDLABELNAMEDICT)
        featuresColm = dataTransformationResult.get(PredictiveConstants.VECTORFEATURES)

        # statistics
        columnListForfeaturesStats = numericalFeatures.copy()
        columnListForfeaturesStats.insert(0, label)
        dataTransformationObj = PredictiveDataTransformation(dataset=dataset)
        dataStatsResult = \
            dataTransformationObj.dataStatistics(categoricalFeatures=categoricalFeatures,
                                                 numericalFeatures=columnListForfeaturesStats,
                                                 categoricalColmStat=categoryColmStats)
        summaryDict = dataStatsResult

        # creating the dataset for statschart visualization in features selection chart
        datasetForStatsChart = dataset.select(columnListForfeaturesStats)
        datasetForStatsChartFileName = \
            PredictiveUtilities.writeToParquet(fileName="datasetForStatsChart",
                                                  locationAddress=locationAddress,
                                                  userId=userId,
                                                  data=datasetForStatsChart)

        featuresStatsDict = {"columnsName": columnListForfeaturesStats,
                             "datasetFileName": datasetForStatsChartFileName}

        # applying the algorithm
        ##calling the pearson test
        trainData, testData = dataset.randomSplit([0.80, 0.20], seed=40)

        keyStatsTest = ''
        statisticalTestResult = {}
        if algoName == PredictiveConstants.RANDOMREGRESSOR:
            statisticalTestObj = PredictiveStatisticalTest(dataset=dataset,
                                                           features=numericalFeatures,
                                                           labelColm=label)
            statisticalTestResult = statisticalTestObj.pearsonTest()
            randomForestModel = \
                RandomForestRegressor(labelCol=label,
                                      featuresCol=featuresColm,
                                      numTrees=10)
            keyStatsTest = "pearson_test_data"
        if algoName == PredictiveConstants.RANDOMCLASSIFIER:
            statisticalTestObj = PredictiveStatisticalTest(dataset=dataset,
                                                           features=indexedFeatures,
                                                           labelColm=label)
            statisticalTestResult = \
                statisticalTestObj.chiSquareTest(categoricalFeatures=categoricalFeatures,
                                                 maxCategories=maxCategories)
            randomForestModel = RandomForestClassifier(labelCol=label,
                                                       featuresCol=featuresColm,
                                                       numTrees=10)
            keyStatsTest = "ChiSquareTestData"
        randomForestModelFit = randomForestModel.fit(trainData)
        # predictions = randomForestModelFit.transform(testData)
        print(randomForestModelFit.featureImportances)
        # feature_importance = randomForestModelFit.featureImportances.toArray().tolist()
        # print(feature_importance)
        import pyspark.sql.functions as F
        import builtins
        round = getattr(builtins, 'round')

        featuresImportance = list(randomForestModelFit.featureImportances)
        featuresImportance = [round(x, 4) for x in featuresImportance]
        featuresImportanceDict = {}
        for importance in featuresImportance:
            featuresImportanceDict[featuresImportance.index(importance)] = round(importance, 4)

        featuresImportanceDictWithName = \
            PredictiveUtilities.summaryTable(featuresName=idNameFeaturesOrdered,
                                                featuresStat=featuresImportanceDict)

        # feature_importance = randomForestModelFit.featureImportances.toArray().tolist()
        # print(feature_importance)
        # featureImportance = []
        # for x in feature_importance:
        #     featureImportance.append(round(x, 4))
        # features_column_for_user = numericalFeatures + categoricalFeatures
        featuresColmList = idNameFeaturesOrdered
        feat = []
        for val in featuresColmList.values():
            feat.append(val)
        feature_imp = {PredictiveConstants.FEATURE_IMPORTANCE: featuresImportance, "feature_column": feat}

        response_dict = {
            PredictiveConstants.FEATURE_IMPORTANCE: feature_imp,
            keyStatsTest: statisticalTestResult,
            'summaryDict': summaryDict,
            'categoricalSummary': categoryColmStats,
            "featuresImportanceDict": featuresImportanceDictWithName,
            "featuresStatsDict": featuresStatsDict
        }
        return response_dict