def __init__(self, trainDataRatio, dataset_add, feature_colm, label_colm, relation_list, relation, userId, locationAddress, algoName, spark): self.trainDataRatio = trainDataRatio self.datasetAdd = dataset_add self.featuresColmList = feature_colm self.labelColmList = label_colm self.relationshipList = relation_list self.relation = relation self.userId = userId self.locationAddress = locationAddress self.algoName = algoName self.spark = spark # only for etlpart of the dataset self.predictiveUtilitiesObj = PredictiveUtilities() ETLOnDatasetStats = \ self.predictiveUtilitiesObj.ETLOnDataset(datasetAdd=self.datasetAdd, featuresColmList=self.featuresColmList, labelColmList=self.labelColmList, relationshipList=self.relationshipList, relation=self.relation, trainDataRatio=self.trainDataRatio, spark=spark, userId=userId) self.dataset = ETLOnDatasetStats.get("dataset") self.featuresColm = ETLOnDatasetStats.get("featuresColm") self.labelColm = ETLOnDatasetStats.get("labelColm") self.trainData = ETLOnDatasetStats.get("trainData") self.testData = ETLOnDatasetStats.get("testData") self.idNameFeaturesOrdered = ETLOnDatasetStats.get( "idNameFeaturesOrdered")
def createGraphData(self, regressor, regressionInfo, etlStats): # getting data from regressionInfo modelId = regressionInfo.get(PredictiveConstants.MODELID) locationAddress = regressionInfo.get( PredictiveConstants.LOCATIONADDRESS) modelName = regressionInfo.get(PredictiveConstants.MODELSHEETNAME) spark = regressionInfo.get(PredictiveConstants.SPARK) # getting data from etl data labelColm = etlStats.get(PredictiveConstants.LABELCOLM) trainData = etlStats.get(PredictiveConstants.TRAINDATA) testData = etlStats.get(PredictiveConstants.TESTDATA) trainPredictedData = regressor.transform(trainData) testPredictedData = regressor.transform(testData) # training Actual vs Predicted dataset trainingPredictionActual = \ trainPredictedData.select(labelColm, modelName) trainingPredictionActualGraphFileName = \ pUtil.writeToParquet(fileName="trainingPredictedVsActualEnsemble", locationAddress=locationAddress, userId=modelId, data=trainingPredictionActual) # test Actual Vs Predicted dataset testPredictionActual = \ testPredictedData.select(labelColm, modelName) testPredictionActualGraphFileName = \ pUtil.writeToParquet(fileName="testPredictedVsActualEnsemble", locationAddress=locationAddress, userId=modelId, data=testPredictionActual) # creating the residual vs fitted graph data residualDataColm = trainingPredictionActual.withColumn( 'residuals', col(labelColm) - col(modelName)) residualDataColm = residualDataColm.select('residuals') residualsPredictiveDataTraining = \ pUtil.residualsFittedGraph(residualsData=residualDataColm, predictionData=trainingPredictionActual, modelSheetName=modelName, spark=spark) residualsVsFittedGraphFileName = \ pUtil.writeToParquet(fileName="residualsVsFittedEnsemble", locationAddress=locationAddress, userId=modelId, data=residualsPredictiveDataTraining) graphNameDict = { PredictiveConstants.RESIDUALSVSFITTEDGRAPHFILENAME: residualsVsFittedGraphFileName, PredictiveConstants.TRAININGPREDICTIONACTUALFILENAME: trainingPredictionActualGraphFileName, PredictiveConstants.TESTPREDICTIONACTUALFILENAME: testPredictionActualGraphFileName } return graphNameDict
def sentimentAnalysis(self, infoData): infoData = self.textProcessing(infoData) infoData = self.viveknSentimentAnalysis(infoData) storagePath = infoData.get(pc.STORAGELOCATION) modelName = infoData.get(pc.MODELSHEETNAME) modelPath = storagePath + modelName #writing the info data in the json file for using in sentiment prediction. """drop the obj which is not jsonSerializable""" infoData.pop(pc.SPARK, "None") infoData.pop(pc.DATASET, "None") infoData.pop(pc.TESTDATA, "None") pu.writeToJson(modelPath, infoData) print("success")
def __init__(self,trainDataRatio,dataset_add, feature_colm, label_colm, relation_list,relation,userId, locationAddress,modelStorageLocation,algoName,modelSheetName,datasetName, spark): self.trainDataRatio = None if trainDataRatio == None else trainDataRatio self.datasetAdd = dataset_add self.featuresColmList = feature_colm self.labelColmList = None if label_colm == None else label_colm self.relationshipList = relation_list self.relation = relation self.userId = userId self.locationAddress = locationAddress self.modelStorageLocation = modelStorageLocation self.algoName = algoName self.modelSheetName = "prediction_" + modelSheetName self.datasetName = datasetName self.spark = spark # only for etlpart of the dataset # PredictiveUtilities = PredictiveUtilities() ETLOnDatasetStats = \ PredictiveUtilities.ETLOnDataset(datasetAdd=self.datasetAdd, featuresColmList=self.featuresColmList, labelColmList=self.labelColmList, relationshipList=self.relationshipList, relation=self.relation, trainDataRatio=self.trainDataRatio, spark=spark, userId=userId) self.dataset = ETLOnDatasetStats.get("dataset") self.featuresColm = ETLOnDatasetStats.get("featuresColm") self.indexedFeatures = ETLOnDatasetStats.get("indexedFeatures") self.oneHotEncodedFeaturesList = ETLOnDatasetStats.get("oneHotEncodedFeaturesList")
def sentimentData(self, sentimentDataInfo): sentimentDataInfo = self.sentimentAnalysis(sentimentDataInfo) sentimentDataInfo = self.trainModel(sentimentDataInfo) sentimentDataInfo = self.invertIndexColm(sentimentDataInfo) modelName = sentimentDataInfo.get(pc.MODELSHEETNAME) storagePath = sentimentDataInfo.get(pc.STORAGELOCATION) jsonStorageLocation = storagePath + modelName #--sahil store the data in json format --> write the separate method for this. sentimentDataInfo.pop(pc.SPARK, "None") sentimentDataInfo.pop(pc.DATASET, "None") sentimentDataInfo.pop(pc.TESTDATA, "None") sentimentDataInfo.pop(pc.TRAINDATA, "None") sentimentDataInfo.pop(pc.MODEL, "None") # json.dump(sentimentDataInfo, open(storagePath + modelName + ".json", 'w')) pu.writeToJson(jsonStorageLocation, sentimentDataInfo)
def mergePosNegDataset(self, infoData): positiveDatasetPath = infoData.get(pc.POSITIVEDATASETPATH) negativeDatasetPath = infoData.get(pc.NEGATIVEDATASETPATH) spark = infoData.get(pc.SPARK) '''positive dictionary dataset--------- call convert to string method from here''' positiveDataset = spark.read.format("com.databricks.spark.csv").option( "header", "true").option("inferSchema", "true").load(positiveDatasetPath) positiveDataset = positiveDataset.select(positiveDataset.columns[:1]) positiveColName = str((positiveDataset.schema.names)[0]) positiveDataset = positiveDataset.withColumnRenamed( positiveColName, pc.DMXDICTIONARYCOLNAME) positiveDataset = positiveDataset.withColumn(pc.DMXSENTIMENT, lit(pc.DMXPOSITIVE)) \ .select(pc.DMXDICTIONARYCOLNAME, pc.DMXSENTIMENT) '''negative dictionary dataset''' negativeDataset = spark.read.format("com.databricks.spark.csv").option( "header", "true").option("inferSchema", "true").load(negativeDatasetPath) negativeDataset = negativeDataset.select(negativeDataset.columns[:1]) negativeColName = str((negativeDataset.schema.names)[0]) negativeDataset = negativeDataset.withColumnRenamed( negativeColName, pc.DMXDICTIONARYCOLNAME) negativeDataset = negativeDataset.withColumn(pc.DMXSENTIMENT, lit(pc.DMXNEGATIVE)) \ .select(pc.DMXDICTIONARYCOLNAME, pc.DMXSENTIMENT) '''before appending negative dataset with positive make all colm name similar with each other''' posNegDataset = positiveDataset.union(negativeDataset) posNegDataset = pu.addInternalId(posNegDataset) return posNegDataset
def __init__(self, trainDataRatio, dataset_add, feature_colm, label_colm, relation_list, relation, userId, locationAddress, algoName, modelSheetName,spark): self.trainDataRatio = trainDataRatio self.datasetAdd = dataset_add self.featuresColmList = feature_colm self.labelColmList = label_colm self.relationshipList = relation_list self.relation = relation self.userId = userId self.locationAddress = locationAddress self.algoName = algoName self.modelSheetName = PredictiveConstants.PREDICTION_ + modelSheetName # self.spark = spark # only for etlpart of the dataset # PredictiveUtilities = PredictiveUtilities() ETLOnDatasetStats = \ PredictiveUtilities.ETLOnDataset(datasetAdd=self.datasetAdd, featuresColmList=self.featuresColmList, labelColmList=self.labelColmList, relationshipList=self.relationshipList, relation=self.relation, trainDataRatio=self.trainDataRatio, spark=spark, userId=userId) self.dataset = ETLOnDatasetStats.get(PredictiveConstants.DATASET) self.featuresColm = ETLOnDatasetStats.get(PredictiveConstants.FEATURESCOLM) self.labelColm = ETLOnDatasetStats.get(PredictiveConstants.LABELCOLM) self.trainData = ETLOnDatasetStats.get(PredictiveConstants.TRAINDATA) self.testData = ETLOnDatasetStats.get(PredictiveConstants.TESTDATA) self.idNameFeaturesOrdered = ETLOnDatasetStats.get(PredictiveConstants.IDNAMEFEATURESORDERED)
def getFeatureImportance(self, randomForestModelFit, idNameFeaturesOrdered): import builtins round = getattr(builtins, 'round') featuresImportance = list(randomForestModelFit.featureImportances) featuresImportance = [round(x, 4) for x in featuresImportance] featuresImportanceDict = {} for importance in featuresImportance: featuresImportanceDict[featuresImportance.index( importance)] = round(importance, 4) featuresImportanceDictWithName = \ PredictiveUtilities.summaryTable(featuresName=idNameFeaturesOrdered, featuresStat=featuresImportanceDict) featuresColmList = idNameFeaturesOrdered feat = [] for val in featuresColmList.values(): feat.append(val) feature_imp = { PredictiveConstants.FEATURE_IMPORTANCE: featuresImportance, "feature_column": feat } featuresImportanceData = { PredictiveConstants.FEATURE_IMPORTANCE: feature_imp, PredictiveConstants.FEATURESIMPORTANCEDICT: featuresImportanceDictWithName } return featuresImportanceData
def getExtractedLemmaData(self, dataset, indexList, lemmatizedList): zipData = zip(indexList, lemmatizedList) columnList = [pc.DMXINDEX, pc.DMXLEMMATIZED] pandasDataframe = pd.DataFrame(zipData, columns=columnList) lemmatizedDataset = spark.createDataFrame(pandasDataframe) dataset = dataset.drop(pc.DMXLEMMATIZED) dataset = pu.joinDataset(dataset, lemmatizedDataset, pc.DMXINDEX) return dataset
def sentimentAnalysis(self, sentimentInfoData): spark = sentimentInfoData.get(pc.SPARK) datasetPath = sentimentInfoData.get(pc.SENTIMENTDATASETPATH) dataset = spark.read.parquet(datasetPath) dataset = pu.addInternalId(dataset) sentimentInfoData.update({pc.DATASET: dataset}) isNgram = sentimentInfoData.get(pc.ISNGRAM) sentimentDataset = self.textPreProcessing(sentimentInfoData) # do the oneHot Encoding after that. textProcessing = TextProcessing() sentimentDataset = textProcessing.lemmatization(sentimentDataset, pc.DMXSTOPWORDS) # sentimentDataset = textProcessing.sparkLemmatizer(sentimentDataset, pc.DMXSTOPWORDS) if(isNgram): ngramPara = sentimentInfoData.get(pc.NGRAMPARA) sentimentDataset = textProcessing.ngrams(sentimentDataset, pc.DMXLEMMATIZED, ngramPara) # with n-grams modelName = sentimentInfoData.get(pc.MODELSHEETNAME) labelColm = sentimentInfoData.get(pc.LABELCOLM) indexedColm = pc.INDEXED_ + labelColm encodedColm = pc.ONEHOTENCODED_ + pc.DMXLEMMATIZED featuresColm = modelName + pc.DMXFEATURE sentimentInfoData.update({pc.COLMTOENCODE: pc.DMXLEMMATIZED, pc.DATASET: sentimentDataset, pc.COLMTOINDEX: labelColm, pc.INDEXEDCOLM: indexedColm, pc.ENCODEDCOLM: encodedColm, pc.COLMTOVECTORIZED: encodedColm, pc.FEATURESCOLM: featuresColm, pc.ORIGINALCOLMNAME: labelColm }) sentimentInfoData = pu.stringIndexer(sentimentInfoData) # after this will get the indexed label sentimentInfoData = pu.countVectorizer(sentimentInfoData) # using the lemmatized colm for now. if(isNgram): sentimentInfoData.update({ pc.COLMTOENCODE: pc.DMXNGRAMS, pc.ENCODEDCOLM: pc.ONEHOTENCODED_ + pc.DMXNGRAMS }) sentimentInfoData = pu.countVectorizer(sentimentInfoData) sentimentInfoData.update({pc.COLMTOVECTORIZED: [pc.ONEHOTENCODED_ + pc.DMXLEMMATIZED, pc.ONEHOTENCODED_ + pc.DMXNGRAMS]}) sentimentInfoData = pu.featureAssembler(sentimentInfoData) # creating feature vector return sentimentInfoData
def writeDataset(self, dataset, infoData): storageLocation = infoData.get(pc.STORAGELOCATION) modelName = infoData.get(pc.MODELNAME) userId = infoData.get(pc.USERID) """ write the dataset if not exists and if do then append the new data inside the dataset - keep the datasetID information, and coversationID should be unique in the dataset. """ datasetInfo = pu.writeToParquet(modelName, storageLocation, userId, dataset) return datasetInfo
def prediction(self, infoData): isNgram = False if infoData.get(pc.ISNGRAM) == None else infoData.get( pc.ISNGRAM) predictionColm = infoData.get(pc.PREDICTIONCOLM) algoName = infoData.get(pc.ALGORITHMNAME) modelStorageLocation = infoData.get(pc.MODELSTORAGELOCATION) spark = infoData.get(pc.SPARK) datasetPath = infoData.get(pc.SENTIMENTDATASETPATH) originalDataset = spark.read.parquet(datasetPath) originalDataset = pu.addInternalId(originalDataset) infoData.update({pc.DATASET: originalDataset}) infoData = self.dataTransformation(infoData) dataset = infoData.get(pc.DATASET) if (isNgram): """sahil-- handle the none value for ngram parameter at the time of data creation""" textProcessing = TextProcessing() ngramPara = infoData.get(pc.NGRAMPARA) dataset = textProcessing.ngrams(dataset, pc.DMXLEMMATIZED, ngramPara) """ -- sahil- hardCoding the algorithm name for comparision handle this while finalising """ if ("GradientBoostClassifier".__eq__(algoName)): predictionModel = GBTClassificationModel.load(modelStorageLocation) if ("DecisionTreeClassifier".__eq__(algoName)): predictionModel = DecisionTreeClassificationModel.load( modelStorageLocation) dataset = dataset.drop(predictionColm) originalDataset = originalDataset.drop(predictionColm) dataset = predictionModel.transform(dataset) """calling indexToString method after the prediction""" infoData.update({pc.DATASET: dataset}) infoData = self.invertIndex(infoData) dataset = infoData.get(pc.DATASET) dataset = dataset.select(pc.DMXINDEX, predictionColm) finalDataset = pu.joinDataset(originalDataset, dataset, pc.DMXINDEX) return finalDataset
class PredictiveClassificationModel(): def __init__(self, trainDataRatio, dataset_add, feature_colm, label_colm, relation_list, relation, userId, locationAddress, algoName, spark): self.trainDataRatio = trainDataRatio self.datasetAdd = dataset_add self.featuresColmList = feature_colm self.labelColmList = label_colm self.relationshipList = relation_list self.relation = relation self.userId = userId self.locationAddress = locationAddress self.algoName = algoName self.spark = spark # only for etlpart of the dataset self.predictiveUtilitiesObj = PredictiveUtilities() ETLOnDatasetStats = \ self.predictiveUtilitiesObj.ETLOnDataset(datasetAdd=self.datasetAdd, featuresColmList=self.featuresColmList, labelColmList=self.labelColmList, relationshipList=self.relationshipList, relation=self.relation, trainDataRatio=self.trainDataRatio, spark=spark, userId=userId) self.dataset = ETLOnDatasetStats.get("dataset") self.featuresColm = ETLOnDatasetStats.get("featuresColm") self.labelColm = ETLOnDatasetStats.get("labelColm") self.trainData = ETLOnDatasetStats.get("trainData") self.testData = ETLOnDatasetStats.get("testData") self.idNameFeaturesOrdered = ETLOnDatasetStats.get( "idNameFeaturesOrdered") def classificationModelStat(self, classifier): trainingSummary = classifier.summary def logisticRegression(self): # family = auto,multinomial and bionomial logisticRegressionModelFit = \ LogisticRegression(featuresCol=self.featuresColm, labelCol=self.labelColm, maxIter=5, regParam=0.1, elasticNetParam=1.0, threshold=0.3, family="auto") classifier = logisticRegressionModelFit.fit(self.trainData) def randomForestClassifierModel(self): randomForestClassifierModelFit = \ RandomForestClassifier(labelCol=self.labelColm, featuresCol=self.featuresColm, numTrees=10) classifier = randomForestClassifierModelFit.fit(self.trainData)
def invertIndexColm(self, infoData): originalColm = infoData.get(pc.COLMTOINDEX) stringIndexerPath = (infoData.get( pc.INDEXERPATHMAPPING)).get(originalColm) inverterColm = infoData.get(pc.PREDICTIONCOLM) testDataset = infoData.get(pc.TESTDATA) trainDataset = infoData.get(pc.TRAINDATA) infoData.update({ pc.INDEXERPATH: stringIndexerPath, pc.COLMTOINVERT: inverterColm }) """ run the indexing part on test and train dataset seperately since needs to show the user accordingly """ infoData.update({pc.DATASET: trainDataset}) trainDataset = pu.indexToString(infoData) infoData.update({pc.DATASET: testDataset}) testDataset = pu.indexToString(infoData) infoData.update({pc.TRAINDATA: trainDataset, pc.TESTDATA: testDataset}) return infoData
def invertIndex(self, infoData): originalColName = infoData.get(pc.ORIGINALCOLMNAME) indexerPath = (infoData.get( pc.INDEXERPATHMAPPING)).get(originalColName) infoData.update({pc.INDEXERPATH: indexerPath}) dataset = pu.indexToString(infoData) infoData.update({pc.DATASET: dataset}) """ datasetTest = datasetTest.select("Text","Sentiment","prediction_knime", predictionColm) datasetTest.coalesce( 1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv( "/home/fidel/Documents/decisionTreeKNIMEPrediction.csv") """ return infoData
def countVectorizer(self, infoData): originalColName = infoData.get(pc.ORIGINALCOLMNAME) dataset = infoData.get(pc.DATASET) oneHotEncoderMapping = infoData.get(pc.ONEHOTENCODERPATHMAPPING) countVectorizerPath = oneHotEncoderMapping.get(originalColName) countVectorizer = CountVectorizerModel.load(countVectorizerPath) encodedColmName = infoData.get(pc.ENCODEDCOLM) dataset = dataset.drop(encodedColmName) dataset = countVectorizer.transform(dataset) infoData.update({pc.DATASET: dataset}) infoData = pu.featureAssembler(infoData) return infoData
def createTaggedDataset(self, dataset, indexList, taggedRowList, positiveNum, negativeNum, totalNum, sentimentScores): zipData = zip(indexList, taggedRowList, positiveNum, negativeNum, totalNum, sentimentScores) columnList = [ pc.DMXINDEX, pc.DMXTAGGEDCOLM, pc.POSITIVENUM, pc.NEGATIVENUM, pc.TOTALWORDS, pc.SENTIMENTSCORE ] pandasDataframe = pd.DataFrame(zipData, columns=columnList) taggedDataset = self.spark.createDataFrame(pandasDataframe) dataset = pu.joinDataset(dataset, taggedDataset, pc.DMXINDEX) '''RN not dropping the neutral sentiment-- after discussion we will decide whether to drop or not''' #dataset = self.dropNeutral(dataset) dataset = self.performSentimentAnalysis(dataset) return dataset
def createTaggedDataset(self, dataset, indexList, taggedRowList, positiveNum, negativeNum, totalNum, sentimentScores): zipData = zip(indexList, taggedRowList, positiveNum, negativeNum, totalNum, sentimentScores) columnList = [ pc.DMXINDEX, pc.DMXTAGGEDCOLM, pc.POSITIVENUM, pc.NEGATIVENUM, pc.TOTALWORDS, pc.SENTIMENTSCORE ] pandasDataframe = pd.DataFrame(zipData, columns=columnList) taggedDataset = spark.createDataFrame(pandasDataframe) dataset = PredictiveUtilities.joinDataset(dataset, taggedDataset, pc.DMXINDEX) dataset = self.dropNeutral(dataset) dataset = self.performSentimentAnalysis(dataset) return dataset
def textAnalytics(self, infoData): sparkSession = infoData.get(pc.SPARK) global spark spark = sparkSession datasetPath = infoData.get(pc.DATASETPATH) try: dataset = spark.read.parquet(datasetPath) except: dataset = spark.read.csv(datasetPath, header=True) dataset = pu.addInternalId(dataset) infoData.update({pc.DATASET: dataset}) # below method textPreprocessing is related to sentiment analysis # make sure you will make it common for both sentiment as well text analtics. dataset = self.textPreProcessing(infoData) ''' after that try to use the tf-idf method for finding frequencies and all. ''' clusteredDataset = self.calTFIDF(dataset, pc.DMXLEMMATIZED)
def sentimentAnalysis(self, sentimentInfoData): '''requirement-- 1.- sentiment sentence containing colmName- 2.- sentiment dataset parquet path 3.- positive dictionary parquet path 4.- negative dictionary parquet path 5.- positive and negative dictionary colm name containing words/sentiment. ''' sparkSession = sentimentInfoData.get(pc.SPARK) global spark spark = sparkSession datasetPath = sentimentInfoData.get(pc.SENTIMENTDATASETPATH) dataset = spark.read.parquet(datasetPath) dataset = pu.addInternalId(dataset) sentimentInfoData.update({pc.DATASET: dataset}) sentimentDataset = self.textPreProcessing(sentimentInfoData) textProcessing = TextProcessing() posNegDataset = textProcessing.mergePosNegDataset(sentimentInfoData) dataset = self.addTag(sentimentDataset, pc.DMXSTEMMEDWORDS, posNegDataset)
def featureStats(self, etlStats, predictiveData): numericalFeatures = etlStats.get(PredictiveConstants.NUMERICALFEATURES) label = etlStats.get(PredictiveConstants.LABELCOLM) dataset = etlStats.get(PredictiveConstants.DATASET) categoricalFeatures = etlStats.get( PredictiveConstants.CATEGORICALFEATURES) categoryColmStats = etlStats.get(PredictiveConstants.CATEGORYCOLMSTATS) locationAddress = predictiveData.get( PredictiveConstants.LOCATIONADDRESS) featureId = predictiveData.get(PredictiveConstants.MODELID) # statistics columnListForfeaturesStats = numericalFeatures.copy() columnListForfeaturesStats.insert(0, label) dataTransformationObj = PredictiveDataTransformation(dataset=dataset) dataStatsResult = \ dataTransformationObj.dataStatistics(categoricalFeatures=categoricalFeatures, numericalFeatures=columnListForfeaturesStats, categoricalColmStat=categoryColmStats) summaryDict = dataStatsResult # creating the dataset for statschart visualization in features selection chart datasetForStatsChart = dataset.select(columnListForfeaturesStats) datasetForStatsChartFileName = \ PredictiveUtilities.writeToParquet(fileName="datasetForStatsChart", locationAddress=locationAddress, userId=featureId, data=datasetForStatsChart) featuresStatsDict = { "columnsName": columnListForfeaturesStats, "datasetFileName": datasetForStatsChartFileName } featureStatistics = { PredictiveConstants.SUMMARYDICT: summaryDict, PredictiveConstants.FEATURESSTATSDICT: featuresStatsDict } return featureStatistics
def loadModel(self): if self.algoName == "linear_reg" or self.algoName == \ "ridge_reg" or self.algoName == "lasso_reg" : regressionPrediction = LinearRegressionModel.load(self.modelStorageLocation) if self.algoName == "RandomForestAlgo" : regressionPrediction = RandomForestRegressionModel.load(self.modelStorageLocation) if self.algoName == "GradientBoostAlgo": regressionPrediction = GBTRegressionModel.load(self.modelStorageLocation) #dropping the already existed column of prediction on same model self.dataset = self.dataset.drop(self.modelSheetName) predictionData = regressionPrediction.transform(self.dataset) predictionData = predictionData.drop(self.featuresColm) #dropping extra added column if self.indexedFeatures: self.indexedFeatures.extend(self.oneHotEncodedFeaturesList) predictionData = predictionData.drop(*self.indexedFeatures) else: predictionData = predictionData #overWriting the original dataset '''this step is needed to write because of the nature of spark to not read or write whole data at once it only takes limited data to memory and another problem was lazy evaluation of spark. so overwriting the same dataset which is already in the memory is not possible''' emptyUserId = '' fileNameWithPathTemp = self.locationAddress + emptyUserId + self.datasetName + "_temp.parquet" predictionData.write.parquet(fileNameWithPathTemp, mode="overwrite") predictionDataReadAgain = self.spark.read.parquet(fileNameWithPathTemp) predictionTableData = \ PredictiveUtilities.writeToParquet(fileName=self.datasetName, locationAddress=self.locationAddress, userId=emptyUserId, data=predictionDataReadAgain) return predictionTableData
def featureSelection(self, predictiveData): algoName = predictiveData.get(PredictiveConstants.ALGORITHMNAME) etlStats = PredictiveUtilities.performETL(predictiveData) featureStatistics = self.featureStats(etlStats, predictiveData) categoryColmStats = etlStats.get(PredictiveConstants.CATEGORYCOLMSTATS) idNameFeaturesOrdered = etlStats.get( PredictiveConstants.IDNAMEFEATURESORDERED) featureAnalysis = self.featureAnalysis(etlStats, algoName) randomForestModelFit = featureAnalysis.get( PredictiveConstants.RANDOMFORESTMODEL) featureImportanceData = self.getFeatureImportance( randomForestModelFit, idNameFeaturesOrdered) featureImportanceDict = featureImportanceData.get( PredictiveConstants.FEATURESIMPORTANCEDICT) featureImportance = featureImportanceData.get( PredictiveConstants.FEATURE_IMPORTANCE) summaryDict = featureStatistics.get(PredictiveConstants.SUMMARYDICT) featuresStatsDict = featureStatistics.get( PredictiveConstants.FEATURESSTATSDICT) keyStatsTest = featureAnalysis.get(PredictiveConstants.KEYSTATSTEST) statisticalTestResult = featureAnalysis.get( PredictiveConstants.STATISTICALTESTRESULT) responseData = { PredictiveConstants.FEATURE_IMPORTANCE: featureImportance, keyStatsTest: statisticalTestResult, PredictiveConstants.SUMMARYDICT: summaryDict, PredictiveConstants.CATEGORICALSUMMARY: categoryColmStats, PredictiveConstants.FEATURESIMPORTANCEDICT: featureImportanceDict, PredictiveConstants.FEATURESSTATSDICT: featuresStatsDict } return responseData
def etlOperation(self, etlInfo): etlStats = PredictiveUtilities.performETL(etlInfo) return etlStats
def regressionEvaluation(self, regressor, regressionInfo, etlStats): modelName = regressionInfo.get(PredictiveConstants.MODELSHEETNAME) # getting data from etl data labelColm = etlStats.get(PredictiveConstants.LABELCOLM) trainData = etlStats.get(PredictiveConstants.TRAINDATA) idNameFeaturesOrdered = etlStats.get( PredictiveConstants.IDNAMEFEATURESORDERED) trainPredictedData = regressor.transform(trainData) metricsList = ['r2', 'rmse', 'mse', 'mae'] trainDataMetrics = {} metricName = '' for metric in metricsList: if metric.__eq__("r2"): metricName = PredictiveConstants.RSQUARE elif metric.__eq__("rmse"): metricName = PredictiveConstants.RMSE elif metric.__eq__("mse"): metricName = PredictiveConstants.MSE elif metric.__eq__("mae"): metricName = PredictiveConstants.MAE evaluator = RegressionEvaluator(labelCol=labelColm, predictionCol=modelName, metricName=metric) metricValue = evaluator.evaluate(trainPredictedData) trainDataMetrics[metricName] = metricValue # summary stats noTrees = regressor.getNumTrees treeWeights = regressor.treeWeights treeNodes = list(regressor.trees) totalNoNodes = regressor.totalNumNodes debugString = regressor.toDebugString debugString = str(debugString).splitlines() featuresImportance = list(regressor.featureImportances) featuresImportance = [round(x, 4) for x in featuresImportance] featuresImportanceDict = {} for importance in featuresImportance: featuresImportanceDict[featuresImportance.index( importance)] = importance featuresImportanceDictWithName = \ pUtil.summaryTable(featuresName=idNameFeaturesOrdered, featuresStat=featuresImportanceDict) trainDataMetrics["No Trees"] = noTrees trainDataMetrics["Total Nodes"] = totalNoNodes summaryStats = { 'noTrees': noTrees, 'treeWeights': treeWeights, 'totalNodes': totalNoNodes, 'featuresImportance': featuresImportanceDictWithName, 'metrics': trainDataMetrics, 'debugString': debugString, } graphDataInfo = self.createGraphData(regressor, regressionInfo, etlStats) response = { PredictiveConstants.STATDATA: summaryStats, PredictiveConstants.GRAPHDATA: graphDataInfo } return response
def regressionModelEvaluation(self, regressor, spark): import builtins round = getattr(builtins, 'round') try: coefficientStdErrorList = regressor.summary.coefficientStandardErrors coefficientStdErrorDict = {} statsDictName = "coefficientStdErrorDictWithName" coefficientStdErrorDictWithName = self.statsDict( coefficientStdErrorList, coefficientStdErrorDict) pValuesList = regressor.summary.pValues pValuesDict = {} pValuesDictWithName = self.statsDict(pValuesList, pValuesDict) tValuesList = regressor.summary.tValues tValuesDict = {} tValuesDictWithName = self.statsDict(tValuesList, tValuesDict) significanceDict = {} for pkey, pVal in pValuesDict.items(): if (0 <= pVal < 0.001): significanceDict[pkey] = '***' if (0.001 <= pVal < 0.01): significanceDict[pkey] = '**' if (0.01 <= pVal < 0.05): significanceDict[pkey] = '*' if (0.05 <= pVal < 0.1): significanceDict[pkey] = '.' if (0.1 <= pVal < 1): significanceDict[pkey] = '-' significanceDictWithName = \ PredictiveUtilities.summaryTable(featuresName=self.idNameFeaturesOrdered, featuresStat=significanceDict) except: coefficientStdErrorDictWithName = {} pValuesDictWithName = {} tValuesDictWithName = {} significanceDictWithName = {} coefficientList = list(map(float, list(regressor.coefficients))) coefficientDict = {} coefficientDictWithName = self.statsDict(coefficientList, coefficientDict) # creating the table chart data summaryTableChartList = [] if self.algoName != "lasso_reg": for (keyOne, valueOne), valueTwo, valueThree, valueFour, valueFive in \ zip(coefficientStdErrorDictWithName.items(), coefficientDictWithName.values(), pValuesDictWithName.values(), tValuesDictWithName.values(), significanceDictWithName.values()): chartList = [ keyOne, valueOne, valueTwo, valueThree, valueFour, valueFive ] summaryTableChartList.append(chartList) schemaSummaryTable = StructType([ StructField("Column_Name", StringType(), True), StructField("std_Error", DoubleType(), True), StructField("coefficient", DoubleType(), True), StructField("P_value", DoubleType(), True), StructField("T_value", DoubleType(), True), StructField("significance", StringType(), True) ]) if (coefficientStdErrorDictWithName == {} or self.algoName == "lasso_reg"): for (keyOne, valueOne) in coefficientDictWithName.items(): chartList = [keyOne, valueOne] summaryTableChartList.append(chartList) schemaSummaryTable = StructType([ StructField("Column_Name", StringType(), True), StructField("coefficient", DoubleType(), True) ]) summaryTableChartData = spark.createDataFrame( summaryTableChartList, schema=schemaSummaryTable) summaryTableChartDataFileName = \ PredictiveUtilities.writeToParquet(fileName="summaryTableChart", locationAddress=self.locationAddress, userId=self.userId, data=summaryTableChartData) # creating the equation for the regression model intercept = round(regressor.intercept, 4) equation = self.labelColm, "=", intercept, "+" for feature, coeff in zip(self.idNameFeaturesOrdered.values(), coefficientDict.values()): coeffFeature = coeff, "*", feature, "+" equation += coeffFeature equation = list(equation[:-1]) # training summary trainingSummary = regressor.summary RMSE = round(trainingSummary.rootMeanSquaredError, 4) MAE = round(trainingSummary.meanAbsoluteError, 4) MSE = round(trainingSummary.meanSquaredError, 4) rSquare = round(trainingSummary.r2, 4) adjustedRSquare = round(trainingSummary.r2adj, 4) degreeOfFreedom = trainingSummary.degreesOfFreedom explainedVariance = round(trainingSummary.explainedVariance, 4) totalNumberOfFeatures = regressor.numFeatures residualsTraining = trainingSummary.residuals # sparkDataframe # test and training data predicted vs actual graphdata trainingPredictionAllColm = trainingSummary.predictions trainingPredictionActual = \ trainingPredictionAllColm.select(self.labelColm, self.modelSheetName) trainingPredictionActualGraphFileName = \ PredictiveUtilities.writeToParquet(fileName="trainingPredictedVsActual", locationAddress=self.locationAddress, userId=self.userId, data=trainingPredictionActual) testPredictionAllColm = regressor.transform(self.testData) testPredictionActual = \ testPredictionAllColm.select(self.labelColm, self.modelSheetName) testPredictionActualGraphFileName = \ PredictiveUtilities.writeToParquet(fileName="testPredictedVsActual", locationAddress=self.locationAddress, userId=self.userId, data=testPredictionActual) # appending train and test dataset together # for future use only trainTestMerged = trainingPredictionAllColm.union( testPredictionAllColm) trainTestMergedFileName = \ PredictiveUtilities.writeToParquet(fileName="trainTestMerged", locationAddress=self.locationAddress, userId=self.userId, data=trainTestMerged) # residual vs fitted graph residualsPredictiveDataTraining = \ PredictiveUtilities.residualsFittedGraph(residualsData=residualsTraining, predictionData=trainingPredictionActual, modelSheetName=self.modelSheetName) residualsVsFittedGraphFileName = \ PredictiveUtilities.writeToParquet(fileName="residualsVsFitted", locationAddress=self.locationAddress, userId=self.userId, data=residualsPredictiveDataTraining) # scale location plot sqrtStdResiduals = \ PredictiveUtilities.scaleLocationGraph(label=self.labelColm, predictionTargetData=trainingPredictionActual, residualsData=residualsTraining, modelSheetName=self.modelSheetName) scaleLocationGraphFileName = \ PredictiveUtilities.writeToParquet(fileName="scaleLocation", locationAddress=self.locationAddress, userId=self.userId, data=sqrtStdResiduals) # quantile plot quantileQuantileData = \ PredictiveUtilities.quantileQuantileGraph(residualsData=residualsTraining, spark=spark) quantileQuantileGraphFileName = \ PredictiveUtilities.writeToParquet(fileName="quantileQuantile", locationAddress=self.locationAddress, userId=self.userId, data=quantileQuantileData) # creating dictionary for the graph data and summary stats graphNameDict = { PredictiveConstants.RESIDUALSVSFITTEDGRAPHFILENAME: residualsVsFittedGraphFileName, PredictiveConstants.SCALELOCATIONGRAPHFILENAME: scaleLocationGraphFileName, PredictiveConstants.QUANTILEQUANTILEGRAPHFILENAME: quantileQuantileGraphFileName, PredictiveConstants.TRAININGPREDICTIONACTUALFILENAME: trainingPredictionActualGraphFileName, PredictiveConstants.TESTPREDICTIONACTUALFILENAME: testPredictionActualGraphFileName } summaryStats = { PredictiveConstants.RMSE: RMSE, PredictiveConstants.MSE: MSE, PredictiveConstants.MAE: MAE, PredictiveConstants.RSQUARE: rSquare, PredictiveConstants.ADJRSQUARE: adjustedRSquare, PredictiveConstants.INTERCEPT: intercept, PredictiveConstants.DOF: degreeOfFreedom, PredictiveConstants.EXPLAINEDVARIANCE: explainedVariance, PredictiveConstants.TOTALFEATURES: totalNumberOfFeatures } summaryTable = { "summaryTableChartDataFileName": summaryTableChartDataFileName } response = { PredictiveConstants.GRAPHDATA: graphNameDict, PredictiveConstants.STATDATA: summaryStats, PredictiveConstants.TABLEDATA: summaryTable, PredictiveConstants.EQUATION: equation } return response
def randomGradientRegressionModelEvaluation(self, regressor): trainPredictedData = regressor.transform(self.trainData) testPredictedData = regressor.transform(self.testData) from pyspark.ml.evaluation import RegressionEvaluator metricsList = ['r2', 'rmse', 'mse', 'mae'] trainDataMetrics = {} metricName = '' for metric in metricsList: if metric.__eq__("r2"): metricName = PredictiveConstants.RSQUARE elif metric.__eq__("rmse"): metricName = PredictiveConstants.RMSE elif metric.__eq__("mse"): metricName = PredictiveConstants.MSE elif metric.__eq__("mae"): metricName = PredictiveConstants.MAE evaluator = RegressionEvaluator(labelCol=self.labelColm, predictionCol=self.modelSheetName, metricName=metric) metricValue = evaluator.evaluate(trainPredictedData) trainDataMetrics[metricName] = metricValue #training Actual vs Predicted dataset trainingPredictionActual = \ trainPredictedData.select(self.labelColm, self.modelSheetName) trainingPredictionActualGraphFileName = \ PredictiveUtilities.writeToParquet(fileName="trainingPredictedVsActualEnsemble", locationAddress=self.locationAddress, userId=self.userId, data=trainingPredictionActual) #test Actual Vs Predicted dataset testPredictionActual = \ testPredictedData.select(self.labelColm, self.modelSheetName) testPredictionActualGraphFileName = \ PredictiveUtilities.writeToParquet(fileName="testPredictedVsActualEnsemble", locationAddress=self.locationAddress, userId=self.userId, data=testPredictionActual) # summary stats noTrees = regressor.getNumTrees treeWeights = regressor.treeWeights treeNodes = list(regressor.trees) totalNoNodes = regressor.totalNumNodes debugString = regressor.toDebugString debugString = str(debugString).splitlines() featuresImportance = list(regressor.featureImportances) featuresImportance = [round(x, 4) for x in featuresImportance] print(featuresImportance) featuresImportanceDict = {} for importance in featuresImportance: featuresImportanceDict[featuresImportance.index( importance)] = importance featuresImportanceDictWithName = \ PredictiveUtilities.summaryTable(featuresName=self.idNameFeaturesOrdered, featuresStat=featuresImportanceDict) trainDataMetrics["No Trees"] = noTrees trainDataMetrics["Total Nodes"] = totalNoNodes summaryStats = { 'noTrees': noTrees, 'treeWeights': treeWeights, 'totalNodes': totalNoNodes, 'featuresImportance': featuresImportanceDictWithName, 'metrics': trainDataMetrics, 'debugString': debugString, } #creating the residual vs fitted graph data residualDataColm = trainingPredictionActual.withColumn( 'residuals', col(self.labelColm) - col(self.modelSheetName)) residualDataColm = residualDataColm.select('residuals') residualsPredictiveDataTraining = \ PredictiveUtilities.residualsFittedGraph(residualsData=residualDataColm, predictionData=trainingPredictionActual, modelSheetName=self.modelSheetName) residualsVsFittedGraphFileName = \ PredictiveUtilities.writeToParquet(fileName="residualsVsFittedEnsemble", locationAddress=self.locationAddress, userId=self.userId, data=residualsPredictiveDataTraining) graphNameDict = { PredictiveConstants.RESIDUALSVSFITTEDGRAPHFILENAME: residualsVsFittedGraphFileName, PredictiveConstants.TRAININGPREDICTIONACTUALFILENAME: trainingPredictionActualGraphFileName, PredictiveConstants.TESTPREDICTIONACTUALFILENAME: testPredictionActualGraphFileName } response = { PredictiveConstants.STATDATA: summaryStats, PredictiveConstants.GRAPHDATA: graphNameDict } return response
def statsDict(self, statList, statDict): for index, value in enumerate(statList): statDict[index] = round(value, 4) return PredictiveUtilities.summaryTable( featuresName=self.idNameFeaturesOrdered, featuresStat=statDict)
def csvToParquet(self): dataset = spark.read.csv(reviewDatasetPath, header=True) dataset = dataset.select(colsName) #according to the requirement dataset = dataset.withColumnRenamed("Document Class", "Sentiment") dataset = dataset.withColumnRenamed("Prediction (Document Class)", "prediction_knime") PredictiveUtilities.writeToParquet("knimeTestDataset","/home/fidel/Documents/","",dataset)
def featuresSelection(self, dataset_add, feature_colm, label_colm, relation_list, relation, userId, algoName, locationAddress): dataset = self.spark.read.parquet(dataset_add) # PredictiveUtilities = PredictiveUtilities() # changing the relationship of the colm(log,squareroot,exponential) dataTransformationObj = PredictiveDataTransformation(dataset=dataset) dataset = dataTransformationObj.colmTransformation(colmTransformationList=relation_list) \ if relation == PredictiveConstants.NON_LINEAR else dataset # transformation dataTransformationObj = PredictiveDataTransformation(dataset=dataset) dataTransformationResult = dataTransformationObj.dataTranform(labelColm=label_colm, featuresColm=feature_colm, userId=userId) dataset = dataTransformationResult.get(PredictiveConstants.DATASET) categoricalFeatures = dataTransformationResult.get(PredictiveConstants.CATEGORICALFEATURES) numericalFeatures = dataTransformationResult.get(PredictiveConstants.NUMERICALFEATURES) maxCategories = dataTransformationResult.get(PredictiveConstants.MAXCATEGORIES) categoryColmStats = dataTransformationResult.get(PredictiveConstants.CATEGORYCOLMSTATS) indexedFeatures = dataTransformationResult.get(PredictiveConstants.INDEXEDFEATURES) label = dataTransformationResult.get(PredictiveConstants.LABEL) idNameFeaturesOrdered = dataTransformationResult.get(PredictiveConstants.IDNAMEFEATURESORDERED) oneHotEncodedFeaturesList = dataTransformationResult.get(PredictiveConstants.ONEHOTENCODEDFEATURESLIST) indexedLabelNameDict = dataTransformationResult.get(PredictiveConstants.INDEXEDLABELNAMEDICT) featuresColm = dataTransformationResult.get(PredictiveConstants.VECTORFEATURES) # statistics columnListForfeaturesStats = numericalFeatures.copy() columnListForfeaturesStats.insert(0, label) dataTransformationObj = PredictiveDataTransformation(dataset=dataset) dataStatsResult = \ dataTransformationObj.dataStatistics(categoricalFeatures=categoricalFeatures, numericalFeatures=columnListForfeaturesStats, categoricalColmStat=categoryColmStats) summaryDict = dataStatsResult # creating the dataset for statschart visualization in features selection chart datasetForStatsChart = dataset.select(columnListForfeaturesStats) datasetForStatsChartFileName = \ PredictiveUtilities.writeToParquet(fileName="datasetForStatsChart", locationAddress=locationAddress, userId=userId, data=datasetForStatsChart) featuresStatsDict = {"columnsName": columnListForfeaturesStats, "datasetFileName": datasetForStatsChartFileName} # applying the algorithm ##calling the pearson test trainData, testData = dataset.randomSplit([0.80, 0.20], seed=40) keyStatsTest = '' statisticalTestResult = {} if algoName == PredictiveConstants.RANDOMREGRESSOR: statisticalTestObj = PredictiveStatisticalTest(dataset=dataset, features=numericalFeatures, labelColm=label) statisticalTestResult = statisticalTestObj.pearsonTest() randomForestModel = \ RandomForestRegressor(labelCol=label, featuresCol=featuresColm, numTrees=10) keyStatsTest = "pearson_test_data" if algoName == PredictiveConstants.RANDOMCLASSIFIER: statisticalTestObj = PredictiveStatisticalTest(dataset=dataset, features=indexedFeatures, labelColm=label) statisticalTestResult = \ statisticalTestObj.chiSquareTest(categoricalFeatures=categoricalFeatures, maxCategories=maxCategories) randomForestModel = RandomForestClassifier(labelCol=label, featuresCol=featuresColm, numTrees=10) keyStatsTest = "ChiSquareTestData" randomForestModelFit = randomForestModel.fit(trainData) # predictions = randomForestModelFit.transform(testData) print(randomForestModelFit.featureImportances) # feature_importance = randomForestModelFit.featureImportances.toArray().tolist() # print(feature_importance) import pyspark.sql.functions as F import builtins round = getattr(builtins, 'round') featuresImportance = list(randomForestModelFit.featureImportances) featuresImportance = [round(x, 4) for x in featuresImportance] featuresImportanceDict = {} for importance in featuresImportance: featuresImportanceDict[featuresImportance.index(importance)] = round(importance, 4) featuresImportanceDictWithName = \ PredictiveUtilities.summaryTable(featuresName=idNameFeaturesOrdered, featuresStat=featuresImportanceDict) # feature_importance = randomForestModelFit.featureImportances.toArray().tolist() # print(feature_importance) # featureImportance = [] # for x in feature_importance: # featureImportance.append(round(x, 4)) # features_column_for_user = numericalFeatures + categoricalFeatures featuresColmList = idNameFeaturesOrdered feat = [] for val in featuresColmList.values(): feat.append(val) feature_imp = {PredictiveConstants.FEATURE_IMPORTANCE: featuresImportance, "feature_column": feat} response_dict = { PredictiveConstants.FEATURE_IMPORTANCE: feature_imp, keyStatsTest: statisticalTestResult, 'summaryDict': summaryDict, 'categoricalSummary': categoryColmStats, "featuresImportanceDict": featuresImportanceDictWithName, "featuresStatsDict": featuresStatsDict } return response_dict