def randomClassifier(dataset_add, feature_colm, label_colm, relation_list, relation): try: # dataset = spark.read.parquet(dataset_add) dataset = spark.read.csv(dataset_add, header=True, inferSchema=True, sep=';') dataset.show() label = '' for y in label_colm: label = y print(label) # # summaryList = ['mean', 'stddev', 'min', 'max'] # summaryDict = {} # for colm in feature_colm: # summaryListTemp = [] # for value in summaryList: # summ = list(dataset.select(colm).summary(value).toPandas()[colm]) # summaryListTemp.append(summ) # varianceListTemp = list(dataset.select(variance(col(colm)).alias(colm)).toPandas()[colm]) # summaryListTemp.append(varianceListTemp) # summaryDict[colm] = summaryListTemp # summaryList.append('variance') # summaryDict['summaryName'] = summaryList # # print(summaryDict) # print(summaryDict) # varianceDict = {} # for colm in feature_colm: # varianceListTemp = list(dataset.select(variance(col(colm)).alias(colm)).toPandas()[colm]) # varianceDict[colm] = varianceListTemp # print(varianceDict) # summaryAll = {'summaryDict': summaryDict, 'varianceDict': varianceDict} # print(summaryAll) # extracting the schema schemaDataset = dataset.schema stringFeatures = [] numericalFeatures = [] for x in schemaDataset: if (str(x.dataType) == "StringType"): for y in feature_colm: if x.name == y: stringFeatures.append(x.name) else: for y in feature_colm: if x.name == y: numericalFeatures.append(x.name) print(stringFeatures) print(numericalFeatures) summaryList = ['mean', 'stddev', 'min', 'max'] summaryDict = {} for colm in numericalFeatures: summaryListTemp = [] for value in summaryList: summ = list( dataset.select(colm).summary(value).toPandas()[colm]) summaryListTemp.append(summ) varianceListTemp = list( dataset.select(variance( col(colm)).alias(colm)).toPandas()[colm]) summaryListTemp.append(varianceListTemp) summaryDict[colm] = summaryListTemp summaryList.append('variance') summaryDict['summaryName'] = summaryList summaryDict['categoricalColumn'] = stringFeatures print(summaryDict) # print(val) if relation == 'linear': dataset = dataset if relation == 'non_linear': dataset = Relationship(dataset, relation_list) # calling pearson test fuction response_pearson_test = Correlation_test_imp( dataset=dataset, features=numericalFeatures, label_col=label) # dataset = dataset.withColumnRenamed(label , 'indexed_'+ label) # dataset_pearson = dataset # # label_indexer = StringIndexer(inputCol=label, outputCol='indexed_'+label).fit(dataset) # dataset = label_indexer.transform(dataset) ########################################################################### indexed_features = [] encoded_features = [] for colm in stringFeatures: indexer = StringIndexer(inputCol=colm, outputCol='indexed_' + colm).fit(dataset) indexed_features.append('indexed_' + colm) dataset = indexer.transform(dataset) # dataset.show() # encoder = OneHotEncoderEstimator(inputCols=['indexed_'+colm], outputCols=['encoded_'+colm]).fit(dataset) # encoded_features.append('encoded_'+colm) # dataset = encoder.transform(dataset) # dataset.show() print(indexed_features) print(encoded_features) # combining both the features colm together final_features = numericalFeatures + indexed_features print(final_features) # now using the vector assembler featureassembler = VectorAssembler(inputCols=final_features, outputCol="features") dataset = featureassembler.transform(dataset) dataset.show() # output.show() # output.select("features").show() # output_features = dataset.select("features") #using the vector indexer vec_indexer = VectorIndexer(inputCol='features', outputCol='vec_indexed_features', maxCategories=4).fit(dataset) categorical_features = vec_indexer.categoryMaps print("Chose %d categorical features: %s" % (len(categorical_features), ", ".join( str(k) for k in categorical_features.keys()))) vec_indexed = vec_indexer.transform(dataset) vec_indexed.show() # preparing the finalized data finalized_data = vec_indexed.select(label, 'vec_indexed_features') finalized_data.show() # renaming the colm # print (label) # dataset.withColumnRenamed(label,"label") # print (label) # dataset.show() # f = "" # f = label + " ~ " # # for x in features: # f = f + x + "+" # f = f[:-1] # f = (f) # # formula = RFormula(formula=f, # featuresCol="features", # labelCol="label") # # output = formula.fit(dataset).transform(dataset) # # output_2 = output.select("features", "label") # # output_2.show() # # # # splitting the dataset into taining and testing train_data, test_data = finalized_data.randomSplit([0.75, 0.25], seed=40) rf = RandomForestRegressor(labelCol=label, featuresCol='vec_indexed_features', numTrees=10) # Convert indexed labels back to original labels. # Train model. This also runs the indexers. model = rf.fit(train_data) # Make predictions. predictions = model.transform(test_data) # Select example rows to display. # predictions.select("prediction", "label", "features").show(10) print(model.featureImportances) feature_importance = model.featureImportances.toArray().tolist() print(feature_importance) features_column_for_user = numericalFeatures + stringFeatures feature_imp = { 'feature_importance': feature_importance, "feature_column": features_column_for_user } response_dict = { 'feature_importance': feature_imp, 'pearson_test_data': response_pearson_test, 'summaryDict': summaryDict } return response_dict print(response_dict) # Select (prediction, true label) and compute test error # evaluator = MulticlassClassificationEvaluator( # labelCol="label", predictionCol="prediction", metricName="accuracy") # accuracy = evaluator.evaluate(predictions) # print("Test Error = %g" % (1.0 - accuracy)) # rfModel = model.stages[2] # print(rfModel) # summary only except Exception as e: print("exception is = " + str(e))
def lasso(self, dataset_add, feature_colm, label_colm, relation_list, relation): Rsqr_list = [] Rsqr_regPara = {} print(self.xt) # print(data_add) try: dataset = spark.read.parquet(dataset_add) dataset.show() # data = spark.read.csv('/home/fidel/mltest/BI.csv', header=True, inferSchema=True) # data.show() # f_data = data.select('Sub Total', 'Tax Amount', 'Freight', 'Profit') # f_data.show() # class A(): # def __init__(self, feature='sahil', label='fcuk'): # self.feature = feature # # feature = 'sahil' # self.label = label # # self.test # self.name = 'bro' # # def linear_c(self): # print(self.feature, '\n', self.label) # print(self.name) # # # a = A(feature='test', label='f_t') # A(feature='test', label='f_t').linear_c() # renaming the colm # print(label_colm) # dataset.withColumnRenamed(label_colm, "label") # print(label_colm) # dataset.show() label = '' for y in label_colm: label = y print(label) # relationship if relation == 'linear': print('linear relationship') if relation == 'non_linear': dataset = Relationship(dataset, relation_list) dataset.show() # implementing the vector assembler featureassembler = VectorAssembler( inputCols=feature_colm, outputCol="Independent_features") output = featureassembler.transform(dataset) output.show() output.select("Independent_features").show() finalized_data = output.select("Independent_features", label) finalized_data.show() # splitting the dataset into taining and testing train_data, test_data = finalized_data.randomSplit([0.75, 0.25], seed=40) ######################################################################33 # lasso final for t in self.xt: lr1 = LinearRegression(featuresCol="Independent_features", labelCol=label, elasticNetParam=1, regParam=t) regressor1 = lr1.fit(train_data) print(t) print("coefficient : " + str(regressor1.coefficients)) reg_sum = regressor1.summary r2 = reg_sum.r2 Rsqr_list.append(r2) Rsqr_regPara[r2] = t print(r2) print(Rsqr_list) print(max(Rsqr_list)) maximum_rsqr = max(Rsqr_list) print(Rsqr_regPara) final_regPara = [] for key, val in Rsqr_regPara.items(): if (key == maximum_rsqr): print(val) final_regPara.append(val) for reg in final_regPara: lr_lasso = LinearRegression(featuresCol="Independent_features", labelCol=label, elasticNetParam=1, regParam=reg) regressor = lr_lasso.fit(train_data) training_summary = regressor.summary r2 = training_summary.r2 print(r2) # lr = LinearRegression(featuresCol="Independent_features", labelCol=label) # regressor = lr.fit(train_data) # # # from pyspark.ml.evaluation import RegressionEvaluator # # evaluator2 = RegressionEvaluator() # # # cross validation k-folds # # from pyspark.ml import Pipeline # from pyspark.ml.tuning import CrossValidator, ParamGridBuilder # from pyspark.ml.evaluation import RegressionEvaluator # # lr1 = LinearRegression(featuresCol="Independent_features", labelCol=label, elasticNetParam=0, regParam=0.5) # regressor1 = lr1.fit(train_data) # # label_features = train_data.withColumnRenamed(label, 'label') # # label_features.show() # lr_t = LinearRegression(featuresCol="Independent_features", labelCol=label) # pipeline = Pipeline(stages=[lr_t]) # paramGrid = ParamGridBuilder() \ # .addGrid(lr_t.regParam, [0, 0.01, 0.05, 0.1, 0.5, 1]) \ # .addGrid(lr_t.elasticNetParam, [0.0, 0.1, 0.5, 0.8, 1]) \ # .build() # evaluator = RegressionEvaluator(metricName='r2', labelCol=label) # crossval = CrossValidator(estimator=pipeline, # estimatorParamMaps=paramGrid, # evaluator=evaluator, # numFolds=10) # model = crossval.fit(train_data) # # best_model = model.bestModel # # print(best_model) # # ####################################################### # # lasso reg # # xt = [0.0, 0.01, 0.05, 0.1, 0.5, 1.0] # Rsqr_list = [] # Rsqr_regPara = {} # # for t in xt: # lr1 = LinearRegression(featuresCol="Independent_features", labelCol=label, elasticNetParam=1, # regParam=t) # regressor1 = lr1.fit(train_data) # print(t) # print("coefficient : " + str(regressor1.coefficients)) # reg_sum = regressor1.summary # r2 = reg_sum.r2 # Rsqr_list.append(r2) # Rsqr_regPara[r2] = t # print(r2) # # print(Rsqr_list) # print(max(Rsqr_list)) # maximum_rsqr = max(Rsqr_list) # print(Rsqr_regPara) # final_regPara = [] # # for key, val in Rsqr_regPara.items(): # if (key == maximum_rsqr): # print(val) # final_regPara.append(val) # # for reg in final_regPara: # lr_lasso = LinearRegression(featuresCol="Independent_features", labelCol=label, elasticNetParam=1, # regParam=reg) # regressor_lasso = lr_lasso.fit(train_data) # reg_sum = regressor_lasso.summary # r2 = reg_sum.r2 # print(r2) # # # print regressor.featureImportances # # # print(dataset.orderBy(feature_colm, ascending=True)) # # # pred = regressor.transform(test_data) # # ############################################################################################# # """ # class Lasso_reg(): # def __init__(self, xt=[0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 0.005, 0.8, 0.3]): # self.xt = xt # # def lasso(self): # # Rsqr_list = [] # Rsqr_regPara = {} # # for t in self.xt: # lr1 = LinearRegression(featuresCol="Independent_features", labelCol=label, elasticNetParam=1, regParam=t) # regressor1 = lr1.fit(train_data) # print(t) # print("coefficient : " + str(regressor1.coefficients)) # reg_sum = regressor1.summary # r2 = reg_sum.r2 # Rsqr_list.append(r2) # Rsqr_regPara[r2] = t # print(r2) # # print(Rsqr_list) # print(max(Rsqr_list)) # maximum_rsqr = max(Rsqr_list) # print(Rsqr_regPara) # final_regPara = [] # # for key, val in Rsqr_regPara.items(): # if (key == maximum_rsqr): # print(val) # final_regPara.append(val) # # for reg in final_regPara: # lr_lasso = LinearRegression(featuresCol="Independent_features", labelCol=label, elasticNetParam=1, # regParam=reg) # regressor_lasso = lr_lasso.fit(train_data) # reg_sum = regressor_lasso.summary # r2 = reg_sum.r2 # print(r2) # # Lasso_reg(xt=[0.5]).lasso() # """ # ############################################################################################# # # applying the model # # # coefficeint & intercept print("coefficient : " + str(regressor.coefficients)) coefficient_t = str(regressor.coefficients) print("intercept : " + str(regressor.intercept)) intercept_t = str(regressor.intercept) prediction = regressor.evaluate(test_data) # VI_IMP = 2 prediction_val = prediction.predictions prediction_val.show() prediction_val_pand = prediction_val.select( label, "prediction").toPandas() prediction_val_pand = prediction_val_pand.assign( residual_vall=prediction_val_pand[label] - prediction_val_pand["prediction"]) prediction_val_pand_residual = prediction_val_pand["residual_vall"] prediction_val_pand_label = prediction_val_pand[label] # print prediction_val_pand_residual prediction_val_pand_predict = prediction_val_pand["prediction"] # print prediction_val_pand_predict # test_summary = prediction.summary # for test data lr_prediction = regressor.transform(test_data) lr_prediction.groupBy(label, "prediction").count().show() lr_prediction_quantile = lr_prediction.select(label, "prediction") lr_prediction_onlypred = lr_prediction.select('prediction') # lr_prediction_quantile.show() # training_summary = regressor.summary print("numof_Iterations...%d\n" % training_summary.totalIterations) print("ObjectiveHistory...%s\n" % str(training_summary.objectiveHistory)) print("RMSE...%f\n" % training_summary.rootMeanSquaredError) RMSE = training_summary.rootMeanSquaredError print("MSE....%f\n" % training_summary.meanSquaredError) MSE = training_summary.meanSquaredError print("r**2(r-square)....::%f\n" % training_summary.r2) r_square = training_summary.r2 print("r**2(r-square adjusted)....%f\n" % training_summary.r2adj) adjsted_r_square = training_summary.r2adj print("deviance residuals %s" % str(training_summary.devianceResiduals)) training_summary.residuals.show() # residual_graph = training_summary.residuals # test = (residual_graph, lr_prediction_onlypred) # residual_graph.write.csv('/home/fidel/PycharmProjects/predictive_analysis_git', header=True, mode='append' ) # print(test) # test.write.csv('/home/fidel/PycharmProjects/predictive_analysis_git', header=True, mode= 'append') # residual_graph_pandas = residual_graph.toPandas() # print("coefficient standard errors: \n" + str(training_summary.coefficientStandardErrors)) # coefficient_error = str(training_summary.coefficientStandardErrors) # print(" Tvalues :\n" + str(training_summary.tValues)) # T_values = str(training_summary.tValues) # print(" p values :\n" + str(training_summary.pValues)) # P_values = str(training_summary.pValues) ####################################################################################################### table_response = { "Intercept": intercept_t, "Coefficients": coefficient_t, "RMSE": RMSE, "MSE": MSE, "R_square": r_square, "Adj_R_square": adjsted_r_square } ####################################################################################################### # residual vs predicted value prediction_data = regressor.summary.predictions prediction_data.show() prediction_data.select(['prediction']).show() predicted = prediction_data.select(['prediction']) regressor.summary.residuals.show() residuals = regressor.summary.residuals pred_d = predicted.withColumn('row_index', f.monotonically_increasing_id()) res_d = residuals.withColumn('row_index', f.monotonically_increasing_id()) pred_residuals = pred_d.join( res_d, on=['row_index']).sort('row_index').drop('row_index') pred_residuals.show() pred_residuals.write.parquet( 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/Q_Q_PLOT.parquet', mode='overwrite') #################################################################################### # appending predicted value to the dataset target = dataset.select(label) pred = prediction_data.select(['prediction']) pred_d = pred.withColumn('row_index', f.monotonically_increasing_id()) target_d = target.withColumn('row_index', f.monotonically_increasing_id()) pred_target = pred_d.join(target_d, on=['row_index']).drop('row_index') pred_target.show() dataset.show() pred_target_data_update = dataset.join(pred_target, on=[label]) pred_target_data_update.show(100) ########################################################################################## # scale location plot # for scale location plot # from pyspark.sql.functions import udf # # def std_res(x): # res_list = [] # res_list.append(x) # # std_residuals = udf(lambda y: std_res(y), FloatType()) # # residuals_std = residuals.withColumn('residuals', std_residuals(col('residuals').cast(FloatType()))) # # import statistics # import numpy as np # residuals_panda = residuals.toPandas() # # residuals_panda.residuals = range(residuals_panda.shape[1]) # residuals_panda = residuals_panda.values # print(residuals_panda) # stdev_training = statistics.stdev(residuals_panda) # print(stdev_training) ############################################################################################################ # creating the dictionary for storing the result # json_response = coefficient_t # print(json_response) # json_response = {"adjusted r**2 value" : training_summary.r2adj} # DATA VISUALIZATION PART # finding the quantile in the dataset(Q_Q plot) import matplotlib.pyplot as plt y = 0.1 x = [] for i in range(0, 90): x.append(y) y = round(y + 0.01, 2) # # for z in x: # print ("~~~~~ ",z) # quantile_label = lr_prediction_quantile.approxQuantile( label, x, 0.01) # print quantile_label quantile_prediction = lr_prediction_quantile.approxQuantile( "prediction", x, 0.01) # print quantile_prediction # # Q_label_pred='' # print(len(quantile_label)) # length = len(quantile_label) # # for i in range(0,len(quantile_label)): # Q_label_pred += str(quantile_label[i]) + '|' + str(quantile_prediction[i]) + '\n' # writing it to the hdfs in parquet file # # quantile_label_tospark = spark.createDataFrame(quantile_label, FloatType()) # quantile_label_tospark = quantile_label_tospark.withColumnRenamed("value", "Q_label") # # quantile_prediction_tospark = spark.createDataFrame(quantile_prediction, FloatType()) # quantile_prediction_tospark = quantile_prediction_tospark.withColumnRenamed("value", "Q_prediction") # # quant_label = quantile_label_tospark.withColumn('row_index', f.monotonically_increasing_id()) # quant_predtiction = quantile_prediction_tospark.withColumn('row_index', f.monotonically_increasing_id()) # # final_quantile = quant_label.join(quant_predtiction,on=['row_index']).sort('row_index').drop('row_index') # # final_quantile.show() # # final_quantile.write.parquet('hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/Q_Q_PLOT.parquet',mode='overwrite') # # # print(str(Q_label_pred[i])) # with open('Q_Q_plot.csv', 'w') as Q_Q: # writer_Q_Q = csv.writer(Q_Q) # writer_Q_Q.writerows((quantile_label, quantile_prediction)) # # plt.scatter(quantile_label, quantile_prediction) # plt.show() ## finding the residual vs fitted graph data # # # prediction_val_pand_predict_tospark = spark.createDataFrame(prediction_val_pand_predict, FloatType()) # prediction_val_pand_predict_tospark = prediction_val_pand_predict_tospark.withColumnRenamed("value", "prediction") # # prediction_val_pand_residual_tospark = spark.createDataFrame(prediction_val_pand_residual, FloatType()) # prediction_val_pand_residual_tospark = prediction_val_pand_residual_tospark.withColumnRenamed("value", "residual") # # pred_spark = prediction_val_pand_predict_tospark.withColumn('row_index', f.monotonically_increasing_id()) # res_spark = prediction_val_pand_residual_tospark.withColumn('row_index', f.monotonically_increasing_id()) # # final_res_fitted = pred_spark.join(res_spark, on=['row_index'])\ # .sort('row_index').drop('row_index') # # final_res_fitted.show() # # final_res_fitted.write.parquet('hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/RESIDUAL_FITTED_PLOT.parquet', # mode='overwrite') # # plt.scatter(prediction_val_pand_predict, prediction_val_pand_residual) # plt.axhline(y=0.0, color="red") # plt.xlabel("prediction") # plt.ylabel("residual") # plt.title("residual vs fitted ") # plt.show() # creating the csv file and writitng into it fitted_residual = '' print(len(prediction_val_pand_residual)) length = len(prediction_val_pand_residual) for i in range(0, len(prediction_val_pand_residual)): fitted_residual += str( prediction_val_pand_predict[i]) + 't' + str( prediction_val_pand_residual[i]) + 'n' with open('residual_vs_fitted.csv', 'w') as r_f: writer_r_f = csv.writer(r_f) writer_r_f.writerows((prediction_val_pand_predict, prediction_val_pand_residual)) # parquet file writing ## residual vs leverage graph data prediction_val_pand_residual # extreme value in the predictor colm prediction_col_extremeval = lr_prediction_quantile.agg( {"prediction": "max"}) # prediction_col_extremeval.show() # plt.plot(prediction_col_extremeval, prediction_val_pand_residual) # plt.show() ## scale location graph data prediction_val_pand_residual prediction_val_pand_predict prediction_val_pand_residual_abs = prediction_val_pand_residual.abs( ) import math sqrt_residual = [] for x in prediction_val_pand_residual_abs: sqrt_residual.append(math.sqrt(x)) # print ("____________________ ",x) sqrt_residual # plt.scatter(sqrt_residual, prediction_val_pand_predict) ####################################################################################3 # calculating std deviation import statistics print(statistics.stdev(prediction_val_pand_residual)) stdev_pred = statistics.stdev(prediction_val_pand_residual) # mean = statistics.mean(prediction_val_pand_residual) # calcuate stnd residuals std_res = [] for x in prediction_val_pand_residual: std_res.append(x / stdev_pred) print(std_res) # calculating the square root of std_res import math sqr_std_res = [] for x in std_res: sqr_std_res.append(math.sqrt(abs(x))) print(sqr_std_res) #######################################################################################3 # QUANTILE ## sort the list sorted_std_res = sorted(std_res) print(sorted_std_res) # mean = statistics.mean(sorted_std_res) stdev = statistics.stdev(sorted_std_res) print(mean) quantile = [] n = len(sorted_std_res) print(n) for x in range(0, n): quantile.append((x - 0.5) / (n)) print(quantile) # # z_score theoritical from scipy.stats import norm z_theory = [] for x in quantile: z_theory.append((norm.ppf(abs(x)))) print(z_theory) # z score for real val z_pract = [] for x in sorted_std_res: z_pract.append((x - mean) / stdev) # y = 0.1 x = [] for i in range(0, 90): x.append(y) y = round(y + 0.01, 2) quantile_std_res = spark.createDataFrame(std_res, FloatType()) quantile_std_res.show() quantile_std_res_t = quantile_std_res.approxQuantile( 'value', x, 0.01) print(quantile_std_res_t) print(x) Q_label_pred = '' print(len(quantile_label)) length = len(quantile_label) for quant, val in zip(z_theory, z_pract): Q_label_pred += str(val) + 't' + str(quant) + 'n' plt.scatter(z_theory, z_pract) plt.savefig('q_q') #################################################### # creating the std residuals # square root of label sqrt_label = [] for x in prediction_val_pand_label: sqrt_label.append(math.sqrt(abs(x))) sqrt_label prediction_val_pand_residual std_residual = [] for sqr, resid in zip(sqrt_label, prediction_val_pand_residual): std_residual.append(resid / sqr) # print(std_sqrt_residual) # creating the std sqr root sqrt_std_residuals = [] for x in std_residual: # print(math.sqrt(abs(x))) sqrt_std_residuals.append(math.sqrt(abs(x))) print(sqrt_std_residuals) # print(std_sqrt_residual) scale_predict_residual = '' for pre, res in zip(prediction_val_pand_predict, sqrt_std_residuals): scale_predict_residual += str(pre) + 't' + str(res) + 'n' print(scale_predict_residual) ########################################################################## """ pred_residuals.show() pred_residuals_pandas = pred_residuals.toPandas() print(pred_residuals_pandas) res_pandas = pred_residuals_pandas['residuals'] pred_pandas = pred_residuals_pandas['prediction'] label_list = [] # for res, pred in zip(res_pandas, pred_pandas): # label_list.append(res+pred) label_pand = prediction_data.select([label]).toPandas() labe_panda = label_pand[label] # sqrt of label column sqrt_lab = [] for lab in labe_panda: sqrt_lab.append(math.sqrt(abs(lab))) print(res_pandas) stdev_res = statistics.stdev(res_pandas) std_res_list = [] for valr, labe in zip(res_pandas,sqrt_lab): std_res_list.append(valr/labe) print(std_res_list) """ ########################################################################## ########################################################################## # import math # sqrt_stdres = [] # for x in std_sqrt_residual: # sqrt_stdres.append(math.sqrt(x)) # # scale_predict_residual = '' # for pre, res in zip(prediction_val_pand_predict, sqrt_stdres): # scale_predict_residual += str(pre) + 't' + str(res) + 'n' # print(scale_predict_residual) ###################################3 # plt.show() # scale_predict_residual='' # # print(len(sqrt_residual)) # length = len(sqrt_residual) # # for i in range(0, len(std_sqrt_residual)): # scale_predict_residual += str(prediction_val_pand_predict[i]) + '|' + str(std_sqrt_residual[i]) + '\n' # with open('scale_location_plot.csv', 'w') as s_l: # writer_s_l = csv.writer(s_l) # writer_s_l.writerows((prediction_val_pand_predict, sqrt_residual)) # writing to the parquet # prediction_val_pand_predict_tospark = spark.createDataFrame(prediction_val_pand_predict, FloatType()) # prediction_val_pand_predict_tospark = prediction_val_pand_predict_tospark.withColumnRenamed("value", # "prediction") # # sqrt_residual_tospark= spark.createDataFrame(sqrt_residual, FloatType()) # sqrt_residual_tospark = sqrt_residual_tospark.withColumnRenamed("value", # "sqrt_residual") # # pred_spark = prediction_val_pand_predict_tospark.withColumn('row_index', f.monotonically_increasing_id()) # res_spark = sqrt_residual_tospark.withColumn('row_index', f.monotonically_increasing_id()) # # final_scale_fitted = pred_spark.join(res_spark,on=['row_index']) \ # .sort('row_index').drop('row_index') # # final_scale_fitted.show() # # final_scale_fitted.write.parquet( # 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/SCALE_LOCATION_PLOT.parquet', # mode='overwrite') # # dumping the dictionary into json object # json_response = {'run_status': 'success', 'PredictiveResponse': resultdf} graph_response = { "Q_Q_plot": Q_label_pred, "residual_fitted": fitted_residual, "scale_location": scale_predict_residual } json_response = { 'table_data': table_response, 'graph_data': graph_response } return json_response except Exception as e: print('exception is =' + str(e))
def linearReg(self, dataset_add, feature_colm, label_colm, relation_list, relation,userId): try: dataset = spark.read.csv(dataset_add, header=True, inferSchema=True) dataset.show() label = '' for val in label_colm: label = val Schema = dataset.schema stringFeatures = [] numericalFeatures = [] for x in Schema: if (str(x.dataType) == "StringType" or str(x.dataType) == 'TimestampType' or str( x.dataType) == 'DateType' or str(x.dataType) == 'BooleanType' or str(x.dataType) == 'BinaryType'): for y in feature_colm: if x.name == y: dataset = dataset.withColumn(y, dataset[y].cast(StringType())) stringFeatures.append(x.name) else: for y in feature_colm: if x.name == y: numericalFeatures.append(x.name) if relation == 'linear': print('linear relationship') if relation == 'non_linear': dataset = Relationship(dataset, relation_list) dataset.show() for x in Schema: if (str(x.dataType) == "StringType" and x.name == label): for labelkey in label_colm: label_indexer = StringIndexer(inputCol=label, outputCol='indexed_' + label, handleInvalid="skip").fit(dataset) dataset = label_indexer.transform(dataset) label = 'indexed_' + label else: label = label indexed_features = [] for colm in stringFeatures: indexer = StringIndexer(inputCol=colm, outputCol='indexed_' + colm, handleInvalid="skip").fit(dataset) indexed_features.append('indexed_' + colm) dataset = indexer.transform(dataset) featureAssembler = VectorAssembler(inputCols=indexed_features + numericalFeatures, outputCol='features', handleInvalid="skip") dataset = featureAssembler.transform(dataset) vectorIndexer = VectorIndexer(inputCol='features', outputCol='vectorIndexedFeatures', maxCategories=4, handleInvalid="skip").fit( dataset) dataset = vectorIndexer.transform(dataset) trainDataRatioTransformed = self.trainDataRatio testDataRatio = 1 - trainDataRatioTransformed trainingData, testData = dataset.randomSplit([trainDataRatioTransformed, testDataRatio], seed=40) # applying the model lr = LinearRegression(featuresCol="vectorIndexedFeatures", labelCol=label) regressor = lr.fit(trainingData) locationAddress = 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/' modelPersist = 'linearRegressorModel.parquet' modelStorageLocation = locationAddress + userId + modelPersist regressor.write().overwrite().save(modelStorageLocation) # print regressor.featureImportances # print(dataset.orderBy(feature_colm, ascending=True)) # pred = regressor.transform(testData) # coefficeint & intercept # saving the model and test dataset as csv file print("coefficient : " + str(regressor.coefficients)) coefficient_t = str(regressor.coefficients) print("intercept : " + str(regressor.intercept)) intercept_t = str(regressor.intercept) prediction = regressor.evaluate(testData) # VI_IMP = 2 prediction_val = prediction.predictions prediction_val.show() prediction_val_pand = prediction_val.select(label, "prediction").toPandas() prediction_val_pand = prediction_val_pand.assign( residual_vall=prediction_val_pand[label] - prediction_val_pand["prediction"]) prediction_val_pand_residual = prediction_val_pand["residual_vall"] prediction_val_pand_label = prediction_val_pand[label] # print prediction_val_pand_residual prediction_val_pand_predict = prediction_val_pand["prediction"] # print prediction_val_pand_predict # test_summary = prediction.summary # for test data lr_prediction = regressor.transform(testData) lr_prediction.groupBy(label, "prediction").count().show() lr_prediction_quantile = lr_prediction.select(label, "prediction") lr_prediction_onlypred = lr_prediction.select('prediction') # lr_prediction_quantile.show() training_summary = regressor.summary print("numof_Iterations...%d\n" % training_summary.totalIterations) print("ObjectiveHistory...%s\n" % str(training_summary.objectiveHistory)) print("RMSE...%f\n" % training_summary.rootMeanSquaredError) RMSE = training_summary.rootMeanSquaredError print("MSE....%f\n" % training_summary.meanSquaredError) MSE = training_summary.meanSquaredError print("r**2(r-square)....::%f\n" % training_summary.r2) r_square = training_summary.r2 print("r**2(r-square adjusted)....%f\n" % training_summary.r2adj) adjsted_r_square = training_summary.r2adj print("deviance residuals %s" % str(training_summary.devianceResiduals)) training_summary.residuals.show() # residual_graph = training_summary.residuals # test = (residual_graph, lr_prediction_onlypred) # residual_graph.write.csv('/home/fidel/PycharmProjects/predictive_analysis_git', header=True, mode='append' ) # print(test) # test.write.csv('/home/fidel/PycharmProjects/predictive_analysis_git', header=True, mode= 'append') # residual_graph_pandas = residual_graph.toPandas() print("coefficient standard errors: \n" + str(training_summary.coefficientStandardErrors)) coefficientStdError = str(training_summary.coefficientStandardErrors) print(" Tvalues :\n" + str(training_summary.tValues)) T_values = str(training_summary.tValues) tValuesList = training_summary.tValues print(" p values :\n" + str(training_summary.pValues)) P_values = str(training_summary.pValues) # regression equation intercept_t = float(intercept_t) coefficientList = list(regressor.coefficients) equation = label, '=', intercept_t, '+' for feature, coeff in zip(feature_colm, coefficientList): coeffFeature = coeff, '*', feature, '+' equation += coeffFeature equation = equation[:-1] print(equation) st = list(equation) # significance value PValuesList = training_summary.pValues significanceObject = {} for pValue in PValuesList: if (0 <= pValue < 0.001): significanceObject[pValue] = '***' if (0.001 <= pValue < 0.01): significanceObject[pValue] = '**' if (0.01 <= pValue < 0.05): significanceObject[pValue] = '*' if (0.05 <= pValue < 0.1): significanceObject[pValue] = '.' if (0.1 <= pValue < 1): significanceObject[pValue] = '-' print(significanceObject) ####################################################################################################### # residual vs predicted value prediction_data = regressor.summary.predictions prediction_data.show() prediction_data.select(['prediction']).show() predicted = prediction_data.select(['prediction']) regressor.summary.residuals.show() residuals = regressor.summary.residuals pred_d = predicted.withColumn('row_index', f.monotonically_increasing_id()) res_d = residuals.withColumn('row_index', f.monotonically_increasing_id()) pred_residuals = pred_d.join(res_d, on=['row_index']).sort('row_index').drop('row_index') pred_residuals.show() # pred_residuals.write.parquet('hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/Q_Q_PLOT.parquet', # mode='overwrite') ''' userId = 'sahil123' graphName = 'QQPlot.parquet' locationAddress = '/home/fidel/mltest/' finalLocation = locationAddress + userId + graphName print(finalLocation) pred_residuals.write.parquet(finalLocation,mode='overwrite') ''' #################################################################################3 # scale location plot from pyspark.sql.functions import abs as ab, sqrt, mean as meann, stddev as stdDev df_label = prediction_data.select(label, 'prediction', sqrt(ab(prediction_data[label])).alias("sqrt_label")) df_label.show() df_sqrt_label_index = df_label.withColumn('row_index', f.monotonically_increasing_id()) df_sqrt_label_index.show() res_d.show() sqrt_label_residual_join = df_sqrt_label_index.join(res_d, on=['row_index']).sort('row_index').drop( 'row_index') sqrt_label_residual_join.show() std_resid = sqrt_label_residual_join.select('sqrt_label', 'prediction', ( sqrt_label_residual_join['residuals'] / sqrt_label_residual_join['sqrt_label']).alias( 'std_res')) std_resid.show() sqrt_std_res = std_resid.select("std_res", 'prediction', sqrt(ab(std_resid["std_res"])).alias("sqrt_std_resid")) sqrt_std_res.show() sqrt_std_res_fitted = sqrt_std_res.select('prediction', 'sqrt_std_resid') # sqrt_std_res_fitted.write.parquet( # 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/scale_location_train.parquet', # mode='overwrite') ###################################################################################### # QUANTILE from scipy.stats import norm import statistics import math res_d.show() sorted_res = res_d.sort('residuals') sorted_res.show() # stdev_ress = sorted_res.select(stdDev(col('residuals')).alias('std_dev'), # meann(col('residuals')).alias('mean')) # stdev_ress.show() # mean_residual = stdev_ress.select(['mean']).toPandas() # l = mean_residual.values.tolist() # print(l) # stddev_residual = stdev_ress.select(['std_dev']).toPandas() # length of the sorted std residuals count = sorted_res.groupBy().count().toPandas() countList = count.values.tolist() tuple1 = () for k in countList: tuple1 = k for tu in tuple1: lengthResiduals = tu print(lengthResiduals) quantileList = [] for x in range(0, lengthResiduals): quantileList.append((x - 0.5) / (lengthResiduals)) print(quantileList) # Z-score on theoritical quantile zTheoriticalTrain = [] for x in quantileList: zTheoriticalTrain.append(norm.ppf(abs(x))) print(zTheoriticalTrain) sortedResidualPDF = sorted_res.select('residuals').toPandas() sortedResidualPDF = sortedResidualPDF['residuals'] stdevResidualTrain = statistics.stdev(sortedResidualPDF) meanResidualTrain = statistics.mean(sortedResidualPDF) zPracticalTrain = [] for x in sortedResidualPDF: zPracticalTrain.append((x - meanResidualTrain) / stdevResidualTrain) # schema = StructType([StructField('zTheoriticalTrain', FloatType(), True), # StructField('zPracticalTrain', FloatType(), True) # ]) # spark.createDataFrame(zPracticalTrain, FloatType()).show() #################################################################################### # appending predicted value to the dataset target = dataset.select(label) pred = prediction_data.select(['prediction']) pred_d = pred.withColumn('row_index', f.monotonically_increasing_id()) target_d = target.withColumn('row_index', f.monotonically_increasing_id()) pred_target = pred_d.join(target_d, on=['row_index']).drop('row_index') pred_target.show() dataset.show() pred_target_data_update = dataset.join(pred_target, on=[label]) pred_target_data_update.show(100) ''' prediction = regressor.evaluate(dataset) predictionTestData= prediction.predictions predictionTestData.show() #appending the predicted column into the dataset which is test dataset predictionLabelList = [label,'prediction'] updatedFeatureColmList = feature_colm for val in predictionLabelList: updatedFeatureColmList.append(val) print(updatedFeatureColmList) predictionTestDatasetcolumn = predictionTestData.select(updatedFeatureColmList) predictionTestDatasetcolumn.show() ''' ########################################################################################## # scale location plot # for scale location plotequationAsList # from pyspark.sql.functions import udf # # def std_res(x): # res_list = [] # res_list.append(x) # # std_residuals = udf(lambda y: std_res(y), FloatType()) # # residuals_std = residuals.withColumn('residuals', std_residuals(col('residuals').cast(FloatType()))) # # import statistics # import numpy as np # residuals_panda = residuals.toPandas() # # residuals_panda.residuals = range(residuals_panda.shape[1]) # residuals_panda = residuals_panda.values # print(residuals_panda) # stdev_training = statistics.stdev(residuals_panda) # print(stdev_training) ############################################################################################################ # creating the dictionary for storing the result # json_response = coefficient_t # print(json_response) # json_response = {"adjusted r**2 value" : training_summary.r2adj} # DATA VISUALIZATION PART # finding the quantile in the dataset(Q_Q plot) import matplotlib.pyplot as plt # y = 0.1 # x = [] # # for i in range(0, 90): # x.append(y) # y = round(y + 0.01, 2) # # for z in x: # print ("~~~~~ ",z) # # quantile_label = lr_prediction_quantile.approxQuantile(label, x, 0.01) # print quantile_label # quantile_prediction = lr_prediction_quantile.approxQuantile("prediction", x, 0.01) # print quantile_prediction # # Q_label_pred='' # print(len(quantile_label)) # length = len(quantile_label) # # for i in range(0,len(quantile_label)): # Q_label_pred += str(quantile_label[i]) + '|' + str(quantile_prediction[i]) + '\n' # writing it to the hdfs in parquet file # # quantile_label_tospark = spark.createDataFrame(quantile_label, FloatType()) # quantile_label_tospark = quantile_label_tospark.withColumnRenamed("value", "Q_label") # # quantile_prediction_tospark = spark.createDataFrame(quantile_prediction, FloatType()) # quantile_prediction_tospark = quantile_prediction_tospark.withColumnRenamed("value", "Q_prediction") # # quant_label = quantile_label_tospark.withColumn('row_index', f.monotonically_increasing_id()) # quant_predtiction = quantile_prediction_tospark.withColumn('row_index', f.monotonically_increasing_id()) # # final_quantile = quant_label.join(quant_predtiction,on=['row_index']).sort('row_index').drop('row_index') # # final_quantile.show() # # final_quantile.write.parquet('hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/Q_Q_PLOT.parquet',mode='overwrite') # # # print(str(Q_label_pred[i])) # with open('Q_Q_plot.csv', 'w') as Q_Q: # writer_Q_Q = csv.writer(Q_Q) # writer_Q_Q.writerows((quantile_label, quantile_prediction)) # # plt.scatter(quantile_label, quantile_prediction) # plt.show() ## finding the residual vs fitted graph data # # # prediction_val_pand_predict_tospark = spark.createDataFrame(prediction_val_pand_predict, FloatType()) # prediction_val_pand_predict_tospark = prediction_val_pand_predict_tospark.withColumnRenamed("value", "prediction") # # prediction_val_pand_residual_tospark = spark.createDataFrame(prediction_val_pand_residual, FloatType()) # prediction_val_pand_residual_tospark = prediction_val_pand_residual_tospark.withColumnRenamed("value", "residual") # # pred_spark = prediction_val_pand_predict_tospark.withColumn('row_index', f.monotonically_increasing_id()) # res_spark = prediction_val_pand_residual_tospark.withColumn('row_index', f.monotonically_increasing_id()) # # final_res_fitted = pred_spark.join(res_spark, on=['row_index'])\ # .sort('row_index').drop('row_index') # # final_res_fitted.show() # # final_res_fitted.write.parquet('hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/RESIDUAL_FITTED_PLOT.parquet', # mode='overwrite') # # plt.scatter(prediction_val_pand_predict, prediction_val_pand_residual) # plt.axhline(y=0.0, color="red") # plt.xlabel("prediction") # plt.ylabel("residual") # plt.title("residual vs fitted ") # plt.show() # creating the csv file and writitng into it import math fitted_residual = '' print(len(prediction_val_pand_residual)) length = len(prediction_val_pand_residual) for i in range(0, len(prediction_val_pand_residual)): fitted_residual += str(prediction_val_pand_predict[i]) + '|' + str( prediction_val_pand_residual[i]) + '\n' with open('residual_vs_fitted.csv', 'w') as r_f: writer_r_f = csv.writer(r_f) writer_r_f.writerows((prediction_val_pand_predict, prediction_val_pand_residual)) # parquet file writing ## residual vs leverage graph data prediction_val_pand_residual # extreme value in the predictor colm prediction_col_extremeval = lr_prediction_quantile.agg({"prediction": "max"}) # prediction_col_extremeval.show() # plt.plot(prediction_col_extremeval, prediction_val_pand_residual) # plt.show() ## scale location graph data prediction_val_pand_residual prediction_val_pand_predict prediction_val_pand_residual_abs = prediction_val_pand_residual.abs() import math sqrt_residual = [] for x in prediction_val_pand_residual_abs: sqrt_residual.append(math.sqrt(x)) # print ("____________________ ",x) sqrt_residual # plt.scatter(sqrt_residual, prediction_val_pand_predict) ####################################################################################3 # calculating std deviation import statistics print(statistics.stdev(prediction_val_pand_residual)) stdev_pred = statistics.stdev(prediction_val_pand_residual) # mean = statistics.mean(prediction_val_pand_residual) # calcuate stnd residuals std_res = [] for x in prediction_val_pand_residual: std_res.append(x / stdev_pred) print(std_res) # calculating the square root of std_res import math sqr_std_res = [] for x in std_res: sqr_std_res.append(math.sqrt(abs(x))) print(sqr_std_res) #######################################################################################3 # QUANTILE ## sort the list sorted_std_res = sorted(std_res) print(sorted_std_res) # mean = statistics.mean(sorted_std_res) stdev = statistics.stdev(sorted_std_res) print(mean) quantile = [] n = len(sorted_std_res) print(n) for x in range(0, n): quantile.append((x - 0.5) / (n)) print(quantile) # # z_score theoritical from scipy.stats import norm z_theory = [] for x in quantile: z_theory.append((norm.ppf(abs(x)))) print(z_theory) # z score for real val z_pract = [] for x in sorted_std_res: z_pract.append((x - mean) / stdev) # y = 0.1 x = [] for i in range(0, 90): x.append(y) y = round(y + 0.01, 2) quantile_std_res = spark.createDataFrame(std_res, FloatType()) quantile_std_res.show() quantile_std_res_t = quantile_std_res.approxQuantile('value', x, 0.01) print(quantile_std_res_t) print(x) Q_label_pred = '' # print(len(quantile_label)) # length = len(quantile_label) for quant, val in zip(z_theory, z_pract): Q_label_pred += str(val) + 't' + str(quant) + 'n' plt.scatter(z_theory, z_pract) plt.savefig('q_q') #################################################### # creating the std residuals # square root of label sqrt_label = [] for x in prediction_val_pand_label: sqrt_label.append(math.sqrt(abs(x))) sqrt_label prediction_val_pand_residual std_residual = [] for sqr, resid in zip(sqrt_label, prediction_val_pand_residual): std_residual.append(resid / sqr) # print(std_sqrt_residual) # creating the std sqr root sqrt_std_residuals = [] for x in std_residual: # print(math.sqrt(abs(x))) sqrt_std_residuals.append(math.sqrt(abs(x))) print(sqrt_std_residuals) # print(std_sqrt_residual) scale_predict_residual = '' for pre, res in zip(prediction_val_pand_predict, sqrt_std_residuals): scale_predict_residual += str(pre) + 't' + str(res) + 'n' print(scale_predict_residual) ########################################################################## # import math # sqrt_stdres = [] # for x in std_sqrt_residual: # sqrt_stdres.append(math.sqrt(x)) # # scale_predict_residual = '' # for pre, res in zip(prediction_val_pand_predict, sqrt_stdres): # scale_predict_residual += str(pre) + 't' + str(res) + 'n' # print(scale_predict_residual) ###################################3 # plt.show() # scale_predict_residual='' # # print(len(sqrt_residual)) # length = len(sqrt_residual) # # for i in range(0, len(std_sqrt_residual)): # scale_predict_residual += str(prediction_val_pand_predict[i]) + '|' + str(std_sqrt_residual[i]) + '\n' # with open('scale_location_plot.csv', 'w') as s_l: # writer_s_l = csv.writer(s_l) # writer_s_l.writerows((prediction_val_pand_predict, sqrt_residual)) # writing to the parquet # prediction_val_pand_predict_tospark = spark.createDataFrame(prediction_val_pand_predict, FloatType()) # prediction_val_pand_predict_tospark = prediction_val_pand_predict_tospark.withColumnRenamed("value", # "prediction") # # sqrt_residual_tospark= spark.createDataFrame(sqrt_residual, FloatType()) # sqrt_residual_tospark = sqrt_residual_tospark.withColumnRenamed("value", # "sqrt_residual") # # pred_spark = prediction_val_pand_predict_tospark.withColumn('row_index', f.monotonically_increasing_id()) # res_spark = sqrt_residual_tospark.withColumn('row_index', f.monotonically_increasing_id()) # # final_scale_fitted = pred_spark.join(res_spark,on=['row_index']) \ # .sort('row_index').drop('row_index') # # final_scale_fitted.show() # # final_scale_fitted.write.parquet( # 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/SCALE_LOCATION_PLOT.parquet', # mode='overwrite') # # dumping the dictionary into json object # json_response = {'run_status': 'success', 'PredictiveResponse': resultdf} tableContent = \ { 'coefficientValuesKey': coefficientList, 'tValuesKey': tValuesList, 'pValuesKey': PValuesList, 'significanceValuesKey': significanceObject, 'interceptValuesKey': intercept_t, "RMSE": RMSE, "RSquare": r_square, "AdjRSquare": adjsted_r_square, "CoefficientStdError": coefficientStdError, } print(tableContent) json_response = { "Intercept": intercept_t, "Coefficients": coefficient_t, "RMSE": RMSE, "MSE": MSE, "R_square": r_square, "Adj_R_square": adjsted_r_square, "Coefficient_error": coefficientStdError, "T_value": T_values, "P_value": P_values, 'Q_Q_plot': Q_label_pred, 'residual_fitted': fitted_residual, 'scale_location': scale_predict_residual } return json_response except Exception as e: print('exception is =' + str(e))
def randomClassifier(dataset_add, feature_colm, label_colm, relation_list, relation): try: dataset = spark.read.parquet(dataset_add) label = '' for y in label_colm: label = y Schema = dataset.schema stringFeatures = [] numericalFeatures = [] for x in Schema: if (str(x.dataType) == "StringType"): for y in feature_colm: if x.name == y: stringFeatures.append(x.name) else: for y in feature_colm: if x.name == y: numericalFeatures.append(x.name) summaryList = ['mean', 'stddev', 'min', 'max'] summaryDict = {} import pyspark.sql.functions as F import builtins round = getattr(builtins, 'round') for colm in numericalFeatures: summaryListTemp = [] for value in summaryList: summ = list( dataset.select(colm).summary(value).toPandas()[colm]) summaryListSubTemp = [] for val in summ: summaryListSubTemp.append(round(float(val), 4)) # print(summaryListSubTemp) summaryListTemp.append(summaryListSubTemp) # varianceListTemp = list(dataset.select(variance(col(colm)).alias(colm)).toPandas()[colm]) # summaryListTemp.append(varianceListTemp) summaryDict[colm] = summaryListTemp # summaryList.append('variance') summaryDict['summaryName'] = summaryList summaryDict['categoricalColumn'] = stringFeatures skewnessList = [] kurtosisList = [] varianceList = [] skewKurtVarDict = {} for colm in numericalFeatures: skewness = (dataset.select(F.skewness(dataset[colm])).toPandas()) for i, row in skewness.iterrows(): for j, column in row.iteritems(): skewnessList.append(round(column, 4)) kurtosis = (dataset.select(F.kurtosis(dataset[colm])).toPandas()) for i, row in kurtosis.iterrows(): for j, column in row.iteritems(): kurtosisList.append(round(column, 4)) variance = (dataset.select(F.variance(dataset[colm])).toPandas()) for i, row in variance.iterrows(): for j, column in row.iteritems(): varianceList.append(round(column, 4)) for skew, kurt, var, colm in zip(skewnessList, kurtosisList, varianceList, numericalFeatures): print(skew, kurt, var) skewKurtVarList = [] skewKurtVarList.append(skew) skewKurtVarList.append(kurt) skewKurtVarList.append(var) skewKurtVarDict[colm] = skewKurtVarList for (keyOne, valueOne), (keyTwo, valueTwo) in zip(summaryDict.items(), skewKurtVarDict.items()): print(keyOne, valueOne, keyTwo, valueTwo) if keyOne == keyTwo: valueOne.extend(valueTwo) summaryDict[keyOne] = valueOne print(summaryDict) print(summaryList.extend(['skewness', 'kurtosis', 'variance'])) print(summaryDict) # for colm in numericalFeatures: # skewness = (dataset.select(F.skewness(dataset[colm])).alias('skewness_' + colm)) # kurtosis = (dataset.select(F.kurtosis(dataset[colm])).alias('kurtosis_' + colm)) # variance = (dataset.select(F.variance(dataset[colm]).alias('kurtosis_' + colm))) if relation == 'linear': dataset = dataset if relation == 'non_linear': dataset = Relationship(dataset, relation_list) dataset.show() for x in Schema: if (str(x.dataType) == "StringType" and x.name == label): for labelkey in label_colm: label_indexer = StringIndexer(inputCol=label, outputCol='indexed_' + label).fit(dataset) dataset = label_indexer.transform(dataset) label = 'indexed_' + label else: label = label indexed_features = [] for colm in stringFeatures: indexer = StringIndexer(inputCol=colm, outputCol='indexed_' + colm).fit(dataset) indexed_features.append('indexed_' + colm) dataset = indexer.transform(dataset) final_features = numericalFeatures + indexed_features response_chi_test = chi_square_test(dataset=dataset, features=indexed_features, label_col=label, stringFeatures=stringFeatures) featureassembler = VectorAssembler(inputCols=final_features, outputCol="features") dataset = featureassembler.transform(dataset) dataset.show() vec_indexer = VectorIndexer(inputCol='features', outputCol='vec_indexed_features', maxCategories=4).fit(dataset) categorical_features = vec_indexer.categoryMaps print("Choose %d categorical features: %s" % (len(categorical_features), ", ".join( str(k) for k in categorical_features.keys()))) vec_indexed = vec_indexer.transform(dataset) vec_indexed.show() finalized_data = vec_indexed.select(label, 'vec_indexed_features') train_data, test_data = finalized_data.randomSplit([0.75, 0.25], seed=40) rf = RandomForestClassifier(labelCol=label, featuresCol='vec_indexed_features', numTrees=10) model = rf.fit(train_data) predictions = model.transform(test_data) print(model.featureImportances) feature_importance = model.featureImportances.toArray().tolist() print(feature_importance) import pyspark.sql.functions as F import builtins round = getattr(builtins, 'round') feature_importance = model.featureImportances.toArray().tolist() print(feature_importance) # feature_importance = [round(x,4) for x in feature_importance] featureImportance = [] for x in feature_importance: featureImportance.append(round(x, 4)) print(featureImportance) features_column_for_user = numericalFeatures + stringFeatures feature_imp = { 'feature_importance': featureImportance, "feature_column": features_column_for_user } response_dict = { 'feature_importance': feature_imp, 'ChiSquareTestData': response_chi_test, 'summaryDict': summaryDict } return response_dict except Exception as e: print("exception is = " + str(e))
def GradientBoostingClassification(self, dataset_add, feature_colm, label_colm, relation_list, relation): try: dataset = spark.read.csv(dataset_add, sep=';', header=True, inferSchema=True) dataset.show() stepSize = self.learningRate label = '' for val in label_colm: label = val #ETL part Schema = dataset.schema stringFeatures = [] numericalFeatures = [] for x in Schema: if (str(x.dataType) == "StringType" or str(x.dataType) == 'TimestampType' or str(x.dataType) == 'DateType' or str(x.dataType) == 'BooleanType' or str(x.dataType) == 'BinaryType'): for y in feature_colm: if x.name == y: dataset = dataset.withColumn( y, dataset[y].cast(StringType())) stringFeatures.append(x.name) else: for y in feature_colm: if x.name == y: numericalFeatures.append(x.name) if relation == 'linear': dataset = dataset if relation == 'non_linear': dataset = Relationship(dataset, relation_list) categoryColmList = [] categoryColmListFinal = [] categoryColmListDict = {} countOfCategoricalColmList = [] for value in stringFeatures: categoryColm = value listValue = value listValue = [] categoryColm = dataset.groupby(value).count() countOfCategoricalColmList.append(categoryColm.count()) categoryColmJson = categoryColm.toJSON() for row in categoryColmJson.collect(): categoryColmSummary = json.loads(row) listValue.append(categoryColmSummary) categoryColmListDict[value] = listValue if not stringFeatures: maxCategories = 5 else: maxCategories = max(countOfCategoricalColmList) for x in Schema: if (str(x.dataType) == "StringType" and x.name == label): for labelkey in label_colm: label_indexer = StringIndexer(inputCol=label, outputCol='indexed_' + label).fit(dataset) dataset = label_indexer.transform(dataset) label = 'indexed_' + label else: label = label indexed_features = [] for colm in stringFeatures: indexer = StringIndexer(inputCol=colm, outputCol='indexed_' + colm).fit(dataset) indexed_features.append('indexed_' + colm) dataset = indexer.transform(dataset) final_features = numericalFeatures + indexed_features featureassembler = VectorAssembler(inputCols=final_features, outputCol="features") dataset = featureassembler.transform(dataset) vectorIndexer = VectorIndexer( inputCol='features', outputCol='vectorIndexedFeatures', maxCategories=maxCategories).fit(dataset) dataset = vectorIndexer.transform(dataset) trainDataRatioTransformed = self.trainDataRatio testDataRatio = 1 - trainDataRatioTransformed trainingData, testData = dataset.randomSplit( [trainDataRatioTransformed, testDataRatio], seed=0) gradientBoostingmodel = GBTClassifier( labelCol=label, featuresCol='vectorIndexedFeatures', maxIter=10, stepSize=stepSize) gradientBoostFittingTrainingData = gradientBoostingmodel.fit( trainingData) gBPredictionTrainData = gradientBoostFittingTrainingData.transform( trainingData) gBPredictionTestData = gradientBoostFittingTrainingData.transform( testData) gBPredictionTestData.select('prediction', label).show() # gbtModel = gradientBoostFittingTrainingData.stages featureImportance = gradientBoostFittingTrainingData.featureImportances.toArray( ).tolist() print(featureImportance) # prediction graph data from pyspark.sql.functions import col TrainPredictedTargetData = gBPredictionTrainData.select( label, 'prediction', 'probability', 'rawPrediction') residualsTrainData = TrainPredictedTargetData.withColumn( 'residuals', col(label) - col('prediction')) residualsTrainData.show() TestPredictedTargetData = gBPredictionTestData.select( label, 'prediction', 'probability', 'rawPrediction') residualsTestData = TestPredictedTargetData.withColumn( 'residuals', col(label) - col('prediction')) residualsTestData.show() # train Test data Metrics gBPredictionDataDict = { 'gBPredictionTestData': gBPredictionTestData, 'gBPredictionTrainData': gBPredictionTrainData } metricsList = [ 'f1', 'weightedPrecision', 'weightedRecall', 'accuracy' ] for key, value in gBPredictionDataDict.items(): if key == 'gBPredictionTestData': testDataMetrics = {} for metric in metricsList: evaluator = MulticlassClassificationEvaluator( labelCol=label, predictionCol="prediction", metricName=metric) metricValue = evaluator.evaluate(gBPredictionTestData) testDataMetrics[metric] = metricValue print('testDataMetrics :', testDataMetrics) if key == 'gBPredictionTrainData': trainDataMetrics = {} for metric in metricsList: evaluator = MulticlassClassificationEvaluator( labelCol=label, predictionCol="prediction", metricName=metric) metricValue = evaluator.evaluate(gBPredictionTrainData) trainDataMetrics[metric] = metricValue print('trainDataMetrics :', trainDataMetrics) # while fitting the training data totalNumberTrees = gradientBoostFittingTrainingData.getNumTrees print('Total number of trees used is :', totalNumberTrees) totalNumberNodes = gradientBoostFittingTrainingData.totalNumNodes print('Total number of node is :', totalNumberNodes) treeWeight = gradientBoostFittingTrainingData.treeWeights print('Weights on each tree is :', treeWeight) treeInfo = gradientBoostFittingTrainingData.trees for eachTree in treeInfo: print('info of each tree is :', eachTree) except Exception as e: print('exception is --', e)
def lassoRegression(self, dataset_add, feature_colm, label_colm, relation_list, relation, userId): try: dataset = spark.read.parquet(dataset_add) dataset.show() Rsqr_list = [] Rsqr_regPara = {} print(self.xt) # print(data_add) label = '' for val in label_colm: label = val #ETL part Schema = dataset.schema stringFeatures = [] numericalFeatures = [] for x in Schema: if (str(x.dataType) == "StringType" or str(x.dataType) == 'TimestampType' or str(x.dataType) == 'DateType' or str(x.dataType) == 'BooleanType' or str(x.dataType) == 'BinaryType'): for y in feature_colm: if x.name == y: dataset = dataset.withColumn( y, dataset[y].cast(StringType())) stringFeatures.append(x.name) else: for y in feature_colm: if x.name == y: numericalFeatures.append(x.name) if relation == 'linear': dataset = dataset if relation == 'non_linear': dataset = Relationship(dataset, relation_list) categoryColmList = [] categoryColmListFinal = [] categoryColmListDict = {} countOfCategoricalColmList = [] for value in stringFeatures: categoryColm = value listValue = value listValue = [] categoryColm = dataset.groupby(value).count() countOfCategoricalColmList.append(categoryColm.count()) categoryColmJson = categoryColm.toJSON() for row in categoryColmJson.collect(): categoryColmSummary = json.loads(row) listValue.append(categoryColmSummary) categoryColmListDict[value] = listValue if not stringFeatures: maxCategories = 5 else: maxCategories = max(countOfCategoricalColmList) for x in Schema: if (str(x.dataType) == "StringType" and x.name == label): for labelkey in label_colm: label_indexer = StringIndexer( inputCol=label, outputCol='indexed_' + label, handleInvalid="skip").fit(dataset) dataset = label_indexer.transform(dataset) label = 'indexed_' + label else: label = label indexed_features = [] encodedFeatures = [] for colm in stringFeatures: indexer = StringIndexer(inputCol=colm, outputCol='indexed_' + colm, handleInvalid="skip").fit(dataset) indexed_features.append('indexed_' + colm) dataset = indexer.transform(dataset) featureAssembler = VectorAssembler(inputCols=indexed_features + numericalFeatures, outputCol='features', handleInvalid="skip") dataset = featureAssembler.transform(dataset) vectorIndexer = VectorIndexer(inputCol='features', outputCol='vectorIndexedFeatures', maxCategories=maxCategories, handleInvalid="skip").fit(dataset) dataset = vectorIndexer.transform(dataset) trainDataRatioTransformed = self.trainDataRatio testDataRatio = 1 - trainDataRatioTransformed train_data, test_data = dataset.randomSplit( [trainDataRatioTransformed, testDataRatio], seed=40) ######################################################################33 # lasso final for t in self.xt: lr1 = LinearRegression(featuresCol="vectorIndexedFeatures", labelCol=label, elasticNetParam=1, regParam=t) regressor1 = lr1.fit(train_data) print(t) print("coefficient : " + str(regressor1.coefficients)) reg_sum = regressor1.summary r2 = reg_sum.r2 Rsqr_list.append(r2) Rsqr_regPara[r2] = t print(r2) print(Rsqr_list) print(max(Rsqr_list)) maximum_rsqr = max(Rsqr_list) print(Rsqr_regPara) final_regPara = [] for key, val in Rsqr_regPara.items(): if (key == maximum_rsqr): print(val) final_regPara.append(val) for reg in final_regPara: lr_lasso = LinearRegression( featuresCol="vectorIndexedFeatures", labelCol=label, elasticNetParam=1, regParam=reg) regressor = lr_lasso.fit(train_data) training_summary = regressor.summary r2 = training_summary.r2 print(r2) print("coefficient : " + str(regressor.coefficients)) coefficient_t = str(regressor.coefficients) print("intercept : " + str(regressor.intercept)) intercept_t = str(regressor.intercept) prediction = regressor.evaluate(test_data) prediction_val = prediction.predictions prediction_val.show() prediction_val_pand = prediction_val.select( label, "prediction").toPandas() prediction_val_pand = prediction_val_pand.assign( residual_vall=prediction_val_pand[label] - prediction_val_pand["prediction"]) prediction_val_pand_residual = prediction_val_pand["residual_vall"] prediction_val_pand_label = prediction_val_pand[label] prediction_val_pand_predict = prediction_val_pand["prediction"] lr_prediction = regressor.transform(test_data) lr_prediction.groupBy(label, "prediction").count().show() lr_prediction_quantile = lr_prediction.select(label, "prediction") lr_prediction_onlypred = lr_prediction.select('prediction') # lr_prediction_quantile.show() # training_summary = regressor.summary print("numof_Iterations...%d\n" % training_summary.totalIterations) print("ObjectiveHistory...%s\n" % str(training_summary.objectiveHistory)) print("RMSE...%f\n" % training_summary.rootMeanSquaredError) RMSE = training_summary.rootMeanSquaredError print("MSE....%f\n" % training_summary.meanSquaredError) MSE = training_summary.meanSquaredError print("r**2(r-square)....::%f\n" % training_summary.r2) r_square = training_summary.r2 print("r**2(r-square adjusted)....%f\n" % training_summary.r2adj) adjsted_r_square = training_summary.r2adj print("deviance residuals %s" % str(training_summary.devianceResiduals)) training_summary.residuals.show() # residual_graph = training_summary.residuals # test = (residual_graph, lr_prediction_onlypred) # residual_graph.write.csv('/home/fidel/PycharmProjects/predictive_analysis_git', header=True, mode='append' ) # print(test) # test.write.csv('/home/fidel/PycharmProjects/predictive_analysis_git', header=True, mode= 'append') # residual_graph_pandas = residual_graph.toPandas() # print("coefficient standard errors: \n" + str(training_summary.coefficientStandardErrors)) # coefficient_error = str(training_summary.coefficientStandardErrors) # print(" Tvalues :\n" + str(training_summary.tValues)) # T_values = str(training_summary.tValues) # print(" p values :\n" + str(training_summary.pValues)) # P_values = str(training_summary.pValues) ####################################################################################################### table_response = { "Intercept": intercept_t, "Coefficients": coefficient_t, "RMSE": RMSE, "MSE": MSE, "R_square": r_square, "Adj_R_square": adjsted_r_square } ####################################################################################################### # residual vs predicted value prediction_data = regressor.summary.predictions prediction_data.show() prediction_data.select(['prediction']).show() predicted = prediction_data.select(['prediction']) regressor.summary.residuals.show() residuals = regressor.summary.residuals pred_d = predicted.withColumn('row_index', f.monotonically_increasing_id()) res_d = residuals.withColumn('row_index', f.monotonically_increasing_id()) pred_residuals = pred_d.join( res_d, on=['row_index']).sort('row_index').drop('row_index') pred_residuals.show() QQPlot = 'QQPlot.parquet' locationAddress = 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/' # userId = '6786103f-b49b-42f2-ba40-aa8168b65e67' QQPlotAddress = locationAddress + userId + QQPlot pred_residuals.write.parquet(QQPlotAddress, mode='overwrite') # pred_residuals.write.parquet('hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/Q_Q_PLOT.parquet', # mode='overwrite') #################################################################################3 # scale location plot from pyspark.sql.functions import abs as ab, sqrt, mean as meann, stddev as stdDev df_label = prediction_data.select( label, 'prediction', sqrt(ab(prediction_data[label])).alias("sqrt_label")) df_label.show() df_sqrt_label_index = df_label.withColumn( 'row_index', f.monotonically_increasing_id()) df_sqrt_label_index.show() res_d.show() sqrt_label_residual_join = df_sqrt_label_index.join( res_d, on=['row_index']).sort('row_index').drop('row_index') sqrt_label_residual_join.show() std_resid = sqrt_label_residual_join.select( 'sqrt_label', 'prediction', (sqrt_label_residual_join['residuals'] / sqrt_label_residual_join['sqrt_label']).alias('std_res')) std_resid.show() sqrt_std_res = std_resid.select( "std_res", 'prediction', sqrt(ab(std_resid["std_res"])).alias("sqrt_std_resid")) sqrt_std_res.show() sqrt_std_res_fitted = sqrt_std_res.select('prediction', 'sqrt_std_resid') scaleLocationPlot = 'scaleLocation.parquet' scaleLocationPlotAddress = locationAddress + userId + scaleLocationPlot sqrt_std_res_fitted.write.parquet(scaleLocationPlotAddress, mode='overwrite') # sqrt_std_res_fitted.write.parquet( # 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/scale_location_train.parquet', # mode='overwrite') ########### #QQplot # QUANTILE from scipy.stats import norm import statistics import math res_d.show() sorted_res = res_d.sort('residuals') sorted_res.show() # stdev_ress = sorted_res.select(stdDev(col('residuals')).alias('std_dev'), # meann(col('residuals')).alias('mean')) # stdev_ress.show() # mean_residual = stdev_ress.select(['mean']).toPandas() # l = mean_residual.values.tolist() # print(l) # stddev_residual = stdev_ress.select(['std_dev']).toPandas() # length of the sorted std residuals count = sorted_res.groupBy().count().toPandas() countList = count.values.tolist() tuple1 = () for k in countList: tuple1 = k for tu in tuple1: lengthResiduals = tu print(lengthResiduals) quantileList = [] for x in range(0, lengthResiduals): quantileList.append((x - 0.5) / (lengthResiduals)) print(quantileList) # Z-score on theoritical quantile zTheoriticalTrain = [] for x in quantileList: zTheoriticalTrain.append(norm.ppf(abs(x))) print(zTheoriticalTrain) sortedResidualPDF = sorted_res.select('residuals').toPandas() sortedResidualPDF = sortedResidualPDF['residuals'] stdevResidualTrain = statistics.stdev(sortedResidualPDF) meanResidualTrain = statistics.mean(sortedResidualPDF) zPracticalTrain = [] for x in sortedResidualPDF: zPracticalTrain.append( (x - meanResidualTrain) / stdevResidualTrain) ########## target = dataset.select(label) pred = prediction_data.select(['prediction']) pred_d = pred.withColumn('row_index', f.monotonically_increasing_id()) target_d = target.withColumn('row_index', f.monotonically_increasing_id()) pred_target = pred_d.join(target_d, on=['row_index']).drop('row_index') pred_target.show() dataset.show() pred_target_data_update = dataset.join(pred_target, on=[label]) pred_target_data_update.show(100) ########################################################################################## # scale location plot # for scale location plot # from pyspark.sql.functions import udf # # def std_res(x): # res_list = [] # res_list.append(x) # # std_residuals = udf(lambda y: std_res(y), FloatType()) # # residuals_std = residuals.withColumn('residuals', std_residuals(col('residuals').cast(FloatType()))) # # import statistics # import numpy as np # residuals_panda = residuals.toPandas() # # residuals_panda.residuals = range(residuals_panda.shape[1]) # residuals_panda = residuals_panda.values # print(residuals_panda) # stdev_training = statistics.stdev(residuals_panda) # print(stdev_training) ############################################################################################################ # creating the dictionary for storing the result # json_response = coefficient_t # print(json_response) # json_response = {"adjusted r**2 value" : training_summary.r2adj} # DATA VISUALIZATION PART # finding the quantile in the dataset(Q_Q plot) import matplotlib.pyplot as plt y = 0.1 x = [] for i in range(0, 90): x.append(y) y = round(y + 0.01, 2) quantile_label = lr_prediction_quantile.approxQuantile( label, x, 0.01) quantile_prediction = lr_prediction_quantile.approxQuantile( "prediction", x, 0.01) Q_label_pred = '' print(len(quantile_label)) length = len(quantile_label) for i in range(0, len(quantile_label)): Q_label_pred += str(quantile_label[i]) + 't' + str( quantile_prediction[i]) + 'n' import math fitted_residual = '' print(len(prediction_val_pand_residual)) length = len(prediction_val_pand_residual) for i in range(0, len(prediction_val_pand_residual)): fitted_residual += str( prediction_val_pand_predict[i]) + 't' + str( prediction_val_pand_residual[i]) + 'n' ## scale location graph data prediction_val_pand_residual prediction_val_pand_predict prediction_val_pand_residual_abs = prediction_val_pand_residual.abs( ) import math sqrt_residual = [] for x in prediction_val_pand_residual_abs: sqrt_residual.append(math.sqrt(x)) # print ("____________________ ",x) sqrt_residual # calculating std deviation import statistics print(statistics.stdev(prediction_val_pand_residual)) stdev_ = statistics.stdev(prediction_val_pand_residual) # calcuate stnd residuals std_res = [] for x in prediction_val_pand_residual: std_res.append(x / stdev_) print(std_res) # calculating the square root of std_res import math sqr_std_res = [] for x in std_res: sqr_std_res.append(math.sqrt(abs(x))) print(sqr_std_res) scale_predict_residual = '' for pre, res in zip(prediction_val_pand_predict, sqr_std_res): scale_predict_residual += str(pre) + 't' + str(res) + 'n' print(scale_predict_residual) # QUANTILE y = 0.1 x = [] for i in range(0, 90): x.append(y) y = round(y + 0.01, 2) quantile_std_res = spark.createDataFrame(std_res, FloatType()) quantile_std_res.show() quantile_std_res_t = quantile_std_res.approxQuantile( 'value', x, 0.01) print(quantile_std_res_t) print(x) # calculating the z_score from scipy.stats import norm ## sort the list sorted_std_res = sorted(std_res) mean = statistics.mean(sorted_std_res) stdev = statistics.stdev(sorted_std_res) # print(mean) quantile = [] n = len(std_res) print(n) for x in range(0, n): quantile.append((x - 0.5) / (n)) print(quantile) # z_score theoratical z_theory = [] for x in quantile: z_theory.append(norm.ppf(abs(x))) # z score for real val z_pract = [] for x in sorted_std_res: z_pract.append((x - mean) / stdev) Q_label_pred = '' for quant, val in zip(z_theory, z_pract): Q_label_pred += str(quant) + 't' + str(val) + 'n' graph_response = { "Q_Q_plot": Q_label_pred, "residual_fitted": fitted_residual, "scale_location": scale_predict_residual } json_response = { 'table_data': table_response, 'graph_data': graph_response } return json_response except Exception as e: print('exception is =' + str(e))
def ridge(self, dataset_add, feature_colm, label_colm, relation_list, relation, userId): try: dataset = spark.read.csv(dataset_add, header=True, inferSchema=True) dataset.show() Rsqr_list = [] Rsqr_regPara = {} print(self.xt) # print(data_add) # data = spark.read.csv('/home/fidel/mltest/BI.csv', header=True, inferSchema=True) # data.show() # f_data = data.select('Sub Total', 'Tax Amount', 'Freight', 'Profit') # f_data.show() # class A(): # def __init__(self, feature='sahil', label='fcuk'): # self.feature = feature # # feature = 'sahil' # self.label = label # # self.test # self.name = 'bro' # # def linear_c(self): # print(self.feature, '\n', self.label) # print(self.name) # # # a = A(feature='test', label='f_t') # A(feature='test', label='f_t').linear_c() # renaming the colm # print(label_colm) # dataset.withColumnRenamed(label_colm, "label") # print(label_colm) # dataset.show() label = '' for y in label_colm: label = y print(label) # relationship if relation == 'linear': print('linear relationship') if relation == 'non_linear': dataset = Relationship(dataset, relation_list) dataset.show() # implementing the vector assembler featureassembler = VectorAssembler( inputCols=feature_colm, outputCol="Independent_features") output = featureassembler.transform(dataset) output.show() output.select("Independent_features").show() finalized_data = output.select("Independent_features", label) finalized_data.show() # splitting the dataset into taining and testing train_data, test_data = finalized_data.randomSplit([0.75, 0.25], seed=40) ######################################################################33 # lasso final for t in self.xt: lr1 = LinearRegression(featuresCol="Independent_features", labelCol=label, elasticNetParam=0, regParam=t) regressor1 = lr1.fit(train_data) print(t) print("coefficient : " + str(regressor1.coefficients)) reg_sum = regressor1.summary r2 = reg_sum.r2 Rsqr_list.append(r2) Rsqr_regPara[r2] = t print(r2) print(Rsqr_list) print(max(Rsqr_list)) maximum_rsqr = max(Rsqr_list) print(Rsqr_regPara) final_regPara = [] for key, val in Rsqr_regPara.items(): if (key == maximum_rsqr): print(val) final_regPara.append(val) for reg in final_regPara: lr_lasso = LinearRegression(featuresCol="Independent_features", labelCol=label, elasticNetParam=0, regParam=reg) regressor = lr_lasso.fit(train_data) training_summary = regressor.summary r2 = training_summary.r2 print(r2) print("coefficient : " + str(regressor.coefficients)) coefficient_t = str(regressor.coefficients) print("intercept : " + str(regressor.intercept)) intercept_t = str(regressor.intercept) prediction = regressor.evaluate(test_data) prediction_val = prediction.predictions prediction_val.show() prediction_val_pand = prediction_val.select( label, "prediction").toPandas() prediction_val_pand = prediction_val_pand.assign( residual_vall=prediction_val_pand[label] - prediction_val_pand["prediction"]) prediction_val_pand_residual = prediction_val_pand["residual_vall"] prediction_val_pand_label = prediction_val_pand[label] # print prediction_val_pand_residual prediction_val_pand_predict = prediction_val_pand["prediction"] # print prediction_val_pand_predict # test_summary = prediction.summary # for test data lr_prediction = regressor.transform(test_data) lr_prediction.groupBy(label, "prediction").count().show() lr_prediction_quantile = lr_prediction.select(label, "prediction") lr_prediction_onlypred = lr_prediction.select('prediction') # lr_prediction_quantile.show() # training_summary = regressor.summary print("numof_Iterations...%d\n" % training_summary.totalIterations) print("ObjectiveHistory...%s\n" % str(training_summary.objectiveHistory)) print("RMSE...%f\n" % training_summary.rootMeanSquaredError) RMSE = training_summary.rootMeanSquaredError print("MSE....%f\n" % training_summary.meanSquaredError) MSE = training_summary.meanSquaredError print("r**2(r-square)....::%f\n" % training_summary.r2) r_square = training_summary.r2 print("r**2(r-square adjusted)....%f\n" % training_summary.r2adj) adjsted_r_square = training_summary.r2adj print("deviance residuals %s" % str(training_summary.devianceResiduals)) training_summary.residuals.show() # residual_graph = training_summary.residuals # test = (residual_graph, lr_prediction_onlypred) # residual_graph.write.csv('/home/fidel/PycharmProjects/predictive_analysis_git', header=True, mode='append' ) # print(test) # test.write.csv('/home/fidel/PycharmProjects/predictive_analysis_git', header=True, mode= 'append') # residual_graph_pandas = residual_graph.toPandas() print("coefficient standard errors: \n" + str(training_summary.coefficientStandardErrors)) coefficient_error = str(training_summary.coefficientStandardErrors) print(" Tvalues :\n" + str(training_summary.tValues)) T_values = str(training_summary.tValues) print(" p values :\n" + str(training_summary.pValues)) P_values = str(training_summary.pValues) ####################################################################################################### table_response = { "Intercept": intercept_t, "Coefficients": coefficient_t, "RMSE": RMSE, "MSE": MSE, "R_square": r_square, "Adj_R_square": adjsted_r_square, "Coefficient_error": coefficient_error, "T_value": T_values, "P_value": P_values } ####################################################################################################### # residual vs fitted graph prediction_data = regressor.summary.predictions prediction_data.show() prediction_data.select(['prediction']).show() predicted = prediction_data.select(['prediction']) regressor.summary.residuals.show() residuals = regressor.summary.residuals pred_d = predicted.withColumn('row_index', f.monotonically_increasing_id()) res_d = residuals.withColumn('row_index', f.monotonically_increasing_id()) pred_residuals = pred_d.join( res_d, on=['row_index']).sort('row_index').drop('row_index') pred_residuals.show() pred_residuals.write.parquet( 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/residual_fitted_train.parquet', mode='overwrite') ###################################################################################### # scale location plot training data from pyspark.sql.functions import sqrt from pyspark.sql.functions import abs as ab df_label = prediction_data.select( label, 'prediction', sqrt(ab(prediction_data[label])).alias("sqrt_label")) df_label.show() df_sqrt_label_index = df_label.withColumn( 'row_index', f.monotonically_increasing_id()) df_sqrt_label_index.show() # df_residual_index = df_residual.withColumn('row_index', f.monotonically_increasing_id()) # df_residual_index.show() res_d.show() sqrt_label_residual_join = df_sqrt_label_index.join( res_d, on=['row_index']).sort('row_index').drop('row_index') sqrt_label_residual_join.show() std_resid = sqrt_label_residual_join.select( 'sqrt_label', 'prediction', (sqrt_label_residual_join['residuals'] / sqrt_label_residual_join['sqrt_label']).alias('std_res')) std_resid.show() # std_resid_std_res = std_resid.select("std_res") sqrt_std_res = std_resid.select( "std_res", 'prediction', sqrt(ab(std_resid["std_res"])).alias("sqrt_std_resid")) # sqrt_std_res = sqrt(abs(std_resid_std_res["std_res"])) sqrt_std_res.show() sqrt_std_res_fitted = sqrt_std_res.select('prediction', 'sqrt_std_resid') sqrt_std_res_fitted.write.parquet( 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/scale_location_train.parquet', mode='overwrite') ###################################################################################### # QUANTILE ''' from pyspark.sql.functions import * res_d.show() sorted_res = res_d.sort('residuals') sorted_res.show() stdev_ress = sorted_res.select(stddev(col('residuals')).alias('std_dev'),mean(col('residuals')).alias('mean')) stdev_ress.show() mean_residual = stdev_ress.select(['mean']).toPandas() stddev_residual = stdev_ress.select(['std_dev']).toPandas() for x in range(0, 5): print(x/mean_residual) ''' #################################################################################### # appending predicted value to the dataset target = dataset.select(label) pred = prediction_data.select(['prediction']) pred_d = pred.withColumn('row_index', f.monotonically_increasing_id()) target_d = target.withColumn('row_index', f.monotonically_increasing_id()) pred_target = pred_d.join(target_d, on=['row_index']).drop('row_index') pred_target.show() dataset.show() pred_target_data_update = dataset.join(pred_target, on=[label]) pred_target_data_update.show(100) ########################################################### import matplotlib.pyplot as plt y = 0.1 x = [] for i in range(0, 90): x.append(y) y = round(y + 0.01, 2) # # for z in x: # print ("~~~~~ ",z) # quantile_label = lr_prediction_quantile.approxQuantile( label, x, 0.01) # print quantile_label quantile_prediction = lr_prediction_quantile.approxQuantile( "prediction", x, 0.01) # creating the csv file and writitng into it fitted_residual = '' print(len(prediction_val_pand_residual)) length = len(prediction_val_pand_residual) for i in range(0, len(prediction_val_pand_residual)): fitted_residual += str( prediction_val_pand_predict[i]) + 't' + str( prediction_val_pand_residual[i]) + 'n' with open('residual_vs_fitted.csv', 'w') as r_f: writer_r_f = csv.writer(r_f) writer_r_f.writerows((prediction_val_pand_predict, prediction_val_pand_residual)) # parquet file writing ## residual vs leverage graph data prediction_val_pand_residual # extreme value in the predictor colm prediction_col_extremeval = lr_prediction_quantile.agg( {"prediction": "max"}) # prediction_col_extremeval.show() # plt.plot(prediction_col_extremeval, prediction_val_pand_residual) # plt.show() ## scale location graph data import math prediction_val_pand_residual prediction_val_pand_predict prediction_val_pand_residual_abs = prediction_val_pand_residual.abs( ) sqrt_residual = [] for x in prediction_val_pand_residual_abs: sqrt_residual.append(math.sqrt(x)) # print ("____________________ ",x) sqrt_residual # plt.scatter(sqrt_residual, prediction_val_pand_predict) ####################################################################################3 # calculating std deviation import statistics print(statistics.stdev(prediction_val_pand_residual)) stdev_pred = statistics.stdev(prediction_val_pand_residual) # mean = statistics.mean(prediction_val_pand_residual) # calcuate stnd residuals std_res = [] for x in prediction_val_pand_residual: std_res.append(x / stdev_pred) print(std_res) # calculating the square root of std_res import math sqr_std_res = [] for x in std_res: sqr_std_res.append(math.sqrt(abs(x))) print(sqr_std_res) #######################################################################################3 # QUANTILE ## sort the list sorted_std_res = sorted(std_res) print(sorted_std_res) # mean = statistics.mean(sorted_std_res) stdev = statistics.stdev(sorted_std_res) print(mean) quantile = [] n = len(sorted_std_res) print(n) for x in range(0, n): quantile.append((x - 0.5) / (n)) print(quantile) # # z_score theoritical from scipy.stats import norm z_theory = [] for x in quantile: z_theory.append((norm.ppf(abs(x)))) print(z_theory) # z score for real val z_pract = [] for x in sorted_std_res: z_pract.append((x - mean) / stdev) # y = 0.1 x = [] for i in range(0, 90): x.append(y) y = round(y + 0.01, 2) quantile_std_res = spark.createDataFrame(std_res, FloatType()) quantile_std_res.show() quantile_std_res_t = quantile_std_res.approxQuantile( 'value', x, 0.01) print(quantile_std_res_t) print(x) Q_label_pred = '' print(len(quantile_label)) length = len(quantile_label) for quant, val in zip(z_theory, z_pract): Q_label_pred += str(val) + 't' + str(quant) + 'n' plt.scatter(z_theory, z_pract) plt.savefig('q_q') #################################################### # creating the std residuals # square root of label sqrt_label = [] for x in prediction_val_pand_label: sqrt_label.append(math.sqrt(abs(x))) sqrt_label prediction_val_pand_residual std_residual = [] for sqr, resid in zip(sqrt_label, prediction_val_pand_residual): std_residual.append(resid / sqr) # print(std_sqrt_residual) # creating the std sqr root sqrt_std_residuals = [] for x in std_residual: # print(math.sqrt(abs(x))) sqrt_std_residuals.append(math.sqrt(abs(x))) print(sqrt_std_residuals) # print(std_sqrt_residual) scale_predict_residual = '' for pre, res in zip(prediction_val_pand_predict, sqrt_std_residuals): scale_predict_residual += str(pre) + 't' + str(res) + 'n' print(scale_predict_residual) ########################################################################## """ pred_residuals.show() pred_residuals_pandas = pred_residuals.toPandas() print(pred_residuals_pandas) res_pandas = pred_residuals_pandas['residuals'] pred_pandas = pred_residuals_pandas['prediction'] label_list = [] # for res, pred in zip(res_pandas, pred_pandas): # label_list.append(res+pred) label_pand = prediction_data.select([label]).toPandas() labe_panda = label_pand[label] # sqrt of label column sqrt_lab = [] for lab in labe_panda: sqrt_lab.append(math.sqrt(abs(lab))) print(res_pandas) stdev_res = statistics.stdev(res_pandas) std_res_list = [] for valr, labe in zip(res_pandas,sqrt_lab): std_res_list.append(valr/labe) print(std_res_list) """ ########################################################################## ########################################################################## # import math # sqrt_stdres = [] # for x in std_sqrt_residual: # sqrt_stdres.append(math.sqrt(x)) # # scale_predict_residual = '' # for pre, res in zip(prediction_val_pand_predict, sqrt_stdres): # scale_predict_residual += str(pre) + 't' + str(res) + 'n' # print(scale_predict_residual) ###################################3 # plt.show() # scale_predict_residual='' # # print(len(sqrt_residual)) # length = len(sqrt_residual) # # for i in range(0, len(std_sqrt_residual)): # scale_predict_residual += str(prediction_val_pand_predict[i]) + '|' + str(std_sqrt_residual[i]) + '\n' # with open('scale_location_plot.csv', 'w') as s_l: # writer_s_l = csv.writer(s_l) # writer_s_l.writerows((prediction_val_pand_predict, sqrt_residual)) # writing to the parquet # prediction_val_pand_predict_tospark = spark.createDataFrame(prediction_val_pand_predict, FloatType()) # prediction_val_pand_predict_tospark = prediction_val_pand_predict_tospark.withColumnRenamed("value", # "prediction") # # sqrt_residual_tospark= spark.createDataFrame(sqrt_residual, FloatType()) # sqrt_residual_tospark = sqrt_residual_tospark.withColumnRenamed("value", # "sqrt_residual") # # pred_spark = prediction_val_pand_predict_tospark.withColumn('row_index', f.monotonically_increasing_id()) # res_spark = sqrt_residual_tospark.withColumn('row_index', f.monotonically_increasing_id()) # # final_scale_fitted = pred_spark.join(res_spark,on=['row_index']) \ # .sort('row_index').drop('row_index') # # final_scale_fitted.show() # # final_scale_fitted.write.parquet( # 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/SCALE_LOCATION_PLOT.parquet', # mode='overwrite') # # dumping the dictionary into json object # json_response = {'run_status': 'success', 'PredictiveResponse': resultdf} graph_response = { "Q_Q_plot": Q_label_pred, "residual_fitted": fitted_residual, "scale_location": scale_predict_residual } json_response = { 'table_data': table_response, 'graph_data': graph_response } return json_response except Exception as e: print('exception is =' + str(e))
def loadModel(dataset_add, feature_colm, label_colm, relation_list, relation): try: # dataset = spark.read.csv('/home/fidel/mltest/testData.csv', header=True, inferSchema=True) # testDataFetched = testDataFetched.select('Independent_features', 'MPG') # testDataFetched.show() # testDataFetched.printSchema() dataset = spark.read.csv(dataset_add, header=True, inferSchema=True) dataset.show() # renaming the colm # print(label_colm) # dataset.withColumnRenamed(label_colm, "label") # print(label_colm) # dataset.show() label = '' for y in label_colm: label = y print(label) dictionary_list = { 'log_list': ["CYLINDERS"], 'sqrt_list': ["WEIGHT"], 'cubic_list': ["ACCELERATION"] } relationship_val = 'linear_reg' if relationship_val == 'linear_reg': print('linear relationship') else: dataset = Relationship(dataset, dictionary_list) dataset.show() # implementing the vector assembler featureassembler = VectorAssembler(inputCols=feature_colm, outputCol="Independent_features") output = featureassembler.transform(dataset) output.show() output = output.select("Independent_features") # finalized_data = output.select("Independent_features", label) # finalized_data.show() regressorTest = LinearRegressionModel.load( '/home/fidel/mltest/linearRegressorFitModel') predictedData = regressorTest.transform(output) predictedData.show() except Exception as e: print('exception ' + str(e)) # # if __name__== '__main__': # loadModel()
def linearReg(self, dataset_add, feature_colm, label_colm, relation_list, relation, userId, locationAddress): try: dataset = spark.read.parquet(dataset_add) dataset.show() label = '' for val in label_colm: label = val #ETL part Schema = dataset.schema stringFeatures = [] numericalFeatures = [] for x in Schema: if (str(x.dataType) == "StringType" or str(x.dataType) == 'TimestampType' or str(x.dataType) == 'DateType' or str(x.dataType) == 'BooleanType' or str(x.dataType) == 'BinaryType'): for y in feature_colm: if x.name == y: dataset = dataset.withColumn( y, dataset[y].cast(StringType())) stringFeatures.append(x.name) else: for y in feature_colm: if x.name == y: numericalFeatures.append(x.name) if relation == 'linear': dataset = dataset if relation == 'non_linear': dataset = Relationship(dataset, relation_list) categoryColmList = [] categoryColmListFinal = [] categoryColmListDict = {} countOfCategoricalColmList = [] for value in stringFeatures: categoryColm = value listValue = value listValue = [] categoryColm = dataset.groupby(value).count() countOfCategoricalColmList.append(categoryColm.count()) categoryColmJson = categoryColm.toJSON() for row in categoryColmJson.collect(): categoryColmSummary = json.loads(row) listValue.append(categoryColmSummary) categoryColmListDict[value] = listValue if not stringFeatures: maxCategories = 5 else: maxCategories = max(countOfCategoricalColmList) for x in Schema: if (str(x.dataType) == "StringType" and x.name == label): for labelkey in label_colm: label_indexer = StringIndexer( inputCol=label, outputCol='indexed_' + label, handleInvalid="skip").fit(dataset) dataset = label_indexer.transform(dataset) label = 'indexed_' + label else: label = label indexed_features = [] # encodedFeatures = [] for colm in stringFeatures: indexer = StringIndexer(inputCol=colm, outputCol='indexed_' + colm, handleInvalid="skip").fit(dataset) indexed_features.append('indexed_' + colm) dataset = indexer.transform(dataset) '''from pyspark.ml.feature import OneHotEncoderEstimator oneHotEncodedFeaturesList = [] for colm in stringFeatures: indexer = StringIndexer(inputCol=colm, outputCol='indexed_' + colm, handleInvalid="skip").fit(dataset) indexed_features.append('indexed_' + colm) dataset = indexer.transform(dataset) oneHotEncodedFeaturesList.append('OneHotEncoded_' + colm) oneHotEncoder=OneHotEncoderEstimator(inputCols=indexed_features, outputCols=oneHotEncodedFeaturesList) oneHotEncoderFit=oneHotEncoder.fit(dataset) oneHotEncoderFeaturesDataset=oneHotEncoderFit.transform(dataset)''' featureAssembler = VectorAssembler(inputCols=indexed_features + numericalFeatures, outputCol='features', handleInvalid="skip") dataset = featureAssembler.transform(dataset) vectorIndexer = VectorIndexer(inputCol='features', outputCol='vectorIndexedFeatures', maxCategories=maxCategories, handleInvalid="skip").fit(dataset) dataset = vectorIndexer.transform(dataset) trainDataRatioTransformed = self.trainDataRatio testDataRatio = 1 - trainDataRatioTransformed train_data, test_data = dataset.randomSplit( [trainDataRatioTransformed, testDataRatio], seed=40) lr = LinearRegression(featuresCol="vectorIndexedFeatures", labelCol=label) regressor = lr.fit(train_data) # locationAddress = 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/' print("coefficient : " + str(regressor.coefficients)) coefficient_t = str(regressor.coefficients) print("intercept : " + str(regressor.intercept)) intercept_t = str(regressor.intercept) featurePredictedLabel = feature_colm featurePredictedLabel.append('prediction') featurePredictedLabel.append(label) # testDataEvaluation = regressor.evaluate(test_data) # testDataPrediction = testDataEvaluation.predictions # testDataPrediction.select(featurePredictedLabel).show() prediction = regressor.evaluate(test_data) prediction_val = prediction.predictions testDataPrediction = prediction_val.select(featurePredictedLabel) # storing test predicted value to the dataset prediction_val_pand = prediction_val.select( label, "prediction").toPandas() prediction_val_pand = prediction_val_pand.assign( residual_vall=prediction_val_pand[label] - prediction_val_pand["prediction"]) prediction_val_pand_residual = prediction_val_pand["residual_vall"] prediction_val_pand_label = prediction_val_pand[label] prediction_val_pand_predict = prediction_val_pand["prediction"] lr_prediction = regressor.transform(test_data) lr_prediction.groupBy(label, "prediction").count().show() lr_prediction_quantile = lr_prediction.select(label, "prediction") training_summary = regressor.summary print("numof_Iterations...%d\n" % training_summary.totalIterations) print("ObjectiveHistory...%s\n" % str(training_summary.objectiveHistory)) print("RMSE...%f\n" % training_summary.rootMeanSquaredError) RMSE = training_summary.rootMeanSquaredError print("MSE....%f\n" % training_summary.meanSquaredError) MSE = training_summary.meanSquaredError print("r**2(r-square)....::%f\n" % training_summary.r2) r_square = training_summary.r2 print("r**2(r-square adjusted)....%f\n" % training_summary.r2adj) adjsted_r_square = training_summary.r2adj print("deviance residuals %s" % str(training_summary.devianceResiduals)) training_summary.residuals.show() residual_graph = training_summary.residuals residual_graph_pandas = residual_graph.toPandas() print("coefficient standard errors: \n" + str(training_summary.coefficientStandardErrors)) coefficientStdError = str( training_summary.coefficientStandardErrors) print(" Tvalues :\n" + str(training_summary.tValues)) T_values = str(training_summary.tValues) tValuesList = training_summary.tValues print(" p values :\n" + str(training_summary.pValues)) P_values = str(training_summary.pValues) coefficientList = list(regressor.coefficients) #summaryData import pyspark.sql.functions as F import builtins round = getattr(builtins, 'round') print(coefficientList) coefficientListRounded = [] for value in coefficientList: coefficientListRounded.append(round(value, 4)) # print(coefficientListRounded) # print(intercept_t) interceptRounded = round(float(intercept_t), 4) # print(interceptRounded) # print(RMSE) RMSERounded = round(RMSE, 4) # print(RMSERounded) MSERounded = round(MSE, 4) rSquareRounded = round(r_square, 4) adjustedrSquareRounded = round(adjsted_r_square, 4) coefficientStdError = training_summary.coefficientStandardErrors coefficientStdErrorRounded = [] for value in coefficientStdError: coefficientStdErrorRounded.append(round(float(value), 4)) print(coefficientStdErrorRounded) tValuesListRounded = [] for value in tValuesList: tValuesListRounded.append(round(value, 4)) print(tValuesListRounded) pValuesListRounded = [] PValuesList = training_summary.pValues for value in PValuesList: pValuesListRounded.append(round(value, 4)) print(pValuesListRounded) # regression equation intercept_t = float(intercept_t) coefficientList = list(regressor.coefficients) equation = label, '=', interceptRounded, '+' for feature, coeff in zip(feature_colm, coefficientListRounded): coeffFeature = coeff, '*', feature, '+' equation += coeffFeature equation = equation[:-1] print(equation) equationAsList = list(equation) '''# statTable function def summaryTable(self,featuresName,featuresStat): statTable={} for name, stat in zip(featuresName.values(), featuresStat.values()): print(name, ": ", stat) statTable[name]=stat return statTable ''' # significance value PValuesList = training_summary.pValues significanceObject = {} for pValue in pValuesListRounded: if (0 <= pValue < 0.001): significanceObject[pValue] = '***' if (0.001 <= pValue < 0.01): significanceObject[pValue] = '**' if (0.01 <= pValue < 0.05): significanceObject[pValue] = '*' if (0.05 <= pValue < 0.1): significanceObject[pValue] = '.' if (0.1 <= pValue < 1): significanceObject[pValue] = '-' print(significanceObject) # storing test predicted value to the dataset predictionData = 'prediction.parquet' predictionDataStoring = locationAddress + userId + predictionData testDataPrediction.write.parquet(predictionDataStoring, mode='overwrite') # residual vs predicted value prediction_data = regressor.summary.predictions prediction_data.show() prediction_data.select(['prediction']).show() predicted = prediction_data.select(['prediction']) regressor.summary.residuals.show() residuals = regressor.summary.residuals pred_d = predicted.withColumn('row_index', f.monotonically_increasing_id()) res_d = residuals.withColumn('row_index', f.monotonically_increasing_id()) pred_residuals = pred_d.join( res_d, on=['row_index']).sort('row_index').drop('row_index') pred_residuals.show() QQPlot = 'QQPlot.parquet' # locationAddress = 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/' # userId = '6786103f-b49b-42f2-ba40-aa8168b65e67' QQPlotAddress = locationAddress + userId + QQPlot pred_residuals.write.parquet(QQPlotAddress, mode='overwrite') # pred_residuals.write.parquet('hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/Q_Q_PLOT.parquet', # mode='overwrite') #################################################################################3 # scale location plot from pyspark.sql.functions import abs as ab, sqrt, mean as meann, stddev as stdDev df_label = prediction_data.select( label, 'prediction', sqrt(ab(prediction_data[label])).alias("sqrt_label")) df_label.show() df_sqrt_label_index = df_label.withColumn( 'row_index', f.monotonically_increasing_id()) df_sqrt_label_index.show() res_d.show() sqrt_label_residual_join = df_sqrt_label_index.join( res_d, on=['row_index']).sort('row_index').drop('row_index') sqrt_label_residual_join.show() std_resid = sqrt_label_residual_join.select( 'sqrt_label', 'prediction', (sqrt_label_residual_join['residuals'] / sqrt_label_residual_join['sqrt_label']).alias('std_res')) std_resid.show() sqrt_std_res = std_resid.select( "std_res", 'prediction', sqrt(ab(std_resid["std_res"])).alias("sqrt_std_resid")) sqrt_std_res.show() sqrt_std_res_fitted = sqrt_std_res.select('prediction', 'sqrt_std_resid') scaleLocationPlot = 'scaleLocation.parquet' scaleLocationPlotAddress = locationAddress + userId + scaleLocationPlot sqrt_std_res_fitted.write.parquet(scaleLocationPlotAddress, mode='overwrite') # sqrt_std_res_fitted.write.parquet( # 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/scale_location_train.parquet', # mode='overwrite') ########### #QQplot # QUANTILE from scipy.stats import norm import statistics import math res_d.show() sorted_res = res_d.sort('residuals') sorted_res.show() # stdev_ress = sorted_res.select(stdDev(col('residuals')).alias('std_dev'), # meann(col('residuals')).alias('mean')) # stdev_ress.show() # mean_residual = stdev_ress.select(['mean']).toPandas() # l = mean_residual.values.tolist() # print(l) # stddev_residual = stdev_ress.select(['std_dev']).toPandas() # length of the sorted std residuals count = sorted_res.groupBy().count().toPandas() countList = count.values.tolist() tuple1 = () for k in countList: tuple1 = k for tu in tuple1: lengthResiduals = tu print(lengthResiduals) quantileList = [] for x in range(0, lengthResiduals): quantileList.append((x - 0.5) / (lengthResiduals)) print(quantileList) # Z-score on theoritical quantile zTheoriticalTrain = [] for x in quantileList: zTheoriticalTrain.append(norm.ppf(abs(x))) print(zTheoriticalTrain) sortedResidualPDF = sorted_res.select('residuals').toPandas() sortedResidualPDF = sortedResidualPDF['residuals'] stdevResidualTrain = statistics.stdev(sortedResidualPDF) meanResidualTrain = statistics.mean(sortedResidualPDF) zPracticalTrain = [] for x in sortedResidualPDF: zPracticalTrain.append( (x - meanResidualTrain) / stdevResidualTrain) ########## target = dataset.select(label) pred = prediction_data.select(['prediction']) pred_d = pred.withColumn('row_index', f.monotonically_increasing_id()) target_d = target.withColumn('row_index', f.monotonically_increasing_id()) pred_target = pred_d.join(target_d, on=['row_index']).drop('row_index') pred_target.show() dataset.show() pred_target_data_update = dataset.join(pred_target, on=[label]) pred_target_data_update.show(100) ##########3 # table_response = { # # "Intercept": intercept_t, # "Coefficients": coefficient_t, # "RMSE": RMSE, # "MSE": MSE, # "R_square": r_square, # "Adj_R_square": adjsted_r_square, # "coefficientStdError": coefficientStdError, # "T_value": T_values, # "P_value": P_values # # } y = 0.1 x = [] for i in range(0, 90): x.append(y) y = round(y + 0.01, 2) quantile_label = lr_prediction_quantile.approxQuantile( label, x, 0.01) quantile_prediction = lr_prediction_quantile.approxQuantile( "prediction", x, 0.01) Q_label_pred = '' print(len(quantile_label)) length = len(quantile_label) for i in range(0, len(quantile_label)): Q_label_pred += str(quantile_label[i]) + 't' + str( quantile_prediction[i]) + 'n' import math fitted_residual = '' print(len(prediction_val_pand_residual)) length = len(prediction_val_pand_residual) for i in range(0, len(prediction_val_pand_residual)): fitted_residual += str( prediction_val_pand_predict[i]) + 't' + str( prediction_val_pand_residual[i]) + 'n' ## scale location graph data prediction_val_pand_residual prediction_val_pand_predict prediction_val_pand_residual_abs = prediction_val_pand_residual.abs( ) import math sqrt_residual = [] for x in prediction_val_pand_residual_abs: sqrt_residual.append(math.sqrt(x)) # print ("____________________ ",x) sqrt_residual # calculating std deviation import statistics print(statistics.stdev(prediction_val_pand_residual)) stdev_ = statistics.stdev(prediction_val_pand_residual) # calcuate stnd residuals std_res = [] for x in prediction_val_pand_residual: std_res.append(x / stdev_) print(std_res) # calculating the square root of std_res import math sqr_std_res = [] for x in std_res: sqr_std_res.append(math.sqrt(abs(x))) print(sqr_std_res) scale_predict_residual = '' for pre, res in zip(prediction_val_pand_predict, sqr_std_res): scale_predict_residual += str(pre) + 't' + str(res) + 'n' print(scale_predict_residual) # QUANTILE y = 0.1 x = [] for i in range(0, 90): x.append(y) y = round(y + 0.01, 2) quantile_std_res = spark.createDataFrame(std_res, FloatType()) quantile_std_res.show() quantile_std_res_t = quantile_std_res.approxQuantile( 'value', x, 0.01) print(quantile_std_res_t) print(x) # calculating the z_score from scipy.stats import norm ## sort the list sorted_std_res = sorted(std_res) mean = statistics.mean(sorted_std_res) stdev = statistics.stdev(sorted_std_res) # print(mean) quantile = [] n = len(std_res) print(n) for x in range(0, n): quantile.append((x - 0.5) / (n)) print(quantile) # z_score theoratical z_theory = [] for x in quantile: z_theory.append(norm.ppf(abs(x))) # z score for real val z_pract = [] for x in sorted_std_res: z_pract.append((x - mean) / stdev) Q_label_pred = '' for quant, val in zip(z_theory, z_pract): Q_label_pred += str(quant) + 't' + str(val) + 'n' graph_response = { "Q_Q_plot": Q_label_pred, "residual_fitted": fitted_residual, "scale_location": scale_predict_residual } tableContent = \ { 'coefficientValuesKey': coefficientListRounded, 'tValuesKey': tValuesListRounded, 'pValuesKey': pValuesListRounded, 'significanceValuesKey': significanceObject, 'interceptValuesKey': interceptRounded, "RMSE": RMSERounded, "RSquare": rSquareRounded, "AdjRSquare": adjustedrSquareRounded, "CoefficientStdError": coefficientStdErrorRounded, 'equationKey': equation } json_response = { 'table_data': tableContent, 'graph_data': graph_response } print(json_response) return (json_response) except Exception as e: print('exception is =' + str(e))
def Linear_reg(dataset_add, feature_colm, label_colm, relation_list, relation): try: dataset = spark.read.parquet(dataset_add) dataset.show() label = '' for y in label_colm: label = y print(label) # relationship if relation == 'linear': print('linear relationship') if relation == 'non_linear': dataset = Relationship(dataset, relation_list) dataset.show() # renaming the colm # print (label) # dataset.withColumnRenamed(label,"label") # print (label) # dataset.show() featureassembler = VectorAssembler(inputCols=feature_colm, outputCol="Independent_features") output = featureassembler.transform(dataset) output.show() output.select("Independent_features").show() finalized_data = output.select("Independent_features", label) finalized_data.show() # splitting the dataset into taining and testing train_data, test_data = finalized_data.randomSplit([0.75, 0.25], seed=40) # applying the model lr = LinearRegression(featuresCol="Independent_features", labelCol=label) regressor = lr.fit(train_data) # print regressor.featureImportances # print(dataset.orderBy(feature_colm, ascending=True)) # pred = regressor.transform(test_data) # coefficeint & intercept print("coefficient : " + str(regressor.coefficients)) coefficient_t = str(regressor.coefficients) print("intercept : " + str(regressor.intercept)) intercept_t = str(regressor.intercept) prediction = regressor.evaluate(test_data) VI_IMP = 2 prediction_val = prediction.predictions prediction_val.show() prediction_val_pand = prediction_val.select(label, "prediction").toPandas() prediction_val_pand = prediction_val_pand.assign( residual_vall=prediction_val_pand[label] - prediction_val_pand["prediction"]) prediction_val_pand_residual = prediction_val_pand["residual_vall"] prediction_val_pand_label = prediction_val_pand[label] # print prediction_val_pand_residual prediction_val_pand_predict = prediction_val_pand["prediction"] # print prediction_val_pand_predict # test_summary = prediction.summary # for test data lr_prediction = regressor.transform(test_data) lr_prediction.groupBy(label, "prediction").count().show() lr_prediction_quantile = lr_prediction.select(label, "prediction") # lr_prediction_quantile.show() training_summary = regressor.summary print("numof_Iterations...%d\n" % training_summary.totalIterations) print("ObjectiveHistory...%s\n" % str(training_summary.objectiveHistory)) print("RMSE...%f\n" % training_summary.rootMeanSquaredError) RMSE = training_summary.rootMeanSquaredError print("MSE....%f\n" % training_summary.meanSquaredError) MSE = training_summary.meanSquaredError print("r**2(r-square)....::%f\n" % training_summary.r2) r_square = training_summary.r2 print("r**2(r-square adjusted)....%f\n" % training_summary.r2adj) adjsted_r_square = training_summary.r2adj print("deviance residuals %s" % str(training_summary.devianceResiduals)) training_summary.residuals.show() residual_graph = training_summary.residuals residual_graph_pandas = residual_graph.toPandas() print("coefficient standard errors: \n" + str(training_summary.coefficientStandardErrors)) coefficient_error = str(training_summary.coefficientStandardErrors) print(" Tvalues :\n" + str(training_summary.tValues)) T_values = str(training_summary.tValues) print(" p values :\n" + str(training_summary.pValues)) P_values = str(training_summary.pValues) ####################################################################################################### # residual vs predicted value prediction_data = regressor.summary.predictions prediction_data.show() prediction_data.select(['prediction']).show() predicted = prediction_data.select(['prediction']) regressor.summary.residuals.show() residuals = regressor.summary.residuals pred_d = predicted.withColumn('row_index', f.monotonically_increasing_id()) res_d = residuals.withColumn('row_index', f.monotonically_increasing_id()) pred_residuals = pred_d.join(res_d, on=['row_index' ]).sort('row_index').drop('row_index') pred_residuals.show() pred_residuals.write.parquet( 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/residual_fitted_plot.parquet', mode='overwrite') # scale location plot ############################################################################################################ #################################################################################### # appending predicted value to the dataset target = dataset.select(label) pred = prediction_data.select(['prediction']) pred_d = pred.withColumn('row_index', f.monotonically_increasing_id()) target_d = target.withColumn('row_index', f.monotonically_increasing_id()) pred_target = pred_d.join(target_d, on=['row_index']).drop('row_index') pred_target.show() dataset.show() pred_target_data_update = dataset.join(pred_target, on=[label]) pred_target_data_update.show(100) ########################################################################################## # creating the dictionary for storing the result table_response = { "Intercept": intercept_t, "Coefficients": coefficient_t, "RMSE": RMSE, "MSE": MSE, "R_square": r_square, "Adj_R_square": adjsted_r_square, "Coefficient_error": coefficient_error, "T_value": T_values, "P_value": P_values } # json_response = coefficient_t # json_response = {"adjusted r**2 value" : training_summary.r2adj} # DATA VISUALIZATION PART # finding the quantile in the dataset(Q_Q plot) y = 0.1 x = [] for i in range(0, 90): x.append(y) y = round(y + 0.01, 2) # # for z in x: # print ("~~~~~ ",z) # quantile_label = lr_prediction_quantile.approxQuantile(label, x, 0.01) # print quantile_label quantile_prediction = lr_prediction_quantile.approxQuantile( "prediction", x, 0.01) # print quantile_prediction Q_label_pred = '' print(len(quantile_label)) length = len(quantile_label) for i in range(0, len(quantile_label)): Q_label_pred += str(quantile_label[i]) + 't' + str( quantile_prediction[i]) + 'n' # # # with open('Q_Q_plot.csv', 'w') as Q_Q: # writer_Q_Q = csv.writer(Q_Q) # writer_Q_Q.writerows((quantile_label, quantile_prediction)) # plt.scatter(quantile_label, quantile_prediction) # plt.show() ## finding the residual vs fitted graph data # plt.scatter(prediction_val_pand_predict, prediction_val_pand_residual) # plt.axhline(y=0.0, color="red") # plt.xlabel("prediction") # plt.ylabel("residual") # plt.title("residual vs fitted ") # # plt.show() # creating the csv file and writitng into it fitted_residual = '' print(len(prediction_val_pand_residual)) length = len(prediction_val_pand_residual) for i in range(0, len(prediction_val_pand_residual)): fitted_residual += str(prediction_val_pand_predict[i]) + 't' + str( prediction_val_pand_residual[i]) + 'n' # # with open('residual_vs_fitted.csv', 'w') as r_f: # writer_r_f = csv.writer(r_f) # writer_r_f.writerows((prediction_val_pand_predict, prediction_val_pand_residual)) ## residual vs leverage graph data # prediction_val_pand_residual # extreme value in the predictor colm # prediction_col_extremeval = lr_prediction_quantile.agg({"prediction": "max"}) # prediction_col_extremeval.show() # plt.plot(prediction_col_extremeval, prediction_val_pand_residual) # plt.show() ## scale location graph data prediction_val_pand_residual prediction_val_pand_predict prediction_val_pand_residual_abs = prediction_val_pand_residual.abs() sqrt_residual = [] for x in prediction_val_pand_residual_abs: sqrt_residual.append(math.sqrt(x)) # print ("____________________ ",x) sqrt_residual ######################################## # calculating std deviation print(statistics.stdev(prediction_val_pand_residual)) stdev_ = statistics.stdev(prediction_val_pand_residual) # calcuate stnd residuals std_res = [] for x in prediction_val_pand_residual: std_res.append(x / stdev_) print(std_res) # calculating the square root of std_res sqr_std_res = [] for x in std_res: sqr_std_res.append(math.sqrt(abs(x))) print(sqr_std_res) ####################################### # # # square root of label # sqrt_label = [] # for x in prediction_val_pand_label: # sqrt_label.append(math.sqrt(abs(x))) # # sqrt_label # prediction_val_pand_residual # std_residual = [] # for sqr, resid in zip(sqrt_label, prediction_val_pand_residual): # std_residual.append(resid / sqr) # # print(std_sqrt_residual) # # # creating the std sqr root # # sqrt_std_residuals = [] # for x in std_residual: # # print(math.sqrt(abs(x))) # sqrt_std_residuals.append(math.sqrt(abs(x))) # print(sqrt_std_residuals) # # # # t_sqrt_std_residuals = [] # for x in sqrt_std_residuals: # # print(math.sqrt(abs(x))) # t_sqrt_std_residuals.append(math.sqrt(abs(x))) # # print(sqrt_std_residuals) # # print(std_sqrt_residual) scale_predict_residual = '' for pre, res in zip(prediction_val_pand_predict, sqr_std_res): scale_predict_residual += str(pre) + 't' + str(res) + 'n' print(scale_predict_residual) #######################################################################################3 # QUANTILE y = 0.1 x = [] for i in range(0, 90): x.append(y) y = round(y + 0.01, 2) quantile_std_res = spark.createDataFrame(std_res, FloatType()) quantile_std_res.show() quantile_std_res_t = quantile_std_res.approxQuantile('value', x, 0.01) print(quantile_std_res_t) print(x) # calculating the z_score ## sort the list sorted_std_res = sorted(std_res) mean = statistics.mean(sorted_std_res) stdev = statistics.stdev(sorted_std_res) # print(mean) quantile = [] n = len(std_res) print(n) for x in range(0, n): quantile.append((x - 0.5) / (n)) print(quantile) # z_score theoratical z_theory = [] for x in quantile: z_theory.append(norm.ppf(abs(x))) # z score for real val z_pract = [] for x in sorted_std_res: z_pract.append((x - mean) / stdev) Q_label_pred = '' # print(len(quantile_label)) # length = len(quantile_label) # z=[-2.0,-1.5,-1.0,-0.5,0, 0.5,1.0,1.5,2.0,2.5] for quant, val in zip(z_theory, z_pract): Q_label_pred += str(quant) + 't' + str(val) + 'n' # plt.scatter(z_pract,z_theory) # plt.savefig() # # plt.scatter(z_theory,z_pract) # plt.show() #################################################### ########################################################################################## # # plt.scatter(sqrt_residual, prediction_val_pand_predict) # # plt.show() # # # # # scale_predict_residual='' # # print(len(sqrt_residual)) # length = len(sqrt_residual) # # for i in range(0, len(sqrt_residual)): # scale_predict_residual += str(prediction_val_pand_predict[i]) + 't' + str(sqrt_residual[i]) + 'n' # # # with open('scale_location_plot.csv', 'w') as s_l: # writer_s_l = csv.writer(s_l) # writer_s_l.writerows((prediction_val_pand_predict, sqrt_residual)) # dumping the dictionary into json object graph_response = { "Q_Q_plot": Q_label_pred, "residual_fitted": fitted_residual, "scale_location": scale_predict_residual } json_response = { 'table_data': table_response, 'graph_data': graph_response } # json_response = coefficient_t print(json_response) # json_response = {'run_status': 'success', 'PredictiveResponse': resultdf} return (json_response) except Exception as e: print('exception is =' + str(e))
def linearRegPersist(self, dataset_add, feature_colm, label_colm, relation_list, relation, userId): try: dataset = spark.read.csv(dataset_add, header=True, inferSchema=True) dataset.show() label = '' for val in label_colm: label = val Schema = dataset.schema stringFeatures = [] numericalFeatures = [] for x in Schema: if (str(x.dataType) == "StringType"): for y in feature_colm: if x.name == y: stringFeatures.append(x.name) else: for y in feature_colm: if x.name == y: numericalFeatures.append(x.name) if relation == 'linear': print('linear relationship') if relation == 'non_linear': dataset = Relationship(dataset, relation_list) dataset.show() for x in Schema: if (str(x.dataType) == "StringType" and x.name == label): for labelkey in label_colm: label_indexer = StringIndexer(inputCol=label, outputCol='indexed_' + label).fit(dataset) dataset = label_indexer.transform(dataset) label = 'indexed_' + label else: label = label indexed_features = [] for colm in stringFeatures: indexer = StringIndexer(inputCol=colm, outputCol='indexed_' + colm).fit(dataset) indexed_features.append('indexed_' + colm) dataset = indexer.transform(dataset) final_features = numericalFeatures + indexed_features featureassembler = VectorAssembler(inputCols=final_features, outputCol="features") dataset = featureassembler.transform(dataset) vectorIndexer = VectorIndexer(inputCol='features', outputCol='vectorIndexedFeatures', maxCategories=4).fit(dataset) dataset = vectorIndexer.transform(dataset) # Loading the persisted model locationAddress = 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/' modelPersist = 'linearRegressorModel.parquet' persistedModelLocation = locationAddress + userId + modelPersist regressorTest = LinearRegressionModel.load(persistedModelLocation) predictedData = regressorTest.transform(dataset) predictedData.show() except Exception as e: print('exception is :', e)