def predictionexample(): # Use a breakpoint in the code line below to debug your script. spark = SparkSession.builder.appName('Customers').getOrCreate() dataset = spark.read.csv("Ecommerce_Customers.csv", inferSchema=True, header=True) #build feature using Vectorassembler featureassembler = VectorAssembler(inputCols=["Avg Session Length", "Time on App", "Time on Website", "Length of Membership"],outputCol="Independent Features") output = featureassembler.transform(dataset) output.show() finalized_data = output.select("Independent Features", "Yearly Amount Spent") finalized_data.show() #split the data 80%, 20% train_data, test_data = finalized_data.randomSplit([0.80, 0.20]) #Regressormodel Linear regressor = LinearRegression(featuresCol='Independent Features', labelCol='Yearly Amount Spent') regressor = regressor.fit(train_data) pred_results = regressor.evaluate(test_data) pred_results.predictions.show(40)
def train_and_pred(train, test_data, tech_only=False): # train the linear regression model lr_model = LinearRegression(featuresCol='scaledFeatures', labelCol=TARGET, maxIter=300, regParam=1, elasticNetParam=1).fit(train) print('Coefficients: {}'.format(str(lr_model.coefficients))) print('Intercept: {}'.format(str(lr_model.intercept))) # summarize the training trainingSummary = lr_model.summary print('Training r2 = {}'.format(float(trainingSummary.r2))) print('Training RMSE = {}\n'.format( float(trainingSummary.rootMeanSquaredError))) predictions_dict = dict() for company in test_data: test_company_data = test_data[company] lr_predictions = lr_model.transform(test_company_data) # Model Evaluation lr_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol=TARGET, metricName='r2') test_r2 = lr_evaluator.evaluate(lr_predictions) print('{}, testing r2 = {}'.format(company.upper(), test_r2)) test_result = lr_model.evaluate(test_company_data) print('{}, testing RMSE = {}\n'.format( company.upper(), test_result.rootMeanSquaredError)) new_df = lr_predictions.drop('scaledFeatures').withColumn( 'Instrument', lit(company)) new_df = new_df.withColumn('Error_Pct', error_pct_udf(array(TARGET, 'prediction'))) new_df = new_df.withColumn('Tech_Only_Pred', lit(tech_only)) predictions_dict[company] = new_df.toPandas().reset_index().rename( columns={'index': 'row_num'}) return predictions_dict
valid_output = assembler.transform(validationdataset) valid_finalized_data = valid_output.select("Attributes", validationdataset.columns[11]) valid_finalized_data.show() # 80/20 split train / test train_data, test_data = finalized_data.randomSplit([0.8, 0.2]) regressor = LinearRegression(featuresCol='Attributes', labelCol=dataset.columns[11]) #Train mdoel with training split regressor = regressor.fit(train_data) pred = regressor.evaluate(test_data) #Predict the model pred.predictions.show() predictions = regressor.transform(valid_finalized_data) predictions.show() dataset.groupby("quality").count().show() # ################################################################################################################ # export the trained model and create a zip file for ease of download import shutil from pyspark.ml.regression import LinearRegressionModel regressor.write().overwrite().save("cs643")
print(output.show()) finalized_data = output.select("features", "Close") print(finalized_data.show()) #spliting the dataset in ratio 8:2 train_data, test_data = finalized_data.randomSplit([0.80, 0.20]) #training the model regressor = LinearRegression(featuresCol='features', labelCol='Close') regressor = regressor.fit(train_data) #Finding coefficients print(regressor.coefficients) #finding intercept print(regressor.intercept) pred_results = regressor.evaluate(test_data) print(pred_results.predictions.show()) from pyspark.ml.evaluation import RegressionEvaluator #Finding coefficient of determination and rsme values try: # training Summary trainingSummary = regressor.summary print("RMSE: %f" % trainingSummary.rootMeanSquaredError) print("r2: %f" % trainingSummary.r2) except: print(" Model Test have a Problem") #saving the model regressor.save("StockPricepred_Model")
regressor = LinearRegression(featuresCol='Features', labelCol='t2mTemp') regressor = regressor.fit(train_data) # COMMAND ---------- # DBTITLE 1,Regression Coefficients regressor.coefficients # COMMAND ---------- regressor.intercept # COMMAND ---------- # DBTITLE 1,Evaluate model with test data pred_results = regressor.evaluate(test_data) pred_resultsTest = regressor.evaluate(finalized_dataTest) # COMMAND ---------- #pred_results.predictions.show() # COMMAND ---------- # DBTITLE 1,Predicted temperature display(pred_resultsTest.predictions) # COMMAND ---------- import numpy as np import matplotlib.pyplot as plt
##### Random Forest Ends ##### # In[108]: #### Linear Regression ##### regressor = LinearRegression(featuresCol="independentFeatures",labelCol="quality") regressor=regressor.fit(train) # In[109]: predResults = regressor.evaluate(val) # In[110]: predResults = predResults.predictions # In[111]: regressor.write().overwrite().save("lrModel") # In[112]:
def main(): # making sparksession object conf = SparkConf().setAppName('Covid-19') sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") spark = SparkSession(sc) # load data into spark dataframe #################################### df = spark.read.format("csv").option( "header", "true").load("time_series_19-covid-Confirmed_archived_0325.csv") # prepare data #################################### df = df.filter(F.col("Country/Region") == "Australia") columns_to_drop = [ 'Country/Region', 'Province/State', 'Lat', 'Long', '1/22/20', '1/23/20', '1/24/20', '1/25/20', '1/26/20', '1/27/20', '1/28/20', '1/29/20', '1/30/20', '1/31/20' ] # '3/22/20', '3/23/20' will be the test data, will remove bit later df = df.drop(*columns_to_drop) # sum up all rows data into 1 row df = df.select([ F.sum(value).alias(str(index)) for index, value in enumerate(df.columns) ]) # transpose the dataframe df_p = df.toPandas().transpose().reset_index() df_p.rename(columns={0: 'Infections'}) df_s = spark.createDataFrame(df_p) df_s = df_s.select(F.col('index'), F.col("0").alias("Infections")) # linear regression #################################### df_s = df_s.withColumn("index_double", df_s['index'].cast(DoubleType())) # df_s = df_s.withColumn("infections_double", df_s['Infections'].cast(DoubleType())) featureassembler = VectorAssembler(inputCols=["index_double"], outputCol="new_index") output = featureassembler.transform(df_s) full_data = output.select("new_index", "Infections") test_data = full_data.where(F.col('index_double') > 49) train_data = full_data.where(F.col('index_double') < 50) train_data.show(50) test_data.show() regressor = LinearRegression(featuresCol='new_index', labelCol='Infections') regressor = regressor.fit(train_data) pred_results = regressor.evaluate(test_data) pred_results.predictions.show(60) print("Coefficients: " + str(regressor.coefficients)) print("Intercept: " + str(regressor.intercept)) trainingSummary = regressor.summary print("numIterations: %d" % trainingSummary.totalIterations) print("RMSE: %f" % trainingSummary.rootMeanSquaredError) print("r2: %f" % trainingSummary.r2) # visualize results #################################### actual_values = pred_results.predictions.select('new_index', 'Infections').collect() predicted_values = pred_results.predictions.select('new_index', 'prediction').collect() def dfToList(df): av_list = [] for row in df: l = [] for val in row: if type(val) is DenseVector: l.append(val.values[0]) else: l.append(val) av_list.append(l) return av_list av_list = dfToList(actual_values) pv_list = dfToList(predicted_values) x1 = [c[0] for c in av_list] y1 = [c[1] for c in av_list] x2 = [c[0] for c in predicted_values] y2 = [c[1] for c in predicted_values] plt.plot(x1, y1) plt.plot(x2, y2) plt.show()
outputCol="Independent Features") output = featureassembler.transform(dataset) finalized_data = output.select("Independent Features", "Close") train_data, test_data = finalized_data.randomSplit([0.75, 0.25]) regressor = LinearRegression(featuresCol='Independent Features', labelCol='Close') regressor = regressor.fit(train_data) predictions = regressor.transform(test_data) #predictions.show() lr_evaluator = RegressionEvaluator(predictionCol="prediction", \ labelCol="Close",metricName="r2") test_result = regressor.evaluate(test_data) print("R Squared (R2) = %g" % lr_evaluator.evaluate(predictions)) print("Root Mean Squared Error (RMSE) = %g" % test_result.rootMeanSquaredError) print("Mean Absolute Error = %g" % test_result.meanAbsoluteError) print("Mean Squared Error = %g" % test_result.meanSquaredError) actual = test_data.toPandas()['Close'].values.tolist() predicted = predictions.toPandas()['prediction'].values.tolist() plt.figure(figsize=(20, 10)) plt.plot(actual, label='Actual', color='green') plt.plot(predicted, color='red', label='Predicted') plt.legend(loc="upper left") now = datetime.datetime.now() diff = now - earlier
print("coefficient : " + str(regressor.coefficients)) coefficents_m = str(regressor.coefficients) print("intercept : " + str(regressor.intercept)) intercept_b = regressor.intercept # # plt.plot(output_features, output_label) # plt.plot(output_features, intercept_b + coefficents_m*output_features, "-") # plt.show() # prediction_va = regressor.evaluate(test_data) prediction_val = prediction_va.predictions prediction_val.show() ############################################################################################################# prediction_val_pand = prediction_val.select("MPG", "prediction").toPandas() prediction_val_pand_sprk = spark.createDataFrame(prediction_val_pand) print(type(prediction_val_pand_sprk)) # prediction_val_pand_sprk.write.csv('/home/fidel/PycharmProjects/predictive_analysis_git', header=True, mode='append') prediction_val_pand = prediction_val_pand.assign( residual_vall=prediction_val_pand["MPG"] -