def predictionexample():
    # Use a breakpoint in the code line below to debug your script.
    spark = SparkSession.builder.appName('Customers').getOrCreate()
    dataset = spark.read.csv("Ecommerce_Customers.csv", inferSchema=True, header=True)
    #build feature using Vectorassembler
    featureassembler = VectorAssembler(inputCols=["Avg Session Length", "Time on App", "Time on Website", "Length of Membership"],outputCol="Independent Features")
    output = featureassembler.transform(dataset)
    output.show()

    finalized_data = output.select("Independent Features", "Yearly Amount Spent")
    finalized_data.show()
    #split the data 80%, 20%
    train_data, test_data = finalized_data.randomSplit([0.80, 0.20])
    #Regressormodel Linear
    regressor = LinearRegression(featuresCol='Independent Features', labelCol='Yearly Amount Spent')
    regressor = regressor.fit(train_data)
    pred_results = regressor.evaluate(test_data)
    pred_results.predictions.show(40)
Exemple #2
0
def train_and_pred(train, test_data, tech_only=False):
    # train the linear regression model
    lr_model = LinearRegression(featuresCol='scaledFeatures',
                                labelCol=TARGET,
                                maxIter=300,
                                regParam=1,
                                elasticNetParam=1).fit(train)
    print('Coefficients: {}'.format(str(lr_model.coefficients)))
    print('Intercept: {}'.format(str(lr_model.intercept)))

    # summarize the training
    trainingSummary = lr_model.summary
    print('Training r2 = {}'.format(float(trainingSummary.r2)))
    print('Training RMSE = {}\n'.format(
        float(trainingSummary.rootMeanSquaredError)))

    predictions_dict = dict()
    for company in test_data:
        test_company_data = test_data[company]
        lr_predictions = lr_model.transform(test_company_data)

        # Model Evaluation
        lr_evaluator = RegressionEvaluator(predictionCol='prediction',
                                           labelCol=TARGET,
                                           metricName='r2')
        test_r2 = lr_evaluator.evaluate(lr_predictions)
        print('{}, testing r2 = {}'.format(company.upper(), test_r2))

        test_result = lr_model.evaluate(test_company_data)
        print('{}, testing RMSE = {}\n'.format(
            company.upper(), test_result.rootMeanSquaredError))

        new_df = lr_predictions.drop('scaledFeatures').withColumn(
            'Instrument', lit(company))
        new_df = new_df.withColumn('Error_Pct',
                                   error_pct_udf(array(TARGET, 'prediction')))
        new_df = new_df.withColumn('Tech_Only_Pred', lit(tech_only))

        predictions_dict[company] = new_df.toPandas().reset_index().rename(
            columns={'index': 'row_num'})

    return predictions_dict
valid_output = assembler.transform(validationdataset)

valid_finalized_data = valid_output.select("Attributes",
                                           validationdataset.columns[11])
valid_finalized_data.show()

# 80/20 split train / test
train_data, test_data = finalized_data.randomSplit([0.8, 0.2])
regressor = LinearRegression(featuresCol='Attributes',
                             labelCol=dataset.columns[11])

#Train mdoel with training split
regressor = regressor.fit(train_data)

pred = regressor.evaluate(test_data)

#Predict the model
pred.predictions.show()

predictions = regressor.transform(valid_finalized_data)
predictions.show()

dataset.groupby("quality").count().show()

# ################################################################################################################
# export the trained model and create a zip file for ease of download
import shutil
from pyspark.ml.regression import LinearRegressionModel
regressor.write().overwrite().save("cs643")
Exemple #4
0
print(output.show())

finalized_data = output.select("features", "Close")

print(finalized_data.show())
#spliting the dataset in ratio 8:2
train_data, test_data = finalized_data.randomSplit([0.80, 0.20])
#training the model
regressor = LinearRegression(featuresCol='features', labelCol='Close')
regressor = regressor.fit(train_data)
#Finding  coefficients
print(regressor.coefficients)
#finding intercept
print(regressor.intercept)

pred_results = regressor.evaluate(test_data)

print(pred_results.predictions.show())

from pyspark.ml.evaluation import RegressionEvaluator
#Finding coefficient of determination and  rsme values
try:
    # training Summary
    trainingSummary = regressor.summary
    print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
    print("r2: %f" % trainingSummary.r2)
except:
    print(" Model Test have a Problem")

#saving the model
regressor.save("StockPricepred_Model")
regressor = LinearRegression(featuresCol='Features', labelCol='t2mTemp')
regressor = regressor.fit(train_data)

# COMMAND ----------

# DBTITLE 1,Regression Coefficients
regressor.coefficients

# COMMAND ----------

regressor.intercept

# COMMAND ----------

# DBTITLE 1,Evaluate model with test data
pred_results = regressor.evaluate(test_data)
pred_resultsTest = regressor.evaluate(finalized_dataTest)

# COMMAND ----------

#pred_results.predictions.show()

# COMMAND ----------

# DBTITLE 1,Predicted temperature
display(pred_resultsTest.predictions)

# COMMAND ----------

import numpy as np
import matplotlib.pyplot as plt
##### Random Forest  Ends #####


# In[108]:


#### Linear Regression  #####
regressor = LinearRegression(featuresCol="independentFeatures",labelCol="quality")
regressor=regressor.fit(train)


# In[109]:


predResults = regressor.evaluate(val)


# In[110]:


predResults = predResults.predictions


# In[111]:


regressor.write().overwrite().save("lrModel")


# In[112]:
Exemple #7
0
def main():

    # making sparksession object
    conf = SparkConf().setAppName('Covid-19')
    sc = SparkContext(conf=conf)
    sc.setLogLevel("ERROR")
    spark = SparkSession(sc)

    # load data into spark dataframe
    ####################################
    df = spark.read.format("csv").option(
        "header",
        "true").load("time_series_19-covid-Confirmed_archived_0325.csv")

    # prepare data
    ####################################
    df = df.filter(F.col("Country/Region") == "Australia")
    columns_to_drop = [
        'Country/Region', 'Province/State', 'Lat', 'Long', '1/22/20',
        '1/23/20', '1/24/20', '1/25/20', '1/26/20', '1/27/20', '1/28/20',
        '1/29/20', '1/30/20', '1/31/20'
    ]  # '3/22/20', '3/23/20' will be the test data, will remove bit later
    df = df.drop(*columns_to_drop)
    # sum up all rows data into 1 row
    df = df.select([
        F.sum(value).alias(str(index))
        for index, value in enumerate(df.columns)
    ])
    # transpose the dataframe
    df_p = df.toPandas().transpose().reset_index()
    df_p.rename(columns={0: 'Infections'})
    df_s = spark.createDataFrame(df_p)
    df_s = df_s.select(F.col('index'), F.col("0").alias("Infections"))

    # linear regression
    ####################################
    df_s = df_s.withColumn("index_double", df_s['index'].cast(DoubleType()))
    # df_s = df_s.withColumn("infections_double", df_s['Infections'].cast(DoubleType()))
    featureassembler = VectorAssembler(inputCols=["index_double"],
                                       outputCol="new_index")
    output = featureassembler.transform(df_s)
    full_data = output.select("new_index", "Infections")
    test_data = full_data.where(F.col('index_double') > 49)
    train_data = full_data.where(F.col('index_double') < 50)
    train_data.show(50)
    test_data.show()
    regressor = LinearRegression(featuresCol='new_index',
                                 labelCol='Infections')
    regressor = regressor.fit(train_data)
    pred_results = regressor.evaluate(test_data)
    pred_results.predictions.show(60)
    print("Coefficients: " + str(regressor.coefficients))
    print("Intercept: " + str(regressor.intercept))

    trainingSummary = regressor.summary
    print("numIterations: %d" % trainingSummary.totalIterations)
    print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
    print("r2: %f" % trainingSummary.r2)

    # visualize results
    ####################################
    actual_values = pred_results.predictions.select('new_index',
                                                    'Infections').collect()
    predicted_values = pred_results.predictions.select('new_index',
                                                       'prediction').collect()

    def dfToList(df):
        av_list = []
        for row in df:
            l = []
            for val in row:
                if type(val) is DenseVector:
                    l.append(val.values[0])
                else:
                    l.append(val)
            av_list.append(l)
        return av_list

    av_list = dfToList(actual_values)
    pv_list = dfToList(predicted_values)
    x1 = [c[0] for c in av_list]
    y1 = [c[1] for c in av_list]
    x2 = [c[0] for c in predicted_values]
    y2 = [c[1] for c in predicted_values]
    plt.plot(x1, y1)
    plt.plot(x2, y2)
    plt.show()
Exemple #8
0
                                   outputCol="Independent Features")

output = featureassembler.transform(dataset)

finalized_data = output.select("Independent Features", "Close")
train_data, test_data = finalized_data.randomSplit([0.75, 0.25])
regressor = LinearRegression(featuresCol='Independent Features',
                             labelCol='Close')
regressor = regressor.fit(train_data)
predictions = regressor.transform(test_data)
#predictions.show()


lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="Close",metricName="r2")
test_result = regressor.evaluate(test_data)

print("R Squared (R2) = %g" % lr_evaluator.evaluate(predictions))
print("Root Mean Squared Error (RMSE) = %g" % test_result.rootMeanSquaredError)
print("Mean Absolute Error = %g" % test_result.meanAbsoluteError)
print("Mean Squared Error = %g" % test_result.meanSquaredError)

actual = test_data.toPandas()['Close'].values.tolist()
predicted = predictions.toPandas()['prediction'].values.tolist()

plt.figure(figsize=(20, 10))
plt.plot(actual, label='Actual', color='green')
plt.plot(predicted, color='red', label='Predicted')
plt.legend(loc="upper left")
now = datetime.datetime.now()
diff = now - earlier
        print("coefficient : " + str(regressor.coefficients))

        coefficents_m = str(regressor.coefficients)

        print("intercept : " + str(regressor.intercept))

        intercept_b = regressor.intercept

        #
        # plt.plot(output_features, output_label)
        # plt.plot(output_features, intercept_b + coefficents_m*output_features, "-")
        # plt.show()
        #

        prediction_va = regressor.evaluate(test_data)

        prediction_val = prediction_va.predictions
        prediction_val.show()

        #############################################################################################################

        prediction_val_pand = prediction_val.select("MPG",
                                                    "prediction").toPandas()

        prediction_val_pand_sprk = spark.createDataFrame(prediction_val_pand)
        print(type(prediction_val_pand_sprk))
        # prediction_val_pand_sprk.write.csv('/home/fidel/PycharmProjects/predictive_analysis_git', header=True, mode='append')

        prediction_val_pand = prediction_val_pand.assign(
            residual_vall=prediction_val_pand["MPG"] -