Example #1
0
def train_and_pred(train, test_data, tech_only=False):
    # train the linear regression model
    lr_model = LinearRegression(featuresCol='scaledFeatures',
                                labelCol=TARGET,
                                maxIter=300,
                                regParam=1,
                                elasticNetParam=1).fit(train)
    print('Coefficients: {}'.format(str(lr_model.coefficients)))
    print('Intercept: {}'.format(str(lr_model.intercept)))

    # summarize the training
    trainingSummary = lr_model.summary
    print('Training r2 = {}'.format(float(trainingSummary.r2)))
    print('Training RMSE = {}\n'.format(
        float(trainingSummary.rootMeanSquaredError)))

    predictions_dict = dict()
    for company in test_data:
        test_company_data = test_data[company]
        lr_predictions = lr_model.transform(test_company_data)

        # Model Evaluation
        lr_evaluator = RegressionEvaluator(predictionCol='prediction',
                                           labelCol=TARGET,
                                           metricName='r2')
        test_r2 = lr_evaluator.evaluate(lr_predictions)
        print('{}, testing r2 = {}'.format(company.upper(), test_r2))

        test_result = lr_model.evaluate(test_company_data)
        print('{}, testing RMSE = {}\n'.format(
            company.upper(), test_result.rootMeanSquaredError))

        new_df = lr_predictions.drop('scaledFeatures').withColumn(
            'Instrument', lit(company))
        new_df = new_df.withColumn('Error_Pct',
                                   error_pct_udf(array(TARGET, 'prediction')))
        new_df = new_df.withColumn('Tech_Only_Pred', lit(tech_only))

        predictions_dict[company] = new_df.toPandas().reset_index().rename(
            columns={'index': 'row_num'})

    return predictions_dict
Example #2
0
# In the previous exercise you added more predictors to the flight duration model. The model performed well on testing data, but with so many coefficients it was difficult to interpret.

# In this exercise you'll use Lasso regression (regularized with a L1 penalty) to create a more parsimonious model. Many of the coefficients in the resulting model will be set to zero. This means that only a subset of the predictors actually contribute to the model. Despite the simpler model, it still produces a good RMSE on the testing data.

# You'll use a specific value for the regularization strength. Later you'll learn how to find the best value using cross validation.

# The data (same as previous exercise) are available as flights, randomly split into flights_train and flights_test.

# Instructions
# 100 XP
# Fit a linear regression model to the training data.
# Calculate the RMSE on the testing data.
# Look at the model coefficients.
# Get the count of coefficients equal to 0.
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Fit Lasso model (α = 1) to training data
regression = LinearRegression(labelCol='duration', regParam=1, elasticNetParam=1).fit(flights_train)

# Calculate the RMSE on testing data
rmse = RegressionEvaluator(labelCol='duration').evaluate(regression.transform(flights_test))
print("The test RMSE is", rmse)

# Look at the model coefficients
coeffs = regression.coefficients
print(coeffs)

# Number of zero coefficients
zero_coeff = sum([beta == 0 for beta in regression.coefficients])
print("Number of ceofficients equal to 0:", zero_coeff)
valid_finalized_data.show()

# 80/20 split train / test
train_data, test_data = finalized_data.randomSplit([0.8, 0.2])
regressor = LinearRegression(featuresCol='Attributes',
                             labelCol=dataset.columns[11])

#Train mdoel with training split
regressor = regressor.fit(train_data)

pred = regressor.evaluate(test_data)

#Predict the model
pred.predictions.show()

predictions = regressor.transform(valid_finalized_data)
predictions.show()

dataset.groupby("quality").count().show()

# ################################################################################################################
# export the trained model and create a zip file for ease of download
import shutil
from pyspark.ml.regression import LinearRegressionModel
regressor.write().overwrite().save("cs643")

path_drv = shutil.make_archive("cs643", format='zip', base_dir="cs643")
shutil.unpack_archive(
    "cs643.zip",
    "test",
    format='zip',
# dow (departure day of week, one-hot encoded, 7 levels) and
# mon (departure month, one-hot encoded, 12 levels).
# These have been assembled into the features column, which is a sparse representation of 32 columns (remember one-hot encoding produces a number of columns which is one fewer than the number of levels).

# The data are available as flights, randomly split into flights_train and flights_test. The object predictions is also available.

# Instructions
# 100 XP
# Fit a linear regression model to the training data.
# Generate predictions for the testing data.
# Calculate the RMSE on the testing data.
# Look at the model coefficients. Are any of them zero?


from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Fit linear regression model to training data
regression = LinearRegression(labelCol='duration').fit(flights_train)

# Make predictions on testing data
predictions = regression.transform(flights_test)

# Calculate the RMSE on testing data
rmse = RegressionEvaluator(labelCol='duration').evaluate(predictions)
print("The test RMSE is", rmse)

# Look at the model coefficients
coeffs = regression.coefficients
print(coeffs)
Example #5
0
assembler = VectorAssembler(inputCols=['weight_kg', 'cyl', 'type_dummy', 'density', 'density_area', 'density_volume'],
                            outputCol='features')
cars = assembler.transform(cars)

kars = cars.select('consumption', 'features')

print(kars.toPandas().sample(12))

# Split the data into training and testing sets
kars_train, kars_test = kars.randomSplit([0.8, 0.2], seed=23)

regression = LinearRegression(labelCol='consumption').fit(kars_train)

# Create predictions for the testing data and take a look at the predictions
predictions = regression.transform(kars_test)
print("\nStandard Linear Regression")
#print("\nStandard Linear Regression\nSample")
#print(predictions.toPandas().sample(12))

# Print the coefficients and RMSE for linear regression
trainingSummary = regression.summary
print("Coefficients: %s" % str(regression.coefficients))
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)

# Ridge regression
ridge = LinearRegression(labelCol='consumption', elasticNetParam=0, regParam=0.1).fit(kars_train)
# Create predictions for the testing data and take a look at the predictions
predictions = ridge.transform(kars_test)
print("\nRidge Regression")
#print("\nRidge Regression\nSample")
# Create an instance of the one hot encoder
onehot = OneHotEncoderEstimator(inputCols=['org_idx'], outputCols=['org_dummy'])

# Apply the one hot encoder to the flights data
onehot = onehot.fit(flights)
flights_onehot = onehot.transform(flights)

# Check the results
flights_onehot.select('org', 'org_idx', 'org_dummy').distinct().sort('org_idx').show()

# Regression
from pyspark.ml.regression import LinearRegression
regression = LinearRegression(labelCol='consumption')

regression = regression.fit(cars_train)
predictions = regression.transform(cars_test)

# Calculate RMSE
from pyspark.ml.evaluation import RegressionEvaluator
# Find RMSE
RegressionEvaluator(labelCol='consumption').evaluate(prediction)
# Other metrics: mae, r2, mse
# Examine intercept
print(regression.intercept)
# Examine Coefficients
print(regression.coefficients)

from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Create a regression object and train on training data
# Consolidate predictor columns
flights_assembled = assembler.transform(flites)

# Check the resulting column
flites = flights_assembled.select('features', 'duration')

# Split the data into training and testing sets
flights_train, flights_test = flites.randomSplit([0.8, 0.2], seed=23)
#print(flights_train.toPandas().shape, flights_test.toPandas().shape)

# Create a lasso regression object and train on training data
lasso = LinearRegression(labelCol="duration", elasticNetParam=1, regParam=1).fit(flights_train)

# Create predictions for the testing data and take a look at the predictions
predictions = lasso.transform(flights_test)
#predictions.select('duration', 'prediction').show(truncate=False)
print("\nLasso Regression")
print(predictions.toPandas().sample(12))

# Calculate the RMSE
print("\nRMSE", RegressionEvaluator(labelCol="duration").evaluate(predictions))

# Print the coefficients and intercept for linear regression
print("\nCoefficients: %s" % str(lasso.coefficients))
print("Intercept: %s" % str(lasso.intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = lasso.summary
#print("numIterations: %d" % trainingSummary.totalIterations)
#print("objectiveHistory: %s\n" % str(trainingSummary.objectiveHistory))
Example #8
0
accuracy = evalVal.evaluate(predictions)
##print("f1 Test Error = %g" % (1.0 - accuracy))
transformed_data = model.transform(val)
transformed_data = transformed_data.withColumn("prediction", func.round("prediction"))
##print(evalVal.getMetricName(), 'accuracy :', evalVal.evaluate(transformed_data))


# In[123]:


####### Linear Regression Accuarcy and f1 ############


# Create evaluator
evaluatorMulti = MulticlassClassificationEvaluator(labelCol="quality", predictionCol="prediction")

# Make predicitons
predictionAndTarget = regressor.transform(val).select("quality", "prediction")
predictionAndTarget = predictionAndTarget.withColumn("prediction", func.round("prediction"))
# Get metrics
acc = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "accuracy"})
f1 = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "f1"})


# In[124]:

print("##### Testdataset Accuracy  #####")
print("Accuracy :" , acc * 100 , "%")
print("f1 Score :" , f1)

# Consolidate predictor columns
kars_assembled = assembler.transform(kars)

# Check the resulting column
kars = kars_assembled.select('features', 'consumption')
kars.distinct().show(8, truncate=False)

# Split the data into training and testing sets
kars_train, kars_test = kars.randomSplit([0.8, 0.2], seed=23)

# Fit a Logistic Regression model to the training data
regression = LinearRegression(labelCol='consumption')
regression = regression.fit(kars_train)

# Make predictions on the testing data
prediction = regression.transform(kars_test)

# Create a confusion matrix, comparing predictions to known labels
prediction.groupBy("consumption", 'prediction').count().show(8)

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(regression.coefficients))
print("Intercept: %s" % str(regression.intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = regression.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show(8)
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)
Example #10
0
dataset = spark.read.csv(
    "C:/Users/bansi/spark-3.0.1-bin-hadoop2.7/Stock Data Google.csv",
    inferSchema=True,
    header=True)

featureassembler = VectorAssembler(inputCols=["Open", "High", "Low", "Volume"],
                                   outputCol="Independent Features")

output = featureassembler.transform(dataset)

finalized_data = output.select("Independent Features", "Close")
train_data, test_data = finalized_data.randomSplit([0.75, 0.25])
regressor = LinearRegression(featuresCol='Independent Features',
                             labelCol='Close')
regressor = regressor.fit(train_data)
predictions = regressor.transform(test_data)
#predictions.show()


lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="Close",metricName="r2")
test_result = regressor.evaluate(test_data)

print("R Squared (R2) = %g" % lr_evaluator.evaluate(predictions))
print("Root Mean Squared Error (RMSE) = %g" % test_result.rootMeanSquaredError)
print("Mean Absolute Error = %g" % test_result.meanAbsoluteError)
print("Mean Squared Error = %g" % test_result.meanSquaredError)

actual = test_data.toPandas()['Close'].values.tolist()
predicted = predictions.toPandas()['prediction'].values.tolist()
Example #11
0
plt.ylabel('Beta Coefficients')
plt.show()

# In[39]:

regressor.intercept

# In[40]:

trainingSummary = regressor.summary
print("numIterations: %d" % trainingSummary.totalIterations)

# In[46]:

test_data = featureassembler.transform(df_test)
rest = regressor.transform(test_data)
df = rest.toPandas()
rest.select("type", "type_index", "prediction").show(6)

# In[42]:

from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                          labelCol='type_index')

# In[44]:

print("The area under ROC for test set is {}".format(evaluator.evaluate(rest)))

# In[ ]:
Example #12
0
The data (same as previous exercise) are available as flights, randomly split into flights_train and flights_test.

Instructions
100 XP
Fit a linear regression model to the training data.
Calculate the RMSE on the testing data.
Look at the model coefficients.
Get the count of coefficients equal to 0.
'''
SOLUTION

from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Fit Lasso model (α = 1) to training data
regression = LinearRegression(labelCol='duration',
                              regParam=1,
                              elasticNetParam=1).fit(flights_train)

# Calculate the RMSE on testing data
rmse = RegressionEvaluator(labelCol='duration').evaluate(
    regression.transform(flights_test))
print("The test RMSE is", rmse)

# Look at the model coefficients
coeffs = regression.coefficients
print(coeffs)

# Number of zero coefficients
zero_coeff = sum([beta == 0 for beta in regression.coefficients])
print("Number of ceofficients equal to 0:", zero_coeff)
Example #13
0
assembler = VectorAssembler(
    inputCols=[x for x in train.columns if x not in ignore],
    outputCol='features')
train_LP = assembler.transform(train).select(['label', 'features'])
evaluation_LP = assembler.transform(evaluation).select(['label', 'features'])

#Definimos el algoritmo del modelo (regresion logistica)
model_regresion = LinearRegression(maxIter=50,
                                   regParam=0.05,
                                   elasticNetParam=0.05)

# Fit the model
model_regresion = model_regresion.fit(train_LP)

# Make predictions.
predictions = model_regresion.transform(evaluation_LP)

# Fit the model
# lrModel = lr.fit(training)

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(model_regresion.coefficients))
print("Intercept: %s" % str(model_regresion.intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = model_regresion.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)
Example #14
0
# Consolidate predictor columns
flights_assembled = assembler.transform(flites)

# Check the resulting column
flites = flights_assembled.select('features', 'duration')

# Split the data into training and testing sets
flights_train, flights_test = flites.randomSplit([0.8, 0.2], seed=23)
#print(flights_train.toPandas().shape, flights_test.toPandas().shape)

# Create a ridge regression object and train on training data
ridge = LinearRegression(labelCol="duration", elasticNetParam=0,
                         regParam=0.1).fit(flights_train)

# Create predictions for the testing data and take a look at the predictions
predictions = ridge.transform(flights_test)
#predictions.select('duration', 'prediction').show(truncate=False)
print("\nRidge Regression")
print(predictions.toPandas().sample(12))

# Calculate the RMSE
print("\nRMSE", RegressionEvaluator(labelCol="duration").evaluate(predictions))

# Print the coefficients and intercept for linear regression
print("\nCoefficients: %s" % str(ridge.coefficients))
print("Intercept: %s" % str(ridge.intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = ridge.summary
#print("numIterations: %d" % trainingSummary.totalIterations)
#print("objectiveHistory: %s\n" % str(trainingSummary.objectiveHistory))
Example #15
0
# Plot on a Google Map using Bokeh (include Maps.py for the plot method)
# execfile('Maps.py')
# plot(lats, lons, count)

l = post.filter(lambda (k, x): k == 'W1J 7NT').map(
    lambda (k, (c1, c2)): [c1, c2]).reduce(lambda x, y: x + y)
p10 = valid.filter(lambda (k, c, (y, m, d), p, s): y == 2010)

dist = p10.map(lambda (k, (c0, c1), d, p, s):
               (p, distance(c0, c1, l[0], l[1])))
vectors = dist.map(lambda (x, y): Vectors.dense([x, y]))

print(Statistics.corr(vectors, method='spearman'))

parsedData = dist.map(lambda (p, d): LabeledPoint(float(p), Vectors.dense(d)))
model = LinearRegression(maxIter=10, regParam=0.3,
                         elasticNetParam=0.8).fit(parsedData.toDF())

Beta = model.coefficients
intercept = model.intercept
x = dist.map(lambda (p, d): Vectors.dense(d))
y = dist.map(lambda (p, d): p)
sd_y = y.sampleStdev()
sd_x = x.sampleStdev()
r = Beta / (sd_y / sd_x)

predict = model.transform(parsedData.toDF())

evaluator = RegressionEvaluator(metricName='rmse')
RMSE = evaluator.evaluate(predict)  #226861.44751570973