Beispiel #1
0
 def train(self, rdd):
     """
     This ignores the optimizer parameter since it makes config difficult for Linear Regression.
     :return:  Trained model to be passed to test.
     """
     options = self.options
     if options.loss == "l2":
         if options.reg_type in ["none", "l1", "l2"]:
             return LinearRegressionWithSGD.train(data=rdd,
                                                  iterations=options.num_iterations,
                                                  step=options.step_size,
                                                  miniBatchFraction=1.0,
                                                  regParam=options.reg_param,
                                                  regType=options.reg_type)
         elif options.reg_type == "elastic-net":  # use spark.ml
             lr = MLLinearRegression(maxIter=options.num_iterations, regParam=options.reg_param,
                                     elasticNetParam=options.elastic_net_param)
             # TODO: Do not include time for conversion to DataFrame (but this currently matches
             #       the Scala tests)
             df = rdd.toDF()
             lrModel = lr.fit(df)
             return LinearRegressionModel(lrModel.weights, lrModel.intercept)
         else:
             raise Exception("GLMRegressionTest cannot run with loss = %s, reg_type = %s" \
                             % (options.loss, options.reg_type))
     else:
         raise Exception("GLMRegressionTest does not recognize loss: %s" % options.loss)
Beispiel #2
0
    def test_java_object_gets_detached(self):
        df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                         (0.0, 2.0, Vectors.sparse(1, [], []))],
                                        ["label", "weight", "features"])
        lr = LinearRegression(maxIter=1, regParam=0.0, solver="normal", weightCol="weight",
                              fitIntercept=False)

        model = lr.fit(df)
        summary = model.summary

        self.assertIsInstance(model, JavaWrapper)
        self.assertIsInstance(summary, JavaWrapper)
        self.assertIsInstance(model, JavaParams)
        self.assertNotIsInstance(summary, JavaParams)

        error_no_object = 'Target Object ID does not exist for this gateway'

        self.assertIn("LinearRegression_", model._java_obj.toString())
        self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString())

        model.__del__()

        with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
            model._java_obj.toString()
        self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString())

        try:
            summary.__del__()
        except:
            pass

        with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
            model._java_obj.toString()
        with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
            summary._java_obj.toString()
Beispiel #3
0
 def test_linear_regression_pmml_basic(self):
     # Most of the validation is done in the Scala side, here we just check
     # that we output text rather than parquet (e.g. that the format flag
     # was respected).
     df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], []))],
                                     ["label", "weight", "features"])
     lr = LinearRegression(maxIter=1)
     model = lr.fit(df)
     path = tempfile.mkdtemp()
     lr_path = path + "/lr-pmml"
     model.write().format("pmml").save(lr_path)
     pmml_text_list = self.sc.textFile(lr_path).collect()
     pmml_text = "\n".join(pmml_text_list)
     self.assertIn("Apache Spark", pmml_text)
     self.assertIn("PMML", pmml_text)
Beispiel #4
0
    def test_linear_regression_with_huber_loss(self):

        data_path = "data/mllib/sample_linear_regression_data.txt"
        df = self.spark.read.format("libsvm").load(data_path)

        lir = LinearRegression(loss="huber", epsilon=2.0)
        model = lir.fit(df)

        expectedCoefficients = [0.136, 0.7648, -0.7761, 2.4236, 0.537,
                                1.2612, -0.333, -0.5694, -0.6311, 0.6053]
        expectedIntercept = 0.1607
        expectedScale = 9.758

        self.assertTrue(
            np.allclose(model.coefficients.toArray(), expectedCoefficients, atol=1E-3))
        self.assertTrue(np.isclose(model.intercept, expectedIntercept, atol=1E-3))
        self.assertTrue(np.isclose(model.scale, expectedScale, atol=1E-3))
Beispiel #5
0
 def test_linear_regression_summary(self):
     df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], []))],
                                     ["label", "weight", "features"])
     lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight",
                           fitIntercept=False)
     model = lr.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     # test that api is callable and returns expected types
     self.assertGreater(s.totalIterations, 0)
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.predictionCol, "prediction")
     self.assertEqual(s.labelCol, "label")
     self.assertEqual(s.featuresCol, "features")
     objHist = s.objectiveHistory
     self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
     self.assertAlmostEqual(s.explainedVariance, 0.25, 2)
     self.assertAlmostEqual(s.meanAbsoluteError, 0.0)
     self.assertAlmostEqual(s.meanSquaredError, 0.0)
     self.assertAlmostEqual(s.rootMeanSquaredError, 0.0)
     self.assertAlmostEqual(s.r2, 1.0, 2)
     self.assertAlmostEqual(s.r2adj, 1.0, 2)
     self.assertTrue(isinstance(s.residuals, DataFrame))
     self.assertEqual(s.numInstances, 2)
     self.assertEqual(s.degreesOfFreedom, 1)
     devResiduals = s.devianceResiduals
     self.assertTrue(isinstance(devResiduals, list) and isinstance(devResiduals[0], float))
     coefStdErr = s.coefficientStandardErrors
     self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float))
     tValues = s.tValues
     self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float))
     pValues = s.pValues
     self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float))
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned
     # The child class LinearRegressionTrainingSummary runs full test
     sameSummary = model.evaluate(df)
     self.assertAlmostEqual(sameSummary.explainedVariance, s.explainedVariance)
Beispiel #6
0
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import print_function

# $example on$
from pyspark.ml.regression import LinearRegression
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession.builder.appName("LinearRegressionWithElasticNet").getOrCreate()

    # $example on$
    # Load training data
    training = spark.read.format("libsvm")\
        .load("data/mllib/sample_linear_regression_data.txt")

    lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

    # Fit the model
    lrModel = lr.fit(training)

    # Print the coefficients and intercept for linear regression
    print("Coefficients: " + str(lrModel.coefficients))
    print("Intercept: " + str(lrModel.intercept))
    # $example off$

    spark.stop()
    df = data.toDF(colNames)

    # Note, there are lots of cases where you can avoid going from an RDD to a DataFrame.
    # Perhaps you're importing data from a real database. Or you are using structured streaming
    # to get your data.

    # Let's split our data into training data and testing data
    trainTest = df.randomSplit([0.5, 0.5])
    trainingDF = trainTest[0]
    testDF = trainTest[1]

    # Now create our linear regression model
    lir = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

    # Train the model using our training data
    model = lir.fit(trainingDF)

    # Now see if we can predict values in our test data.
    # Generate predictions using our linear regression model for all features in our
    # test dataframe:
    fullPredictions = model.transform(testDF).cache()

    # Extract the predictions and the "known" correct labels.
    predictions = fullPredictions.select("prediction").rdd.map(lambda x: x[0])
    labels = fullPredictions.select("label").rdd.map(lambda x: x[0])

    # Zip them together
    predictionAndLabel = predictions.zip(labels).collect()

    # Print out the predicted and actual values for each point
    for prediction in predictionAndLabel:
Beispiel #8
0
df_vector = spark.createDataFrame(
    input_data, ["fare", "features"])  # Create a vector dataframe
# Scale the Pclass values to make it more fit for analysis
standardScaler = StandardScaler(inputCol="features",
                                outputCol="features_scaled")
scaler = standardScaler.fit(df_vector)
df_scaled = scaler.transform(df_vector)

# Create train and test data for the regression model
train_data, test_data = df_scaled.randomSplit([.8, .2], seed=1234)
# Create a Linear Regression model
lr = LinearRegression(labelCol="fare",
                      maxIter=10,
                      regParam=0.3,
                      elasticNetParam=0.8)
model = lr.fit(train_data)

print('\n------------- Question 3 -------------')
# Print some important statistics from the regression model
print(
    'Linear Regression model statistics for dependent Fare and independent Pclass:'
)
print("Coefficient(s): %s" % str(model.coefficients))
print("Intercept: %s" % str(model.intercept))
print("RMSE: %f" % model.summary.rootMeanSquaredError)
print("r2: %f" % model.summary.r2)
print('\n')

# Answer y = b + ax or fare = intercept + coefficient * pclass
# round output to 2 decimals
q3a = round(model.intercept + (model.coefficients[0] * 1), 2)
Beispiel #9
0
# Replace `df` with the new DataFrame
Dataframe = spark.createDataFrame(input_data, ["label", "features"])
from pyspark.ml.feature import StandardScaler
standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled")

scaler = standardScaler.fit(Dataframe)
scaled_df = scaler.transform(Dataframe)

train_data, test_data = scaled_df.randomSplit([.8,.2],seed=1234)

from pyspark.ml.regression import LinearRegression

lr = LinearRegression(labelCol="label", maxIter=10, regParam=0.3, elasticNetParam=0.8)

linearModel = lr.fit(train_data)

Got error here 

predicted = linearModel.transform(test_data)
predictions = predicted.select("prediction").rdd.map(lambda x: x[0])
labels = predicted.select("label").rdd.map(lambda x: x[0])
predictionAndLabel[:5]
linearModel.coefficients
linearModel.intercept


# Get the RMSE
linearModel.summary.rootMeanSquaredError
#The RMSE measures how much error there is between two datasets comparing a predicted value and an observed or known value. 
#The smaller an RMSE value, the closer predicted and observed values are.
Beispiel #10
0
    def ridgeRegression(self, dataset_add, feature_colm, label_colm, relation_list, relation,userId):
        try:
            dataset = spark.read.parquet(dataset_add)
            dataset.show()
            Rsqr_list = []
            Rsqr_regPara = {}
            print(self.xt)
            # print(data_add)

            label = ''
            for val in label_colm:
                label = val
            #ETL part
            Schema = dataset.schema
            stringFeatures = []
            numericalFeatures = []
            for x in Schema:
                if (str(x.dataType) == "StringType" or str(x.dataType) == 'TimestampType' or str(
                        x.dataType) == 'DateType' or str(x.dataType) == 'BooleanType' or str(x.dataType) == 'BinaryType'):
                    for y in feature_colm:
                        if x.name == y:
                            dataset = dataset.withColumn(y, dataset[y].cast(StringType()))
                            stringFeatures.append(x.name)
                else:
                    for y in feature_colm:
                        if x.name == y:
                            numericalFeatures.append(x.name)

            if relation == 'linear':
                dataset = dataset
            if relation == 'non_linear':
                dataset = Relationship(dataset, relation_list)


            categoryColmList = []
            categoryColmListFinal = []
            categoryColmListDict = {}
            countOfCategoricalColmList = []
            for value in stringFeatures:
                categoryColm = value
                listValue = value
                listValue = []
                categoryColm = dataset.groupby(value).count()
                countOfCategoricalColmList.append(categoryColm.count())
                categoryColmJson = categoryColm.toJSON()
                for row in categoryColmJson.collect():
                    categoryColmSummary = json.loads(row)
                    listValue.append(categoryColmSummary)
                categoryColmListDict[value] = listValue

            if not stringFeatures:
                maxCategories = 5
            else:
                maxCategories = max(countOfCategoricalColmList)
            for x in Schema:
                if (str(x.dataType) == "StringType" and x.name == label):
                    for labelkey in label_colm:
                        label_indexer = StringIndexer(inputCol=label, outputCol='indexed_' + label, handleInvalid="skip").fit(dataset)
                        dataset = label_indexer.transform(dataset)
                        label = 'indexed_' + label
                else:
                    label = label
            indexed_features = []
            encodedFeatures = []
            for colm in stringFeatures:
                indexer = StringIndexer(inputCol=colm, outputCol='indexed_' + colm, handleInvalid="skip").fit(dataset)
                indexed_features.append('indexed_' + colm)
                dataset = indexer.transform(dataset)
            featureAssembler = VectorAssembler(inputCols=indexed_features + numericalFeatures, outputCol='features', handleInvalid="skip")
            dataset = featureAssembler.transform(dataset)
            vectorIndexer = VectorIndexer(inputCol='features', outputCol='vectorIndexedFeatures', maxCategories=maxCategories, handleInvalid="skip").fit(
                dataset)
            dataset = vectorIndexer.transform(dataset)
            trainDataRatioTransformed = self.trainDataRatio
            testDataRatio = 1 - trainDataRatioTransformed
            train_data, test_data = dataset.randomSplit([trainDataRatioTransformed, testDataRatio], seed=40)

            ######################################################################33

            for t in self.xt:
                lr1 = LinearRegression(featuresCol="vectorIndexedFeatures", labelCol=label, elasticNetParam=0,
                                       regParam=t)
                regressor1 = lr1.fit(train_data)
                print(t)
                print("coefficient : " + str(regressor1.coefficients))
                reg_sum = regressor1.summary
                r2 = reg_sum.r2
                Rsqr_list.append(r2)
                Rsqr_regPara[r2] = t
                print(r2)

            print(Rsqr_list)
            print(max(Rsqr_list))
            maximum_rsqr = max(Rsqr_list)
            print(Rsqr_regPara)
            final_regPara = []

            for key, val in Rsqr_regPara.items():
                if (key == maximum_rsqr):
                    print(val)
                    final_regPara.append(val)

            for reg in final_regPara:
                lr_lasso = LinearRegression(featuresCol="vectorIndexedFeatures", labelCol=label, elasticNetParam=0,
                                            regParam=reg)
                regressor = lr_lasso.fit(train_data)
                training_summary = regressor.summary
                r2 = training_summary.r2
                print(r2)

            print("coefficient : " + str(regressor.coefficients))
            coefficient_t = str(regressor.coefficients)
            print("intercept : " + str(regressor.intercept))
            intercept_t = str(regressor.intercept)
            prediction = regressor.evaluate(test_data)
            prediction_val = prediction.predictions
            prediction_val.show()
            prediction_val_pand = prediction_val.select(label, "prediction").toPandas()
            prediction_val_pand = prediction_val_pand.assign(
                residual_vall=prediction_val_pand[label] - prediction_val_pand["prediction"])

            prediction_val_pand_residual = prediction_val_pand["residual_vall"]
            prediction_val_pand_label = prediction_val_pand[label]
            prediction_val_pand_predict = prediction_val_pand["prediction"]
            lr_prediction = regressor.transform(test_data)
            lr_prediction.groupBy(label, "prediction").count().show()
            lr_prediction_quantile = lr_prediction.select(label, "prediction")
            lr_prediction_onlypred = lr_prediction.select('prediction')

            # training_summary = regressor.summary

            print("numof_Iterations...%d\n" % training_summary.totalIterations)
            print("ObjectiveHistory...%s\n" % str(training_summary.objectiveHistory))
            print("RMSE...%f\n" % training_summary.rootMeanSquaredError)
            RMSE = training_summary.rootMeanSquaredError
            print("MSE....%f\n" % training_summary.meanSquaredError)
            MSE = training_summary.meanSquaredError
            print("r**2(r-square)....::%f\n" % training_summary.r2)
            r_square = training_summary.r2
            print("r**2(r-square adjusted)....%f\n" % training_summary.r2adj)
            adjsted_r_square = training_summary.r2adj
            print("deviance residuals %s" % str(training_summary.devianceResiduals))
            training_summary.residuals.show()
            residual_graph = training_summary.residuals
            residual_graph_pandas = residual_graph.toPandas()
            print("coefficient standard errors: \n" + str(training_summary.coefficientStandardErrors))
            coefficientStdError = str(training_summary.coefficientStandardErrors)
            print(" Tvalues :\n" + str(training_summary.tValues))
            T_values = str(training_summary.tValues)
            tValuesList = training_summary.tValues
            print(" p values :\n" + str(training_summary.pValues))
            P_values = str(training_summary.pValues)
            coefficientList = list(regressor.coefficients)

            #summaryData
            import pyspark.sql.functions as F
            import builtins
            round = getattr(builtins, 'round')
            print(coefficientList)
            coefficientListRounded = []
            for value in coefficientList:
                coefficientListRounded.append(round(value, 4))
            # print(coefficientListRounded)
            # print(intercept_t)
            interceptRounded = round(float(intercept_t), 4)
            # print(interceptRounded)
            # print(RMSE)
            RMSERounded = round(RMSE, 4)
            # print(RMSERounded)
            MSERounded = round(MSE, 4)
            rSquareRounded = round(r_square, 4)
            adjustedrSquareRounded = round(adjsted_r_square, 4)
            coefficientStdError = training_summary.coefficientStandardErrors
            coefficientStdErrorRounded = []
            for value in coefficientStdError:
                coefficientStdErrorRounded.append(round(float(value), 4))
            print(coefficientStdErrorRounded)
            tValuesListRounded = []
            for value in tValuesList:
                tValuesListRounded.append(round(value, 4))
            print(tValuesListRounded)
            pValuesListRounded = []
            PValuesList = training_summary.pValues

            for value in PValuesList:
                pValuesListRounded.append(round(value, 4))
            print(pValuesListRounded)

            # regression equation
            intercept_t = float(intercept_t)
            coefficientList = list(regressor.coefficients)
            equation = label, '=', interceptRounded, '+'
            for feature, coeff in zip(feature_colm, coefficientListRounded):
                coeffFeature = coeff, '*', feature, '+'
                equation += coeffFeature
            equation = equation[:-1]
            print(equation)
            equationAsList = list(equation)

            # significance value

            PValuesList = training_summary.pValues
            significanceObject = {}

            for pValue in pValuesListRounded:
                if (0 <= pValue < 0.001):
                    significanceObject[pValue] = '***'
                if (0.001 <= pValue < 0.01):
                    significanceObject[pValue] = '**'
                if (0.01 <= pValue < 0.05):
                    significanceObject[pValue] = '*'
                if (0.05 <= pValue < 0.1):
                    significanceObject[pValue] = '.'
                if (0.1 <= pValue < 1):
                    significanceObject[pValue] = '-'
            print(significanceObject)


            # residual  vs predicted value

            prediction_data = regressor.summary.predictions
            prediction_data.show()
            prediction_data.select(['prediction']).show()
            predicted = prediction_data.select(['prediction'])
            regressor.summary.residuals.show()
            residuals = regressor.summary.residuals
            pred_d = predicted.withColumn('row_index', f.monotonically_increasing_id())
            res_d = residuals.withColumn('row_index', f.monotonically_increasing_id())

            pred_residuals = pred_d.join(res_d, on=['row_index']).sort('row_index').drop('row_index')
            pred_residuals.show()

            QQPlot = 'QQPlot.parquet'
            locationAddress = 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/'

            # userId = '6786103f-b49b-42f2-ba40-aa8168b65e67'

            QQPlotAddress = locationAddress + userId + QQPlot
            pred_residuals.write.parquet(QQPlotAddress, mode='overwrite')

            # pred_residuals.write.parquet('hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/Q_Q_PLOT.parquet',
            #                              mode='overwrite')


            #################################################################################3
            # scale location plot
            from pyspark.sql.functions import abs as ab, sqrt, mean as meann, stddev as stdDev

            df_label = prediction_data.select(label, 'prediction',
                                              sqrt(ab(prediction_data[label])).alias("sqrt_label"))

            df_label.show()
            df_sqrt_label_index = df_label.withColumn('row_index', f.monotonically_increasing_id())
            df_sqrt_label_index.show()
            res_d.show()
            sqrt_label_residual_join = df_sqrt_label_index.join(res_d, on=['row_index']).sort('row_index').drop(
                'row_index')
            sqrt_label_residual_join.show()
            std_resid = sqrt_label_residual_join.select('sqrt_label', 'prediction', (
                    sqrt_label_residual_join['residuals'] / sqrt_label_residual_join['sqrt_label']).alias(
                'std_res'))
            std_resid.show()
            sqrt_std_res = std_resid.select("std_res", 'prediction',
                                            sqrt(ab(std_resid["std_res"])).alias("sqrt_std_resid"))
            sqrt_std_res.show()
            sqrt_std_res_fitted = sqrt_std_res.select('prediction', 'sqrt_std_resid')

            scaleLocationPlot = 'scaleLocation.parquet'

            scaleLocationPlotAddress = locationAddress + userId + scaleLocationPlot
            sqrt_std_res_fitted.write.parquet(scaleLocationPlotAddress, mode='overwrite')

            # sqrt_std_res_fitted.write.parquet(
            #     'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/scale_location_train.parquet',
            #     mode='overwrite')
            ###########
            #QQplot
            # QUANTILE

            from scipy.stats import norm
            import statistics
            import math

            res_d.show()
            sorted_res = res_d.sort('residuals')
            sorted_res.show()
            # stdev_ress = sorted_res.select(stdDev(col('residuals')).alias('std_dev'),
            #                                meann(col('residuals')).alias('mean'))
            # stdev_ress.show()
            # mean_residual = stdev_ress.select(['mean']).toPandas()
            # l = mean_residual.values.tolist()
            # print(l)
            # stddev_residual = stdev_ress.select(['std_dev']).toPandas()
            # length of the sorted std residuals
            count = sorted_res.groupBy().count().toPandas()
            countList = count.values.tolist()
            tuple1 = ()
            for k in countList:
                tuple1 = k
            for tu in tuple1:
                lengthResiduals = tu
            print(lengthResiduals)
            quantileList = []
            for x in range(0, lengthResiduals):
                quantileList.append((x - 0.5) / (lengthResiduals))

            print(quantileList)

            # Z-score on theoritical quantile

            zTheoriticalTrain = []
            for x in quantileList:
                zTheoriticalTrain.append(norm.ppf(abs(x)))
            print(zTheoriticalTrain)

            sortedResidualPDF = sorted_res.select('residuals').toPandas()
            sortedResidualPDF = sortedResidualPDF['residuals']
            stdevResidualTrain = statistics.stdev(sortedResidualPDF)
            meanResidualTrain = statistics.mean(sortedResidualPDF)

            zPracticalTrain = []
            for x in sortedResidualPDF:
                zPracticalTrain.append((x - meanResidualTrain) / stdevResidualTrain)




            ##########
            target = dataset.select(label)
            pred = prediction_data.select(['prediction'])
            pred_d = pred.withColumn('row_index', f.monotonically_increasing_id())
            target_d = target.withColumn('row_index', f.monotonically_increasing_id())

            pred_target = pred_d.join(target_d, on=['row_index']).drop('row_index')
            pred_target.show()

            dataset.show()

            pred_target_data_update = dataset.join(pred_target, on=[label])

            pred_target_data_update.show(100)


            ##########3
            # table_response = {
            #
            #     "Intercept": intercept_t,
            #     "Coefficients": coefficient_t,
            #     "RMSE": RMSE,
            #     "MSE": MSE,
            #     "R_square": r_square,
            #     "Adj_R_square": adjsted_r_square,
            #     "coefficientStdError": coefficientStdError,
            #     "T_value": T_values,
            #     "P_value": P_values
            #
            # }
            y = 0.1
            x = []

            for i in range(0, 90):
                x.append(y)
                y = round(y + 0.01, 2)
            quantile_label = lr_prediction_quantile.approxQuantile(label, x, 0.01)
            quantile_prediction = lr_prediction_quantile.approxQuantile("prediction", x, 0.01)
            Q_label_pred=''
            print(len(quantile_label))
            length = len(quantile_label)

            for i in range(0,len(quantile_label)):
                Q_label_pred += str(quantile_label[i]) + 't'  +  str(quantile_prediction[i]) + 'n'
            import math

            fitted_residual = ''
            print(len(prediction_val_pand_residual))
            length = len(prediction_val_pand_residual)

            for i in range(0, len(prediction_val_pand_residual)):
                fitted_residual += str(prediction_val_pand_predict[i]) + 't' + str(prediction_val_pand_residual[i]) + 'n'
            ## scale location graph data

            prediction_val_pand_residual
            prediction_val_pand_predict
            prediction_val_pand_residual_abs = prediction_val_pand_residual.abs()
            import math
            sqrt_residual = []
            for x in prediction_val_pand_residual_abs:
                sqrt_residual.append(math.sqrt(x))
                # print ("____________________  ",x)

            sqrt_residual
            # calculating std deviation
            import statistics

            print(statistics.stdev(prediction_val_pand_residual))
            stdev_ = statistics.stdev(prediction_val_pand_residual)

            # calcuate stnd residuals
            std_res = []
            for x in prediction_val_pand_residual:
                std_res.append(x / stdev_)
            print(std_res)

            # calculating the square root of std_res
            import math
            sqr_std_res = []
            for x in std_res:
                sqr_std_res.append(math.sqrt(abs(x)))
            print(sqr_std_res)

            scale_predict_residual = ''
            for pre, res in zip(prediction_val_pand_predict, sqr_std_res):
                scale_predict_residual += str(pre) + 't' + str(res) + 'n'
            print(scale_predict_residual)
            # QUANTILE

            y = 0.1
            x = []

            for i in range(0, 90):
                x.append(y)
                y = round(y + 0.01, 2)

            quantile_std_res = spark.createDataFrame(std_res, FloatType())
            quantile_std_res.show()
            quantile_std_res_t = quantile_std_res.approxQuantile('value', x, 0.01)
            print(quantile_std_res_t)
            print(x)
            # calculating the z_score
            from scipy.stats import norm

            ## sort the list
            sorted_std_res = sorted(std_res)

            mean = statistics.mean(sorted_std_res)
            stdev = statistics.stdev(sorted_std_res)
            # print(mean)
            quantile = []
            n = len(std_res)
            print(n)
            for x in range(0,n):
                quantile.append((x-0.5) / (n))

            print(quantile)
            # z_score theoratical
            z_theory = []
            for x in quantile:
                z_theory.append(norm.ppf(abs(x)))
            # z score for real val
            z_pract = []
            for x in sorted_std_res:
                z_pract.append((x-mean)/stdev)
            Q_label_pred = ''
            for quant,val in zip(z_theory,z_pract):
                Q_label_pred += str(quant) + 't' + str(val) + 'n'
            graph_response = {
                "Q_Q_plot": Q_label_pred,
                "residual_fitted": fitted_residual,
                "scale_location": scale_predict_residual
            }

            tableContent = \
                {
                    'coefficientValuesKey': coefficientListRounded,
                    'tValuesKey': tValuesListRounded,
                    'pValuesKey': pValuesListRounded,
                    'significanceValuesKey': significanceObject,
                    'interceptValuesKey': interceptRounded,
                    "RMSE": RMSERounded,
                    "RSquare": rSquareRounded,
                    "AdjRSquare": adjustedrSquareRounded,
                    "CoefficientStdError": coefficientStdErrorRounded,
                    'equationKey': equation
                }

            json_response = {

                'table_data': tableContent,
                'graph_data' : graph_response


            }
            print(json_response)
            return (json_response)
        except Exception as e:
            print('exception is =' + str(e))
#VECTORIZE TRAIN DATA
energi_nuclear_train = ssc.textFileStream("train_nuclear.txt")
energi_nuclear_train_labeled = energi_nuclear_train.map(parse_train)
energi_nuclear_train_labeled_DF = SQLContext.createDataFrame(energi_nuclear_train_labeled["label", "features"])
print(energi_nuclear_train_labeled_DF)

#VECTORIZE TEST DATA
energi_nuclear_test = ssc.textFileStream("test_nuclear.txt")
energi_nuclear_test_labeled = energi_nuclear_test.map(parse_test)
energi_nuclear_test_labeled_DF = SQLContext.createDataFrame(energi_nuclear_test_labeled["label", "features"])
print(energi_nuclear_test_labeled_DF)

#Create Model
numFeatures = 3
lr = LinearRegression(maxIter=50)
lrModel = lr.fit(energi_nuclear_train_labeled_DF)

#see what the model do
print("Coefficients: "+str(lrModel.coefficients))
print("Intercept: "+str(lrModel.intercept))

#Predict On the tested data
predictions = lrModel.transform(energi_nuclear_test_labeled_DF)
predictions.select("prediction","label", "features").show()

#Evaluate the predictions
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="r2")
evaluator.evaluate(predictions)
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
sample_test_data_path = 'test_input/linear_regression/sample_linear_regression_data.txt'
spark = SparkSession.builder.appName('lrex').getOrCreate()

all_data = spark.read.format('libsvm').load(sample_test_data_path)

# Split the data into training and test
training_data, test_data = all_data.randomSplit([0.7, 0.3])

# Initialize model
lr = LinearRegression(featuresCol='features',
                      labelCol='label',
                      predictionCol='prediction')

# Fit the model
lrModel = lr.fit(training_data)

test_results = lrModel.evaluate(test_data)

rms = test_results.rootMeanSquaredError
print rms
# Unlabelled data

unlabelled_data = test_data.select('features')

predictions = lrModel.transform(unlabelled_data)

print predictions
from pyspark.sql.functions import udf


# Linear regression model parameter values
num_iters = 500  # iterations
reg = 1e-1  # regParam
alpha = .2  # elasticNetParam
use_intercept = True  # intercept

# parsed_train_data_df = parsed_train_data_df.withColumn("Year", parsed_train_data_df["Year"].cast(DoubleType()))
parsed_train_data_df = parsed_train_data_df.rdd.map(lambda row: (Vectors.dense(row["Features"]), float(row['Year'])))
parsed_train_data_df = sqlContext.createDataFrame(parsed_train_data_df,["features","label"])
parsed_train_data_df
lin_reg = LinearRegression(maxIter = num_iters, regParam = reg, elasticNetParam = alpha, fitIntercept = use_intercept, labelCol = 'label', featuresCol = 'features')

first_model = lin_reg.fit(parsed_train_data_df)

%pyspark

coeffs_LR1 = first_model.coefficients
intercept_LR1 = first_model.intercept
print coeffs_LR1, intercept_LR1

%pyspark

parsed_val_data_df = parsed_val_data_df.rdd.map(lambda row: (Vectors.dense(row["Features"]), float(row['Year'])))
parsed_val_data_df = sqlContext.createDataFrame(parsed_val_data_df,["features","label"])

#parsed_val_data_df = parsed_val_data_df.withColumn("label", parsed_val_data_df["label"].cast(DoubleType()))
val_pred_df = first_model.transform(parsed_val_data_df)
rmse_val_LR1 = evaluator.evaluate(val_pred_df)
Beispiel #14
0
output.printSchema()

output.head(1)

final_data = output.select('features', 'Yearly Amount Spent')

final_data.show()

train, test = final_data.randomSplit([0.7,0.3])

train.describe().show()

test.describe().show()

## Create model with train data set
## And evaluate our model with test data set

lr = LinearRegression(labelCol = "Yearly Amount Spent")

linear_regression_model = lr.fit(train)

results = linear_regression_model.evaluate(test)

results.residuals.show() ## values that were failed

results.rootMeanSquaredError ## Average of error in prediction, in our case we're handling are +/- 500, 10 is a low number

results.r2  ## the result gives us that is a good model, +/- 98% accurate prediction

Beispiel #15
0
def main(input_path, output_attribute_index, scikit_output_path,
         spark_output_path):

    # Instancira se Passive Aggressive Regressor model
    regressor = PassiveAggressiveRegressor()
    for file_path in hdfs.ls(input_path):
        # Ucitava se sadrzaj fajla i kreira string matrica od njega
        content = hdfs.load(file_path)
        temp = content.split("\n")
        temp = list(map(lambda x: x.split(","), temp))
        temp = list(filter(lambda x: len(x) > 1, temp))
        raw_matrix = np.array(temp)
        # Ucitava se numpy matrica i zatim parsira u matricu realnih vrednosti
        # koja se nakon toga koristi prilikom treniranja modela
        # raw_matrix = np.genfromtxt(file_path, delimiter=',', dtype='string')
        input_matrix = raw_matrix[1:, 3:-5].astype('float64')
        output_vector = raw_matrix[1:, -5 +
                                   output_attribute_index].astype('float64')
        # Model se trenira u vidu iterativnog poboljsanja
        regressor.partial_fit(input_matrix, output_vector)
        # Na konzoli se stampa putanja do obradjenog fajla
        print(file_path)

    # Cuva se kreirani model na izlaznoj putanji
    # koja je prosledjena u vidu argumenta
    with hdfs.open(scikit_output_path, 'w') as opened_file:
        pickle.dump(regressor, opened_file)

    # Inicijalizacija konfiguracije i konteksta izvrsenja aplikacije
    configuration = SparkConf().setAppName("BigDataProj3_Trainer")
    context = SparkContext(conf=configuration)
    context.setLogLevel("ERROR")
    # Inicijalizacija sesije
    # (mora da se obavi zbog upisivanja modela)
    session = SparkSession(context)

    # Ucitavanje RDD podataka sa ulazne putanje
    input_data = context.textFile(input_path)
    # Parsiranje svakog reda na reci
    input_data = input_data.map(lambda x: x.split(","))
    # Ignorisu se header-i
    input_data = input_data.filter(lambda x: x[0] != "Timestamp")
    # Ignorisu se prve tri vrste (Timestamp, Latitude i Longitude)
    # i bira se odgovarajuca izlazna kolona
    # (u zavisnosti od output_attribute_index promenljive)
    input_data = input_data.map(lambda x: list(map(lambda y: float(y), x[
        3:-5])) + [float(x[-5 + output_attribute_index])])

    # Formira se odgovarajuci DataFrame objekat
    # (VectorAssembler se koristi kod formiranja kolona
    # koje omogucavaju koriscenje fit metode linearne regresije)
    input_cols = []
    for i in range(15):
        input_cols.append("_" + str(i + 1))
    assembler = VectorAssembler(inputCols=input_cols, outputCol='features')
    data_frame = assembler.transform(input_data.toDF())

    # Instancira se LinearRegression objekat i vrsi njegovo treniranje
    # i zatim cuvanje na zadatoj putanji
    regression = LinearRegression(featuresCol='features', labelCol='_16')
    model = regression.fit(data_frame)
    model.write().overwrite().save(spark_output_path)
from pyspark.ml.regression import LinearRegression

pp_df = spark.read.csv(
    "/Users/danemorgan/Documents/DataScience/CCPP/powerplant.csv",
    header="True",
    inferSchema=True)
pp_df
from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler(inputCols=["AT", "V", "AP", "RH"],
                                  outputCol="features")

vpp_df = vectorAssembler.transform(pp_df)

vpp_df.take(1)

LR = LinearRegression(featuresCol="features", labelCol="PE")
lr_model = LR.fit(vpp_df)

lr_model.coefficients
#should output: DenseVector([-1.9775, -0.2339, 0.0621, -0.1581])
lr_model.intercept
#should output: 454.6092744523414
lr_model.summary.rootMeanSquaredError
#should output: 4.557126016749488
lr_model.save("linearRegression1.model")
Beispiel #17
0
from pyspark.ml.linalg import Vectors
ad_df = ad.rdd.map(lambda x: [Vectors.dense(x[2:-2]), x[-1]]).toDF(
    ['features', 'crew'])
ad_df.show(10)

# In[4]:
# Build linear regression model

from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol='features', labelCol='crew')

# In[5]:
# Fit the model

lr_model = lr.fit(ad_df)

# In[6]:
# Prediction

pred = lr_model.transform(ad_df)
pred.show(10)

# In[7]:
# Module evaluation

from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='crew')
evaluator.evaluate(pred)

Beispiel #18
0
trainingData = spark_sql_output.rdd.map(
    lambda x: (Vectors.dense(x[0:-1]), x[-1])).toDF(["features", "label"])
trainingData.show()
featureIndexer =\
VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(trainingData)

(trainingData, testData) = trainingData.randomSplit([0.7, 0.3])

#################### SPARK ML  ####################

# Define LinearRegression algorithm
lr = LinearRegression()

# Fit 2 models, using different regularization parameters
modelA = lr.fit(trainingData, {lr.regParam: 0.0})
modelB = lr.fit(trainingData, {lr.regParam: 100.0})

# Make predictions
predictionsA = modelA.transform(trainingData)
print('-' * 70)
print('MODEL A : ')
predictionsA.select("prediction", "label", "features").show(30)
print('-' * 70)

predictionsB = modelB.transform(trainingData)
print('-' * 70)
print('MODEL B : ')
predictionsB.select("prediction", "label", "features").show(30)
print('-' * 70)
df = spark.read.load("/data/regression")


# COMMAND ----------

from pyspark.ml.regression import LinearRegression
lr = LinearRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
print lr.explainParams()
lrModel = lr.fit(df)


# COMMAND ----------

summary = lrModel.summary
summary.residuals.show()
print summary.totalIterations
print summary.objectiveHistory
print summary.rootMeanSquaredError
print summary.r2


# COMMAND ----------

from pyspark.ml.regression import GeneralizedLinearRegression
glr = GeneralizedLinearRegression()\
  .setFamily("gaussian")\
  .setLink("identity")\
  .setMaxIter(10)\
  .setRegParam(0.3)\
  .setLinkPredictionCol("linkOut")
print glr.explainParams()
Beispiel #20
0
    d.pop('success_metric', None)
    values = [float(x) for x in d.values()] ##this block is unusable until we have our Hive Data
    return (pred, Vectors.dense(values))

# training set
trainParsed = sc.parallelize(map(parsePoint, train_dict))
# test set
testParsed = sc.parallelize(map(parsePoint, test_dict))


## create validation set

trainDf = sqlContext.createDataFrame(trainParsed, ["label", "features"])
testDf = sqlContext.createDataFrame(testParsed, ["label", "features"])
lm_model = LinearRegression(featuresCol="features", predictionCol="prediction", maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6)
lm_model_fit = lm_model.fit(trainDf)
lm_transform = lm_model_fit.transform(trainDf)
results = lm_transform.select(lm_transform['prediction'], lm_transform['label'])
MSE = results.map(lambda (p,l):(p-l)**2).reduce(lambda x,y:x+y)/results.count()
print("Linear Regression training Mean Squared Error = " + str(MSE))

lm_transform = lm_model_fit.transform(testDf)
results = lm_transform.select(lm_transform['prediction'], lm_transform['label'])
MSE = results.map(lambda (p,l):(p-l)**2).reduce(lambda x,y:x+y)/results.count()
print("Linear Regression testing Mean Squared Error = " + str(MSE))

res = results.collect()
predsAndLabels = sc.parallelize([i.asDict().values() for i in res])
metrics = RegressionMetrics(predsAndLabels)

Beispiel #21
0
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

data = spark.read.csv("mlinput.csv", inferSchema=True)
data.printSchema()
feature_columns = data.columns[1:]

# Need to install numpy if haven't.
# Command: pip install numpy
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
transformed_data = assembler.transform(data)

train, test = transformed_data.randomSplit([0.8, 0.2])

from pyspark.ml.regression import LinearRegression
linearregression = LinearRegression(featuresCol="features", labelCol="_c0")
model = linearregression.fit(train)

predictions = model.transform(test)

predictions.show()
# prepare data frame as required by MLLib
data = spark.sparkContext.parallelize(ratingsPerDayDict.items()) \
        .map(lambda x: (float(x[1]), Vectors.dense(float(x[0]))))
df = data.toDF(["label", "features"])

# Let's split our data into training data and testing data
trainTest = df.randomSplit([0.5, 0.5])
trainingDF = trainTest[0]
testDF = trainTest[1]

# Now create the linear regression model
lir = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Train the model using our training data
model = lir.fit(trainingDF)

# Generate predictions for test data using our linear regression model 
fullPredictions = model.transform(testDF).cache()

# Extract the predictions and the "known" correct labels.
predictions = fullPredictions.select("prediction").rdd.map(lambda x: x[0])
labels = fullPredictions.select("label").rdd.map(lambda x: x[0])

# Zip them together
predictionAndLabel = predictions.zip(labels).collect()

# Print out the predicted and actual values for each point
for prediction in predictionAndLabel:
    print(prediction)
training = spark.read.format('libsvm').load(
    '/FileStore/tables/sample_linear_regression_data.txt')

# COMMAND ----------

training.show()

# COMMAND ----------

lr = LinearRegression(featuresCol='features',
                      labelCol='label',
                      predictionCol='prediction')

# COMMAND ----------

lrModel = lr.fit(training)

# COMMAND ----------

lrModel.coefficients

# COMMAND ----------

training_summary = lrModel.summary

# COMMAND ----------

training_summary.rootMeanSquaredError

# COMMAND ----------
Beispiel #24
0
# MAGIC 
# MAGIC **References**
# MAGIC * [MLlib LinearRegression user guide](http://spark.apache.org/docs/latest/ml-classification-regression.html#linear-regression)
# MAGIC * [PySpark LinearRegression API](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.regression.LinearRegression)

# COMMAND ----------

# Import LinearRegression class
from pyspark.ml.regression import LinearRegression
# Define LinearRegression algorithm
lr = LinearRegression()

# COMMAND ----------

# Fit 2 models, using different regularization parameters
modelA = lr.fit(dataset, {lr.regParam:0.0})
modelB = lr.fit(dataset, {lr.regParam:100.0})
print(">>>> ModelA intercept: %r, coefficient: %r" % (modelA.intercept, modelA.coefficients[0]))
print(">>>> ModelB intercept: %r, coefficient: %r" % (modelB.intercept, modelB.coefficients[0]))

# COMMAND ----------

# MAGIC %md ## Make predictions
# MAGIC 
# MAGIC Calling `transform()` on data adds a new column of predictions.

# COMMAND ----------

# Make predictions
predictionsA = modelA.transform(dataset)
display(predictionsA)
Beispiel #25
0
import pyspark.sql.functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import SparseVector
spark = SparkSession.builder.appName("Regression").getOrCreate()
df = spark.read.format("csv").option("header", True)\
.option("inferSchema", True).option("delimiter", ",")\
.load("imports-85.data")
data = df.withColumnRenamed("wheel-base",
                            "label").select("label", "length", "width",
                                            "height")
data.show()
from pyspark.ml.regression import LinearRegression
assembler = VectorAssembler(inputCols=data.columns[1:], outputCol="features")
y = assembler.transform(data)
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
model = lr.fit(y)
# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(model.coefficients))
print("Intercept: %s" % str(model.intercept))
# Summarize the model over the training set and print out some metrics
trainingSummary = model.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)
from pyspark.sql.functions import col, when
logistic_df = df.withColumn(
    "label",
    when(col("num-of-doors") == "four",
         1).otherwise(0)).select("label", "length", "width", "height")
Beispiel #26
0
'''see the vectorized feature'''
output.select("Independent Features").show()
output.columns
'''get the sorted column'''
finalized_data = output.select("Independent Features", "Close")
finalized_data.show()
'''Divide the data for Training and Testing'''

train_data, test_data = finalized_data.randomSplit([0.75, 0.25])
'''BUILDING MODEL'''
'''USe linear regression alogorithm for model fiting'''

from pyspark.ml.regression import LinearRegression
regressor = LinearRegression(featuresCol='Independent Features',
                             labelCol='Close')
regressor = regressor.fit(train_data)

lr = LinearRegression(featuresCol='Independent Features',
                      labelCol='Close',
                      maxIter=10,
                      regParam=0.3,
                      elasticNetParam=0.8)
lr_model = lr.fit(train_data)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))
'''TESTING'''
'''testing the data get the accuracy by using root mean square '''

lr_predictions = lr_model.transform(test_data)
lr_predictions.select("Close", "Independent Features", "prediction").show(5)
'''EVALUATION'''
train_df.show()
test_df.show()

# # Modelos de regresión sobre el precio de los artículos

# # Modelo de Regresion Lineal

# In[129]:

# se crea y entrena el modelo
lr = LinearRegression(featuresCol='features',
                      labelCol='price',
                      maxIter=100,
                      regParam=0.2,
                      elasticNetParam=0.2)
lr_model = lr.fit(train_df)

# ahora se pueden hacer algunas predicciones y evaluar el desempeño
lr_predictions = lr_model.transform(test_df)
test_prediction = lr_predictions.select("prediction", "price")

test_prediction.show()

evaluator = RegressionEvaluator(labelCol="price")

print("\nModelo de Regresión Lineal")
print("R Squared (R2) on test data = %g" %
      evaluator.evaluate(test_prediction, {evaluator.metricName: "r2"}))
print("Root Mean Squared Error (RMSE) on test data = %g" %
      evaluator.evaluate(test_prediction, {evaluator.metricName: "rmse"}))
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.evaluation import RegressionMetrics

# Load training data
training = sqlContext.read.format('com.databricks.spark.csv').options(
    header='true',
    inferschema='true').load('file:///home/pkatta/Downloads/spark/CASP.csv')

vecAssembler = VectorAssembler(
    inputCols=["F1", "F2", "F3", "F5", "F6", "F7", "F8", "F9"],
    outputCol="features")
t = vecAssembler.transform(training)

(trainingData, testData) = t.randomSplit([0.7, 0.3])

lr = LinearRegression(maxIter=10,
                      regParam=0.3,
                      elasticNetParam=0.8,
                      featuresCol="features",
                      labelCol="RMSD",
                      predictionCol="prediction")

lrModel = lr.fit(t)

# Print the coefficients and intercept for linear regression
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

#valuator = RegressionEvaluator(predictionCol='prediction', labelCol='label')
# Load the JSON strings as a Spark Dataframe.
natality_data = spark.read.json(table_json)
# Create a view so that Spark SQL queries can be run against the data.
natality_data.createOrReplaceTempView("natality")


# As a precaution, run a query in Spark SQL to ensure no NULL values exist.
sql_query = """
SELECT *
from natality
where weight_pounds is not null
and mother_age is not null
and father_age is not null
and gestation_weeks is not null
"""
clean_data = spark.sql(sql_query)

# Create an input DataFrame for Spark ML using the above function.
training_data = clean_data.rdd.map(vector_from_inputs).toDF(["label",
                                                             "features"])
training_data.cache()

# Construct a new LinearRegression object and fit the training data.
lr = LinearRegression(maxIter=5, regParam=0.2, solver="normal")
model = lr.fit(training_data)
# Print the model summary.
print "Coefficients:" + str(model.coefficients)
print "Intercept:" + str(model.intercept)
print "R^2:" + str(model.summary.r2)
model.summary.residuals.show()
# Top 10 correlated Crime Types= ['OTHERTRAFFICINFRACTION', 'VEHICLEANDTRAFFICLAWS', 'CRIMINALTRESPASS', 'DANGEROUSDRUGS', 'INTOXICATED&IMPAIREDDRIVING', 'OTHEROFFENSESRELATEDTOTHEFT', 'THEFT-FRAUD', 'GRANDLARCENY', 'OTHERSTATELAWS', 'PARKINGOFFENSES']
####### Linear regression to validate Hypothesis Testing between every crime type and unemployment rate #######

for i in range(len(total_columns)):
    columns = [total_columns[i]]
    mergedDf = mergedDfTotal.select(['UnemplymentRate'] + columns)
    ####### Validated results with and without normalisng data per crime, rss seems to do better without normalising so ignoring normalizing using MinMaxScaler#######
    assembler = VectorAssembler(inputCols=['UnemplymentRate'],
                                outputCol="features")
    vgrouped_arrests_unemp = assembler.transform(mergedDf)
    lr = LinearRegression(featuresCol='features',
                          labelCol=columns[0],
                          maxIter=200,
                          regParam=0.3,
                          elasticNetParam=0.8)
    lr_model = lr.fit(vgrouped_arrests_unemp)
    print("Coefficients: " + str(lr_model.coefficients))
    print("Intercept: " + str(lr_model.intercept))
    trainingSummary = lr_model.summary
    print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
    print("r2: %f" % trainingSummary.r2)
    vgrouped_arrests_unemp.describe().show()
    lr_predictions = lr_model.transform(vgrouped_arrests_unemp)
    output = np.array(
        lr_predictions.select(["prediction"] + ['UnemplymentRate'] +
                              columns).collect())
    X, y, y_pred = output[:, 2].reshape(-1, 1), output[:, 1].reshape(
        -1, 1), output[:, 0].reshape(-1, 1)
    coefficient_significance = slope_significance_hyp_testing(
        X, y, y_pred,
        np.array(lr_model.coefficients).reshape(-1, 1), correlation[i])
                                            float(p[7]),
                                            float(p[8]),
                                            float(p[9]),
                                            float(p[10])
                                        ])))

# In[36]:

# Create the data frame containing the training data having two columns. 1) The actula output or label of the data 2) The vector containing the features
trainingDF = spark.createDataFrame(wineDataRDD, ['label', 'features'])
trainingDF.show()
# Create the object of the algorithm which is the Linear Regression with the parameters
# Linear regression parameter to make lr.fit() use at most 10 iterations
lr = LinearRegression(maxIter=10)
# Create a trained model by fitting the parameters using the training data
model = lr.fit(trainingDF)

# In[37]:

# Once the model is prepared, to test the model, prepare the test data containing the labels and feature vectors
testDF = spark.createDataFrame(
    [(5.0,
      Vectors.dense(
          [7.4, 0.7, 0.0, 1.9, 0.076, 25.0, 67.0, 0.9968, 3.2, 0.68, 9.8])),
     (5.0,
      Vectors.dense(
          [7.8, 0.88, 0.0, 2.6, 0.098, 11.0, 34.0, 0.9978, 3.51, 0.56, 9.4])),
     (7.0,
      Vectors.dense(
          [7.3, 0.65, 0.0, 1.2, 0.065, 15.0, 18.0, 0.9968, 3.36, 0.57, 9.5]))],
    ["label", "features"])
Beispiel #32
0
#Define Pipeline
pipeline = Pipeline(stages=[
    Neighborhood_indexer, YearBuilt_indexer, MoSold_indexer, YrSold_indexer,
    assembler, lr
])

# COMMAND ----------

ind_model = pipeline.fit(train)
train_final = ind_model.transform(test)
display(train_final)

# COMMAND ----------

# Fit the model
lrModel = lr.fit(train_final)

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = lrModel.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

# COMMAND ----------
    df.drop('Date', axis=1, inplace=True)

    df.to_csv(os.path.join(dir_path, 'temp_' + key['Key']),
              index=False,
              sep=' ',
              header=False)

    data = spark.read.format("libsvm") \
        .load(os.path.join(dir_path,'temp_' + key['Key']))

    test_data = spark.read.format("libsvm").load("final.csv")

    lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

    # Fit the model
    lrModel = lr.fit(data)

    real_preds = lrModel.transform(test_data)

    real_preds = real_preds.select(real_preds['features'],
                                   real_preds['prediction'])

    clean = udf(clean_features)

    real_preds = real_preds.select(
        clean(real_preds['features']).alias('Date'),
        real_preds['prediction'].alias('Value'))

    real_preds.show()

    real_preds.write.option("header", "false").csv("temp")
Beispiel #34
0
#Transform to a Data Frame for input to Machine Learing
#Drop columns that are not required (low correlation)

    
usdLP = usdVectors.map(transformationLR.transformToLabeledPoint)
usdDF = sqlContext.createDataFrame(usdLP, ["label", "features"])
usdDF.select("label", "features").show(10)

#Split into training and testing data
(trainingData, testData) = usdDF.randomSplit([0.7, 0.3])
trainingData.count()
testData.count()

#Build the model on training data
lr = LinearRegression(maxIter=10)
lrModel = lr.fit(trainingData)
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

#Predict on the test data
predictions = lrModel.transform(testData)
predictions.select("prediction","label","features").show()

evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="label",metricName="r2")
evaluator.evaluate(predictions)
#Streaming data

from pyspark.streaming import StreamingContext
ssc=StreamingContext(sc,1)
inputStream=ssc.textFileStream("../Forex DT/data/1440/streaming1440.csv")
print (spark_sql_output.take(10))

trainingData=spark_sql_output.rdd.map(lambda x:(Vectors.dense(x[0:-1]), x[-1])).toDF(["features", "label"])
trainingData.show()
featureIndexer =\
VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(trainingData)

(trainingData, testData) = trainingData.randomSplit([0.7, 0.3])

#################### SPARK ML  ####################

# Define LinearRegression algorithm
lr = LinearRegression()

# Fit 2 models, using different regularization parameters
modelA = lr.fit(trainingData, {lr.regParam:0.0})
modelB = lr.fit(trainingData, {lr.regParam:100.0})

# Make predictions
predictionsA = modelA.transform(trainingData)
print ('-'*70)
print ('MODEL A : ')
predictionsA.select("prediction", "label", "features").show(30)
print ('-'*70)

predictionsB = modelB.transform(trainingData)
print ('-'*70)
print ('MODEL B : ')
predictionsB.select("prediction", "label", "features").show(30)
print ('-'*70)
Beispiel #36
0
  run.log("Model Name", model_name)
  run.log("Max Iterations", maxIters)
  run.log("Regularization Rate", regParam)
  run.log_list("Feature Columns", feature_cols)

  ###############
  # TRAIN MODEL #
  ###############

  print("  * Training {0} model".format(model_name))
  # Instantiate New LinearRegression Object
  lr = LinearRegression(featuresCol='features', labelCol='duration_minutes', maxIter=maxIters, regParam=regParam, solver="auto")

  # Train model on transformed training data
  lr_model = lr.fit(trainDF_transformed)

  lr_full_model = feature_model.copy()
  lr_full_model.stages.append(lr_model)

  print("  * Model trained, scoring validation data")
  # Run the full model (feature steps and trained model)
  validation_scored = lr_full_model.transform(validDF)

  #####################
  # MODEL PERFORMANCE #
  #####################

  print("  * Calculating performance metrics")
  # Calculate Regression Performance
  rmse = evaluator.evaluate(validation_scored, {evaluator.metricName: "rmse"})
    def _train_model_spark(self, data):
        df = self._prepare_data_spark(data)
        input_num = len(data.keys().difference({self.CHANGE_AMOUNT, self.CHANGE_DIRECTION, self.TARGET_PRICE,
                                                self.TODAY_PRICE}))

        if self.ann_hidden_nodes_num is None:
            self.ann_hidden_nodes_num = input_num / 2 + 1
        ann_layers = [input_num,
                      # input_num / 3 * 2,
                      # input_num / 3,
                      self.ann_hidden_nodes_num,
                      2]

        self.logger.info('layer settings are {}'.format(ann_layers))
        self.logger.info('training method is {}'.format(self._train_method))
        self.logger.info('trees num is {}'.format(self.random_forest_tree_number))
        if isinstance(self._train_method, dict):
            if self._model is not None and self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK:
                self._model[self.CHANGE_AMOUNT].stop_server()
            self._model = {self.CHANGE_AMOUNT: None,
                           self.CHANGE_DIRECTION: None}

            if self._train_method[self.CHANGE_AMOUNT] == self.LINEAR_REGRESSION:
                lr = LinearRegression(featuresCol="features", labelCol=self.CHANGE_AMOUNT,
                                      maxIter=self.linear_regression_training_times,
                                      regParam=self.linear_regression_regularization_parameter,
                                      predictionCol='AmountPrediction')
                self._model[self.CHANGE_AMOUNT] = lr.fit(df)
            elif self._train_method[self.CHANGE_AMOUNT] == self.RANDOM_FOREST:
                rfr = RandomForestRegressor(featuresCol="features", labelCol=self.CHANGE_AMOUNT,
                                            numTrees=self.random_forest_tree_number,
                                            maxDepth=self.random_forest_tree_max_depth,
                                            predictionCol='AmountPrediction')
                self._model[self.CHANGE_AMOUNT] = rfr.fit(df)
            elif self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK:
                ann_layers[-1] = 1
                self._model[self.CHANGE_AMOUNT] = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark,
                                                                          num_workers=self.spark_worker_numbers,
                                                                          epoch=self.ann_epoch_number,
                                                                          featuresCol="features",
                                                                          labelCol=self.CHANGE_AMOUNT,
                                                                          predictionCol='AmountPrediction'
                                                                          )
                self._model[self.CHANGE_AMOUNT].fit(df)
            else:
                self.logger.warn('Unsupported training method {}'.format(self._train_method))
                raise ValueError('Unsupported training method {}'.format(self._train_method))

            if self._train_method[self.CHANGE_DIRECTION] == self.LOGISTIC_REGRESSION:
                lr = LogisticRegression(featuresCol="features", labelCol=self.CHANGE_DIRECTION,
                                        maxIter=self.logistic_regression_training_times,
                                        regParam=self.linear_regression_regularization_parameter,
                                        predictionCol='DirPrediction')
                self._model[self.CHANGE_DIRECTION] = lr.fit(df)
            elif self._train_method[self.CHANGE_DIRECTION] == self.RANDOM_FOREST:
                rfc = RandomForestClassifier(featuresCol="features", labelCol=self.CHANGE_DIRECTION,
                                             numTrees=self.random_forest_tree_number,
                                             maxDepth=self.random_forest_tree_max_depth,
                                             predictionCol='DirPrediction')
                self._model[self.CHANGE_DIRECTION] = rfc.fit(df)

            elif self._train_method[self.CHANGE_DIRECTION] == self.ARTIFICIAL_NEURAL_NETWORK:
                ann_layers[-1] = 2
                mlpc = MultilayerPerceptronClassifier(featuresCol="features",
                                                      labelCol=self.CHANGE_DIRECTION,
                                                      layers=ann_layers,
                                                      predictionCol='DirPrediction')
                self._model[self.CHANGE_DIRECTION] = mlpc.fit(df)

            else:
                self.logger.warn('Unsupported training method {}'.format(self._train_method))
                raise ValueError('Unsupported training method {}'.format(self._train_method))

        else:
            if self._train_method == self.LINEAR_REGRESSION:
                lr = LinearRegression(featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction',
                                      regParam=self.linear_regression_regularization_parameter,
                                      maxIter=self.linear_regression_training_times)
                self._model = lr.fit(df)
            elif self._train_method == self.RANDOM_FOREST:
                rfr = RandomForestRegressor(featuresCol="features", labelCol=self.TARGET_PRICE,
                                            predictionCol='prediction',
                                            numTrees=self.random_forest_tree_number,
                                            maxDepth=self.random_forest_tree_max_depth)
                self._model = rfr.fit(df)

            elif self._train_method == self.ARTIFICIAL_NEURAL_NETWORK:
                ann_layers[-1] = 1
                if self._model is not None:
                    self._model.stop_server()
                self.logger.warn('layers are {}'.format(ann_layers))
                self._model = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark,
                                                      num_workers=self.spark_worker_numbers, epoch=100,
                                                      featuresCol="features", labelCol=self.TARGET_PRICE,
                                                      predictionCol='prediction'
                                                      )
                self._model.fit(df)

            else:
                self.logger.warn('Unsupported training method {}'.format(self._train_method))
                raise ValueError('Unsupported training method {}'.format(self._train_method))

        return self._model
Beispiel #38
0
# in a DataFrame named `assembled`
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
assembled = assembler.transform(selected)

# Split the `assembled` DataFrame into training and test
# sets
(train, test) = assembled.randomSplit([0.8, 0.2], 12345)

# ## Specifying and training the model

# instantiate the Spark MLlib linear regression estimator
lr = LinearRegression(featuresCol="features", labelCol="weight")

# Call the `fit` method to fit (train) the linear regression
# model
lr_model = lr.fit(train)

# ## Evaluating the trained model

# Generate predictions on the test set
test_with_predictions = lr_model.transform(test)

# Create an instance of `RegressionEvaluator` class
evaluator = RegressionEvaluator(predictionCol="prediction",
                                labelCol="weight",
                                metricName="r2")

# Compute the R-squared
evaluator.evaluate(test_with_predictions)

# ## Interpreting the model
from vectorizer import VectorizeData
from pyspark.ml.regression import LinearRegression
from pyspark.sql.types import IntegerType

if __name__ == "__main__":

    train, test = VectorizeData().get_train_test_data()

    reg = LinearRegression(featuresCol='features',
                           labelCol='G3',
                           maxIter=10,
                           regParam=0.3,
                           elasticNetParam=0.8)
    regModel = reg.fit(train)
    tSummary = regModel.summary

    print(tSummary.rootMeanSquaredError, tSummary.r2)
    '''
    Mean Squared Error = 1.8853871486836264
    r2 = 0.8266004476656317
    '''

    predictionDF = regModel.transform(test)
    predictionDF = predictionDF.withColumn(
        "prediction", predictionDF["prediction"].cast(IntegerType()))
    output_DF = predictionDF.select("prediction", "G3")
    output_DF.show()
    '''
    +----------+---+
    |prediction| G3|
    +----------+---+
Beispiel #40
0
assembler = VectorAssembler(inputCols=[
    'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density',
    'ship_indexer', 'cruise_indexer'
],
                            outputCol='features')
output = assembler.transform(result)
scaler = StandardScaler(inputCol="features",
                        outputCol="scaledFeatures",
                        withStd=True,
                        withMean=False)
output_scaled = scaler.fit(output).transform(output)

data = output_scaled.select('scaledFeatures', 'crew')
train_data, test_data = data.randomSplit([0.7, 0.3])

lr_model = LinearRegression(featuresCol="scaledFeatures", labelCol="crew")
model = lr_model.fit(train_data)

test_results = model.evaluate(test_data)
# test_results.residuals.show()
print(test_results.rootMeanSquaredError)
print(test_results.r2)

data.describe().show()

# check why model performs so well
from pyspark.sql.functions import corr

df.select(corr('crew', 'passengers')).show()
# COMMAND ----------

finalized_data = output.select('Features', 't2mTemp')
finalized_data.show()

# COMMAND ----------

# DBTITLE 1,Split data
train_data, test_data = finalized_data.randomSplit([0.8, 0.2])

# COMMAND ----------

# DBTITLE 1,Train data with LR
from pyspark.ml.regression import LinearRegression
regressor = LinearRegression(featuresCol='Features', labelCol='t2mTemp')
regressor = regressor.fit(train_data)

# COMMAND ----------

# DBTITLE 1,Regression Coefficients
regressor.coefficients

# COMMAND ----------

regressor.intercept

# COMMAND ----------

# DBTITLE 1,Evaluate model with test data
pred_results = regressor.evaluate(test_data)
pred_resultsTest = regressor.evaluate(finalized_dataTest)
#VECTORIZE TRAIN DATA
energi_habis_train = ssc.textFileStream("train_habis.txt")
energi_habis_train_labeled = energi_habis_train.map(parse_train)
energi_habis_train_labeled_DF = SQLContext.createDataFrame(energi_habis_train_labeled["label", "features"])
print(energi_habis_train_labeled_DF)

#VECTORIZE TEST DATA
energi_habis_test = ssc.textFileStream("test_habis.txt")
energi_habis_test_labeled = energi_habis_test.map(parse_test)
energi_habis_test_labeled_DF = SQLContext.createDataFrame(energi_habis_test_labeled["label", "features"])
print(energi_habis_test_labeled_DF)

#Create Model
numFeatures = 3
lr = LinearRegression(maxIter=50)
lrModel = lr.fit(energi_habis_train_labeled_DF)

#see what the model do
print("Coefficients: "+str(lrModel.coefficients))
print("Intercept: "+str(lrModel.intercept))

#Predict On the tested data
predictions = lrModel.transform(energi_habis_test_labeled_DF)
predictions.select("prediction","label", "features").show()

#Evaluate the predictions
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="r2")
evaluator.evaluate(predictions)
Beispiel #43
0
#Find correlations
numFeatures = autoDF.take(1)[0].features.size
labelRDD = autoDF.map(lambda lp: float(lp.label))
for i in range(numFeatures):
    featureRDD = autoDF.map(lambda lp: lp.features[i])
    corr = Statistics.corr(labelRDD, featureRDD, 'pearson')
    print('%d\t%g' % (i, corr))

#Split into training and testing data
(trainingData, testData) = autoDF.randomSplit([0.9, 0.1])
trainingData.count()
testData.count()

#Build the model on training data
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(maxIter=10)
lrModel = lr.fit(trainingData)

print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

#Predict on the test data
predictions = lrModel.transform(testData)
predictions.select("prediction","label","features").show()

from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="label",metricName="r2")
evaluator.evaluate(predictions)
from pyspark.mllib.linalg import Vectors
from pyspark.ml.regression import LinearRegression
from pyspark.mllib.regression import LabeledPoint

data= [LabeledPoint(0.0, Vectors.dense([0.0]),), LabeledPoint(0.99, Vectors.dense([1.0])), LabeledPoint(2.0, Vectors.dense([2.0])), LabeledPoint(3.01, Vectors.dense([3.0]))]
training = sqlContext.createDataFrame(data)

lr = LinearRegression(maxIter=100, regParam=0.05, elasticNetParam=0.8)
lrModel = lr.fit(training)
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

# Load data and select feature and label columns
data = spark.read.format("csv").option("header", True).option(
    "inferSchema", True
).option("delimiter", ",").load(
    "/home/charan/workspaces/big_data_programming/bigdata_progamming_m2_icp/icp7/apps/datasets/imports-85.data"
)
data = data.withColumnRenamed("symboling",
                              "label").select("label", "length", "width",
                                              "height")

# Create vector assembler for feature columns
assembler = VectorAssembler(inputCols=data.columns[1:], outputCol="features")
data = assembler.transform(data)

lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Fit the model
model = lr.fit(data)

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(model.coefficients))
print("Intercept: %s" % str(model.intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = model.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)
# Load training data
#data = spark.read.format("libsvm")\
#    .load("sample_linear_regression_data.txt")
# or read it from a local disk (if working with a local Spark)

data = spark.read.format("libsvm")\
    .load("file:///home/hadoop/spark/data/mllib/sample_linear_regression_data.txt")

# split into training and test data
(train, test) = data.randomSplit([0.7, 0.3])

lr = LinearRegression(maxIter=100, regParam=0.3, elasticNetParam=0.8)


# Fit the model
lrModel = lr.fit(train)

print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = lrModel.summary
print("numIterations: %d" % trainingSummary.totalIterations)
# Used to help if LR systematically over and under-predicts the data (bias)
trainingSummary.residuals.show()
# Root Mean Squared Error (RMSE) on test data
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
# R-squared = Explained variation / Total variation (between 0-100%)
# R-squared cannot determine whether the coefficient estimates and 
# predictions are biased, which is why you must assess the residual plots.
print("r2: %f" % trainingSummary.r2)
#VECTORIZE TRAIN DATA
energi_terbarukan_train = sc.textFile("train_terbarukan.txt")
energi_terbarukan_train_labeled = energi_terbarukan_train.map(parse_train)
energi_terbarukan_train_labeled_DF = SQLContext.createDataFrame(energi_terbarukan_train_labeled["label", "features"])
print(energi_terbarukan_train_labeled_DF)

#VECTORIZE TEST DATA
energi_terbarukan_test = ssc.textFileStream("test_terbarukan.txt")
energi_terbarukan_test_labeled = energi_terbarukan_test.map(parse_test)
energi_terbarukan_test_labeled_DF = SQLContext.createDataFrame(energi_terbarukan_test_labeled["label", "features"])
print(energi_terbarukan_train_labeled_DF)

#Create Model
numFeatures = 3
lr = LinearRegression(maxIter=50)
lrModel = lr.fit(energi_terbarukan_train_labeled_DF)

#see what the model do
print("Coefficients: "+str(lrModel.coefficients))
print("Intercept: "+str(lrModel.intercept))

#Predict On the tested data
predictions = lrModel.transform(energi_terbarukan_test_labeled_DF)
predictions.select("prediction","label", "features").show()

#Evaluate the predictions
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="r2")
evaluator.evaluate(predictions)