def test_prediction(self):
        """Test prediction on a model with weights already set."""
        # Create a model with initial Weights equal to coefs
        slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25)
        slr.setInitialWeights([10.0, 10.0])

        # Create ten batches with 100 sample points in each.
        batches = []
        for i in range(10):
            batch = LinearDataGenerator.generateLinearInput(
                0.0, [10.0, 10.0], [0.0, 0.0], [1.0 / 3.0, 1.0 / 3.0], 100,
                42 + i, 0.1)
            batches.append(
                self.sc.parallelize(batch).map(lambda lp:
                                               (lp.label, lp.features)))

        input_stream = self.ssc.queueStream(batches)
        output_stream = slr.predictOnValues(input_stream)
        samples = []
        output_stream.foreachRDD(lambda x: samples.append(x.collect()))

        self.ssc.start()

        def condition():
            self.assertEqual(len(samples), len(batches))
            return True

        # We want all batches to finish for this test.
        eventually(condition, catch_assertions=True)

        # Test that mean absolute error on each batch is less than 0.1
        for batch in samples:
            true, predicted = zip(*batch)
            self.assertTrue(mean(abs(array(true) - array(predicted))) < 0.1)
    def test_parameter_accuracy(self):
        """Test that coefs are predicted accurately by fitting on toy data."""

        # Test that fitting (10*X1 + 10*X2), (X1, X2) gives coefficients
        # (10, 10)
        slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25)
        slr.setInitialWeights([0.0, 0.0])
        xMean = [0.0, 0.0]
        xVariance = [1.0 / 3.0, 1.0 / 3.0]

        # Create ten batches with 100 sample points in each.
        batches = []
        for i in range(10):
            batch = LinearDataGenerator.generateLinearInput(
                0.0, [10.0, 10.0], xMean, xVariance, 100, 42 + i, 0.1)
            batches.append(self.sc.parallelize(batch))

        input_stream = self.ssc.queueStream(batches)
        slr.trainOn(input_stream)
        self.ssc.start()

        def condition():
            self.assertArrayAlmostEqual(slr.latestModel().weights.array,
                                        [10., 10.], 1)
            self.assertAlmostEqual(slr.latestModel().intercept, 0.0, 1)
            return True

        eventually(condition, catch_assertions=True)
    def test_parameter_convergence(self):
        """Test that the model parameters improve with streaming data."""
        slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25)
        slr.setInitialWeights([0.0])

        # Create ten batches with 100 sample points in each.
        batches = []
        for i in range(10):
            batch = LinearDataGenerator.generateLinearInput(
                0.0, [10.0], [0.0], [1.0 / 3.0], 100, 42 + i, 0.1)
            batches.append(self.sc.parallelize(batch))

        model_weights = []
        input_stream = self.ssc.queueStream(batches)
        input_stream.foreachRDD(
            lambda x: model_weights.append(slr.latestModel().weights[0]))
        slr.trainOn(input_stream)
        self.ssc.start()

        def condition():
            self.assertEqual(len(model_weights), len(batches))
            return True

        # We want all batches to finish for this test.
        eventually(condition, 90, catch_assertions=True)

        w = array(model_weights)
        diff = w[1:] - w[:-1]
        self.assertTrue(all(diff >= -0.1))
Beispiel #4
0
    def test_train_prediction(self):
        """Test that error on test data improves as model is trained."""
        slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25)
        slr.setInitialWeights([0.0])

        # Create ten batches with 100 sample points in each.
        batches = []
        for i in range(10):
            batch = LinearDataGenerator.generateLinearInput(
                0.0, [10.0], [0.0], [1.0 / 3.0], 100, 42 + i, 0.1)
            batches.append(self.sc.parallelize(batch))

        predict_batches = [
            b.map(lambda lp: (lp.label, lp.features)) for b in batches]
        errors = []

        def func(rdd):
            true, predicted = zip(*rdd.collect())
            errors.append(mean(abs(true) - abs(predicted)))

        input_stream = self.ssc.queueStream(batches)
        output_stream = self.ssc.queueStream(predict_batches)
        slr.trainOn(input_stream)
        output_stream = slr.predictOnValues(output_stream)
        output_stream.foreachRDD(func)
        self.ssc.start()

        def condition():
            if len(errors) == len(predict_batches):
                self.assertGreater(errors[1] - errors[-1], 2)
            if len(errors) >= 3 and errors[1] - errors[-1] > 2:
                return True
            return "Latest errors: " + ", ".join(map(lambda x: str(x), errors))

        self._eventually(condition)
Beispiel #5
0
def fit_and_predict(sparkSession, ts):
    import numpy as np
    from sklearn.model_selection import train_test_split
    from pyspark.streaming import StreamingContext
    from pyspark.mllib.regression import StreamingLinearRegressionWithSGD

    def to_scaled_rdd(pandasDataFrame):
        import pandas as pd
        from sklearn.preprocessing import RobustScaler
        from pyspark.mllib.regression import LabeledPoint

        regressors = pandasDataFrame.columns[1:]
        num_regressors = len(regressors)
        # FIX ME: As a bonus exercise, read the last paragraph from section about residual
        # plots and make the necessary bug fix! Compare the behavior of this version with the
        # fixed one and see whether you can decipher anything from the outputs.
        scaler = RobustScaler()
        scaled_regressors = scaler.fit_transform(pandasDataFrame[regressors])
        scaled_pandasDataFrame = pd.DataFrame(scaled_regressors,
                                              columns=regressors)
        scaled_pandasDataFrame['target'] = pandasDataFrame[
            pandasDataFrame.columns[0]].values

        sparkDataFrame = sparkSession.createDataFrame(scaled_pandasDataFrame)
        return sparkDataFrame.rdd.map(lambda row: LabeledPoint(
            row[num_regressors], row[:num_regressors]))

    def report_accuracy(result_rdd):
        from pyspark.mllib.evaluation import RegressionMetrics

        if not result_rdd.isEmpty():
            metrics = RegressionMetrics(
                result_rdd.map(lambda t: (float(t[1]), float(t[0]))))
            print("MSE = %s" % metrics.meanSquaredError)
            print("RMSE = %s" % metrics.rootMeanSquaredError)
            print("R-squared = %s" % metrics.r2)
            print("MAE = %s" % metrics.meanAbsoluteError)
            print("Explained variance = %s" % metrics.explainedVariance)

    df_train, df_test = train_test_split(ts, test_size=0.2, shuffle=False)
    train_rdd = to_scaled_rdd(df_train)
    test_rdd = to_scaled_rdd(df_test)

    streamContext = StreamingContext(sparkSession.sparkContext, 1)
    train_stream = streamContext.queueStream([train_rdd])
    test_stream = streamContext.queueStream([test_rdd])

    numFeatures = len(ts.columns) - 1
    model = StreamingLinearRegressionWithSGD(stepSize=0.05, numIterations=300)
    np.random.seed(0)
    model.setInitialWeights(np.random.rand(numFeatures))

    model.trainOn(train_stream)
    result_stream = model.predictOnValues(
        test_stream.map(lambda lp: (lp.label, lp.features)))
    result_stream.cache()
    result_stream.foreachRDD(report_accuracy)

    streamContext.start()
    streamContext.awaitTermination()
Beispiel #6
0
from pyspark.mllib.regression import StreamingLinearRegressionWithSGD
import numpy as np
import math
from datetime import datetime

#initialize sc,then initialize ssc
sc = SparkContext(appName="StreamingModel1")
#just print ssc result and error 
sc.setLogLevel("ERROR")

ssc = StreamingContext(sc, batchDuration=5)# batch rdd per 5 seconds  
stream = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))

#create model 
len_term = 100
Model1 = StreamingLinearRegressionWithSGD(stepSize=0.01, numIterations=50)
Model1.setInitialWeights(np.array([0.0, ]*len_term))
Model2 = StreamingLinearRegressionWithSGD(stepSize=0.1, numIterations=50)
Model2.setInitialWeights(np.array([0.0, ]*len_term))
Model3 = StreamingLinearRegressionWithSGD(stepSize=1, numIterations=50)
Model3.setInitialWeights(np.array([0.0, ]*len_term))
#0.01 is slow; 1 leads to diverge; 0.1 is good   

# create actions
labeledStream = stream.map(
    lambda x:tuple(x.split(','))
    ).map(
    lambda x:(float(x[0]), np.array([float(i) for i in x[1:]]) )
    ).map(
    lambda x:LabeledPoint(label=x[0], features=x[1])
    )
Beispiel #7
0
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.regression import StreamingLinearRegressionWithSGD

#create a local streamingcontext with two
#working thread and batch interval of 1 second
sc = SparkContext("local[2]", "streaming_lr")
ssc = StreamingContext(sc, 1)


def parse(lp):
    label = float(lp[lp.find("(") + 1:lp.find(",")])
    vec = Vectors.dense(lp[lp.find("[") + 1:lp.find("]")].split(","))
    return LabeledPoint(label, vec)


trainingData = ssc.textFileStream("/training/data/dir").map(parse).cache()
testData = ssc.textFileStream("/testing/data/dir").map(parse)

numFeatures = 3
model = StreamingLinearRegressionWithSGD()
model.setInitialWeigths([0.0, 0.0, 0.0])

model.trainOn(trainingData)
print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features))))

ssc.start()
ssc.awaitTermination()
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.regression import StreamingLinearRegressionWithSGD
import numpy as np

#initialize sc,then initialize ssc
sc = SparkContext(appName="StreamingModel")
#just print ssc result and error
sc.setLogLevel("ERROR")

ssc = StreamingContext(sc, batchDuration=5)  # batch rdd per 5 seconds
stream = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))

#create model
len_term = 100
Model = StreamingLinearRegressionWithSGD(stepSize=0.1, numIterations=50)
Model.setInitialWeights(np.array([
    0.0,
] * len_term))

# create actions
labeledStream = stream.map(lambda x: tuple(x.split(','))).map(
    lambda x: (float(x[0]), np.array([float(i) for i in x[1:]]))).map(
        lambda x: LabeledPoint(label=x[0], features=x[1]))

labeledStream.pprint(1)
Model.trainOn(labeledStream)
Model.predictOn(labeledStream.map(lambda x: x.features)).pprint(1)

# start sc_stream
ssc.start()
import threading, time
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
from pyspark.mllib.regression import StreamingLinearRegressionWithSGD

sc = SparkContext("local[5]", "Tester")
sc.setLogLevel("OFF")
conf = SparkConf()
model = StreamingLinearRegressionWithSGD(stepSize=0.01)


class SparkThread(threading.Thread):
    global sc, model

    def __init__(self):
        threading.Thread.__init__(self)
        pass

    def run(self):
        """
        from OneTestTrainer import train
        ssc = StreamingContext(sc, 5)
        train(model=model, Context=sc, streamingContext=ssc)
        ssc.stop(stopSparkContext=False)
        """
        ssc = StreamingContext(sc, 5)
        from OneTestScenario import SparkApp
        SparkApp(model=model, Context=sc, streamingContext=ssc)
        print "\nSpark thread ended.\n"

Beispiel #10
0
 def __init__(self, initial_weights):
     self.model = StreamingLinearRegressionWithSGD(stepSize=0.01,
                                                   miniBatchFraction=0.5)
     self.model.setInitialWeights(initial_weights)