def test_parameter_accuracy(self):
        """Test that coefs are predicted accurately by fitting on toy data."""

        # Test that fitting (10*X1 + 10*X2), (X1, X2) gives coefficients
        # (10, 10)
        slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25)
        slr.setInitialWeights([0.0, 0.0])
        xMean = [0.0, 0.0]
        xVariance = [1.0 / 3.0, 1.0 / 3.0]

        # Create ten batches with 100 sample points in each.
        batches = []
        for i in range(10):
            batch = LinearDataGenerator.generateLinearInput(
                0.0, [10.0, 10.0], xMean, xVariance, 100, 42 + i, 0.1)
            batches.append(self.sc.parallelize(batch))

        input_stream = self.ssc.queueStream(batches)
        slr.trainOn(input_stream)
        self.ssc.start()

        def condition():
            self.assertArrayAlmostEqual(
                slr.latestModel().weights.array, [10., 10.], 1)
            self.assertAlmostEqual(slr.latestModel().intercept, 0.0, 1)
            return True

        self._eventually(condition, catch_assertions=True)
    def test_prediction(self):
        """Test prediction on a model with weights already set."""
        # Create a model with initial Weights equal to coefs
        slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25)
        slr.setInitialWeights([10.0, 10.0])

        # Create ten batches with 100 sample points in each.
        batches = []
        for i in range(10):
            batch = LinearDataGenerator.generateLinearInput(
                0.0, [10.0, 10.0], [0.0, 0.0], [1.0 / 3.0, 1.0 / 3.0],
                100, 42 + i, 0.1)
            batches.append(
                self.sc.parallelize(batch).map(lambda lp: (lp.label, lp.features)))

        input_stream = self.ssc.queueStream(batches)
        output_stream = slr.predictOnValues(input_stream)
        samples = []
        output_stream.foreachRDD(lambda x: samples.append(x.collect()))

        self.ssc.start()

        def condition():
            self.assertEqual(len(samples), len(batches))
            return True

        # We want all batches to finish for this test.
        self._eventually(condition, catch_assertions=True)

        # Test that mean absolute error on each batch is less than 0.1
        for batch in samples:
            true, predicted = zip(*batch)
            self.assertTrue(mean(abs(array(true) - array(predicted))) < 0.1)
    def test_prediction(self):
        """Test prediction on a model with weights already set."""
        # Create a model with initial Weights equal to coefs
        slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25)
        slr.setInitialWeights([10.0, 10.0])

        # Create ten batches with 100 sample points in each.
        batches = []
        for i in range(10):
            batch = LinearDataGenerator.generateLinearInput(
                0.0, [10.0, 10.0], [0.0, 0.0], [1.0 / 3.0, 1.0 / 3.0], 100,
                42 + i, 0.1)
            batches.append(
                self.sc.parallelize(batch).map(lambda lp:
                                               (lp.label, lp.features)))

        input_stream = self.ssc.queueStream(batches)
        output_stream = slr.predictOnValues(input_stream)
        samples = []
        output_stream.foreachRDD(lambda x: samples.append(x.collect()))

        self.ssc.start()

        def condition():
            self.assertEqual(len(samples), len(batches))
            return True

        # We want all batches to finish for this test.
        eventually(condition, catch_assertions=True)

        # Test that mean absolute error on each batch is less than 0.1
        for batch in samples:
            true, predicted = zip(*batch)
            self.assertTrue(mean(abs(array(true) - array(predicted))) < 0.1)
Exemple #4
0
    def test_train_prediction(self):
        """Test that error on test data improves as model is trained."""
        slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25)
        slr.setInitialWeights([0.0])

        # Create ten batches with 100 sample points in each.
        batches = []
        for i in range(10):
            batch = LinearDataGenerator.generateLinearInput(
                0.0, [10.0], [0.0], [1.0 / 3.0], 100, 42 + i, 0.1)
            batches.append(self.sc.parallelize(batch))

        predict_batches = [
            b.map(lambda lp: (lp.label, lp.features)) for b in batches]
        errors = []

        def func(rdd):
            true, predicted = zip(*rdd.collect())
            errors.append(mean(abs(true) - abs(predicted)))

        input_stream = self.ssc.queueStream(batches)
        output_stream = self.ssc.queueStream(predict_batches)
        slr.trainOn(input_stream)
        output_stream = slr.predictOnValues(output_stream)
        output_stream.foreachRDD(func)
        self.ssc.start()

        def condition():
            if len(errors) == len(predict_batches):
                self.assertGreater(errors[1] - errors[-1], 2)
            if len(errors) >= 3 and errors[1] - errors[-1] > 2:
                return True
            return "Latest errors: " + ", ".join(map(lambda x: str(x), errors))

        self._eventually(condition)
Exemple #5
0
def fit_and_predict(sparkSession, ts):
    import numpy as np
    from sklearn.model_selection import train_test_split
    from pyspark.streaming import StreamingContext
    from pyspark.mllib.regression import StreamingLinearRegressionWithSGD

    def to_scaled_rdd(pandasDataFrame):
        import pandas as pd
        from sklearn.preprocessing import RobustScaler
        from pyspark.mllib.regression import LabeledPoint

        regressors = pandasDataFrame.columns[1:]
        num_regressors = len(regressors)
        # FIX ME: As a bonus exercise, read the last paragraph from section about residual
        # plots and make the necessary bug fix! Compare the behavior of this version with the
        # fixed one and see whether you can decipher anything from the outputs.
        scaler = RobustScaler()
        scaled_regressors = scaler.fit_transform(pandasDataFrame[regressors])
        scaled_pandasDataFrame = pd.DataFrame(scaled_regressors,
                                              columns=regressors)
        scaled_pandasDataFrame['target'] = pandasDataFrame[
            pandasDataFrame.columns[0]].values

        sparkDataFrame = sparkSession.createDataFrame(scaled_pandasDataFrame)
        return sparkDataFrame.rdd.map(lambda row: LabeledPoint(
            row[num_regressors], row[:num_regressors]))

    def report_accuracy(result_rdd):
        from pyspark.mllib.evaluation import RegressionMetrics

        if not result_rdd.isEmpty():
            metrics = RegressionMetrics(
                result_rdd.map(lambda t: (float(t[1]), float(t[0]))))
            print("MSE = %s" % metrics.meanSquaredError)
            print("RMSE = %s" % metrics.rootMeanSquaredError)
            print("R-squared = %s" % metrics.r2)
            print("MAE = %s" % metrics.meanAbsoluteError)
            print("Explained variance = %s" % metrics.explainedVariance)

    df_train, df_test = train_test_split(ts, test_size=0.2, shuffle=False)
    train_rdd = to_scaled_rdd(df_train)
    test_rdd = to_scaled_rdd(df_test)

    streamContext = StreamingContext(sparkSession.sparkContext, 1)
    train_stream = streamContext.queueStream([train_rdd])
    test_stream = streamContext.queueStream([test_rdd])

    numFeatures = len(ts.columns) - 1
    model = StreamingLinearRegressionWithSGD(stepSize=0.05, numIterations=300)
    np.random.seed(0)
    model.setInitialWeights(np.random.rand(numFeatures))

    model.trainOn(train_stream)
    result_stream = model.predictOnValues(
        test_stream.map(lambda lp: (lp.label, lp.features)))
    result_stream.cache()
    result_stream.foreachRDD(report_accuracy)

    streamContext.start()
    streamContext.awaitTermination()
    def test_parameter_convergence(self):
        """Test that the model parameters improve with streaming data."""
        slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25)
        slr.setInitialWeights([0.0])

        # Create ten batches with 100 sample points in each.
        batches = []
        for i in range(10):
            batch = LinearDataGenerator.generateLinearInput(
                0.0, [10.0], [0.0], [1.0 / 3.0], 100, 42 + i, 0.1)
            batches.append(self.sc.parallelize(batch))

        model_weights = []
        input_stream = self.ssc.queueStream(batches)
        input_stream.foreachRDD(
            lambda x: model_weights.append(slr.latestModel().weights[0]))
        slr.trainOn(input_stream)
        self.ssc.start()

        def condition():
            self.assertEqual(len(model_weights), len(batches))
            return True

        # We want all batches to finish for this test.
        eventually(condition, 90, catch_assertions=True)

        w = array(model_weights)
        diff = w[1:] - w[:-1]
        self.assertTrue(all(diff >= -0.1))
    def test_parameter_accuracy(self):
        """Test that coefs are predicted accurately by fitting on toy data."""

        # Test that fitting (10*X1 + 10*X2), (X1, X2) gives coefficients
        # (10, 10)
        slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25)
        slr.setInitialWeights([0.0, 0.0])
        xMean = [0.0, 0.0]
        xVariance = [1.0 / 3.0, 1.0 / 3.0]

        # Create ten batches with 100 sample points in each.
        batches = []
        for i in range(10):
            batch = LinearDataGenerator.generateLinearInput(
                0.0, [10.0, 10.0], xMean, xVariance, 100, 42 + i, 0.1)
            batches.append(self.sc.parallelize(batch))

        input_stream = self.ssc.queueStream(batches)
        slr.trainOn(input_stream)
        self.ssc.start()

        def condition():
            self.assertArrayAlmostEqual(slr.latestModel().weights.array,
                                        [10., 10.], 1)
            self.assertAlmostEqual(slr.latestModel().intercept, 0.0, 1)
            return True

        eventually(condition, catch_assertions=True)
    def test_train_prediction(self):
        """Test that error on test data improves as model is trained."""
        slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25)
        slr.setInitialWeights([0.0])

        # Create ten batches with 100 sample points in each.
        batches = []
        for i in range(10):
            batch = LinearDataGenerator.generateLinearInput(
                0.0, [10.0], [0.0], [1.0 / 3.0], 100, 42 + i, 0.1)
            batches.append(self.sc.parallelize(batch))

        predict_batches = [
            b.map(lambda lp: (lp.label, lp.features)) for b in batches]
        errors = []

        def func(rdd):
            true, predicted = zip(*rdd.collect())
            errors.append(mean(abs(true) - abs(predicted)))

        input_stream = self.ssc.queueStream(batches)
        output_stream = self.ssc.queueStream(predict_batches)
        slr.trainOn(input_stream)
        output_stream = slr.predictOnValues(output_stream)
        output_stream.foreachRDD(func)
        self.ssc.start()

        def condition():
            if len(errors) == len(predict_batches):
                self.assertGreater(errors[1] - errors[-1], 2)
            if len(errors) >= 3 and errors[1] - errors[-1] > 2:
                return True
            return "Latest errors: " + ", ".join(map(lambda x: str(x), errors))

        self._eventually(condition)
    def test_parameter_convergence(self):
        """Test that the model parameters improve with streaming data."""
        slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25)
        slr.setInitialWeights([0.0])

        # Create ten batches with 100 sample points in each.
        batches = []
        for i in range(10):
            batch = LinearDataGenerator.generateLinearInput(
                0.0, [10.0], [0.0], [1.0 / 3.0], 100, 42 + i, 0.1)
            batches.append(self.sc.parallelize(batch))

        model_weights = []
        input_stream = self.ssc.queueStream(batches)
        input_stream.foreachRDD(
            lambda x: model_weights.append(slr.latestModel().weights[0]))
        slr.trainOn(input_stream)
        self.ssc.start()

        def condition():
            self.assertEqual(len(model_weights), len(batches))
            return True

        # We want all batches to finish for this test.
        self._eventually(condition, catch_assertions=True)

        w = array(model_weights)
        diff = w[1:] - w[:-1]
        self.assertTrue(all(diff >= -0.1))
Exemple #10
0
class DStreamService:
    def __init__(self, initial_weights):
        self.model = StreamingLinearRegressionWithSGD(stepSize=0.01,
                                                      miniBatchFraction=0.5)
        self.model.setInitialWeights(initial_weights)

    def train(self, ds):
        def transform(v):
            values = []
            for x in range(len(v)):
                values.append(v[x])
            label = values[0]
            features = values[1:]
            return LabeledPoint(label, Vectors.dense(features))

        ds_labeled = ds.map(lambda x: transform(x))

        self.model.trainOn(ds_labeled)

    def predict(self, ds):
        def transform(v):
            values = []
            for x in range(len(v)):
                values.append(v[x])
            label = values[0]
            features = values[1:]
            return label, Vectors.dense(features)

        ds_labeled = ds.map(lambda x: transform(x))

        self.model.predictOnValues(ds_labeled).pprint()
Exemple #11
0
########################################### Spark Initialisation ###################################################
###################################################################################################################

conf = SparkConf()
conf.setAppName("FPL")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
ssc = StreamingContext(sc, 100)
ssc.checkpoint("checkpoint_FPL")

####################################################################################################################
########################################### Quadratic Regression ###################################################
####################################################################################################################

numFeatures = 2
model = StreamingLinearRegressionWithSGD()
model.setInitialWeights([0.0, 0.0])

####################################################################################################################
###################################### Reading players.csv, teams.csv ##############################################
####################################################################################################################

# Reading players and teams csv files
players = sqlContext.read.load(
    "file:///C:\\Users\\navan\\Desktop\\Source_Code\\players.csv",
    format="csv",
    header="true",
    inferSchema="true")
playerBirthDate = players.select("Id", "birthDate").rdd.collectAsMap()
playerBirthDate = sc.broadcast(playerBirthDate)
# teams = sqlContext.read.load("file:///home/navaneeth/Desktop/Project/Source_Code/teams.csv", format="csv", header="true", inferSchema="true")
Exemple #12
0
from pyspark.mllib.regression import StreamingLinearRegressionWithSGD
import numpy as np
import math
from datetime import datetime

#initialize sc,then initialize ssc
sc = SparkContext(appName="StreamingModel1")
#just print ssc result and error 
sc.setLogLevel("ERROR")

ssc = StreamingContext(sc, batchDuration=5)# batch rdd per 5 seconds  
stream = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))

#create model 
len_term = 100
Model1 = StreamingLinearRegressionWithSGD(stepSize=0.01, numIterations=50)
Model1.setInitialWeights(np.array([0.0, ]*len_term))
Model2 = StreamingLinearRegressionWithSGD(stepSize=0.1, numIterations=50)
Model2.setInitialWeights(np.array([0.0, ]*len_term))
Model3 = StreamingLinearRegressionWithSGD(stepSize=1, numIterations=50)
Model3.setInitialWeights(np.array([0.0, ]*len_term))
#0.01 is slow; 1 leads to diverge; 0.1 is good   

# create actions
labeledStream = stream.map(
    lambda x:tuple(x.split(','))
    ).map(
    lambda x:(float(x[0]), np.array([float(i) for i in x[1:]]) )
    ).map(
    lambda x:LabeledPoint(label=x[0], features=x[1])
    )
from pyspark.streaming import StreamingContext
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.regression import StreamingLinearRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel
# Create a local StreamingContext with two working thread and batch interval of 1 second
sc = SparkContext("local[2]", "Streaming online learning")
ssc = StreamingContext(sc, 10)

stream = ssc.socketTextStream("localhost", 9999)

numFeatures = 100

zeroVector=Vectors.zeros(numFeatures)

model = StreamingLinearRegressionWithSGD(stepSize=0.01,numIterations=1)
model.setInitialWeights(Vectors.dense([0]*numFeatures))


#labeledStream=stream.map(lambda line: line.split('\t')).map(lambda fields: LabeledPoint(float(fields[0]),fields[1].map(lambda line: line.split(',')).map(float())))

def parsePoint(line):
    values = [float(x) for x in line.split(',')]
    return LabeledPoint(label=values[0], features=Vectors.dense(values[1:]))

labeledStream=stream.map(lambda line: parsePoint(line))

model.trainOn(labeledStream)

model.predictOn(labeledStream.map(lambda lp: lp.features)).pprint()
Exemple #14
0
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.regression import StreamingLinearRegressionWithSGD

#create a local streamingcontext with two
#working thread and batch interval of 1 second
sc = SparkContext("local[2]", "streaming_lr")
ssc = StreamingContext(sc, 1)


def parse(lp):
    label = float(lp[lp.find("(") + 1:lp.find(",")])
    vec = Vectors.dense(lp[lp.find("[") + 1:lp.find("]")].split(","))
    return LabeledPoint(label, vec)


trainingData = ssc.textFileStream("/training/data/dir").map(parse).cache()
testData = ssc.textFileStream("/testing/data/dir").map(parse)

numFeatures = 3
model = StreamingLinearRegressionWithSGD()
model.setInitialWeigths([0.0, 0.0, 0.0])

model.trainOn(trainingData)
print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features))))

ssc.start()
ssc.awaitTermination()
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel
from operator import add
import math


# Create a local StreamingContext with two working thread and batch interval of 1 second
sc = SparkContext("local[2]", "Streaming online learning performance comparison")
ssc = StreamingContext(sc, 10)

stream = ssc.socketTextStream("localhost", 9999)

numFeatures = 100

zeroVector=Vectors.zeros(numFeatures)

model1 = StreamingLinearRegressionWithSGD(stepSize=0.01,numIterations=1)
model1.setInitialWeights(Vectors.dense([0]*numFeatures))

model2 = StreamingLinearRegressionWithSGD(stepSize=1,numIterations=1)
model2.setInitialWeights(Vectors.dense([0]*numFeatures))

def parsePoint(line):
    values = [float(x) for x in line.split(',')]
    return LabeledPoint(label=values[0], features=Vectors.dense(values[1:]))

labeledStream=stream.map(lambda line: parsePoint(line))

model1.trainOn(labeledStream)
model2.trainOn(labeledStream)

from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.regression import StreamingLinearRegressionWithSGD
import numpy as np

#initialize sc,then initialize ssc
sc = SparkContext(appName="StreamingModel")
#just print ssc result and error
sc.setLogLevel("ERROR")

ssc = StreamingContext(sc, batchDuration=5)  # batch rdd per 5 seconds
stream = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))

#create model
len_term = 100
Model = StreamingLinearRegressionWithSGD(stepSize=0.1, numIterations=50)
Model.setInitialWeights(np.array([
    0.0,
] * len_term))

# create actions
labeledStream = stream.map(lambda x: tuple(x.split(','))).map(
    lambda x: (float(x[0]), np.array([float(i) for i in x[1:]]))).map(
        lambda x: LabeledPoint(label=x[0], features=x[1]))

labeledStream.pprint(1)
Model.trainOn(labeledStream)
Model.predictOn(labeledStream.map(lambda x: x.features)).pprint(1)

# start sc_stream
ssc.start()
# $example off$

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: streaming_linear_regression_example.py <trainingDir> <testDir>",
              file=sys.stderr)
        exit(-1)

    sc = SparkContext(appName="PythonLogisticRegressionWithLBFGSExample")
    ssc = StreamingContext(sc, 1)

    # $example on$
    def parse(lp):
        label = float(lp[lp.find('(') + 1: lp.find(',')])
        vec = Vectors.dense(lp[lp.find('[') + 1: lp.find(']')].split(','))
        return LabeledPoint(label, vec)

    trainingData = ssc.textFileStream(sys.argv[1]).map(parse).cache()
    testData = ssc.textFileStream(sys.argv[2]).map(parse)

    numFeatures = 3
    model = StreamingLinearRegressionWithSGD()
    model.setInitialWeights([0.0, 0.0, 0.0])

    model.trainOn(trainingData)
    print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features))))

    ssc.start()
    ssc.awaitTermination()
    # $example off$
import threading, time
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
from pyspark.mllib.regression import StreamingLinearRegressionWithSGD

sc = SparkContext("local[5]", "Tester")
sc.setLogLevel("OFF")
conf = SparkConf()
model = StreamingLinearRegressionWithSGD(stepSize=0.01)


class SparkThread(threading.Thread):
    global sc, model

    def __init__(self):
        threading.Thread.__init__(self)
        pass

    def run(self):
        """
        from OneTestTrainer import train
        ssc = StreamingContext(sc, 5)
        train(model=model, Context=sc, streamingContext=ssc)
        ssc.stop(stopSparkContext=False)
        """
        ssc = StreamingContext(sc, 5)
        from OneTestScenario import SparkApp
        SparkApp(model=model, Context=sc, streamingContext=ssc)
        print "\nSpark thread ended.\n"

Exemple #19
0
    return Vectors.dense(row["x"])


if __name__ == "__main__":
    # Create a local StreamingContext with two working thread
    # and batch interval of 1 second
    sc = SparkContext("local[1]", "Streaming Linear Regression")
    sc.setLogLevel("FATAL") # Ignores the whole history and reads only latest messages
    # 2nd argument is batch duration
    ssc = StreamingContext(sc, 5)

    directKafkaStream = KafkaUtils.createDirectStream(ssc,
                                                      ["trendy3-topic"],
                                                      {"metadata.broker.list": "localhost:9092"})


    model = StreamingLinearRegressionWithSGD()
    model.setInitialWeights(np.random.rand(NUM_FEATURES))

    numStream = directKafkaStream.flatMap(extract_data_rows_from_json)

    trainingStream = numStream.filter(lambda row: row["known"]).map(transform_training_row_into_lp)
    testStream = numStream.filter(lambda row: not row["known"]).map(transform_test_row)

    model.trainOn(trainingStream)
    predictionStream = model.predictOn(testStream)
    predictionStream.pprint()

    ssc.start()
    ssc.awaitTermination()
Exemple #20
0
 def __init__(self, initial_weights):
     self.model = StreamingLinearRegressionWithSGD(stepSize=0.01,
                                                   miniBatchFraction=0.5)
     self.model.setInitialWeights(initial_weights)