def test_parameter_accuracy(self): """Test that coefs are predicted accurately by fitting on toy data.""" # Test that fitting (10*X1 + 10*X2), (X1, X2) gives coefficients # (10, 10) slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0, 0.0]) xMean = [0.0, 0.0] xVariance = [1.0 / 3.0, 1.0 / 3.0] # Create ten batches with 100 sample points in each. batches = [] for i in range(10): batch = LinearDataGenerator.generateLinearInput( 0.0, [10.0, 10.0], xMean, xVariance, 100, 42 + i, 0.1) batches.append(self.sc.parallelize(batch)) input_stream = self.ssc.queueStream(batches) slr.trainOn(input_stream) self.ssc.start() def condition(): self.assertArrayAlmostEqual( slr.latestModel().weights.array, [10., 10.], 1) self.assertAlmostEqual(slr.latestModel().intercept, 0.0, 1) return True self._eventually(condition, catch_assertions=True)
def test_prediction(self): """Test prediction on a model with weights already set.""" # Create a model with initial Weights equal to coefs slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([10.0, 10.0]) # Create ten batches with 100 sample points in each. batches = [] for i in range(10): batch = LinearDataGenerator.generateLinearInput( 0.0, [10.0, 10.0], [0.0, 0.0], [1.0 / 3.0, 1.0 / 3.0], 100, 42 + i, 0.1) batches.append( self.sc.parallelize(batch).map(lambda lp: (lp.label, lp.features))) input_stream = self.ssc.queueStream(batches) output_stream = slr.predictOnValues(input_stream) samples = [] output_stream.foreachRDD(lambda x: samples.append(x.collect())) self.ssc.start() def condition(): self.assertEqual(len(samples), len(batches)) return True # We want all batches to finish for this test. self._eventually(condition, catch_assertions=True) # Test that mean absolute error on each batch is less than 0.1 for batch in samples: true, predicted = zip(*batch) self.assertTrue(mean(abs(array(true) - array(predicted))) < 0.1)
def test_prediction(self): """Test prediction on a model with weights already set.""" # Create a model with initial Weights equal to coefs slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([10.0, 10.0]) # Create ten batches with 100 sample points in each. batches = [] for i in range(10): batch = LinearDataGenerator.generateLinearInput( 0.0, [10.0, 10.0], [0.0, 0.0], [1.0 / 3.0, 1.0 / 3.0], 100, 42 + i, 0.1) batches.append( self.sc.parallelize(batch).map(lambda lp: (lp.label, lp.features))) input_stream = self.ssc.queueStream(batches) output_stream = slr.predictOnValues(input_stream) samples = [] output_stream.foreachRDD(lambda x: samples.append(x.collect())) self.ssc.start() def condition(): self.assertEqual(len(samples), len(batches)) return True # We want all batches to finish for this test. eventually(condition, catch_assertions=True) # Test that mean absolute error on each batch is less than 0.1 for batch in samples: true, predicted = zip(*batch) self.assertTrue(mean(abs(array(true) - array(predicted))) < 0.1)
def test_train_prediction(self): """Test that error on test data improves as model is trained.""" slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0]) # Create ten batches with 100 sample points in each. batches = [] for i in range(10): batch = LinearDataGenerator.generateLinearInput( 0.0, [10.0], [0.0], [1.0 / 3.0], 100, 42 + i, 0.1) batches.append(self.sc.parallelize(batch)) predict_batches = [ b.map(lambda lp: (lp.label, lp.features)) for b in batches] errors = [] def func(rdd): true, predicted = zip(*rdd.collect()) errors.append(mean(abs(true) - abs(predicted))) input_stream = self.ssc.queueStream(batches) output_stream = self.ssc.queueStream(predict_batches) slr.trainOn(input_stream) output_stream = slr.predictOnValues(output_stream) output_stream.foreachRDD(func) self.ssc.start() def condition(): if len(errors) == len(predict_batches): self.assertGreater(errors[1] - errors[-1], 2) if len(errors) >= 3 and errors[1] - errors[-1] > 2: return True return "Latest errors: " + ", ".join(map(lambda x: str(x), errors)) self._eventually(condition)
def fit_and_predict(sparkSession, ts): import numpy as np from sklearn.model_selection import train_test_split from pyspark.streaming import StreamingContext from pyspark.mllib.regression import StreamingLinearRegressionWithSGD def to_scaled_rdd(pandasDataFrame): import pandas as pd from sklearn.preprocessing import RobustScaler from pyspark.mllib.regression import LabeledPoint regressors = pandasDataFrame.columns[1:] num_regressors = len(regressors) # FIX ME: As a bonus exercise, read the last paragraph from section about residual # plots and make the necessary bug fix! Compare the behavior of this version with the # fixed one and see whether you can decipher anything from the outputs. scaler = RobustScaler() scaled_regressors = scaler.fit_transform(pandasDataFrame[regressors]) scaled_pandasDataFrame = pd.DataFrame(scaled_regressors, columns=regressors) scaled_pandasDataFrame['target'] = pandasDataFrame[ pandasDataFrame.columns[0]].values sparkDataFrame = sparkSession.createDataFrame(scaled_pandasDataFrame) return sparkDataFrame.rdd.map(lambda row: LabeledPoint( row[num_regressors], row[:num_regressors])) def report_accuracy(result_rdd): from pyspark.mllib.evaluation import RegressionMetrics if not result_rdd.isEmpty(): metrics = RegressionMetrics( result_rdd.map(lambda t: (float(t[1]), float(t[0])))) print("MSE = %s" % metrics.meanSquaredError) print("RMSE = %s" % metrics.rootMeanSquaredError) print("R-squared = %s" % metrics.r2) print("MAE = %s" % metrics.meanAbsoluteError) print("Explained variance = %s" % metrics.explainedVariance) df_train, df_test = train_test_split(ts, test_size=0.2, shuffle=False) train_rdd = to_scaled_rdd(df_train) test_rdd = to_scaled_rdd(df_test) streamContext = StreamingContext(sparkSession.sparkContext, 1) train_stream = streamContext.queueStream([train_rdd]) test_stream = streamContext.queueStream([test_rdd]) numFeatures = len(ts.columns) - 1 model = StreamingLinearRegressionWithSGD(stepSize=0.05, numIterations=300) np.random.seed(0) model.setInitialWeights(np.random.rand(numFeatures)) model.trainOn(train_stream) result_stream = model.predictOnValues( test_stream.map(lambda lp: (lp.label, lp.features))) result_stream.cache() result_stream.foreachRDD(report_accuracy) streamContext.start() streamContext.awaitTermination()
def test_parameter_convergence(self): """Test that the model parameters improve with streaming data.""" slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0]) # Create ten batches with 100 sample points in each. batches = [] for i in range(10): batch = LinearDataGenerator.generateLinearInput( 0.0, [10.0], [0.0], [1.0 / 3.0], 100, 42 + i, 0.1) batches.append(self.sc.parallelize(batch)) model_weights = [] input_stream = self.ssc.queueStream(batches) input_stream.foreachRDD( lambda x: model_weights.append(slr.latestModel().weights[0])) slr.trainOn(input_stream) self.ssc.start() def condition(): self.assertEqual(len(model_weights), len(batches)) return True # We want all batches to finish for this test. eventually(condition, 90, catch_assertions=True) w = array(model_weights) diff = w[1:] - w[:-1] self.assertTrue(all(diff >= -0.1))
def test_parameter_accuracy(self): """Test that coefs are predicted accurately by fitting on toy data.""" # Test that fitting (10*X1 + 10*X2), (X1, X2) gives coefficients # (10, 10) slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0, 0.0]) xMean = [0.0, 0.0] xVariance = [1.0 / 3.0, 1.0 / 3.0] # Create ten batches with 100 sample points in each. batches = [] for i in range(10): batch = LinearDataGenerator.generateLinearInput( 0.0, [10.0, 10.0], xMean, xVariance, 100, 42 + i, 0.1) batches.append(self.sc.parallelize(batch)) input_stream = self.ssc.queueStream(batches) slr.trainOn(input_stream) self.ssc.start() def condition(): self.assertArrayAlmostEqual(slr.latestModel().weights.array, [10., 10.], 1) self.assertAlmostEqual(slr.latestModel().intercept, 0.0, 1) return True eventually(condition, catch_assertions=True)
def test_parameter_convergence(self): """Test that the model parameters improve with streaming data.""" slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0]) # Create ten batches with 100 sample points in each. batches = [] for i in range(10): batch = LinearDataGenerator.generateLinearInput( 0.0, [10.0], [0.0], [1.0 / 3.0], 100, 42 + i, 0.1) batches.append(self.sc.parallelize(batch)) model_weights = [] input_stream = self.ssc.queueStream(batches) input_stream.foreachRDD( lambda x: model_weights.append(slr.latestModel().weights[0])) slr.trainOn(input_stream) self.ssc.start() def condition(): self.assertEqual(len(model_weights), len(batches)) return True # We want all batches to finish for this test. self._eventually(condition, catch_assertions=True) w = array(model_weights) diff = w[1:] - w[:-1] self.assertTrue(all(diff >= -0.1))
class DStreamService: def __init__(self, initial_weights): self.model = StreamingLinearRegressionWithSGD(stepSize=0.01, miniBatchFraction=0.5) self.model.setInitialWeights(initial_weights) def train(self, ds): def transform(v): values = [] for x in range(len(v)): values.append(v[x]) label = values[0] features = values[1:] return LabeledPoint(label, Vectors.dense(features)) ds_labeled = ds.map(lambda x: transform(x)) self.model.trainOn(ds_labeled) def predict(self, ds): def transform(v): values = [] for x in range(len(v)): values.append(v[x]) label = values[0] features = values[1:] return label, Vectors.dense(features) ds_labeled = ds.map(lambda x: transform(x)) self.model.predictOnValues(ds_labeled).pprint()
########################################### Spark Initialisation ################################################### ################################################################################################################### conf = SparkConf() conf.setAppName("FPL") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) ssc = StreamingContext(sc, 100) ssc.checkpoint("checkpoint_FPL") #################################################################################################################### ########################################### Quadratic Regression ################################################### #################################################################################################################### numFeatures = 2 model = StreamingLinearRegressionWithSGD() model.setInitialWeights([0.0, 0.0]) #################################################################################################################### ###################################### Reading players.csv, teams.csv ############################################## #################################################################################################################### # Reading players and teams csv files players = sqlContext.read.load( "file:///C:\\Users\\navan\\Desktop\\Source_Code\\players.csv", format="csv", header="true", inferSchema="true") playerBirthDate = players.select("Id", "birthDate").rdd.collectAsMap() playerBirthDate = sc.broadcast(playerBirthDate) # teams = sqlContext.read.load("file:///home/navaneeth/Desktop/Project/Source_Code/teams.csv", format="csv", header="true", inferSchema="true")
from pyspark.mllib.regression import StreamingLinearRegressionWithSGD import numpy as np import math from datetime import datetime #initialize sc,then initialize ssc sc = SparkContext(appName="StreamingModel1") #just print ssc result and error sc.setLogLevel("ERROR") ssc = StreamingContext(sc, batchDuration=5)# batch rdd per 5 seconds stream = ssc.socketTextStream(sys.argv[1], int(sys.argv[2])) #create model len_term = 100 Model1 = StreamingLinearRegressionWithSGD(stepSize=0.01, numIterations=50) Model1.setInitialWeights(np.array([0.0, ]*len_term)) Model2 = StreamingLinearRegressionWithSGD(stepSize=0.1, numIterations=50) Model2.setInitialWeights(np.array([0.0, ]*len_term)) Model3 = StreamingLinearRegressionWithSGD(stepSize=1, numIterations=50) Model3.setInitialWeights(np.array([0.0, ]*len_term)) #0.01 is slow; 1 leads to diverge; 0.1 is good # create actions labeledStream = stream.map( lambda x:tuple(x.split(',')) ).map( lambda x:(float(x[0]), np.array([float(i) for i in x[1:]]) ) ).map( lambda x:LabeledPoint(label=x[0], features=x[1]) )
from pyspark.streaming import StreamingContext from pyspark.mllib.linalg import Vectors from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.regression import StreamingLinearRegressionWithSGD from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel # Create a local StreamingContext with two working thread and batch interval of 1 second sc = SparkContext("local[2]", "Streaming online learning") ssc = StreamingContext(sc, 10) stream = ssc.socketTextStream("localhost", 9999) numFeatures = 100 zeroVector=Vectors.zeros(numFeatures) model = StreamingLinearRegressionWithSGD(stepSize=0.01,numIterations=1) model.setInitialWeights(Vectors.dense([0]*numFeatures)) #labeledStream=stream.map(lambda line: line.split('\t')).map(lambda fields: LabeledPoint(float(fields[0]),fields[1].map(lambda line: line.split(',')).map(float()))) def parsePoint(line): values = [float(x) for x in line.split(',')] return LabeledPoint(label=values[0], features=Vectors.dense(values[1:])) labeledStream=stream.map(lambda line: parsePoint(line)) model.trainOn(labeledStream) model.predictOn(labeledStream.map(lambda lp: lp.features)).pprint()
from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.mllib.linalg import Vectors from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.regression import StreamingLinearRegressionWithSGD #create a local streamingcontext with two #working thread and batch interval of 1 second sc = SparkContext("local[2]", "streaming_lr") ssc = StreamingContext(sc, 1) def parse(lp): label = float(lp[lp.find("(") + 1:lp.find(",")]) vec = Vectors.dense(lp[lp.find("[") + 1:lp.find("]")].split(",")) return LabeledPoint(label, vec) trainingData = ssc.textFileStream("/training/data/dir").map(parse).cache() testData = ssc.textFileStream("/testing/data/dir").map(parse) numFeatures = 3 model = StreamingLinearRegressionWithSGD() model.setInitialWeigths([0.0, 0.0, 0.0]) model.trainOn(trainingData) print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features)))) ssc.start() ssc.awaitTermination()
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel from operator import add import math # Create a local StreamingContext with two working thread and batch interval of 1 second sc = SparkContext("local[2]", "Streaming online learning performance comparison") ssc = StreamingContext(sc, 10) stream = ssc.socketTextStream("localhost", 9999) numFeatures = 100 zeroVector=Vectors.zeros(numFeatures) model1 = StreamingLinearRegressionWithSGD(stepSize=0.01,numIterations=1) model1.setInitialWeights(Vectors.dense([0]*numFeatures)) model2 = StreamingLinearRegressionWithSGD(stepSize=1,numIterations=1) model2.setInitialWeights(Vectors.dense([0]*numFeatures)) def parsePoint(line): values = [float(x) for x in line.split(',')] return LabeledPoint(label=values[0], features=Vectors.dense(values[1:])) labeledStream=stream.map(lambda line: parsePoint(line)) model1.trainOn(labeledStream) model2.trainOn(labeledStream)
from pyspark.mllib.linalg import Vectors from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.regression import StreamingLinearRegressionWithSGD import numpy as np #initialize sc,then initialize ssc sc = SparkContext(appName="StreamingModel") #just print ssc result and error sc.setLogLevel("ERROR") ssc = StreamingContext(sc, batchDuration=5) # batch rdd per 5 seconds stream = ssc.socketTextStream(sys.argv[1], int(sys.argv[2])) #create model len_term = 100 Model = StreamingLinearRegressionWithSGD(stepSize=0.1, numIterations=50) Model.setInitialWeights(np.array([ 0.0, ] * len_term)) # create actions labeledStream = stream.map(lambda x: tuple(x.split(','))).map( lambda x: (float(x[0]), np.array([float(i) for i in x[1:]]))).map( lambda x: LabeledPoint(label=x[0], features=x[1])) labeledStream.pprint(1) Model.trainOn(labeledStream) Model.predictOn(labeledStream.map(lambda x: x.features)).pprint(1) # start sc_stream ssc.start()
# $example off$ if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: streaming_linear_regression_example.py <trainingDir> <testDir>", file=sys.stderr) exit(-1) sc = SparkContext(appName="PythonLogisticRegressionWithLBFGSExample") ssc = StreamingContext(sc, 1) # $example on$ def parse(lp): label = float(lp[lp.find('(') + 1: lp.find(',')]) vec = Vectors.dense(lp[lp.find('[') + 1: lp.find(']')].split(',')) return LabeledPoint(label, vec) trainingData = ssc.textFileStream(sys.argv[1]).map(parse).cache() testData = ssc.textFileStream(sys.argv[2]).map(parse) numFeatures = 3 model = StreamingLinearRegressionWithSGD() model.setInitialWeights([0.0, 0.0, 0.0]) model.trainOn(trainingData) print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features)))) ssc.start() ssc.awaitTermination() # $example off$
import threading, time from pyspark import SparkContext, SparkConf from pyspark.streaming import StreamingContext from pyspark.mllib.regression import StreamingLinearRegressionWithSGD sc = SparkContext("local[5]", "Tester") sc.setLogLevel("OFF") conf = SparkConf() model = StreamingLinearRegressionWithSGD(stepSize=0.01) class SparkThread(threading.Thread): global sc, model def __init__(self): threading.Thread.__init__(self) pass def run(self): """ from OneTestTrainer import train ssc = StreamingContext(sc, 5) train(model=model, Context=sc, streamingContext=ssc) ssc.stop(stopSparkContext=False) """ ssc = StreamingContext(sc, 5) from OneTestScenario import SparkApp SparkApp(model=model, Context=sc, streamingContext=ssc) print "\nSpark thread ended.\n"
return Vectors.dense(row["x"]) if __name__ == "__main__": # Create a local StreamingContext with two working thread # and batch interval of 1 second sc = SparkContext("local[1]", "Streaming Linear Regression") sc.setLogLevel("FATAL") # Ignores the whole history and reads only latest messages # 2nd argument is batch duration ssc = StreamingContext(sc, 5) directKafkaStream = KafkaUtils.createDirectStream(ssc, ["trendy3-topic"], {"metadata.broker.list": "localhost:9092"}) model = StreamingLinearRegressionWithSGD() model.setInitialWeights(np.random.rand(NUM_FEATURES)) numStream = directKafkaStream.flatMap(extract_data_rows_from_json) trainingStream = numStream.filter(lambda row: row["known"]).map(transform_training_row_into_lp) testStream = numStream.filter(lambda row: not row["known"]).map(transform_test_row) model.trainOn(trainingStream) predictionStream = model.predictOn(testStream) predictionStream.pprint() ssc.start() ssc.awaitTermination()
def __init__(self, initial_weights): self.model = StreamingLinearRegressionWithSGD(stepSize=0.01, miniBatchFraction=0.5) self.model.setInitialWeights(initial_weights)