def test_accuracy_for_single_center(self): """Test that parameters obtained are correct for a single center.""" centers, batches = self.streamingKMeansDataGenerator(batches=5, numPoints=5, k=1, d=5, r=0.1, seed=0) stkm = StreamingKMeans(1) stkm.setInitialCenters([[0., 0., 0., 0., 0.]], [0.]) input_stream = self.ssc.queueStream( [self.sc.parallelize(batch, 1) for batch in batches]) stkm.trainOn(input_stream) self.ssc.start() def condition(): self.assertEqual(stkm.latestModel().clusterWeights, [25.0]) return True eventually(condition, catch_assertions=True) realCenters = array_sum(array(centers), axis=0) for i in range(5): modelCenters = stkm.latestModel().centers[0][i] self.assertAlmostEqual(centers[0][i], modelCenters, 1) self.assertAlmostEqual(realCenters[i], modelCenters, 1)
def test_parameter_convergence(self): """Test that the model parameters improve with streaming data.""" slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0]) # Create ten batches with 100 sample points in each. batches = [] for i in range(10): batch = LinearDataGenerator.generateLinearInput( 0.0, [10.0], [0.0], [1.0 / 3.0], 100, 42 + i, 0.1) batches.append(self.sc.parallelize(batch)) model_weights = [] input_stream = self.ssc.queueStream(batches) input_stream.foreachRDD( lambda x: model_weights.append(slr.latestModel().weights[0])) slr.trainOn(input_stream) self.ssc.start() def condition(): self.assertEqual(len(model_weights), len(batches)) return True # We want all batches to finish for this test. eventually(condition, 90, catch_assertions=True) w = array(model_weights) diff = w[1:] - w[:-1] self.assertTrue(all(diff >= -0.1))
def test_prediction(self): """Test prediction on a model with weights already set.""" # Create a model with initial Weights equal to coefs slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([10.0, 10.0]) # Create ten batches with 100 sample points in each. batches = [] for i in range(10): batch = LinearDataGenerator.generateLinearInput( 0.0, [10.0, 10.0], [0.0, 0.0], [1.0 / 3.0, 1.0 / 3.0], 100, 42 + i, 0.1) batches.append( self.sc.parallelize(batch).map(lambda lp: (lp.label, lp.features))) input_stream = self.ssc.queueStream(batches) output_stream = slr.predictOnValues(input_stream) samples = [] output_stream.foreachRDD(lambda x: samples.append(x.collect())) self.ssc.start() def condition(): self.assertEqual(len(samples), len(batches)) return True # We want all batches to finish for this test. eventually(condition, catch_assertions=True) # Test that mean absolute error on each batch is less than 0.1 for batch in samples: true, predicted = zip(*batch) self.assertTrue(mean(abs(array(true) - array(predicted))) < 0.1)
def test_predictions(self): """Test predicted values on a toy model.""" input_batches = [] for i in range(20): batch = self.sc.parallelize( self.generateLogisticInput(0, 1.5, 100, 42 + i)) input_batches.append(batch.map(lambda x: (x.label, x.features))) input_stream = self.ssc.queueStream(input_batches) slr = StreamingLogisticRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([1.5]) predict_stream = slr.predictOnValues(input_stream) true_predicted = [] predict_stream.foreachRDD(lambda x: true_predicted.append(x.collect())) self.ssc.start() def condition(): self.assertEqual(len(true_predicted), len(input_batches)) return True eventually(condition, catch_assertions=True) # Test that the accuracy error is no more than 0.4 on each batch. for batch in true_predicted: true, predicted = zip(*batch) self.assertTrue( self.calculate_accuracy_error(true, predicted) < 0.4)
def test_parameter_accuracy(self): """Test that coefs are predicted accurately by fitting on toy data.""" # Test that fitting (10*X1 + 10*X2), (X1, X2) gives coefficients # (10, 10) slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0, 0.0]) xMean = [0.0, 0.0] xVariance = [1.0 / 3.0, 1.0 / 3.0] # Create ten batches with 100 sample points in each. batches = [] for i in range(10): batch = LinearDataGenerator.generateLinearInput( 0.0, [10.0, 10.0], xMean, xVariance, 100, 42 + i, 0.1) batches.append(self.sc.parallelize(batch)) input_stream = self.ssc.queueStream(batches) slr.trainOn(input_stream) self.ssc.start() def condition(): self.assertArrayAlmostEqual(slr.latestModel().weights.array, [10., 10.], 1) self.assertAlmostEqual(slr.latestModel().intercept, 0.0, 1) return True eventually(condition, catch_assertions=True)
def test_trainOn_predictOn(self): """Test that prediction happens on the updated model.""" stkm = StreamingKMeans(decayFactor=0.0, k=2) stkm.setInitialCenters([[0.0], [1.0]], [1.0, 1.0]) # Since decay factor is set to zero, once the first batch # is passed the clusterCenters are updated to [-0.5, 0.7] # which causes 0.2 & 0.3 to be classified as 1, even though the # classification based in the initial model would have been 0 # proving that the model is updated. batches = [[[-0.5], [0.6], [0.8]], [[0.2], [-0.1], [0.3]]] batches = [self.sc.parallelize(batch) for batch in batches] input_stream = self.ssc.queueStream(batches) predict_results = [] def collect(rdd): rdd_collect = rdd.collect() if rdd_collect: predict_results.append(rdd_collect) stkm.trainOn(input_stream) predict_stream = stkm.predictOn(input_stream) predict_stream.foreachRDD(collect) self.ssc.start() def condition(): self.assertEqual(predict_results, [[0, 1, 1], [1, 0, 1]]) return True eventually(condition, catch_assertions=True)
def test_convergence(self): """ Test that weights converge to the required value on toy data. """ input_batches = [ self.sc.parallelize(self.generateLogisticInput( 0, 1.5, 100, 42 + i)) for i in range(20) ] input_stream = self.ssc.queueStream(input_batches) models = [] slr = StreamingLogisticRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0]) slr.trainOn(input_stream) input_stream.foreachRDD( lambda x: models.append(slr.latestModel().weights[0])) self.ssc.start() def condition(): self.assertEqual(len(models), len(input_batches)) return True # We want all batches to finish for this test. eventually(condition, 60.0, catch_assertions=True) t_models = array(models) diff = t_models[1:] - t_models[:-1] # Test that weights improve with a small tolerance self.assertTrue(all(diff >= -0.1)) self.assertTrue(array_sum(diff > 0) > 1)
def test_trainOn_model(self): """Test the model on toy data with four clusters.""" stkm = StreamingKMeans() initCenters = [[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]] stkm.setInitialCenters(centers=initCenters, weights=[1.0, 1.0, 1.0, 1.0]) # Create a toy dataset by setting a tiny offset for each point. offsets = [[0, 0.1], [0, -0.1], [0.1, 0], [-0.1, 0]] batches = [] for offset in offsets: batches.append([[offset[0] + center[0], offset[1] + center[1]] for center in initCenters]) batches = [self.sc.parallelize(batch, 1) for batch in batches] input_stream = self.ssc.queueStream(batches) stkm.trainOn(input_stream) self.ssc.start() # Give enough time to train the model. def condition(): finalModel = stkm.latestModel() self.assertTrue(all(finalModel.centers == array(initCenters))) self.assertEqual(finalModel.clusterWeights, [5.0, 5.0, 5.0, 5.0]) return True eventually(condition, catch_assertions=True)
def test_predictOn_model(self): """Test that the model predicts correctly on toy data.""" stkm = StreamingKMeans() stkm._model = StreamingKMeansModel(clusterCenters=[[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]], clusterWeights=[1.0, 1.0, 1.0, 1.0]) predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]], [[1.5, -1.5]]] predict_data = [ self.sc.parallelize(batch, 1) for batch in predict_data ] predict_stream = self.ssc.queueStream(predict_data) predict_val = stkm.predictOn(predict_stream) result = [] def update(rdd): rdd_collect = rdd.collect() if rdd_collect: result.append(rdd_collect) predict_val.foreachRDD(update) self.ssc.start() def condition(): self.assertEqual(result, [[0], [1], [2], [3]]) return True eventually(condition, catch_assertions=True)
def test_train_prediction(self): """Test that error on test data improves as model is trained.""" slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0]) # Create fifteen batches with 100 sample points in each. batches = [] for i in range(15): batch = LinearDataGenerator.generateLinearInput( 0.0, [10.0], [0.0], [1.0 / 3.0], 100, 42 + i, 0.1) batches.append(self.sc.parallelize(batch)) predict_batches = [ b.map(lambda lp: (lp.label, lp.features)) for b in batches] errors = [] def func(rdd): true, predicted = zip(*rdd.collect()) errors.append(mean(abs(true) - abs(predicted))) input_stream = self.ssc.queueStream(batches) output_stream = self.ssc.queueStream(predict_batches) slr.trainOn(input_stream) output_stream = slr.predictOnValues(output_stream) output_stream.foreachRDD(func) self.ssc.start() def condition(): if len(errors) == len(predict_batches): self.assertGreater(errors[1] - errors[-1], 2) if len(errors) >= 3 and errors[1] - errors[-1] > 2: return True return "Latest errors: " + ", ".join(map(lambda x: str(x), errors)) eventually(condition, timeout=180.0)
def test_training_and_prediction(self): """Test that the model improves on toy data with no. of batches""" input_batches = [ self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i)) for i in range(40)] predict_batches = [ b.map(lambda lp: (lp.label, lp.features)) for b in input_batches] slr = StreamingLogisticRegressionWithSGD( stepSize=0.01, numIterations=25) slr.setInitialWeights([-0.1]) errors = [] def collect_errors(rdd): true, predicted = zip(*rdd.collect()) errors.append(self.calculate_accuracy_error(true, predicted)) true_predicted = [] input_stream = self.ssc.queueStream(input_batches) predict_stream = self.ssc.queueStream(predict_batches) slr.trainOn(input_stream) ps = slr.predictOnValues(predict_stream) ps.foreachRDD(lambda x: collect_errors(x)) self.ssc.start() def condition(): # Test that the improvement in error is > 0.3 if len(errors) == len(predict_batches): self.assertGreater(errors[1] - errors[-1], 0.3) if len(errors) >= 3 and errors[1] - errors[-1] > 0.3: return True return "Latest errors: " + ", ".join(map(lambda x: str(x), errors)) eventually(condition, timeout=180.0)
def test_reuse_worker_of_parallelize_range(self): def check_reuse_worker_of_parallelize_range(): rdd = self.sc.parallelize(range(20), 8) previous_pids = rdd.map(lambda x: os.getpid()).collect() current_pids = rdd.map(lambda x: os.getpid()).collect() for pid in current_pids: self.assertTrue(pid in previous_pids) return True eventually(check_reuse_worker_of_parallelize_range, catch_assertions=True)
def test_java_object_gets_detached(self): df = self.spark.createDataFrame( [(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"], ) lr = LinearRegression(maxIter=1, regParam=0.0, solver="normal", weightCol="weight", fitIntercept=False) model = lr.fit(df) summary = model.summary self.assertIsInstance(model, JavaWrapper) self.assertIsInstance(summary, JavaWrapper) self.assertIsInstance(model, JavaParams) self.assertNotIsInstance(summary, JavaParams) error_no_object = "Target Object ID does not exist for this gateway" self.assertIn("LinearRegression_", model._java_obj.toString()) self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString()) model.__del__() def condition(): with self.assertRaisesRegex(py4j.protocol.Py4JError, error_no_object): model._java_obj.toString() self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString()) return True eventually(condition, timeout=10, catch_assertions=True) try: summary.__del__() except: pass def condition(): with self.assertRaisesRegex(py4j.protocol.Py4JError, error_no_object): model._java_obj.toString() with self.assertRaisesRegex(py4j.protocol.Py4JError, error_no_object): summary._java_obj.toString() return True eventually(condition, timeout=10, catch_assertions=True)
def test_parameter_accuracy(self): """ Test that the final value of weights is close to the desired value. """ input_batches = [ self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i)) for i in range(20) ] input_stream = self.ssc.queueStream(input_batches) slr = StreamingLogisticRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0]) slr.trainOn(input_stream) self.ssc.start() def condition(): rel = (1.5 - slr.latestModel().weights.array[0]) / 1.5 self.assertAlmostEqual(rel, 0.1, 1) return True eventually(condition, timeout=120.0, catch_assertions=True)
def test_task_context_correct_with_python_worker_reuse(self): # Retrying the check as the PIDs from Python workers might be different even # when reusing Python workers is enabled if a Python worker is dead for some reasons # (e.g., socket connection failure) and new Python worker is created. eventually( self.check_task_context_correct_with_python_worker_reuse, catch_assertions=True)