def test_predictions(self): """Test predicted values on a toy model.""" input_batches = [] for i in range(20): batch = self.sc.parallelize( self.generateLogisticInput(0, 1.5, 100, 42 + i)) input_batches.append(batch.map(lambda x: (x.label, x.features))) input_stream = self.ssc.queueStream(input_batches) slr = StreamingLogisticRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([1.5]) predict_stream = slr.predictOnValues(input_stream) true_predicted = [] predict_stream.foreachRDD(lambda x: true_predicted.append(x.collect())) self.ssc.start() def condition(): self.assertEqual(len(true_predicted), len(input_batches)) return True eventually(condition, catch_assertions=True) # Test that the accuracy error is no more than 0.4 on each batch. for batch in true_predicted: true, predicted = zip(*batch) self.assertTrue( self.calculate_accuracy_error(true, predicted) < 0.4)
def test_training_and_prediction(self): """Test that the model improves on toy data with no. of batches""" input_batches = [ self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i)) for i in range(20)] predict_batches = [ b.map(lambda lp: (lp.label, lp.features)) for b in input_batches] slr = StreamingLogisticRegressionWithSGD( stepSize=0.01, numIterations=25) slr.setInitialWeights([-0.1]) errors = [] def collect_errors(rdd): true, predicted = zip(*rdd.collect()) errors.append(self.calculate_accuracy_error(true, predicted)) true_predicted = [] input_stream = self.ssc.queueStream(input_batches) predict_stream = self.ssc.queueStream(predict_batches) slr.trainOn(input_stream) ps = slr.predictOnValues(predict_stream) ps.foreachRDD(lambda x: collect_errors(x)) self.ssc.start() def condition(): # Test that the improvement in error is > 0.3 if len(errors) == len(predict_batches): self.assertGreater(errors[1] - errors[-1], 0.3) if len(errors) >= 3 and errors[1] - errors[-1] > 0.3: return True return "Latest errors: " + ", ".join(map(lambda x: str(x), errors)) self._eventually(condition, timeout=60.0)
def test_convergence(self): """ Test that weights converge to the required value on toy data. """ input_batches = [ self.sc.parallelize(self.generateLogisticInput( 0, 1.5, 100, 42 + i)) for i in range(20) ] input_stream = self.ssc.queueStream(input_batches) models = [] slr = StreamingLogisticRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0]) slr.trainOn(input_stream) input_stream.foreachRDD( lambda x: models.append(slr.latestModel().weights[0])) self.ssc.start() def condition(): self.assertEqual(len(models), len(input_batches)) return True # We want all batches to finish for this test. eventually(condition, 60.0, catch_assertions=True) t_models = array(models) diff = t_models[1:] - t_models[:-1] # Test that weights improve with a small tolerance self.assertTrue(all(diff >= -0.1)) self.assertTrue(array_sum(diff > 0) > 1)
def test_convergence(self): """ Test that weights converge to the required value on toy data. """ input_batches = [ self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i)) for i in range(20)] input_stream = self.ssc.queueStream(input_batches) models = [] slr = StreamingLogisticRegressionWithSGD( stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0]) slr.trainOn(input_stream) input_stream.foreachRDD( lambda x: models.append(slr.latestModel().weights[0])) self.ssc.start() def condition(): self.assertEqual(len(models), len(input_batches)) return True # We want all batches to finish for this test. self._eventually(condition, 60.0, catch_assertions=True) t_models = array(models) diff = t_models[1:] - t_models[:-1] # Test that weights improve with a small tolerance self.assertTrue(all(diff >= -0.1)) self.assertTrue(array_sum(diff > 0) > 1)
def test_predictions(self): """Test predicted values on a toy model.""" input_batches = [] for i in range(20): batch = self.sc.parallelize( self.generateLogisticInput(0, 1.5, 100, 42 + i)) input_batches.append(batch.map(lambda x: (x.label, x.features))) input_stream = self.ssc.queueStream(input_batches) slr = StreamingLogisticRegressionWithSGD( stepSize=0.2, numIterations=25) slr.setInitialWeights([1.5]) predict_stream = slr.predictOnValues(input_stream) true_predicted = [] predict_stream.foreachRDD(lambda x: true_predicted.append(x.collect())) self.ssc.start() def condition(): self.assertEqual(len(true_predicted), len(input_batches)) return True self._eventually(condition, catch_assertions=True) # Test that the accuracy error is no more than 0.4 on each batch. for batch in true_predicted: true, predicted = zip(*batch) self.assertTrue( self.calculate_accuracy_error(true, predicted) < 0.4)
def test_training_and_prediction(self): """Test that the model improves on toy data with no. of batches""" input_batches = [ self.sc.parallelize(self.generateLogisticInput( 0, 1.5, 100, 42 + i)) for i in range(20) ] predict_batches = [ b.map(lambda lp: (lp.label, lp.features)) for b in input_batches ] slr = StreamingLogisticRegressionWithSGD(stepSize=0.01, numIterations=25) slr.setInitialWeights([-0.1]) errors = [] def collect_errors(rdd): true, predicted = zip(*rdd.collect()) errors.append(self.calculate_accuracy_error(true, predicted)) true_predicted = [] input_stream = self.ssc.queueStream(input_batches) predict_stream = self.ssc.queueStream(predict_batches) slr.trainOn(input_stream) ps = slr.predictOnValues(predict_stream) ps.foreachRDD(lambda x: collect_errors(x)) t = time() self.ssc.start() self._ssc_wait(t, 20.0, 0.01) # Test that the improvement in error is atleast 0.3 self.assertTrue(errors[1] - errors[-1] > 0.3)
def test_training_and_prediction(self): """Test that the model improves on toy data with no. of batches""" input_batches = [ self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i)) for i in range(40)] predict_batches = [ b.map(lambda lp: (lp.label, lp.features)) for b in input_batches] slr = StreamingLogisticRegressionWithSGD( stepSize=0.01, numIterations=25) slr.setInitialWeights([-0.1]) errors = [] def collect_errors(rdd): true, predicted = zip(*rdd.collect()) errors.append(self.calculate_accuracy_error(true, predicted)) true_predicted = [] input_stream = self.ssc.queueStream(input_batches) predict_stream = self.ssc.queueStream(predict_batches) slr.trainOn(input_stream) ps = slr.predictOnValues(predict_stream) ps.foreachRDD(lambda x: collect_errors(x)) self.ssc.start() def condition(): # Test that the improvement in error is > 0.3 if len(errors) == len(predict_batches): self.assertGreater(errors[1] - errors[-1], 0.3) if len(errors) >= 3 and errors[1] - errors[-1] > 0.3: return True return "Latest errors: " + ", ".join(map(lambda x: str(x), errors)) self._eventually(condition, timeout=60.0)
def test_training_and_prediction(self): """Test that the model improves on toy data with no. of batches""" input_batches = [ self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i)) for i in range(20)] predict_batches = [ b.map(lambda lp: (lp.label, lp.features)) for b in input_batches] slr = StreamingLogisticRegressionWithSGD( stepSize=0.01, numIterations=25) slr.setInitialWeights([-0.1]) errors = [] def collect_errors(rdd): true, predicted = zip(*rdd.collect()) errors.append(self.calculate_accuracy_error(true, predicted)) true_predicted = [] input_stream = self.ssc.queueStream(input_batches) predict_stream = self.ssc.queueStream(predict_batches) slr.trainOn(input_stream) ps = slr.predictOnValues(predict_stream) ps.foreachRDD(lambda x: collect_errors(x)) t = time() self.ssc.start() self._ssc_wait(t, 20.0, 0.01) # Test that the improvement in error is atleast 0.3 self.assertTrue(errors[1] - errors[-1] > 0.3)
def test_convergence(self): """ Test that weights converge to the required value on toy data. """ input_batches = [ self.sc.parallelize(self.generateLogisticInput( 0, 1.5, 100, 42 + i)) for i in range(20) ] input_stream = self.ssc.queueStream(input_batches) models = [] slr = StreamingLogisticRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0]) slr.trainOn(input_stream) input_stream.foreachRDD( lambda x: models.append(slr.latestModel().weights[0])) t = time() self.ssc.start() self._ssc_wait(t, 15.0, 0.01) t_models = array(models) diff = t_models[1:] - t_models[:-1] # Test that weights improve with a small tolerance, self.assertTrue(all(diff >= -0.1)) self.assertTrue(array_sum(diff > 0) > 1)
def test_convergence(self): """ Test that weights converge to the required value on toy data. """ input_batches = [ self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i)) for i in range(20)] input_stream = self.ssc.queueStream(input_batches) models = [] slr = StreamingLogisticRegressionWithSGD( stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0]) slr.trainOn(input_stream) input_stream.foreachRDD( lambda x: models.append(slr.latestModel().weights[0])) t = time() self.ssc.start() self._ssc_wait(t, 15.0, 0.01) t_models = array(models) diff = t_models[1:] - t_models[:-1] # Test that weights improve with a small tolerance, self.assertTrue(all(diff >= -0.1)) self.assertTrue(array_sum(diff > 0) > 1)
def test_parameter_accuracy(self): """ Test that the final value of weights is close to the desired value. """ input_batches = [ self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i)) for i in range(20)] input_stream = self.ssc.queueStream(input_batches) slr = StreamingLogisticRegressionWithSGD( stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0]) slr.trainOn(input_stream) t = time() self.ssc.start() self._ssc_wait(t, 20.0, 0.01) rel = (1.5 - slr.latestModel().weights.array[0]) / 1.5 self.assertAlmostEqual(rel, 0.1, 1)
def test_parameter_accuracy(self): """ Test that the final value of weights is close to the desired value. """ input_batches = [ self.sc.parallelize(self.generateLogisticInput( 0, 1.5, 100, 42 + i)) for i in range(20) ] input_stream = self.ssc.queueStream(input_batches) slr = StreamingLogisticRegressionWithSGD(stepSize=0.2, numIterations=25) slr.setInitialWeights([0.0]) slr.trainOn(input_stream) t = time() self.ssc.start() self._ssc_wait(t, 20.0, 0.01) rel = (1.5 - slr.latestModel().weights.array[0]) / 1.5 self.assertAlmostEqual(rel, 0.1, 1)
if __name__ == '__main__': # creating a SparkContext object sc = SparkContext.getOrCreate() # setting the log level to avoid printing logs in the console sc.setLogLevel("WARN") # creating a Spark Streaming Context ssc = StreamingContext(sparkContext=sc, batchDuration=10) # setting up a model lr = StreamingLogisticRegressionWithSGD() # loading the pre-trained parameters parameters = json.load(open('model.json', 'r')) # assigning the pre-trained parameters to the logistic regression lr.setInitialWeights(parameters['weights']) # loading stop words stop_words = load_stopwords() # loading common words common_words = load_common_words() # creating the reference table reference_table = create_hash_table(common_words=common_words, stop_words=stop_words) # opening the stream kafkaStream = KafkaUtils.createDirectStream(ssc=ssc, topics=['trump'], kafkaParams={"metadata.broker.list": 'localhost:9092'}) # getting only the useful information dfs = kafkaStream.map(lambda stream: stream[1].encode('utf-8')) # parsing data into a dictionary
lambda tweet: (filtering(tweet[0].split(" ")), tweet[1])).map( lambda tweet: ([model.value.get(word) for word in tweet[0]], tweet[1])) #SUM among vectors features_training = features_training.filter(lambda tweet: check_None(tweet[ 0])).map(lambda tweet: (media(tweet[0], vectorSize), tweet[1])) features_test = features_test.filter(lambda tweet: check_None(tweet[0])).map( lambda tweet: (media(tweet[0], vectorSize), tweet[1])) features_training = features_training.map(lambda tweet: LabeledPoint( tweet[1], tweet[0])).filter(lambda labeled: labeled.features) features_test = features_test.map(lambda tweet: LabeledPoint( tweet[1], tweet[0])).filter(lambda labeled: labeled.features) model_2 = StreamingLogisticRegressionWithSGD() model_2.setInitialWeights([0.0] * vectorSize) model_2.trainOn(features_training) # Test predictions = model_2.predictOnValues( features_test.map(lambda tweet: (tweet.label, tweet.features))) # 0 - ITA # 1 - ENG true_eng = predictions.window(test_seconds, 1) \ .filter(lambda prediction: prediction[0] == 1.0 and prediction[1] == 1) \ .map(lambda prediction: (prediction, 1)) \ .reduceByKey(lambda a, b: a + b).pprint()
model_pol_tech = StreamingLogisticRegressionWithSGD() model_pol_ent = StreamingLogisticRegressionWithSGD() model_pol_crime = StreamingLogisticRegressionWithSGD() model_fin_sports = StreamingLogisticRegressionWithSGD() model_fin_tech = StreamingLogisticRegressionWithSGD() model_fin_ent = StreamingLogisticRegressionWithSGD() model_fin_crime = StreamingLogisticRegressionWithSGD() model_sports_tech = StreamingLogisticRegressionWithSGD() model_sports_ent = StreamingLogisticRegressionWithSGD() model_sports_crime = StreamingLogisticRegressionWithSGD() model_tech_ent = StreamingLogisticRegressionWithSGD() model_tech_crime = StreamingLogisticRegressionWithSGD() model_ent_crime = StreamingLogisticRegressionWithSGD() if clear: model_pol_fin.setInitialWeights([0.0] * num_features) model_pol_sports.setInitialWeights([0.0] * num_features) model_pol_tech.setInitialWeights([0.0] * num_features) model_pol_ent.setInitialWeights([0.0] * num_features) model_pol_crime.setInitialWeights([0.0] * num_features) model_fin_sports.setInitialWeights([0.0] * num_features) model_fin_tech.setInitialWeights([0.0] * num_features) model_fin_ent.setInitialWeights([0.0] * num_features) model_fin_crime.setInitialWeights([0.0] * num_features) model_sports_tech.setInitialWeights([0.0] * num_features) model_sports_ent.setInitialWeights([0.0] * num_features) model_sports_crime.setInitialWeights([0.0] * num_features) model_tech_ent.setInitialWeights([0.0] * num_features) model_tech_crime.setInitialWeights([0.0] * num_features) model_ent_crime.setInitialWeights([0.0] * num_features) else:
if __name__ == '__main__': # Get user input first with open('config/malicious_ips.txt', 'r') as f: for line in f: MALICIOUS_IPS.append(str(line.replace('\n', ''))) # First create the streaming context sc = SparkContext(appName="Realtime Packet Classifier") sc.setLogLevel("ERROR") ssc = StreamingContext(sc, UPDATE_TIMER) # Create the data streams for the training and streaming directory trainingStream = ssc.textFileStream(TRAINING_DIR).map(processTrainingLine) secondaryTrainingStream = ssc.textFileStream(SEC_TRAINING_DIR).map( processGeneratedLine) testingStream = ssc.textFileStream(STREAMING_DIR).map(processGeneratedLine) # Create the model and train it on the training data model = StreamingLogisticRegressionWithSGD(numIterations=500) model.setInitialWeights([0 for i in range(75)]) model.trainOn(trainingStream) model.trainOn(secondaryTrainingStream) # Get the model to predict on values incoming in the streaming directory model.predictOnValues(testingStream.map(lambda lp: (lp.label, lp.features))\ ).pprint(50) # Start the stream and await manual termination ssc.start() ssc.awaitTermination()