def test_predictions(self):
        """Test predicted values on a toy model."""
        input_batches = []
        for i in range(20):
            batch = self.sc.parallelize(
                self.generateLogisticInput(0, 1.5, 100, 42 + i))
            input_batches.append(batch.map(lambda x: (x.label, x.features)))
        input_stream = self.ssc.queueStream(input_batches)

        slr = StreamingLogisticRegressionWithSGD(
            stepSize=0.2, numIterations=25)
        slr.setInitialWeights([1.5])
        predict_stream = slr.predictOnValues(input_stream)
        true_predicted = []
        predict_stream.foreachRDD(lambda x: true_predicted.append(x.collect()))
        self.ssc.start()

        def condition():
            self.assertEqual(len(true_predicted), len(input_batches))
            return True

        self._eventually(condition, catch_assertions=True)

        # Test that the accuracy error is no more than 0.4 on each batch.
        for batch in true_predicted:
            true, predicted = zip(*batch)
            self.assertTrue(
                self.calculate_accuracy_error(true, predicted) < 0.4)
    def test_predictions(self):
        """Test predicted values on a toy model."""
        input_batches = []
        for i in range(20):
            batch = self.sc.parallelize(
                self.generateLogisticInput(0, 1.5, 100, 42 + i))
            input_batches.append(batch.map(lambda x: (x.label, x.features)))
        input_stream = self.ssc.queueStream(input_batches)

        slr = StreamingLogisticRegressionWithSGD(stepSize=0.2,
                                                 numIterations=25)
        slr.setInitialWeights([1.5])
        predict_stream = slr.predictOnValues(input_stream)
        true_predicted = []
        predict_stream.foreachRDD(lambda x: true_predicted.append(x.collect()))
        self.ssc.start()

        def condition():
            self.assertEqual(len(true_predicted), len(input_batches))
            return True

        eventually(condition, catch_assertions=True)

        # Test that the accuracy error is no more than 0.4 on each batch.
        for batch in true_predicted:
            true, predicted = zip(*batch)
            self.assertTrue(
                self.calculate_accuracy_error(true, predicted) < 0.4)
Esempio n. 3
0
    def test_training_and_prediction(self):
        """Test that the model improves on toy data with no. of batches"""
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(
                0, 1.5, 100, 42 + i)) for i in range(20)
        ]
        predict_batches = [
            b.map(lambda lp: (lp.label, lp.features)) for b in input_batches
        ]

        slr = StreamingLogisticRegressionWithSGD(stepSize=0.01,
                                                 numIterations=25)
        slr.setInitialWeights([-0.1])
        errors = []

        def collect_errors(rdd):
            true, predicted = zip(*rdd.collect())
            errors.append(self.calculate_accuracy_error(true, predicted))

        true_predicted = []
        input_stream = self.ssc.queueStream(input_batches)
        predict_stream = self.ssc.queueStream(predict_batches)
        slr.trainOn(input_stream)
        ps = slr.predictOnValues(predict_stream)
        ps.foreachRDD(lambda x: collect_errors(x))

        t = time()
        self.ssc.start()
        self._ssc_wait(t, 20.0, 0.01)

        # Test that the improvement in error is atleast 0.3
        self.assertTrue(errors[1] - errors[-1] > 0.3)
Esempio n. 4
0
    def test_convergence(self):
        """
        Test that weights converge to the required value on toy data.
        """
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(
                0, 1.5, 100, 42 + i)) for i in range(20)
        ]
        input_stream = self.ssc.queueStream(input_batches)
        models = []

        slr = StreamingLogisticRegressionWithSGD(stepSize=0.2,
                                                 numIterations=25)
        slr.setInitialWeights([0.0])
        slr.trainOn(input_stream)
        input_stream.foreachRDD(
            lambda x: models.append(slr.latestModel().weights[0]))

        t = time()
        self.ssc.start()
        self._ssc_wait(t, 15.0, 0.01)
        t_models = array(models)
        diff = t_models[1:] - t_models[:-1]

        # Test that weights improve with a small tolerance,
        self.assertTrue(all(diff >= -0.1))
        self.assertTrue(array_sum(diff > 0) > 1)
    def test_convergence(self):
        """
        Test that weights converge to the required value on toy data.
        """
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(
                0, 1.5, 100, 42 + i)) for i in range(20)
        ]
        input_stream = self.ssc.queueStream(input_batches)
        models = []

        slr = StreamingLogisticRegressionWithSGD(stepSize=0.2,
                                                 numIterations=25)
        slr.setInitialWeights([0.0])
        slr.trainOn(input_stream)
        input_stream.foreachRDD(
            lambda x: models.append(slr.latestModel().weights[0]))

        self.ssc.start()

        def condition():
            self.assertEqual(len(models), len(input_batches))
            return True

        # We want all batches to finish for this test.
        eventually(condition, 60.0, catch_assertions=True)

        t_models = array(models)
        diff = t_models[1:] - t_models[:-1]
        # Test that weights improve with a small tolerance
        self.assertTrue(all(diff >= -0.1))
        self.assertTrue(array_sum(diff > 0) > 1)
Esempio n. 6
0
    def test_training_and_prediction(self):
        """Test that the model improves on toy data with no. of batches"""
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i))
            for i in range(40)]
        predict_batches = [
            b.map(lambda lp: (lp.label, lp.features)) for b in input_batches]

        slr = StreamingLogisticRegressionWithSGD(
            stepSize=0.01, numIterations=25)
        slr.setInitialWeights([-0.1])
        errors = []

        def collect_errors(rdd):
            true, predicted = zip(*rdd.collect())
            errors.append(self.calculate_accuracy_error(true, predicted))

        true_predicted = []
        input_stream = self.ssc.queueStream(input_batches)
        predict_stream = self.ssc.queueStream(predict_batches)
        slr.trainOn(input_stream)
        ps = slr.predictOnValues(predict_stream)
        ps.foreachRDD(lambda x: collect_errors(x))

        self.ssc.start()

        def condition():
            # Test that the improvement in error is > 0.3
            if len(errors) == len(predict_batches):
                self.assertGreater(errors[1] - errors[-1], 0.3)
            if len(errors) >= 3 and errors[1] - errors[-1] > 0.3:
                return True
            return "Latest errors: " + ", ".join(map(lambda x: str(x), errors))

        self._eventually(condition, timeout=60.0)
Esempio n. 7
0
def get_model(weight, pretrained=True):
    """ Initiate a streaming model."""
    if pretrained:
        trained_model = _load_pre_trained_model()
        model = MyStreamingLogisticRegressionWithSGD(
            trained_model=trained_model)
    else:
        model = StreamingLogisticRegressionWithSGD()
        model.setInitialWeights(weight)
    return model
Esempio n. 8
0
    def test_convergence(self):
        """
        Test that weights converge to the required value on toy data.
        """
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i))
            for i in range(20)]
        input_stream = self.ssc.queueStream(input_batches)
        models = []

        slr = StreamingLogisticRegressionWithSGD(
            stepSize=0.2, numIterations=25)
        slr.setInitialWeights([0.0])
        slr.trainOn(input_stream)
        input_stream.foreachRDD(
            lambda x: models.append(slr.latestModel().weights[0]))

        t = time()
        self.ssc.start()
        self._ssc_wait(t, 15.0, 0.01)
        t_models = array(models)
        diff = t_models[1:] - t_models[:-1]

        # Test that weights improve with a small tolerance,
        self.assertTrue(all(diff >= -0.1))
        self.assertTrue(array_sum(diff > 0) > 1)
    def test_training_and_prediction(self):
        """Test that the model improves on toy data with no. of batches"""
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i))
            for i in range(20)]
        predict_batches = [
            b.map(lambda lp: (lp.label, lp.features)) for b in input_batches]

        slr = StreamingLogisticRegressionWithSGD(
            stepSize=0.01, numIterations=25)
        slr.setInitialWeights([-0.1])
        errors = []

        def collect_errors(rdd):
            true, predicted = zip(*rdd.collect())
            errors.append(self.calculate_accuracy_error(true, predicted))

        true_predicted = []
        input_stream = self.ssc.queueStream(input_batches)
        predict_stream = self.ssc.queueStream(predict_batches)
        slr.trainOn(input_stream)
        ps = slr.predictOnValues(predict_stream)
        ps.foreachRDD(lambda x: collect_errors(x))

        self.ssc.start()

        def condition():
            # Test that the improvement in error is > 0.3
            if len(errors) == len(predict_batches):
                self.assertGreater(errors[1] - errors[-1], 0.3)
            if len(errors) >= 3 and errors[1] - errors[-1] > 0.3:
                return True
            return "Latest errors: " + ", ".join(map(lambda x: str(x), errors))

        self._eventually(condition, timeout=60.0)
Esempio n. 10
0
    def test_training_and_prediction(self):
        """Test that the model improves on toy data with no. of batches"""
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i))
            for i in range(20)]
        predict_batches = [
            b.map(lambda lp: (lp.label, lp.features)) for b in input_batches]

        slr = StreamingLogisticRegressionWithSGD(
            stepSize=0.01, numIterations=25)
        slr.setInitialWeights([-0.1])
        errors = []

        def collect_errors(rdd):
            true, predicted = zip(*rdd.collect())
            errors.append(self.calculate_accuracy_error(true, predicted))

        true_predicted = []
        input_stream = self.ssc.queueStream(input_batches)
        predict_stream = self.ssc.queueStream(predict_batches)
        slr.trainOn(input_stream)
        ps = slr.predictOnValues(predict_stream)
        ps.foreachRDD(lambda x: collect_errors(x))

        t = time()
        self.ssc.start()
        self._ssc_wait(t, 20.0, 0.01)

        # Test that the improvement in error is atleast 0.3
        self.assertTrue(errors[1] - errors[-1] > 0.3)
    def test_convergence(self):
        """
        Test that weights converge to the required value on toy data.
        """
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i))
            for i in range(20)]
        input_stream = self.ssc.queueStream(input_batches)
        models = []

        slr = StreamingLogisticRegressionWithSGD(
            stepSize=0.2, numIterations=25)
        slr.setInitialWeights([0.0])
        slr.trainOn(input_stream)
        input_stream.foreachRDD(
            lambda x: models.append(slr.latestModel().weights[0]))

        self.ssc.start()

        def condition():
            self.assertEqual(len(models), len(input_batches))
            return True

        # We want all batches to finish for this test.
        self._eventually(condition, 60.0, catch_assertions=True)

        t_models = array(models)
        diff = t_models[1:] - t_models[:-1]
        # Test that weights improve with a small tolerance
        self.assertTrue(all(diff >= -0.1))
        self.assertTrue(array_sum(diff > 0) > 1)
Esempio n. 12
0
def get_model(pretrained=True):
    ''' Initiate a streaming model.
    If pretrained=True, init a streaming model with the trained parameters;
    if not, set initial weight to be all zeros.
    '''
    if (pretrained):
        trained_model = _load_pre_trained_model()
        model = MyStreamingLogisticRegressionWithSGD(
            trained_model=trained_model)
    else:
        model = StreamingLogisticRegressionWithSGD()
        model.setInitialWeights([0.0] * NUM_FEATURES)
    return model
    def test_parameter_accuracy(self):
        """
        Test that the final value of weights is close to the desired value.
        """
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i))
            for i in range(20)]
        input_stream = self.ssc.queueStream(input_batches)

        slr = StreamingLogisticRegressionWithSGD(
            stepSize=0.2, numIterations=25)
        slr.setInitialWeights([0.0])
        slr.trainOn(input_stream)

        self.ssc.start()

        def condition():
            rel = (1.5 - slr.latestModel().weights.array[0]) / 1.5
            self.assertAlmostEqual(rel, 0.1, 1)
            return True

        self._eventually(condition, catch_assertions=True)
Esempio n. 14
0
    def test_parameter_accuracy(self):
        """
        Test that the final value of weights is close to the desired value.
        """
        input_batches = [
            self.sc.parallelize(self.generateLogisticInput(
                0, 1.5, 100, 42 + i)) for i in range(20)
        ]
        input_stream = self.ssc.queueStream(input_batches)

        slr = StreamingLogisticRegressionWithSGD(stepSize=0.2,
                                                 numIterations=25)
        slr.setInitialWeights([0.0])
        slr.trainOn(input_stream)

        t = time()
        self.ssc.start()
        self._ssc_wait(t, 20.0, 0.01)
        rel = (1.5 - slr.latestModel().weights.array[0]) / 1.5
        self.assertAlmostEqual(rel, 0.1, 1)
if __name__ == '__main__':
    # Get user input first
    with open('config/malicious_ips.txt', 'r') as f:
        for line in f:
            MALICIOUS_IPS.append(str(line.replace('\n', '')))

    # First create the streaming context
    sc = SparkContext(appName="Realtime Packet Classifier")
    sc.setLogLevel("ERROR")
    ssc = StreamingContext(sc, UPDATE_TIMER)

    # Create the data streams for the training and streaming directory
    trainingStream = ssc.textFileStream(TRAINING_DIR).map(processTrainingLine)
    secondaryTrainingStream = ssc.textFileStream(SEC_TRAINING_DIR).map(
        processGeneratedLine)
    testingStream = ssc.textFileStream(STREAMING_DIR).map(processGeneratedLine)

    # Create the model and train it on the training data
    model = StreamingLogisticRegressionWithSGD(numIterations=500)
    model.setInitialWeights([0 for i in range(75)])
    model.trainOn(trainingStream)
    model.trainOn(secondaryTrainingStream)

    # Get the model to predict on values incoming in the streaming directory
    model.predictOnValues(testingStream.map(lambda lp: (lp.label, lp.features))\
        ).pprint(50)

    # Start the stream and await manual termination
    ssc.start()
    ssc.awaitTermination()
    pol_sports = pol_sports.map(p)
    pol_tech = pol_tech.map(p)
    pol_ent = pol_ent.map(p)
    pol_crime = pol_crime.map(p)
    fin_sports = fin_sports.map(f)
    fin_tech = fin_tech.map(f)
    fin_ent = fin_ent.map(f)
    fin_crime = fin_crime.map(f)
    sports_tech = sports_tech.map(s)
    sports_ent = sports_ent.map(s)
    sports_crime = sports_crime.map(s)
    tech_ent = tech_ent.map(t)
    tech_crime = tech_crime.map(t)
    ent_crime = ent_crime.map(e)

    model_pol_fin = StreamingLogisticRegressionWithSGD()
    model_pol_sports = StreamingLogisticRegressionWithSGD()
    model_pol_tech = StreamingLogisticRegressionWithSGD()
    model_pol_ent = StreamingLogisticRegressionWithSGD()
    model_pol_crime = StreamingLogisticRegressionWithSGD()
    model_fin_sports = StreamingLogisticRegressionWithSGD()
    model_fin_tech = StreamingLogisticRegressionWithSGD()
    model_fin_ent = StreamingLogisticRegressionWithSGD()
    model_fin_crime = StreamingLogisticRegressionWithSGD()
    model_sports_tech = StreamingLogisticRegressionWithSGD()
    model_sports_ent = StreamingLogisticRegressionWithSGD()
    model_sports_crime = StreamingLogisticRegressionWithSGD()
    model_tech_ent = StreamingLogisticRegressionWithSGD()
    model_tech_crime = StreamingLogisticRegressionWithSGD()
    model_ent_crime = StreamingLogisticRegressionWithSGD()
Esempio n. 17
0
features_test = test.map(
    lambda tweet: (filtering(tweet[0].split(" ")), tweet[1])).map(
        lambda tweet: ([model.value.get(word) for word in tweet[0]], tweet[1]))

#SUM among vectors
features_training = features_training.filter(lambda tweet: check_None(tweet[
    0])).map(lambda tweet: (media(tweet[0], vectorSize), tweet[1]))
features_test = features_test.filter(lambda tweet: check_None(tweet[0])).map(
    lambda tweet: (media(tweet[0], vectorSize), tweet[1]))

features_training = features_training.map(lambda tweet: LabeledPoint(
    tweet[1], tweet[0])).filter(lambda labeled: labeled.features)
features_test = features_test.map(lambda tweet: LabeledPoint(
    tweet[1], tweet[0])).filter(lambda labeled: labeled.features)

model_2 = StreamingLogisticRegressionWithSGD()
model_2.setInitialWeights([0.0] * vectorSize)
model_2.trainOn(features_training)

# Test
predictions = model_2.predictOnValues(
    features_test.map(lambda tweet: (tweet.label, tweet.features)))

# 0 - ITA
# 1 - ENG


true_eng = predictions.window(test_seconds, 1) \
   .filter(lambda prediction: prediction[0] == 1.0 and prediction[1] == 1) \
   .map(lambda prediction: (prediction, 1)) \
   .reduceByKey(lambda a, b: a + b).pprint()
        else:
            table.put(row=date, data={'tweet_count:pos': str(data[1])})

    connection.close()


if __name__ == '__main__':
    # creating a SparkContext object
    sc = SparkContext.getOrCreate()
    # setting the log level to avoid printing logs in the console
    sc.setLogLevel("WARN")
    # creating a Spark Streaming Context
    ssc = StreamingContext(sparkContext=sc, batchDuration=10)

    # setting up a model
    lr = StreamingLogisticRegressionWithSGD()
    # loading the pre-trained parameters
    parameters = json.load(open('model.json', 'r'))
    # assigning the pre-trained parameters to the logistic regression
    lr.setInitialWeights(parameters['weights'])
    # loading stop words
    stop_words = load_stopwords()
    # loading common words
    common_words = load_common_words()
    # creating the reference table
    reference_table = create_hash_table(common_words=common_words, stop_words=stop_words)

    # opening the stream
    kafkaStream = KafkaUtils.createDirectStream(ssc=ssc,
                                                topics=['trump'],
                                                kafkaParams={"metadata.broker.list": 'localhost:9092'})
Esempio n. 19
0
def append_key_to_dictionary(dictionary, key, value):
    dictionary[key] = value
    return dictionary


def insert_into_table(values, table_name, host, port):
    pass


if __name__ == '__main__':

    sc = SparkContext(appName='PythonSparkStreamingKafka')
    sc.setLogLevel("WARN")  # avoid printing logs

    # setting up a model
    lr = StreamingLogisticRegressionWithSGD()
    parameters = json.load(open('model.json', 'r'))
    # lr.setInitialWeights(parameters['weights'])
    lr = create_logistic_regression_skl(parameters['weights'],
                                        parameters['intercept'])
    stop_words = load_stopwords()
    common_words = load_common_words()
    reference_table = create_hash_table(common_words=common_words,
                                        stop_words=stop_words)

    ssc = StreamingContext(sparkContext=sc, batchDuration=2)
    spark_sql = SQLContext(sparkContext=sc)

    kafkaStream = KafkaUtils.createDirectStream(
        ssc=ssc,
        topics=['trump'],
    tech_ent = tech_ent.map(t)
    tech_crime = tech_crime.map(t)
    ent_crime = ent_crime.map(e)

    allrdd = [pol_fin,pol_sports,pol_tech,pol_ent,pol_crime,fin_sports,fin_tech,fin_ent,fin_crime,sports_tech,sports_ent,sports_crime,
              tech_ent,tech_crime,ent_crime]
    # Build the model
    # numFeatures = 3
    # model.setInitialWeights([0.0, 0.0, 0.0])
    models = [] #incase needed
    labelsAndPreds = []
    df = []
    for irdd in allrdd:
        print(irdd)
        # modellr = LogisticRegressionWithSGD.train(irdd.map(lambda x: x[0]))
        modellr = StreamingLogisticRegressionWithSGD()
        modellr.trainOn(irdd.map(lambda x: x[0]))
        print(modellr)
        models.append(modellr)
        #outputrdd = parsedData.map(lambda p: (p[0].label, models[i].predict(p[0].features)))
        outputrdd = modellr.predictOnValues(parsedTestData.map(lambda lp: (lp[0].label, lp[0].features)))
        labelsAndPreds.append(outputrdd)
        outputdf = outputrdd.toDF(['label', 'prediction']).toPandas()
        df.append(outputdf)

    lab_count = np.zeros((parsedTestData.count(),len(labels)),dtype="int32")
    for i in range(0,len(allrdd)):
        lab_count = makePredOVO(df[i],labels_num[i],lab_count)

    cz,correct = 0,0
    parsedTestDataDF['PredictedClass'] = pd.np.empty((len(testData), 0)).tolist()