def update_row(cls, current, past, *args, **kwargs):
     return update_mean(
         past.get(cls.feature_name_from_class()),
         current[cls.feature_name_from_class()],
         past.get(FeatureRequestTotal.feature_name_from_class()),
         current[FeatureRequestTotal.feature_name_from_class()]
     )
Esempio n. 2
0
 def update_row(cls, current, past, *args, **kwargs):
     return update_ratio(
         past.get(FeatureUniquePathTotal.feature_name_from_class()),
         past.get(FeatureRequestTotal.feature_name_from_class()),
         current[FeatureUniquePathTotal.feature_name_from_class()],
         current[FeatureRequestTotal.feature_name_from_class()]
     )
Esempio n. 3
0
    def test_update(self):
        schema = T.StructType([
            T.StructField(self.feature.current_features_column,
                          T.MapType(T.StringType(), T.FloatType())),
            T.StructField(self.feature.past_features_column,
                          T.MapType(T.StringType(), T.FloatType())),
        ])

        sub_df = self.session.createDataFrame([{
            self.feature.current_features_column: {
                self.feature.feature_name: 6.,
                FeatureRequestTotal.feature_name_from_class(): 3.,
            },
            self.feature.past_features_column: {
                self.feature.feature_name: 2.,
                FeatureRequestTotal.feature_name_from_class(): 1.,
            }
        }],
                                              schema=schema)
        result_df = self.feature.update(sub_df)

        result_df.show()
        value = result_df.select(
            self.feature.updated_feature_col_name).collect()[0][
                self.feature.updated_feature_col_name]
        expected_value = 0.75 * 6. + 0.25 * 2.
        self.assertAlmostEqual(value, expected_value, places=2)
Esempio n. 4
0
 def update_row(cls, current, past, *args, **kwargs):
     return update_variance(
         past.get(cls.feature_name_from_class()),
         current[cls.feature_name_from_class()],
         past.get(FeatureRequestTotal.feature_name_from_class()),
         current[FeatureRequestTotal.feature_name_from_class()],
         past.get(FeaturePathDepthAverage.feature_name_from_class()),
         current[FeaturePathDepthAverage.feature_name_from_class()])
Esempio n. 5
0
    def test_update(self):
        count_col = FeatureRequestTotal.feature_name_from_class()
        mean_col = FeaturePathDepthAverage.feature_name_from_class()
        schema = T.StructType([
            T.StructField(self.feature.current_features_column,
                          T.MapType(T.StringType(), T.FloatType())),
            T.StructField(self.feature.past_features_column,
                          T.MapType(T.StringType(), T.FloatType())),
        ])

        sub_df = self.session.createDataFrame([{
            self.feature.current_features_column: {
                self.feature.feature_name: 6.,
                count_col: 3.,
                mean_col: 5.,
            },
            self.feature.past_features_column: {
                self.feature.feature_name: 2.,
                count_col: 1.,
                mean_col: 4.,
            }
        }],
                                              schema=schema)
        result_df = self.feature.update(sub_df)

        result_df.show()
        value = result_df.select(
            self.feature.updated_feature_col_name).collect()[0][
                self.feature.updated_feature_col_name]
        from baskerville.features.helpers import update_variance
        expected_value = update_variance(2., 6., 1., 3., 4., 5.)
        print(expected_value)
        self.assertAlmostEqual(value, expected_value, places=2)
Esempio n. 6
0
 def update(self, df, feat_column='features', old_feat_column='old_features'):
     return super().update(
         df,
         self.feature_name,
         FeatureRequestTotal.feature_name_from_class(),
         FeatureRequestIntervalAverage.feature_name_from_class()
     )
Esempio n. 7
0
 def update(self,
            df,
            feat_column='features',
            old_feat_column='old_features'):
     return super().update(
         df,
         FeatureTopPageTotal.feature_name_from_class(),
         FeatureRequestTotal.feature_name_from_class(),
     )
Esempio n. 8
0
    def test_update_row(self):
        requests = FeatureRequestTotal()
        test_current = {
            self.feature.feature_name: 6.,
            requests.feature_name: 3.
        }
        test_past = {self.feature.feature_name: 2., requests.feature_name: 1.}
        value = self.feature.update_row(test_current, test_past)

        expected_value = 0.75 * 6. + 0.25 * 2.
        self.assertAlmostEqual(value, expected_value, places=2)
Esempio n. 9
0
    def test_update_row(self):
        requests = FeatureRequestTotal()
        path_depth_ave = FeaturePathDepthAverage()
        test_current = {
            self.feature.feature_name: 6.,
            requests.feature_name: 3.,
            path_depth_ave.feature_name: 5.
        }
        test_past = {
            self.feature.feature_name: 2.,
            requests.feature_name: 1.,
            path_depth_ave.feature_name: 4.
        }
        value = self.feature.update_row(test_current, test_past)

        from baskerville.features.helpers import update_variance
        expected_value = update_variance(2., 6., 1., 3., 4., 5.)

        self.assertAlmostEqual(value, expected_value, places=2)
 def update(self, df):
     return super().update(
         df,
         numerator=FeatureRequestTotal.feature_name_from_class(),
         denominator=FeatureMinutesTotal.feature_name_from_class(),
     )
Esempio n. 11
0
 def setUp(self):
     super(TestSparkRequestTotal, self).setUp()
     self.feature = FeatureRequestTotal()
Esempio n. 12
0
class TestSparkRequestTotal(FeatureSparkTestCase):
    def setUp(self):
        super(TestSparkRequestTotal, self).setUp()
        self.feature = FeatureRequestTotal()

    def test_instance(self):
        self.assertTrue(hasattr(self.feature, 'feature_name'))
        self.assertTrue(hasattr(self.feature, 'COLUMNS'))
        self.assertTrue(hasattr(self.feature, 'DEPENDENCIES'))
        self.assertTrue(hasattr(self.feature, 'DEFAULT_VALUE'))
        self.assertTrue(hasattr(self.feature, 'compute_type'))

        self.assertTrue(self.feature.feature_name == 'request_total')
        self.assertTrue(self.feature.columns == ['@timestamp'])
        self.assertTrue(self.feature.dependencies == [])
        self.assertTrue(self.feature.DEFAULT_VALUE == 0.)
        self.assertTrue(self.feature.compute_type == FeatureComputeType.total)
        self.assertIsNotNone(self.feature.feature_name)
        self.assertIsNotNone(self.feature.feature_default)

        self.assertTrue(isinstance(self.feature.feature_name, str))
        self.assertTrue(isinstance(self.feature.feature_default, float))

    def test_compute_single_record(self):
        ats_record = {
            "client_ip": '55.555.55.55',
            "@timestamp": '2018-01-17T08:30:00.000Z',
            "content_type": 'html',
            "client_url": 'page1/page2/page3?query',
        }
        sub_df = self.get_sub_df_for_feature(self.feature, [ats_record])
        result = self.feature.compute(sub_df)
        expected_df = sub_df.withColumn(self.feature.feature_name,
                                        F.lit(1.).cast('float'))

        self.assertDataFrameEqual(result, expected_df)

    def test_compute_multiple_records(self):
        first_ats_record = {
            "client_ip": '55.555.55.55',
            "@timestamp": '2018-01-17T08:30:00.000Z',
            "content_type": 'html',
            "client_url": 'page1/page2/page3',
        }
        second_ats_record = {
            "client_ip": '55.555.55.55',
            "@timestamp": '2018-01-17T08:30:00.000Z',
            "content_type": 'html',
            "client_url": 'page1/page2',
        }
        sub_df = self.get_sub_df_for_feature(self.feature, [
            first_ats_record,
            second_ats_record,
        ])
        result = self.feature.compute(sub_df)

        expected_df = sub_df.withColumn(self.feature.feature_name,
                                        F.lit(2.).cast('float'))
        self.assertDataFrameEqual(result, expected_df)

    def test_update_row(self):
        test_current = {self.feature.feature_name: 2.}
        test_past = {self.feature.feature_name: 1.}
        value = self.feature.update_row(test_current, test_past)

        self.assertAlmostEqual(value, 3., places=2)

    def test_update(self):
        schema = T.StructType([
            T.StructField(self.feature.current_features_column,
                          T.MapType(T.StringType(), T.FloatType())),
            T.StructField(self.feature.past_features_column,
                          T.MapType(T.StringType(), T.FloatType())),
        ])

        sub_df = self.session.createDataFrame([{
            self.feature.current_features_column: {
                self.feature.feature_name: 2.,
            },
            self.feature.past_features_column: {
                self.feature.feature_name: 1.,
            }
        }],
                                              schema=schema)
        result_df = self.feature.update(sub_df)

        result_df.show()
        value = result_df.select(
            self.feature.updated_feature_col_name).collect()[0][
                self.feature.updated_feature_col_name]
        expected_value = 3.
        self.assertAlmostEqual(value, expected_value, places=2)