Beispiel #1
0
    def test_partitioned_min_max_scaler(self):
        ls = scalers.LinearScalarScaler('score',
                                        'tenant',
                                        'new_score',
                                        1,
                                        2,
                                        use_pandas=False)

        df = self.create_sample_dataframe()
        model = ls.fit(df)
        new_df = model.transform(df)

        assert new_df.count() == df.count()

        t1_arr = new_df.filter(
            f.col('tenant') == 't1').orderBy('new_score').collect()
        assert len(t1_arr) == 2
        assert t1_arr[0]['new_score'] == 1.0
        assert t1_arr[1]['new_score'] == 2.0

        t2_arr = new_df.filter(
            f.col('tenant') == 't2').orderBy('new_score').collect()
        assert len(t2_arr) == 2
        assert t2_arr[0]['new_score'] == 1.0
        assert t2_arr[1]['new_score'] == 2.0

        t3_arr = new_df.filter(
            f.col('tenant') == 't3').orderBy('new_score').collect()
        assert len(t3_arr) == 1
        # this is the average between min and max
        assert t3_arr[0]['new_score'] == 1.5
Beispiel #2
0
 def _get_scaled_df(self, df: DataFrame) -> DataFrame:
     return scalers.LinearScalarScaler(
         input_col=self.likelihood_col,
         partition_key=self.tenant_col,
         output_col=self.scaled_likelihood_col,
         min_required_value=self.low_value,
         max_required_value=self.high_value
     ).fit(df).transform(
         df
     ) if self.low_value is not None and self.high_value is not None else df
Beispiel #3
0
    def test_explain(self):
        types = [str, float]

        def counts(c: int, tt: Type):
            return tt not in types or c > 0

        params = [
            'inputCol', 'partitionKey', 'outputCol', 'minRequiredValue',
            'maxRequiredValue'
        ]
        self.check_explain(
            scalers.LinearScalarScaler('input', 'tenant', 'output'), params,
            counts)
Beispiel #4
0
    def test_unpartitioned_min_max_scaler(self):
        ls = scalers.LinearScalarScaler('score',
                                        None,
                                        'new_score',
                                        5,
                                        9,
                                        use_pandas=False)

        df = self.create_sample_dataframe().cache()
        model = ls.fit(df)
        new_df = model.transform(df).cache()

        assert new_df.count() == df.count()

        assert 0 == new_df.filter(
            f.col('name').cast(t.IntegerType()) != f.col('new_score').cast(
                t.IntegerType())).count()