Beispiel #1
0
    def test_partitioned_standard_scaler(self):
        ls = scalers.StandardScalarScaler('score',
                                          'tenant',
                                          'new_score',
                                          1.0,
                                          use_pandas=False)

        df = self.create_sample_dataframe()
        model = ls.fit(df)
        new_df = model.transform(df)

        assert new_df.count() == df.count()

        for tenant in ['t1', 't2', 't3']:
            new_scores = new_df.filter(
                f.col('tenant') == tenant).toPandas()['new_score']

            assert new_scores is not None

            the_mean = new_scores.to_numpy().mean()
            the_std = new_scores.to_numpy().std()
            tenant_scores = [s for _, s in new_scores.items()]

            assert the_mean == 0.0

            if tenant != 't3':
                assert abs(the_std - 1.0) < 0.0001, str(the_std)
                assert len(tenant_scores) == 2
                assert tenant_scores[0] == -1.0
                assert tenant_scores[1] == 1.0
            else:
                assert the_std == 0.0
                assert len(tenant_scores) == 1
                assert tenant_scores[0] == 0.0
Beispiel #2
0
    def test_explain(self):
        types = [str, float]

        def counts(c: int, tt: Type):
            return tt not in types or c > 0

        params = ['inputCol', 'partitionKey', 'outputCol', 'coefficientFactor']
        self.check_explain(
            scalers.StandardScalarScaler('input', 'tenant', 'output'), params,
            counts)
Beispiel #3
0
    def transform(
        self, user_res_cf_df_model: _UserResourceFeatureVectorMapping
    ) -> _UserResourceFeatureVectorMapping:
        likelihood_col_token = '__likelihood__'

        dot = _make_dot()

        tenant_col = user_res_cf_df_model.tenant_col
        user_col = user_res_cf_df_model.user_col
        user_vec_col = user_res_cf_df_model.user_vec_col
        res_col = user_res_cf_df_model.res_col
        res_vec_col = user_res_cf_df_model.res_vec_col

        fixed_df = self.access_df.join(
            user_res_cf_df_model.user_feature_vector_mapping_df,
            [tenant_col, user_col]).join(
                user_res_cf_df_model.res_feature_vector_mapping_df,
                [tenant_col, res_col]).select(
                    tenant_col, user_col, user_vec_col, res_col, res_vec_col,
                    dot(f.col(user_vec_col),
                        f.col(res_vec_col)).alias(likelihood_col_token))

        scaler_model = scalers.StandardScalarScaler(likelihood_col_token,
                                                    tenant_col,
                                                    user_vec_col).fit(fixed_df)

        per_group_stats: DataFrame = scaler_model.per_group_stats
        assert isinstance(per_group_stats, DataFrame)

        append2user_bias = self._make_append_bias(user_col, res_col, user_col,
                                                  user_col, self.rank)
        append2res_bias = self._make_append_bias(user_col, res_col, res_col,
                                                 user_col, self.rank)

        fixed_user_mapping_df = user_res_cf_df_model.user_feature_vector_mapping_df.join(
            per_group_stats, tenant_col).select(
                tenant_col, user_col,
                append2user_bias(
                    f.col(user_vec_col),
                    f.lit(-1.0) *
                    f.col(scalers.StandardScalarScalerConfig.mean_token),
                    f.lit(-1.0) / f.when(
                        f.col(scalers.StandardScalarScalerConfig.std_token)
                        != 0.0,
                        f.col(scalers.StandardScalarScalerConfig.std_token
                              )).otherwise(f.lit(1.0))).alias(user_vec_col))

        fixed_res_mapping_df = user_res_cf_df_model.res_feature_vector_mapping_df.join(
            per_group_stats, tenant_col).select(
                tenant_col, res_col,
                append2res_bias(f.col(res_vec_col),
                                f.lit(0)).alias(res_vec_col))

        return user_res_cf_df_model.replace_mappings(fixed_user_mapping_df,
                                                     fixed_res_mapping_df)
Beispiel #4
0
    def test_unpartitioned_standard_scaler(self):
        ls = scalers.StandardScalarScaler('score',
                                          None,
                                          'new_score',
                                          1.0,
                                          use_pandas=False)

        df = self.create_sample_dataframe()
        model = ls.fit(df)
        new_df = model.transform(df)

        assert new_df.count() == df.count()

        new_scores = new_df.toPandas()['new_score']

        assert new_scores.to_numpy().mean() == 0.0
        assert abs(new_scores.to_numpy().std() - 1.0) < 0.0001