def test_partitioned_standard_scaler(self): ls = scalers.StandardScalarScaler('score', 'tenant', 'new_score', 1.0, use_pandas=False) df = self.create_sample_dataframe() model = ls.fit(df) new_df = model.transform(df) assert new_df.count() == df.count() for tenant in ['t1', 't2', 't3']: new_scores = new_df.filter( f.col('tenant') == tenant).toPandas()['new_score'] assert new_scores is not None the_mean = new_scores.to_numpy().mean() the_std = new_scores.to_numpy().std() tenant_scores = [s for _, s in new_scores.items()] assert the_mean == 0.0 if tenant != 't3': assert abs(the_std - 1.0) < 0.0001, str(the_std) assert len(tenant_scores) == 2 assert tenant_scores[0] == -1.0 assert tenant_scores[1] == 1.0 else: assert the_std == 0.0 assert len(tenant_scores) == 1 assert tenant_scores[0] == 0.0
def test_explain(self): types = [str, float] def counts(c: int, tt: Type): return tt not in types or c > 0 params = ['inputCol', 'partitionKey', 'outputCol', 'coefficientFactor'] self.check_explain( scalers.StandardScalarScaler('input', 'tenant', 'output'), params, counts)
def transform( self, user_res_cf_df_model: _UserResourceFeatureVectorMapping ) -> _UserResourceFeatureVectorMapping: likelihood_col_token = '__likelihood__' dot = _make_dot() tenant_col = user_res_cf_df_model.tenant_col user_col = user_res_cf_df_model.user_col user_vec_col = user_res_cf_df_model.user_vec_col res_col = user_res_cf_df_model.res_col res_vec_col = user_res_cf_df_model.res_vec_col fixed_df = self.access_df.join( user_res_cf_df_model.user_feature_vector_mapping_df, [tenant_col, user_col]).join( user_res_cf_df_model.res_feature_vector_mapping_df, [tenant_col, res_col]).select( tenant_col, user_col, user_vec_col, res_col, res_vec_col, dot(f.col(user_vec_col), f.col(res_vec_col)).alias(likelihood_col_token)) scaler_model = scalers.StandardScalarScaler(likelihood_col_token, tenant_col, user_vec_col).fit(fixed_df) per_group_stats: DataFrame = scaler_model.per_group_stats assert isinstance(per_group_stats, DataFrame) append2user_bias = self._make_append_bias(user_col, res_col, user_col, user_col, self.rank) append2res_bias = self._make_append_bias(user_col, res_col, res_col, user_col, self.rank) fixed_user_mapping_df = user_res_cf_df_model.user_feature_vector_mapping_df.join( per_group_stats, tenant_col).select( tenant_col, user_col, append2user_bias( f.col(user_vec_col), f.lit(-1.0) * f.col(scalers.StandardScalarScalerConfig.mean_token), f.lit(-1.0) / f.when( f.col(scalers.StandardScalarScalerConfig.std_token) != 0.0, f.col(scalers.StandardScalarScalerConfig.std_token )).otherwise(f.lit(1.0))).alias(user_vec_col)) fixed_res_mapping_df = user_res_cf_df_model.res_feature_vector_mapping_df.join( per_group_stats, tenant_col).select( tenant_col, res_col, append2res_bias(f.col(res_vec_col), f.lit(0)).alias(res_vec_col)) return user_res_cf_df_model.replace_mappings(fixed_user_mapping_df, fixed_res_mapping_df)
def test_unpartitioned_standard_scaler(self): ls = scalers.StandardScalarScaler('score', None, 'new_score', 1.0, use_pandas=False) df = self.create_sample_dataframe() model = ls.fit(df) new_df = model.transform(df) assert new_df.count() == df.count() new_scores = new_df.toPandas()['new_score'] assert new_scores.to_numpy().mean() == 0.0 assert abs(new_scores.to_numpy().std() - 1.0) < 0.0001