def test_multi_indexer_undo_transform(self): multi_indexer = indexers.MultiIndexer([ indexers.IdIndexer('user', 'tenant', 'actual_uid', True), indexers.IdIndexer('res', 'tenant', 'actual_rid', True), ]) df = self.create_sample_dataframe() model = multi_indexer.fit(df) new_df = model.transform(df) assert new_df.filter(f.col('actual_uid') <= 0).count() == 0 assert new_df.filter(f.col('actual_rid') <= 0).count() == 0 orig_df = model.undo_transform(new_df.select( 'tenant', 'actual_uid', 'actual_rid' )) assert orig_df.select( 'tenant', 'user' ).distinct().orderBy('tenant', 'user').collect() == df.select( 'tenant', 'user' ).distinct().orderBy('tenant', 'user').collect() assert orig_df.select( 'tenant', 'res' ).distinct().orderBy('tenant', 'res').collect() == df.select( 'tenant', 'res' ).distinct().orderBy('tenant', 'res').collect()
def test_multi_indexer_non_per_tenant(self): multi_indexer = indexers.MultiIndexer([ indexers.IdIndexer('user', 'tenant', 'actual_uid', False), indexers.IdIndexer('res', 'tenant', 'actual_rid', False) ]) df = self.create_sample_dataframe() model = multi_indexer.fit(df) new_df = model.transform(df) assert new_df.count() == df.count() assert new_df.filter(f.col('actual_uid') <= 0).count() == 0 assert new_df.filter(f.col('actual_rid') <= 0).count() == 0 user_count = df.select('tenant', 'user').distinct().count() res_count = df.select('tenant', 'res').distinct().count() assert new_df.select('actual_uid').distinct().count() == user_count assert new_df.select('actual_rid').distinct().count() == res_count stats_row = new_df.select( f.min('actual_uid').alias('min_uid'), f.max('actual_uid').alias('max_uid'), f.min('actual_rid').alias('min_rid'), f.max('actual_rid').alias('max_rid') ).first() assert (stats_row['min_uid'] == 1) and (stats_row['max_uid'] == user_count) assert (stats_row['min_rid'] == 1) and (stats_row['max_rid'] == res_count) orig_df = model.undo_transform(new_df).select('tenant', 'user', 'res').orderBy('tenant', 'user', 'res') assert df.select('tenant', 'user', 'res').orderBy('tenant', 'user', 'res').collect() == orig_df.collect()
def test_explain(self): types = [str, bool] def counts(c: int, tt: Type): return tt not in types or c > 0 params = ['inputCol', 'partitionKey', 'outputCol', 'resetPerPartition'] self.check_explain(indexers.IdIndexer('input', 'tenant', 'output', True), params, counts)
def test_multi_indexer(self): multi_indexer = indexers.MultiIndexer([ indexers.IdIndexer('user', 'tenant', 'actual_uid', True), indexers.IdIndexer('res', 'tenant', 'actual_rid', True), ]) df = self.create_sample_dataframe() model = multi_indexer.fit(df) new_df = model.transform(df) assert new_df.count() == df.count() assert new_df.filter(f.col('actual_uid') <= 0).count() == 0 assert new_df.filter(f.col('actual_rid') <= 0).count() == 0 assert 0 == new_df.filter( f.col('expected_uid') != f.col('actual_uid') ).count() assert 0 == new_df.filter( f.col('expected_rid') != f.col('actual_rid') ).count()
def test_id_indexer(self): indexer = indexers.IdIndexer('user', 'tenant', 'actual_uid', True) df = self.create_sample_dataframe() model = indexer.fit(df) new_df = model.transform(df) assert new_df.count() == df.count() assert 0 == new_df.filter( f.col('expected_uid') != f.col('actual_uid') ).count()
def _fit(self, df: DataFrame) -> AccessAnomalyModel: # index the user and resource columns to allow running the spark ALS algorithm the_indexer = indexers.MultiIndexer(indexers=[ indexers.IdIndexer(input_col=self.user_col, partition_key=self.tenant_col, output_col=self.indexed_user_col, reset_per_partition=self.separate_tenants), indexers.IdIndexer(input_col=self.res_col, partition_key=self.tenant_col, output_col=self.indexed_res_col, reset_per_partition=self.separate_tenants) ]) the_indexer_model = the_indexer.fit(df) # indexed_df is the dataframe with the indices for user and resource indexed_df = the_indexer_model.transform(df) enriched_df = self._enrich_and_normalize(indexed_df).cache() user_res_feature_vector_mapping_df = self.create_spark_model_vectors_df( enriched_df) user_res_norm_cf_df_model = ModelNormalizeTransformer( enriched_df, self.rank_param).transform(user_res_feature_vector_mapping_df) # convert user and resource indices back to names user_index_model = the_indexer_model.get_model_by_input_col( self.user_col) res_index_model = the_indexer_model.get_model_by_input_col( self.res_col) assert user_index_model is not None and res_index_model is not None norm_user_mapping_df = user_res_norm_cf_df_model.user_feature_vector_mapping_df norm_res_mapping_df = user_res_norm_cf_df_model.res_feature_vector_mapping_df indexed_user_col = self.indexed_user_col indexed_res_col = self.indexed_res_col # do the actual index to name mapping (using undo_transform) final_user_mapping_df = user_index_model.undo_transform( norm_user_mapping_df).drop(indexed_user_col) final_res_mapping_df = res_index_model.undo_transform( norm_res_mapping_df).drop(indexed_res_col) tenant_col, user_col, res_col = self.tenant_col, self.user_col, self.res_col history_access_df = self.history_access_df access_df = \ history_access_df if history_access_df is not None else df.select(tenant_col, user_col, res_col).cache() user2component_mappings_df, res2component_mappings_df = ConnectedComponents( tenant_col, user_col, res_col).transform(access_df) return AccessAnomalyModel( _UserResourceFeatureVectorMapping( tenant_col=self.tenant_col, user_col=self.user_col, user_vec_col=self.user_vec_col, res_col=self.res_col, res_vec_col=self.res_vec_col, history_access_df=history_access_df, user2component_mappings_df=user2component_mappings_df, res2component_mappings_df=res2component_mappings_df, user_feature_vector_mapping_df=final_user_mapping_df.cache(), res_feature_vector_mapping_df=final_res_mapping_df.cache()), self.output_col)
def test_enrich_and_normalize(self): training = Dataset.create_new_training(1.0).cache() access_anomaly = AccessAnomaly( tenantCol=AccessAnomalyConfig.default_tenant_col, maxIter=10, applyImplicitCf=False) tenant_col = access_anomaly.tenant_col user_col = access_anomaly.user_col indexed_user_col = access_anomaly.indexed_user_col res_col = access_anomaly.res_col indexed_res_col = access_anomaly.indexed_res_col scaled_likelihood_col = access_anomaly.scaled_likelihood_col assert training.filter(f.col(user_col).isNull()).count() == 0 assert training.filter(f.col(res_col).isNull()).count() == 0 the_indexer = indexers.MultiIndexer(indexers=[ indexers.IdIndexer(input_col=user_col, partition_key=tenant_col, output_col=indexed_user_col, reset_per_partition=False), indexers.IdIndexer(input_col=res_col, partition_key=tenant_col, output_col=indexed_res_col, reset_per_partition=False) ]) the_indexer_model = the_indexer.fit(training) indexed_df = materialized_cache(the_indexer_model.transform(training)) assert indexed_df.filter(f.col(indexed_user_col).isNull()).count() == 0 assert indexed_df.filter(f.col(indexed_res_col).isNull()).count() == 0 assert indexed_df.filter(f.col(indexed_user_col) <= 0).count() == 0 assert indexed_df.filter(f.col(indexed_res_col) <= 0).count() == 0 unindexed_df = materialized_cache( the_indexer_model.undo_transform(indexed_df)) assert unindexed_df.filter(f.col(user_col).isNull()).count() == 0 assert unindexed_df.filter(f.col(res_col).isNull()).count() == 0 enriched_indexed_df = materialized_cache( access_anomaly._enrich_and_normalize(indexed_df)) enriched_df = materialized_cache( without_ffa(the_indexer_model.undo_transform(enriched_indexed_df))) assert enriched_df.filter(f.col(user_col).isNull()).count() == 0 assert enriched_df.filter(f.col(res_col).isNull()).count() == 0 assert enriched_df.filter( (get_department(user_col) == get_department(res_col)) & (f.col(scaled_likelihood_col) == 1.0)).count() == 0 assert enriched_df.filter( (get_department(user_col) != get_department(res_col)) & (f.col(scaled_likelihood_col) != 1.0)).count() == 0 assert enriched_df.filter( (get_department(user_col) != get_department(res_col))).count() == enriched_df.filter( f.col(scaled_likelihood_col) == 1.0).count() assert enriched_df.filter( (get_department(user_col) == get_department(res_col) )).count() == enriched_df.filter( f.col(scaled_likelihood_col) != 1.0).count() low_value = access_anomaly.low_value high_value = access_anomaly.high_value assert enriched_df.count() > training.count() assert enriched_df.filter(( (f.col(scaled_likelihood_col) >= low_value) & (f.col(scaled_likelihood_col) <= high_value)) | (f.col(scaled_likelihood_col) == 1.0) ).count() == enriched_df.count()