def test_correlations(queries_and_models): gender_queries, negative_test_queries, models = queries_and_models results_gender = run_queries( WEAT, gender_queries, models, queries_set_name='Gender Queries', aggregate_results=True, ) results_negative = run_queries(WEAT, negative_test_queries, models, queries_set_name='Negative Test Queries', aggregate_results=True, return_only_aggregation=True) results_gender_rnd = run_queries( RND, gender_queries, models, queries_set_name='Gender Queries', generate_subqueries=True, aggregate_results=True, ) ranking = create_ranking( [results_gender, results_negative, results_gender_rnd]) assert ranking.shape == (3, 3) correlations = calculate_ranking_correlations(ranking) assert correlations.shape == (3, 3) assert np.array_equal( correlations.columns.values, np.array([ 'WEAT: Gender Queries average of abs values score', 'WEAT: Negative Test Queries average of abs values score', 'RND: Gender Queries average of abs values score' ])) assert np.array_equal( correlations.index.values, np.array([ 'WEAT: Gender Queries average of abs values score', 'WEAT: Negative Test Queries average of abs values score', 'RND: Gender Queries average of abs values score' ]))
def test_ranking_results(queries_and_models): gender_queries, negative_test_queries, models = queries_and_models results_gender = run_queries( WEAT, gender_queries, models, queries_set_name='Gender Queries', aggregate_results=True, ) results_negative = run_queries(WEAT, negative_test_queries, models, queries_set_name='Negative Test Queries', aggregate_results=True, return_only_aggregation=True) results_gender_rnd = run_queries( RND, gender_queries, models, queries_set_name='Gender Queries', generate_subqueries=True, aggregate_results=True, ) with pytest.raises( TypeError, match='All elements of results_dataframes must be a pandas ' 'Dataframe instance*'): create_ranking([None, results_gender, results_negative]) with pytest.raises( TypeError, match='All elements of results_dataframes must be a pandas ' 'Dataframe instance*'): create_ranking([results_gender, results_negative, 2]) ranking = create_ranking( [results_gender, results_negative, results_gender_rnd]) assert ranking.shape == (3, 3) for row in ranking.values: for val in row: assert val <= 3 and val >= 1
def test_run_query_input_validation(queries_and_models): # ----------------------------------------------------------------- # Input checks # ----------------------------------------------------------------- # Load the inputs of the fixture gender_queries, _, models = queries_and_models with pytest.raises( TypeError, match='queries parameter must be a list or a numpy array*'): run_queries(WEAT, None, None) with pytest.raises( Exception, match='queries list must have at least one query instance*'): run_queries(WEAT, [], None) with pytest.raises(TypeError, match='item on index 0 must be a Query instance*'): run_queries(WEAT, [None, None], None) with pytest.raises(TypeError, match='item on index 3 must be a Query instance*'): run_queries(WEAT, gender_queries + [None], None) with pytest.raises( TypeError, match='word_embeddings_models parameter must be a list or a ' 'numpy array*'): run_queries(WEAT, gender_queries, None) with pytest.raises( Exception, match='word_embeddings_models parameter must be a non empty list ' 'or numpy array*'): run_queries(WEAT, gender_queries, []) with pytest.raises( TypeError, match='item on index 0 must be a WordEmbeddingModel instance*'): run_queries(WEAT, gender_queries, [None]) with pytest.raises( TypeError, match='item on index 3 must be a WordEmbeddingModel instance*'): run_queries(WEAT, gender_queries, models + [None]) with pytest.raises( TypeError, match='When queries_set_name parameter is provided, it must be a ' 'non-empty string*'): run_queries(WEAT, gender_queries, models, queries_set_name=None) with pytest.raises( TypeError, match='When queries_set_name parameter is provided, it must be a ' 'non-empty string*'): run_queries(WEAT, gender_queries, models, queries_set_name="") with pytest.raises( TypeError, match='run_experiment_params must be a dict with a params' ' for the metric*'): run_queries(WEAT, gender_queries, models, metric_params=None) with pytest.raises( Exception, match='aggregate_results parameter must be a bool value*'): run_queries(WEAT, gender_queries, models, aggregate_results=None) with pytest.raises(Exception, match='aggregation_function must be one of \'sum\',' 'abs_sum\', \'avg\', \'abs_avg\' or a callable.*'): run_queries(WEAT, gender_queries, models, aggregation_function=None) with pytest.raises(Exception, match='aggregation_function must be one of \'sum\',' 'abs_sum\', \'avg\', \'abs_avg\' or a callable.*'): run_queries(WEAT, gender_queries, models, aggregation_function='hello') with pytest.raises( Exception, match='return_only_aggregation param must be boolean.*'): run_queries(WEAT, gender_queries, models, return_only_aggregation=None)
def test_run_query(queries_and_models): def check_results_types(results, only_negative=False): for row in results.values: for value in row: assert isinstance(value, np.float_) if only_negative: assert value <= 0 # ----------------------------------------------------------------- # Basic run_queries execution # ----------------------------------------------------------------- # Load the inputs of the fixture gender_queries, negative_test_queries, models = queries_and_models results = run_queries(WEAT, gender_queries, models) assert isinstance(results, pd.DataFrame) assert results.shape == (3, 3) # Check cols expected_cols = [ 'Male terms and Female terms wrt Career and Family', 'Male terms and Female terms wrt Science and Arts', 'Male terms and Female terms wrt Math and Arts' ] for given_col, expected_col in zip(results.columns, expected_cols): assert given_col == expected_col # Check index expected_index = ['dummy_model_1', 'dummy_model_2', 'dummy_model_3'] for given_idx, expected_idx in zip(results.index, expected_index): assert given_idx, expected_idx # Check values check_results_types(results) results = run_queries(WEAT, negative_test_queries, models) check_results_types(results, only_negative=True) # ----------------------------------------------------------------- # run_queries with params execution # ----------------------------------------------------------------- # lost_vocabulary_threshold... word_sets = load_weat() dummy_query_1 = Query([['bla', 'ble', 'bli'], word_sets['insects']], [word_sets['pleasant_9'], word_sets['unpleasant_9']]) results = run_queries(WEAT, gender_queries + [dummy_query_1], models, lost_vocabulary_threshold=0.1) assert results.shape == (3, 4) assert results.isnull().any().any() check_results_types(results) # metric param... results = run_queries(WEAT, gender_queries, models, metric_params={'return_effect_size': True}) assert results.shape == (3, 3) check_results_types(results) # ----------------------------------------------------------------- # run_queries with aggregation params execution # ----------------------------------------------------------------- # include_average_by_embedding.. results = run_queries(WEAT, gender_queries, models, aggregate_results=True) assert results.shape == (3, 4) check_results_types(results) # avg results = run_queries(WEAT, negative_test_queries, models, aggregate_results=True, aggregation_function='avg') assert results.shape == (3, 3) check_results_types(results) agg = results.values[:, 2] values = results.values[:, 0:2] calc_agg = np.mean(values, axis=1) assert np.array_equal(agg, calc_agg) # abs avg results = run_queries(WEAT, negative_test_queries, models, aggregate_results=True, aggregation_function='abs_avg') assert results.shape == (3, 3) check_results_types(results) agg = results.values[:, 2] values = results.values[:, 0:2] calc_agg = np.mean(np.abs(values), axis=1) assert np.array_equal(agg, calc_agg) # sum results = run_queries(WEAT, negative_test_queries, models, aggregate_results=True, aggregation_function='sum') assert results.shape == (3, 3) check_results_types(results) agg = results.values[:, 2] values = results.values[:, 0:2] calc_agg = np.sum(values, axis=1) assert np.array_equal(agg, calc_agg) # abs_sum results = run_queries(WEAT, negative_test_queries, models, aggregate_results=True, aggregation_function='abs_sum') assert results.shape == (3, 3) check_results_types(results) agg = results.values[:, 2] values = results.values[:, 0:2] calc_agg = np.sum(np.abs(values), axis=1) assert np.array_equal(agg, calc_agg) # custom agg function results = run_queries(WEAT, negative_test_queries, models, aggregate_results=True, aggregation_function=lambda df: -df.abs().mean(1)) assert results.shape == (3, 3) check_results_types(results, only_negative=True) agg = results.values[:, 2] values = results.values[:, 0:2] calc_agg = -np.mean(np.abs(values), axis=1) assert np.array_equal(agg, calc_agg) # return only aggregation without query name results = run_queries(WEAT, gender_queries, models, aggregate_results=True, aggregation_function='abs_avg', return_only_aggregation=True) assert results.shape == (3, 1) check_results_types(results) assert results.columns[ -1] == 'WEAT: Unnamed queries set average of abs values score' # return only aggregation without query name results = run_queries(WEAT, gender_queries, models, aggregate_results=True, aggregation_function='abs_avg', queries_set_name='Gender queries', return_only_aggregation=True) assert results.shape == (3, 1) check_results_types(results) assert results.columns[ -1] == 'WEAT: Gender queries average of abs values score' # return aggregation with query name results = run_queries(WEAT, gender_queries, models, aggregate_results=True, aggregation_function='abs_avg', queries_set_name='Gender queries', return_only_aggregation=False) assert results.shape == (3, 4) check_results_types(results) assert results.columns[ -1] == 'WEAT: Gender queries average of abs values score' # ----------------------------------------------------------------- # run_queries with generate subqueries params execution # ----------------------------------------------------------------- # with this option, the gender queries must be divided in RND template # (2,1). with one query replicated (arts), the remaining are only 5. results = run_queries(RND, gender_queries, models, generate_subqueries=True) assert results.shape == (3, 5) check_results_types(results) # ----------------------------------------------------------------- # run_queries full params execution # ----------------------------------------------------------------- results = run_queries(RND, gender_queries, models, queries_set_name='Gender queries', generate_subqueries=True, aggregate_results=True, aggregation_function='abs_avg', return_only_aggregation=False, metric_params={'distance_type': 'cos'}) assert results.shape == (3, 6) check_results_types(results) assert results.columns[ -1] == 'RND: Gender queries average of abs values score'
def test_ranking_results(queries_and_models): gender_queries, negative_test_queries, models = queries_and_models results_gender = run_queries( WEAT, gender_queries, models, queries_set_name='Gender Queries', aggregate_results=True, ) results_negative = run_queries(WEAT, negative_test_queries, models, queries_set_name='Negative Test Queries', aggregate_results=True) results_gender_rnd = run_queries( RND, gender_queries, models, queries_set_name='Gender Queries', generate_subqueries=True, aggregate_results=True, ) with pytest.raises( TypeError, match='All elements of results_dataframes must be a pandas ' 'Dataframe instance*'): create_ranking([None, results_gender, results_negative]) with pytest.raises( TypeError, match='All elements of results_dataframes must be a pandas ' 'Dataframe instance*'): create_ranking([results_gender, results_negative, 2]) ranking = create_ranking( [results_gender, results_negative, results_gender_rnd]) expected_ranking = pd.DataFrame({ 'WEAT: Gender Queries average of abs values score': { 'dummy_model_1': 1.0, 'dummy_model_2': 2.0, 'dummy_model_3': 3.0 }, 'WEAT: Negative Test Queries average of abs values score': { 'dummy_model_1': 1.0, 'dummy_model_2': 2.0, 'dummy_model_3': 3.0 }, 'RND: Gender Queries average of abs values score': { 'dummy_model_1': 1.0, 'dummy_model_2': 2.0, 'dummy_model_3': 3.0 }, }) assert ranking.shape == (3, 3) assert expected_ranking.equals(ranking) for row in ranking.values: for val in row: assert val <= 3 and val >= 1