def test_load_from_disk(self): # test for dict with warning: # save a file to disk to test it: inp = {"wuwu": {"warning": ["warning1", "warning2"]}} out = {"wuwu": {"warning": {"warning1", "warning2"}}} with open("test.gdn", "w") as test_file: test_file.write(json.dumps(inp)) # test if it loads as expected: self.assertEqual(GenderNounDataHandler.load_from_disk("test.gdn"), out) # test for dict without warning: # save a file to disk to test it: inp = {"wuwu": {"fufu": "wawa"}} out = inp with open("test.gdn", "w") as test_file: test_file.write(json.dumps(inp)) # test if it loads directly: self.assertEqual(GenderNounDataHandler.load_from_disk("test.gdn"), out) # test for a mixture: # save a file to disk to test it: inp = {"wuwu": {"warning": ["warning1", "warning2"]}, "wawa": {"fufu": "wawa"}} out = {"wuwu": {"warning": {"warning1", "warning2"}}, "wawa": {"fufu": "wawa"}} with open("test.gdn", "w") as test_file: test_file.write(json.dumps(inp)) # test if it loads as expected: self.assertEqual(GenderNounDataHandler.load_from_disk("test.gdn"), out) # finally delete the file: os.remove("test.gdn")
def test_make_sure_all_referenced_words_exist(self): # keep a dict that does not reference non-existing words as-is: self.assertEqual(GenderNounDataHandler.make_sure_all_referenced_words_exist( {"carpenter": {"gender": "neutral", "gender_map": {}}}), {"carpenter": {"gender": "neutral", "gender_map": {}}}) # create non-existent, yet referenced words: self.assertEqual(GenderNounDataHandler.make_sure_all_referenced_words_exist( {"carpenter": {"gender": "neutral", "gender_map": {"female": "carpentress"}}}), {"carpenter": {"gender": "neutral", "gender_map": {"female": "carpentress"}}, "carpentress": {"gender": "female", "gender_map": {"neutral": "carpenter"}}})
def test_create_extra_links_to_gender_ambiguous_words(self): # test for fine input: self.assertEqual(GenderNounDataHandler.create_extra_links_to_gender_ambiguous_words( {"carpenter": {"gender": "neutral", "gender_map": {"female": "carpentress"}}, "carpentress": {"gender": "female", "gender_map": {"neutral": "carpenter"}}}), {"carpenter": {"gender": "neutral", "gender_map": {"female": "carpentress"}}, "carpentress": {"gender": "female", "gender_map": {"neutral": "carpenter"}}}) # test for improvable input: self.assertEqual(GenderNounDataHandler.create_extra_links_to_gender_ambiguous_words( {"carpenter": {"gender": "neutral", "gender_map": {"female": "carpentress"}}, "carpentress": {"gender": "female", "gender_map": {"male": "carpenter"}}}), {"carpenter": {"gender": "neutral", "gender_map": {"female": "carpentress"}}, "carpentress": {"gender": "female", "gender_map": {"neutral": "carpenter", "male": "carpenter"}}})
def test_remove_words_that_are_not_nouns(self): # keep the dict as it is if all words are nouns: self.assertEqual(GenderNounDataHandler.remove_words_that_are_not_nouns( {"carpenter": {"gender": "neutral", "gender_map": {}}, "food": {"gender": "neutral", "gender_map": {}}}), {"carpenter": {"gender": "neutral", "gender_map": {}}, "food": {"gender": "neutral", "gender_map": {}}}) # remove words that are not nouns: self.assertEqual(GenderNounDataHandler.remove_words_that_are_not_nouns( {"carpenter": {"gender": "neutral", "gender_map": {}}, "eat": {"gender": "neutral", "gender_map": {}}}), {"carpenter": {"gender": "neutral", "gender_map": {}}}) # but keep those that are linked to nouns with their gender dict: self.assertEqual(GenderNounDataHandler.remove_words_that_are_not_nouns( {"carpenter": {"gender": "neutral", "gender_map": {}}, "eat": {"gender": "neutral", "gender_map": {"male": "carpenter"}}}), {"carpenter": {"gender": "neutral", "gender_map": {}}, "eat": {"gender": "neutral", "gender_map": {"male": "carpenter"}}})
def test_save_to_disk(self): # test for dict with warning: # save a file to disk to test it: data_in_file = {"wuwu": {"warning": ["warning1", "warning2"]}} data_loaded = {"wuwu": {"warning": {"warning1", "warning2"}}} GenderNounDataHandler.save_to_disk(data_loaded, "test.gdn") with open("test.gdn", "r") as test_file: self.assertEqual(json.loads(test_file.read()), data_in_file) # test for dict without warning: # save a file to disk to test it: data_in_file = {"wuwu": {"fufu": "wawa"}} data_loaded = data_in_file GenderNounDataHandler.save_to_disk(data_loaded, "test.gdn") with open("test.gdn", "r") as test_file: self.assertEqual(json.loads(test_file.read()), data_in_file) # test for a mixture: # save a file to disk to test it: data_in_file = {"wuwu": {"warning": ["warning1", "warning2"]}, "wawa": {"fufu": "wawa"}} data_loaded = {"wuwu": {"warning": {"warning1", "warning2"}}, "wawa": {"fufu": "wawa"}} GenderNounDataHandler.save_to_disk(data_loaded, "test.gdn") with open("test.gdn", "r") as test_file: self.assertEqual(json.loads(test_file.read()), data_in_file) # finally delete the file: os.remove("test.gdn")
def test_create_noun_data(self): # test if noun data is correctly generated when the `gender-nouns.gdn`-file is missing and has to be rebuild # on initialization. # the outcome should be identical to what it contained before it was deleted, and also, to the result of the # render pipeline. # create copy and reload module: old_gender_dict = copy.deepcopy(gn.GENDER_DICT) pipeline_output = GenderNounDataHandler.create_full_graph_from_web() os.remove("src/gendered-nouns.gdn") with self.assertWarns(ws.GenderedNounsBuildFromWebWarning): importlib.reload(sys.modules["src.gender_nouns"]) # check for equality: self.assertEqual(old_gender_dict, pipeline_output) self.assertEqual(old_gender_dict, gn.GENDER_DICT) self.assertEqual(pipeline_output, gn.GENDER_DICT)
def test_make_all_links_two_sided(self): # keep a dict where all links are two-sided (A <-> B C): self.assertEqual(GenderNounDataHandler.make_all_links_two_sided( {"carpenter": {"gender": "neutral", "gender_map": {"female": "carpentress"}}, "carpentress": {"gender": "female", "gender_map": {"neutral": "carpenter"}}, "carpenter_man": {"gender": "male", "gender_map": {}}}), {"carpenter": {"gender": "neutral", "gender_map": {"female": "carpentress"}}, "carpentress": {"gender": "female", "gender_map": {"neutral": "carpenter"}}, "carpenter_man": {"gender": "male", "gender_map": {}}}) # change {A->B C} to {A<->B C}: self.assertEqual(GenderNounDataHandler.make_all_links_two_sided( {"carpenter": {"gender": "neutral", "gender_map": {"female": "carpentress"}}, "carpentress": {"gender": "female", "gender_map": {}}, "carpenter_man": {"gender": "male", "gender_map": {}}}), {"carpenter": {"gender": "neutral", "gender_map": {"female": "carpentress"}}, "carpentress": {"gender": "female", "gender_map": {"neutral": "carpenter"}}, "carpenter_man": {"gender": "male", "gender_map": {}}}) # the result for the following tests: triangle_result = { "carpenter": {"gender": "neutral", "gender_map": {"female": "carpentress", "male": "carpenter_man"}}, "carpentress": {"gender": "female", "gender_map": {"male": "carpenter_man", "neutral": "carpenter"}}, "carpenter_man": {"gender": "male", "gender_map": {"female": "carpentress", "neutral": "carpenter"}}} # Triangle tests for V-shaped triangles (A and B are somewhat linked and B and C, but not C and A): # change {A->B<-C} to {A<->B<->C<->A}: self.assertEqual(GenderNounDataHandler.make_all_links_two_sided( {"carpenter": {"gender": "neutral", "gender_map": {"female": "carpentress"}}, "carpentress": {"gender": "female", "gender_map": {}}, "carpenter_man": {"gender": "male", "gender_map": {"female": "carpentress"}}}), triangle_result) # chance {A<-B->C} to {A<->B<->C<->A}: self.assertEqual(GenderNounDataHandler.make_all_links_two_sided( {"carpenter": {"gender": "neutral", "gender_map": {}}, "carpentress": {"gender": "female", "gender_map": {"male": "carpenter_man", "neutral": "carpenter"}}, "carpenter_man": {"gender": "male", "gender_map": {}}}), triangle_result) # change {A<->B<-C} to {A<->B<->C<->A}: self.assertEqual(GenderNounDataHandler.make_all_links_two_sided( {"carpenter": {"gender": "neutral", "gender_map": {"female": "carpentress"}}, "carpentress": {"gender": "female", "gender_map": {"neutral": "carpenter"}}, "carpenter_man": {"gender": "male", "gender_map": {"female": "carpentress"}}}), triangle_result) # chance {A<->B->C} to {A<->B<->C<->A}: self.assertEqual(GenderNounDataHandler.make_all_links_two_sided( {"carpenter": {"gender": "neutral", "gender_map": {"female": "carpentress"}}, "carpentress": {"gender": "female", "gender_map": {"male": "carpenter_man", "neutral": "carpenter"}}, "carpenter_man": {"gender": "male", "gender_map": {}}}), triangle_result) # chance {A<->B<->C} to {A<->B<->C<->A}: self.assertEqual(GenderNounDataHandler.make_all_links_two_sided( {"carpenter": {"gender": "neutral", "gender_map": {"female": "carpentress"}}, "carpentress": {"gender": "female", "gender_map": {"male": "carpenter_man", "neutral": "carpenter"}}, "carpenter_man": {"gender": "male", "gender_map": {"female": "carpentress"}}}), triangle_result) # change {A->B->C} to {A<->B<->C<->A}: self.assertEqual(GenderNounDataHandler.make_all_links_two_sided( {"carpenter": {"gender": "neutral", "gender_map": {"female": "carpentress"}}, "carpentress": {"gender": "female", "gender_map": {"male": "carpenter_man"}}, "carpenter_man": {"gender": "male", "gender_map": {}}}), triangle_result) # Now come the same tests again, but this time for triangles with three sides rather than V-shaped triangles: # change {A->B<-C->A} to {A<->B<->C<->A}: self.assertEqual(GenderNounDataHandler.make_all_links_two_sided( {"carpenter": {"gender": "neutral", "gender_map": {"female": "carpentress"}}, "carpentress": {"gender": "female", "gender_map": {}}, "carpenter_man": {"gender": "male", "gender_map": {"female": "carpentress", "neutral": "carpenter"}}}), triangle_result) # change {A<->B<-C->A} to {A<->B<->C<->A}: self.assertEqual(GenderNounDataHandler.make_all_links_two_sided( {"carpenter": {"gender": "neutral", "gender_map": {"female": "carpentress"}}, "carpentress": {"gender": "female", "gender_map": {"neutral": "carpenter"}}, "carpenter_man": {"gender": "male", "gender_map": {"female": "carpentress", "neutral": "carpenter"}}}), triangle_result) # chance {A<->B->C->A} to {A<->B<->C<->A}: self.assertEqual(GenderNounDataHandler.make_all_links_two_sided( {"carpenter": {"gender": "neutral", "gender_map": {"female": "carpentress"}}, "carpentress": {"gender": "female", "gender_map": {"male": "carpenter_man", "neutral": "carpenter"}}, "carpenter_man": {"gender": "male", "gender_map": {"neutral": "carpenter"}}}), triangle_result) # chance {A<->B<->C->A} to {A<->B<->C<->A}: self.assertEqual(GenderNounDataHandler.make_all_links_two_sided( {"carpenter": {"gender": "neutral", "gender_map": {"female": "carpentress"}}, "carpentress": {"gender": "female", "gender_map": {"male": "carpenter_man", "neutral": "carpenter"}}, "carpenter_man": {"gender": "male", "gender_map": {"female": "carpentress", "neutral": "carpenter"}}}), triangle_result) # change {A->B->C->A} to {A<->B<->C<->A}: self.assertEqual(GenderNounDataHandler.make_all_links_two_sided( {"carpenter": {"gender": "neutral", "gender_map": {"female": "carpentress"}}, "carpentress": {"gender": "female", "gender_map": {"male": "carpenter_man"}}, "carpenter_man": {"gender": "male", "gender_map": {"neutral": "carpenter"}}}), triangle_result) # indirect gender information (given by links): self.assertEqual(GenderNounDataHandler.make_all_links_two_sided( {"carpenter": {"gender": "neutral", "gender_map": {"female": "carpentress"}}, "carpentress": {"gender": "female", "gender_map": {"male": "carpenter_man", "neutral": "carpenter"}}, "carpenter_man": {"gender": "neutral", "gender_map": {}}}), {"carpenter": {"gender": "neutral", "gender_map": {"female": "carpentress", "male": "carpenter_man"}}, "carpentress": {"gender": "female", "gender_map": {"male": "carpenter_man", "neutral": "carpenter"}}, "carpenter_man": {"gender": "neutral", "gender_map": {"male": "carpenter_man", "female": "carpentress"}}}) # direct gender information (given by `gender`-attrib) takes precedence over indirect gender information (given # by links): self.assertEqual(GenderNounDataHandler.make_all_links_two_sided( # ↓ unusual gendering in links {"carpenter": {"gender": "neutral", "gender_map": {"female": "carpenter"}}, "carpentress": {"gender": "female", "gender_map": {"male": "carpenter_man", "neutral": "carpenter"}}, "carpenter_man": {"gender": "male", "gender_map": {"neutral": "carpenter"}}}), {"carpenter": {"gender": "neutral", "gender_map": {"female": "carpenter", "male": "carpenter_man"}}, "carpentress": {"gender": "female", "gender_map": {"male": "carpenter_man", "neutral": "carpenter"}}, "carpenter_man": {"gender": "male", "gender_map": {"neutral": "carpenter", "female": "carpentress"}}} ) # does not affect newly build links, since explicit gender information takes precedence over it ↑ # having to make decisions between two words based on `choose_better_word`: self.assertEqual(self.rmv_all_warn(GenderNounDataHandler.make_all_links_two_sided( {"bachelor": {"gender": "male", "gender_map": {}}, "bachelor_girl": {"gender": "female", "gender_map": {"male": "bachelor"}}, "bachelorette": {"gender": "female", "gender_map": {"male": "bachelor"}}})), {"bachelor": {"gender": "male", "gender_map": {"female": "bachelorette"}}, "bachelor_girl": {"gender": "female", "gender_map": {"male": "bachelor"}}, "bachelorette": {"gender": "female", "gender_map": {"male": "bachelor"}}})
def test_choose_better_word(self): # This is merely a helper function to determine which words of two words are more "precise", but we will test it # anyways: # test if all pairs of ranking work: self.assertEqual(GenderNounDataHandler.choose_better_word("long_word_w_underscores", "police_woman"), "police_woman") self.assertEqual(GenderNounDataHandler.choose_better_word("police_woman", "police_person"), "police_person") self.assertEqual(GenderNounDataHandler.choose_better_word("police_man", "police_person"), "police_person") self.assertEqual(GenderNounDataHandler.choose_better_word("police_person", "police_mother"), "police_mother") self.assertEqual(GenderNounDataHandler.choose_better_word("police_person", "police_father"), "police_father") self.assertEqual(GenderNounDataHandler.choose_better_word("police_parent", "police_mother"), "police_parent") self.assertEqual(GenderNounDataHandler.choose_better_word("police_parent", "police_father"), "police_parent") # test if words with height quality distance compare correctly as well: self.assertEqual(GenderNounDataHandler.choose_better_word("police_person", "police_parent"), "police_parent") self.assertEqual(GenderNounDataHandler.choose_better_word("police_mother", "wuwuwu"), "wuwuwu") # test if words of whom none fits into the hard-coded categories are decided alphabetically: self.assertEqual(GenderNounDataHandler.choose_better_word("wuwuwu", "aiaiaiaiaiaiai"), "aiaiaiaiaiaiai") self.assertEqual(GenderNounDataHandler.choose_better_word("police_matron", "poioioice_matron"), "poioioice_matron")
def test_save_and_load(self): # test for dict with warning: # save a file to disk to test it: data = {"wuwu": {"warning": {"warning1", "warning2"}}} GenderNounDataHandler.save_to_disk(data, "test.gdn") self.assertEqual(data, GenderNounDataHandler.load_from_disk("test.gdn")) # test for dict without warning: # save a file to disk to test it: data = {"wuwu": {"fufu": "wawa"}} GenderNounDataHandler.save_to_disk(data, "test.gdn") self.assertEqual(data, GenderNounDataHandler.load_from_disk("test.gdn")) # test for words with underscores: data = {"fu_fu": {"wawa": "wuwu"}} GenderNounDataHandler.save_to_disk(data, "test.gdn") self.assertEqual(data, GenderNounDataHandler.load_from_disk("test.gdn")) # test for a mixture: # save a file to disk to test it: data = {"wuwu": {"warning": {"warning1", "warning2"}}, "wa_wa": {"fufu": "wawa"}} GenderNounDataHandler.save_to_disk(data, "test.gdn") self.assertEqual(data, GenderNounDataHandler.load_from_disk("test.gdn")) # finally delete the file: os.remove("test.gdn")
def test_load_from_web(self): # we will not test the completeness of the dataset that this is based on, since we know it is incomplete, but # rather, whether the returned data appears to really be derived from the original, and have the right format. # make sure the type checks we wrote for TypedDict-typing actually work: # blueprint: json_original: gn.OriginalGenderNounData = json.loads(requests.get( "https://raw.githubusercontent.com/phseiff/gendered_words/master/gendered_words.json").text) # type check: self.assertTrue(check_type.is_instance(json_original, gn.OriginalGenderNounData)) # the real thing: with warnings.catch_warnings(record=True) as w: json_generated = GenderNounDataHandler.load_from_web() # type check: self.assertTrue(check_type.is_instance(json_generated, gn.GeneratedGenderNounData)) # make sure that all other values are there: for i in range(len(json_original)): word_data = json_original[i] if i < len(json_original) - 1 and json_original[i+1]["word"] == json_original[i]["word"]: # continue the loop if the word is part of the json data more than once, since only the last occurance # is counted: continue # make sure that words without a senseno are removed - unless we already had the same word in the data, in # which case they are not removed -> this test might lead to false negatives in the future: if "wordnet_senseno" not in word_data: if not (i > 0 and json_original[i - 1]["word"] == word_data["word"] and "wordnet_senseno" in json_original[i - 1]): self.assertNotIn(word_data["word"], json_generated) continue # make sure that words tagged as "other" are re-gendered as neutral: if word_data["gender"] == "o": self.assertEqual(json_generated[word_data["word"]]["gender"], "neutral") else: # and other words keep their gender: self.assertEqual(json_generated[word_data["word"]]["gender"][0], word_data["gender"]) # if word is in the new json data, check further: if word_data["word"] in json_generated: # check that artificially added nouns that didn't come from wordnet aren't present anymore: self.assertIn("wordnet_senseno", word_data) # make sure we have an equally sized gender mapping: original_gender_map = word_data["gender_map"] if "gender_map" in word_data else {} generated_gender_map = json_generated[word_data["word"]]["gender_map"] self.assertEqual(len(generated_gender_map), len(original_gender_map)) # make sure that both gender mappings are identical: for gender, mapped_word in generated_gender_map.items(): # make sure each linked gender is in the original json as well as the new: self.assertIn(gender[0], original_gender_map) # make sure their values are identical: self.assertEqual(generated_gender_map[gender], original_gender_map[gender[0]][0]["word"].replace(" ", "_")) # make some simple exemplary tests to show this for some examples: # hermaphrodite got re-gendered as neutral: self.assertIn({"word": "hermaphrodite", "wordnet_senseno": "hermaphrodite.n.01", "gender": "o"}, json_original) self.assertEqual(json_generated["hermaphrodite"], {"gender": "neutral", "gender_map": {}}) # heroine (as a word with a full gender map) was handled correctly: self.assertIn({"word": "heroine", "wordnet_senseno": "heroine.n.02", "gender": "f", "gender_map": {"m": [{"parts_of_speech": "*", "word": "hero"}]}}, json_original) self.assertEqual(json_generated["heroine"], {"gender": "female", "gender_map": {"male": "hero"}}) # reenactor (as a word with no gender map) is handled correctly (i.e. empty rather than no # gender map): self.assertIn({"word": "reenactor", "wordnet_senseno": "reenactor.n.01", "gender": "n"}, json_original) self.assertEqual(json_generated["reenactor"], {"gender": "neutral", "gender_map": {}}) # women (as a word with no wordnet equivalent) is handles correctly (i.e. removed): self.assertIn({"word": "women", "gender": "f", "gender_map": {"m": [{"parts_of_speech": "*", "word": "men"}]}}, json_original) self.assertNotIn("women", json_generated) # great_grandson has its female version great_granddaughter listed with underscored rather than whitespace: self.assertIn({"word": "great_grandson", "wordnet_senseno": "great_grandson.n.01", "gender": "m", "gender_map": {"f": [{"parts_of_speech": "*", "word": "great granddaughter"}]}}, json_original) self.assertEqual(json_generated["great_grandson"], {"gender": "male", "gender_map": {"female": "great_granddaughter"}})