def test_dictionary_lookup_standardiser_appends_columns_to_data(): standardiser = EthnicityDictionaryLookup( "tests/test_data/test_dictionary_lookup/test_lookup.csv") # given data data_set = EthnicityDataset( data=[["Ethnicity", "Ethnicity type"], ["a", "any ethnicity type"]]) # when we add_columns standardiser.process_data_set(data_set) # then 4 columns are appended to the data assert 6 == data_set.get_data()[0].__len__()
def test_dictionary_lookup_standardiser_can_handle_empty_rows(): standardiser = EthnicityDictionaryLookup( "tests/test_data/test_dictionary_lookup/test_lookup.csv") # given a dataset with a blank row data = [["Ethnicity", "Ethnicity type"], [" a", "xxx"], []] data_set = EthnicityDataset(data=data) # when we add_columns try: standardiser.process_data_set(data_set) except IndexError: assert False
def test_dictionary_lookup_standardiser_without_default_values_appends_blanks_when_not_found( ): standardiser = EthnicityDictionaryLookup( "tests/test_data/test_dictionary_lookup/test_lookup.csv") # given a dataset with a strange value data = [["Ethnicity", "Ethnicity type"], ["strange", "missing"]] data_set = EthnicityDataset(data=data) # when we add_columns standardiser.process_data_set(data_set) # then 4 blank values are appended for the four columns assert data_set.get_data()[1] == ["strange", "missing", "", "", "", ""]
def test_dictionary_lookup_standardiser_appends_columns_using_defaults_for_unknown_ethnicity_type( ): standardiser = EthnicityDictionaryLookup( "tests/test_data/test_dictionary_lookup/test_lookup.csv") # given data from an ethnicity type not in the lookup data = [["Ethnicity", "Ethnicity type"], [" a", "xxx"], ["b ", "xxx"]] data_set = EthnicityDataset(data=data) # when we add_columns standardiser.process_data_set(data_set) # then values are added assert data_set.get_data()[0][2] == "Label" assert data_set.get_data()[1][2] == "A" assert data_set.get_data()[2][2] == "B"
def test_dictionary_lookup_standardiser_appends_columns_trimming_white_space_for_lookup( ): standardiser = EthnicityDictionaryLookup( "tests/test_data/test_dictionary_lookup/test_lookup.csv") # given data where one has forward white space and the other has trailing data = [["Ethnicity", "Ethnicity type"], [" a", "phonetic"], ["b ", "phonetic"]] data_set = EthnicityDataset(data=data) # when we add_columns standardiser.process_data_set(data_set) # then values are added assert data_set.get_data()[0][2] == "Label" assert data_set.get_data()[1][2] == "alpha" assert data_set.get_data()[2][2] == "bravo"
def test_dictionary_lookup_standardiser_appends_columns_using_case_insensitive_lookup( ): standardiser = EthnicityDictionaryLookup( "tests/test_data/test_dictionary_lookup/test_lookup.csv") # given data where one is capitalised data = [["Ethnicity", "Ethnicity type"], ["A", "phonetic"], ["b", "phonetic"]] data_set = EthnicityDataset(data=data) # when we add_columns standardiser.process_data_set(data_set) # then values are added assert data_set.get_data()[0][2] == "Label" assert data_set.get_data()[1][2] == "alpha" assert data_set.get_data()[2][2] == "bravo"
def test_dictionary_lookup_standardiser_appends_columns_using_specific_ethnicity_type_in_lookup( ): standardiser = EthnicityDictionaryLookup( "tests/test_data/test_dictionary_lookup/test_lookup.csv") # given data from an ethnicity type in the lookup data = [["Ethnicity", "Ethnicity type"], ["a", "phonetic"], ["b", "phonetic"]] data_set = EthnicityDataset(data=data) # when we add_columns standardiser.process_data_set(data_set) # then added values come from entries in the lookup with ethnicity_type = '' assert data_set.get_data()[0][2] == "Label" assert data_set.get_data()[1][2] == "alpha" assert data_set.get_data()[2][2] == "bravo"
def test_dictionary_lookup_standardiser_with_wildcard_values_inserts_custom_defaults_when_not_found( ): default_values = ["*", "two", "Unknown - *", "four"] standardiser = EthnicityDictionaryLookup( "tests/test_data/test_dictionary_lookup/test_lookup.csv", default_values=default_values) # given a dataset with a strange value data = [["Ethnicity", "Ethnicity type"], ["strange", "missing"]] data_set = EthnicityDataset(data=data) # when we add_columns standardiser.process_data_set(data_set) # then the default values are appended with * substituted with the ethnicity value assert data_set.get_data()[1] == [ "strange", "missing", "strange", "two", "Unknown - strange", "four" ]
def process_data(self, data): ethnicity_data_set = EthnicityDataset(data) self.process_data_set(data_set=ethnicity_data_set) return ethnicity_data_set.get_data()