def test_cleaning_pipeline(): """This function tests our cleaning pipeline to make sure that garbage values are removed and ranks are create """ # read in the df using our function in order to pass to later tests # read in df using your function and then using pandas regular csv read, then compare the resulting dfs df = prep.read_then_clean(FILEPATH, STR_VARS) # also passed it through the rest of the cleaning pipeline on order to compare df to df_clean df_clean = prep.remove_garbage_codes(df, STR_VARS, STR_GARBAGE) df_clean = prep.extract_ranking(df_clean, NUM_VARS) df_clean = prep.remove_garbage_codes(df_clean, RANK_VARS, RANK_GARBAGE) # assert that rankings were generated in the next step of the pipeline for x in RANK_VARS: # verify that it wasnt originally present in df assert (x in df) == False, "rank column present in raw data" # assert that this column was added assert x in df_clean, "rank column was not added by extract_ranking fx" # assert that garbage was removed for x in STR_VARS: for y in STR_GARBAGE: print(x, y) # assert that it is removed assert (y in df_clean[x].unique( )) == False, "garbage values not removed from clean dataframe"
DEP_VAR = "housing_roof" PRED_VAR = DEP_VAR + "_rank" #will always be using the strings to predict ranking #setup a filter to select which surveys you want to work with SVY_FILTER = ['MACRO_DHS'] #garbage lists STR_GARBAGE = ['nan', 'other', 'not a dejure resident', 'not dejure resident'] RANK_GARBAGE = ['4', '5', '6', '7', '8', '9', 'n'] #dictionaries PRED_DICT = {'natural':'1', 'rudimentary':'2', 'finished':'3'} #map categories back to ranks df = prep.read_then_clean(DATA_DIR + "/" + DATA_FILENAME, STR_VARS, SVY_FILTER) df_clean = prep.remove_garbage_codes(df, STR_VARS, STR_GARBAGE) df_clean = prep.extract_ranking(df_clean, NUM_VARS) df_clean = prep.remove_garbage_codes(df_clean, RANK_VARS, RANK_GARBAGE) df_clean = df_clean.dropna(subset=[DEP_VAR]) class FilterOneWordMaterials(unittest.TestCase): """Tests for `filter_one_word_materials.py`.""" def test_expected_number_of_rows(self, df_clean): """Has the function successfully filtered out all the materials described with more than one word?""" df = df_clean[0:20] self.assertTrue(sem.filter_one_word_materials(df, DEP_VAR).shape[0], sum(df[DEP_VAR].str.get_dummies(sep=' ').T.sum() == 1)) def test_raise_error_if_no_material_with_one_word(self, df_clean): """Does the function raise an error if there is no material described with one word in the corpus?"""