def test_clean_data_one_field(self): pums_data = datasource.PumsData( pandas.DataFrame(self._mock_dirty_household_input())) cleaned = pums_data.clean([inputs.NUM_PEOPLE.name], Preprocessor()) actual = cleaned.data.loc[1].to_dict() expected = { inputs.NUM_PEOPLE.name: '2', } self.assertDictEqual(actual, expected)
def test_clean_data(self): pums_data = datasource.PumsData( pandas.DataFrame(self._mock_dirty_household_input())) cleaned = pums_data.clean([ inputs.SERIAL_NUMBER.name, inputs.NUM_PEOPLE.name, inputs.NUM_VEHICLES.name, inputs.HOUSEHOLD_INCOME.name ], Preprocessor()) actual = cleaned.data.loc[1].to_dict() expected = { inputs.SERIAL_NUMBER.name: 'b', inputs.NUM_PEOPLE.name: '2', inputs.NUM_VEHICLES.name: '3+', inputs.HOUSEHOLD_INCOME.name: '40000+' } self.assertDictEqual(actual, expected)
def test_clean_data_filter_length(self): pums_data = datasource.PumsData( pandas.DataFrame(self._mock_dirty_household_puma_state_input())) field_names = [ inputs.SERIAL_NUMBER.name, inputs.STATE.name, inputs.PUMA.name ] cleaned = pums_data.clean(field_names, Preprocessor()) cleaned_state = pums_data.clean(field_names, Preprocessor(), state='06') cleaned_puma = pums_data.clean(field_names, Preprocessor(), puma='00106') cleaned_both = pums_data.clean(field_names, Preprocessor(), state='06', puma='00106') self.assertEqual(len(cleaned.data), 3) self.assertEqual(len(cleaned_state.data), 2) self.assertEqual(len(cleaned_puma.data), 2) self.assertEqual(len(cleaned_both.data), 1)