def test_single_dim_order(self): spec = Spec(name='test', dims=['technology_type'], coords={ 'technology_type': ['water_meter', 'electricity_meter', 'other', 'aaa'] }, dtype='float') df = pd.DataFrame([ { 'technology_type': 'water_meter', 'test': 5 }, { 'technology_type': 'electricity_meter', 'test': 6 }, { 'technology_type': 'other', 'test': 7 }, { 'technology_type': 'aaa', 'test': 8 }, ]) da = DataArray(spec, numpy.array([5., 6., 7., 8.])) da_from_df = DataArray.from_df(spec, df) da_from_df_2 = DataArray.from_df(spec, df) assert da == da_from_df assert da == da_from_df_2
def test_multi_dim_order(self): spec = Spec(name='test', coords={ 'lad': ['c', 'a', 'b'], 'interval': [4, 2] }, dims=['lad', 'interval'], dtype='float') data = numpy.array( [ # 4 2 [1, 2], # c [5, 6], # a [9, 0] # b ], dtype='float') da = DataArray(spec, data) df = pd.DataFrame([ { 'test': 6.0, 'lad': 'a', 'interval': 2 }, { 'test': 0.0, 'lad': 'b', 'interval': 2 }, { 'test': 2.0, 'lad': 'c', 'interval': 2 }, { 'test': 5.0, 'lad': 'a', 'interval': 4 }, { 'test': 9.0, 'lad': 'b', 'interval': 4 }, { 'test': 1.0, 'lad': 'c', 'interval': 4 }, ]).set_index(['lad', 'interval']) da_from_df = DataArray.from_df(spec, df) assert da_from_df == da da_to_df = da.as_df().sort_index() df = df.sort_index() pd.testing.assert_frame_equal(da_to_df, df)
def test_from_multiindex(self): spec = Spec(name='test', dims=['multi'], coords={'multi': ['b', 'a', 'c']}, dtype='float') index = pd.MultiIndex.from_product([['b', 'a', 'c']], names=['multi']) df = pd.DataFrame({'test': [1, 2, 3]}, index=index) da_from_df = DataArray.from_df(spec, df) da = DataArray(spec, numpy.array([1, 2, 3])) assert da == da_from_df
def test_error_duplicate_rows_multi_index(self): spec = Spec(name='test', dims=['a', 'b'], coords={ 'a': [1, 2], 'b': [3, 4] }, dtype='int') df = pd.DataFrame([ { 'a': 1, 'b': 3, 'test': 0 }, { 'a': 2, 'b': 3, 'test': 1 }, { 'a': 1, 'b': 4, 'test': 2 }, { 'a': 2, 'b': 4, 'test': 3 }, { 'a': 2, 'b': 4, 'test': 4 }, ]) with raises(SmifDataMismatchError) as ex: DataArray.from_df(spec, df) msg = "Data for 'test' contains duplicate values at [{'a': 2, 'b': 4}]" msg_alt = "Data for 'test' contains duplicate values at [{'b': 4, 'a': 2}]" assert msg in str(ex.value) or msg_alt in str(ex.value)
def test_scalar(self): # should handle zero-dimensional case (numpy array as scalar) data = numpy.array(2.0) spec = Spec(name='test', dims=[], coords={}, dtype='float') da = DataArray(spec, data) df = pd.DataFrame([{'test': 2.0}]) da_from_df = DataArray.from_df(spec, df) assert da_from_df == da df_from_da = da.as_df() pd.testing.assert_frame_equal(df_from_da, df)
def test_df_round_trip_2d(self): spec = Spec.from_dict({ 'name': 'two_d', 'dims': ['a', 'z'], 'coords': { 'a': ['q', 'p'], 'z': ['a', 'c', 'b'], }, 'dtype': 'float' }) da = DataArray(spec, numpy.array([ [5., 6., 7.], [8., 9., 0.], ])) df = pd.DataFrame([ { 'z': 'a', 'a': 'p', 'two_d': 8. }, { 'z': 'c', 'a': 'q', 'two_d': 6. }, { 'z': 'a', 'a': 'q', 'two_d': 5. }, { 'z': 'b', 'a': 'q', 'two_d': 7. }, { 'z': 'b', 'a': 'p', 'two_d': 0. }, { 'z': 'c', 'a': 'p', 'two_d': 9. }, ]) df = df.set_index(spec.dims) df_from_da = da.as_df() da_from_df = DataArray.from_df(spec, df_from_da) assert_array_equal(da.data, da_from_df.data)
def dataframe_to_data_array(dataframe, spec, path): if spec.dims: data_array = DataArray.from_df(spec, dataframe) else: # zero-dimensional case (scalar) data = dataframe[spec.name] if data.shape != (1, ): msg = "Data for '{}' should contain a single value, instead got {} while " + \ "reading from {}" raise SmifDataMismatchError( msg.format(spec.name, len(data), path)) data_array = DataArray(spec, data.iloc[0]) return data_array
def test_from_df_partial(self, spec): """Should create a DataArray that can handle missing data, returning nan/null """ df = pd.DataFrame({ 'a': ['a1'], 'b': ['b1'], 'c': ['c2'], 'test_data': [1] }).set_index(['a', 'b', 'c']) expected_data = numpy.full(spec.shape, numpy.nan) expected_data[0, 0, 1] = 1.0 expected = DataArray(spec, expected_data) actual = DataArray.from_df(spec, df) assert_array_equal(actual.data, expected.data) assert actual == expected
def test_match_metadata(self): spec = Spec(name='test', dims=['region'], coords={'region': ['oxford']}, dtype='int64') # must have a column named the same as the spec.name df = pd.DataFrame([{ 'region': 'oxford', 'other': 'else' }]).set_index(['region']) msg = "Data for 'test' expected a data column called 'test' and index names " + \ "['region'], instead got data columns ['other'] and index names ['region']" with raises(SmifDataMismatchError) as ex: DataArray.from_df(spec, df) assert msg in str(ex.value) # may not be indexed, if columns are otherwise all okay df = pd.DataFrame([{'region': 'oxford', 'test': 1}]) DataArray.from_df(spec, df) # must have an index level for each spec dimension df = pd.DataFrame([{'test': 3.14}]) msg = "Data for 'test' expected a data column called 'test' and index names " + \ "['region'], instead got data columns ['test'] and index names [None]" with raises(SmifDataMismatchError) as ex: DataArray.from_df(spec, df) assert msg in str(ex.value) # must not have dimension labels outside of the spec dimension df = pd.DataFrame([{ 'test': 3.14, 'region': 'oxford' }, { 'test': 3.14, 'region': 'extra' }]).set_index(['region']) msg = "Data for 'test' contained unexpected values in the set of coordinates for " + \ "dimension 'region': ['extra']" with raises(SmifDataMismatchError) as ex: DataArray.from_df(spec, df) assert msg in str(ex.value)
def test_to_from_df(self): df = pd.DataFrame([{ 'test': 3, 'region': 'oxford', 'interval': 1 }]).set_index(['region', 'interval']) spec = Spec(name='test', dims=['region', 'interval'], coords={ 'region': ['oxford'], 'interval': [1] }, dtype='int64') da = DataArray(spec, numpy.array([[3.]], dtype='int64')) da_from_df = DataArray.from_df(spec, df) assert da_from_df == da da_to_df = da.as_df() pd.testing.assert_frame_equal(da_to_df, df)
def test_df_round_trip(self): spec = Spec.from_dict({ 'name': 'multi_savings', 'description': 'The savings from various technologies', 'dims': ['technology_type'], 'coords': { 'technology_type': ['water_meter', 'electricity_meter', 'other', 'aaa'] }, 'dtype': 'float', 'abs_range': (0, 100), 'exp_range': (3, 10), 'unit': '%' }) da = DataArray(spec, numpy.array([5., 6., 7., 8.])) df = pd.DataFrame([ { 'technology_type': 'water_meter', 'multi_savings': 5. }, { 'technology_type': 'electricity_meter', 'multi_savings': 6. }, { 'technology_type': 'other', 'multi_savings': 7. }, { 'technology_type': 'aaa', 'multi_savings': 8. }, ]) df = df.set_index(spec.dims) df_from_da = da.as_df() da_from_df = DataArray.from_df(spec, df_from_da) assert_array_equal(da.data, da_from_df.data)
def get_multidimensional_param(): spec = Spec.from_dict({ 'name': 'ss_t_base_heating', 'description': 'Industrial base temperature', 'default': '../energy_demand/parameters/ss_t_base_heating.csv', 'unit': '', 'dims': ['interpolation_params', 'end_yr'], 'coords': { 'interpolation_params': ['diffusion_choice', 'value_ey'], 'end_yr': [2030, 2050] }, 'dtype': 'float' }) dataframe = pd.DataFrame([ { 'interpolation_params': 'diffusion_choice', 'end_yr': 2030, 'ss_t_base_heating': 0 }, { 'interpolation_params': 'diffusion_choice', 'end_yr': 2050, 'ss_t_base_heating': 0 }, { 'interpolation_params': 'value_ey', 'end_yr': 2030, 'ss_t_base_heating': 15.5 }, { 'interpolation_params': 'value_ey', 'end_yr': 2050, 'ss_t_base_heating': 15.5 }, ]).set_index(['interpolation_params', 'end_yr']) return DataArray.from_df(spec, dataframe)
def _df_to_ndarray(self, output_name, dataframe): spec = self.outputs[output_name] dataframe.set_index(spec.dims, inplace=True) dataarray = DataArray.from_df(spec, dataframe) return dataarray.data
def test_from_df(self, small_da, small_da_df): """Should create a DataArray from a pandas.DataFrame """ actual = DataArray.from_df(small_da.spec, small_da_df) assert actual == small_da
def main(): # Read connection details if 'NISMOD_API_USER' in os.environ and 'NISMOD_API_PASSWORD' in os.environ: username = os.environ['NISMOD_API_USER'] password = os.environ['NISMOD_API_PASSWORD'] else: parser = configparser.ConfigParser() parser.read(os.path.join(os.path.dirname(__file__), 'dbconfig.ini')) username = parser['nismod-api']['user'] password = parser['nismod-api']['password'] auth = (username, password) try: os.mkdir(CACHE_PATH) except FileExistsError: pass # Population get_population(auth) process_oa_population() process_oa_to_lad_population() # Read oa_pop = pandas.read_csv(os.path.join(CACHE_PATH, 'oa_population.csv')) lad_pop = pandas.read_csv(os.path.join(CACHE_PATH, 'lad_population.csv')) # to DataArray years = list(range(POPULATION_MIN_YEAR, POPULATION_MAX_YEAR + 1)) oas = list(oa_pop.oa.unique()) oa_spec = Spec(name='population', dims=['timestep', 'oa'], coords={ 'oa': oas, 'timestep': years }, dtype='int') lads = list(lad_pop.lad_gb_2016.unique()) lad_spec = Spec(name='population', dims=['timestep', 'lad_gb_2016'], coords={ 'lad_gb_2016': lads, 'timestep': years }, dtype='int') # Write to CSV and Parquet stores text_store = CSVDataStore(STORE_BASE_PATH) binary_store = ParquetDataStore(STORE_BASE_PATH) lad_key = 'population_nismod_db.v5_lad16' lad_pop_da = DataArray.from_df(lad_spec, lad_pop.set_index(lad_spec.dims)) text_store.write_scenario_variant_data('{}.csv'.format(lad_key), lad_pop_da) binary_store.write_scenario_variant_data('{}.parquet'.format(lad_key), lad_pop_da) oa_key = 'population_nismod_db.v5_oa' oa_pop_da = DataArray.from_df(oa_spec, oa_pop.set_index(oa_spec.dims)) text_store.write_scenario_variant_data('{}.csv'.format(oa_key), oa_pop_da) binary_store.write_scenario_variant_data('{}.parquet'.format(oa_key), oa_pop_da)