def test_write_codelist(): # Retrieve codelists from a test specimen and convert to pandas with specimen('common-structure.xml') as f: dsd_common = sdmx.read_sdmx(f) codelists = sdmx.to_pandas(dsd_common)['codelist'] # File contains 5 code lists assert len(codelists) == 5 # Code lists have expected number of items assert len(codelists['CL_FREQ']) == 8 # Items names can be retrieved by ID freq = codelists['CL_FREQ'] assert freq['A'] == 'Annual' # Non-hierarchical code list has a string name assert freq.name == 'Code list for Frequency (FREQ)' # Hierarchical code list with specimen('codelist_partial.xml') as f: msg = sdmx.read_sdmx(f) # Convert single codelist CL_AREA = sdmx.to_pandas(msg.codelist['CL_AREA']) # Hierichical list has a 'parent' column; parent of Africa is the World assert CL_AREA.loc['002', 'parent'] == '001' # Pandas features can be used to merge parent names area_hierarchy = pd.merge(CL_AREA, CL_AREA, how='left', left_on='parent', right_index=True, suffixes=('', '_parent')) assert area_hierarchy.loc['002', 'name_parent'] == 'World'
def test_read_sdmx(tmp_path): # Copy the file to a temporary file with an urecognizable suffix target = tmp_path / "foo.badsuffix" with specimen("flat.json", opened=False) as original: target.open("w").write(original.read_text()) # With unknown file extension, read_sdmx() peeks at the file content pandasdmx.read_sdmx(target) # Format can be inferred from an already-open file without extension with specimen("flat.json") as f: pandasdmx.read_sdmx(f) # Exception raised when the file contents don't allow to guess the format bad_file = BytesIO(b"#! neither XML nor JSON") exc = ( "cannot infer SDMX message format from path None, format={}, or content " "'#! ne..'") with pytest.raises(RuntimeError, match=exc.format("None")): pandasdmx.read_sdmx(bad_file) # Using the format= argument forces a certain reader to be used # Create new open file: bad_file = BytesIO(b"#! neither XML nor JSON") with pytest.raises(json.JSONDecodeError): pandasdmx.read_sdmx(bad_file, format="JSON")
def test_freq_in_series_attribute(self, req): # Test that we don't have regression on Issues #39 and #41 # INSEE time series provide the FREQ value as attribute on the series # instead of a dimension. This caused a runtime error when writing as # pandas dataframe. data_response = pandasdmx.read_sdmx(SERIES["UNEMPLOYMENT_CAT_A_B_C"]["data-fp"]) pandasdmx.to_pandas(data_response)
def test_gh_75(self, req): """Test of https://github.com/dr-leo/pandaSDMX/pull/75.""" df_id = "47_850" # # Reported Dataflow query works # df = req.dataflow(df_id).dataflow[df_id] with specimen("47_850-structure") as f: df = pandasdmx.read_sdmx(f).dataflow[df_id] # dict() key for the query data_key = dict( FREQ=["A"], ITTER107=["001001"], SETTITOLARE=["1"], TIPO_DATO=["AUTP"], TIPO_GESTIONE=["ALL"], TIPSERVSOC=["ALL"], ) # Dimension components are in the correct order assert [dim.id for dim in df.structure.dimensions.components] == list( data_key.keys() ) + ["TIME_PERIOD"] # Reported data query works req.data(df_id, key="A.001001+001002.1.AUTP.ALL.ALL") # Use a dict() key to force Request to make a sub-query for the DSD req.data(df_id, key=data_key)
def test_write_conceptscheme(): with specimen('common-structure.xml') as f: msg = sdmx.read_sdmx(f) data = sdmx.to_pandas(msg) cdc = data['concept_scheme']['CROSS_DOMAIN_CONCEPTS'] assert cdc.loc['UNIT_MEASURE', 'name'] == 'Unit of Measure'
def test_write_dataflow(): # Read the INSEE dataflow definition with specimen('INSEE/dataflow') as f: msg = sdmx.read_sdmx(f) # Convert to pandas result = sdmx.to_pandas(msg, include='dataflow') # Number of Dataflows described in the file assert len(result['dataflow']) == 663 # ID and names of first Dataflows mbop = 'Monthly Balance of Payments - ' expected = pd.Series({ 'ACT-TRIM-ANC': 'Activity by sex and age - Quarterly series', 'BPM6-CCAPITAL': '{}Capital account'.format(mbop), 'BPM6-CFINANCIER': '{}Financial account'.format(mbop), 'BPM6-CTRANSACTION': '{}Current transactions account'.format(mbop), 'BPM6-TOTAL': '{}Overall total and main headings'.format(mbop), }) assert_pd_equal(result['dataflow'].head(), expected)
def test_write_constraint(): """'constraint' argument to writer.write_dataset.""" with specimen("ng-ts.xml") as f: msg = pandasdmx.read_sdmx(f) # Fetch the message's DSD assert msg.structure.is_external_reference # NB the speciment included in tests/data has 'ECB_EXR_NG' as the # data structure ID; but a query against the web service gives # 'ECB_EXR1' for the same data structure. id = "ECB_EXR1" dsd = ( pandasdmx.Request(msg.structure.maintainer.id) .get("datastructure", id) .structure[id] ) # Create a ContentConstraint cc = dsd.make_constraint({"CURRENCY": "JPY+USD"}) # Write the message without constraint s1 = pandasdmx.to_pandas(msg) assert len(s1) == 12 assert set(s1.index.to_frame()["CURRENCY"]) == {"CHF", "GBP", "JPY", "USD"} # Writing using constraint produces a fewer items; only those matching the # constraint s2 = pandasdmx.to_pandas(msg, constraint=cc) assert len(s2) == 6 assert set(s2.index.to_frame()["CURRENCY"]) == {"JPY", "USD"}
def test_write_conceptscheme(): with specimen("common-structure.xml") as f: msg = pandasdmx.read_sdmx(f) data = pandasdmx.to_pandas(msg) cdc = data["concept_scheme"]["CROSS_DOMAIN_CONCEPTS"] assert cdc.loc["UNIT_MEASURE", "name"] == "Unit of Measure"
def test_doc_howto_timeseries(): with specimen("sg-ts.xml") as f: ds = pandasdmx.read_sdmx(f).data[0] # Convert to pd.Series and unstack the time dimension to columns base = pandasdmx.to_pandas(ds) s1 = base.unstack("TIME_PERIOD") # DatetimeIndex on columns s1.columns = pd.to_datetime(s1.columns) assert isinstance(s1.columns, pd.DatetimeIndex) # DatetimeIndex on index s2 = base.unstack("TIME_PERIOD").transpose() s2.index = pd.to_datetime(s2.index) assert isinstance(s2.index, pd.DatetimeIndex) # Same with pd.PeriodIndex s3 = s1.to_period(axis=1) assert isinstance(s3.columns, pd.PeriodIndex) assert s3.columns.freqstr == "M" s4 = s2.to_period(axis=0) assert isinstance(s4.index, pd.PeriodIndex) assert s4.index.freqstr == "M"
def test_write_constraint(): """'constraint' argument to writer.write_dataset.""" with specimen('ng-ts.xml') as f: msg = sdmx.read_sdmx(f) # Fetch the message's DSD assert msg.structure.is_external_reference # NB the speciment included in tests/data has 'ECB_EXR_NG' as the # data structure ID; but a query against the web service gives # 'ECB_EXR1' for the same data structure. id = 'ECB_EXR1' dsd = sdmx.Request(msg.structure.maintainer.id) \ .get('datastructure', id) \ .structure[id] # Create a ContentConstraint cc = dsd.make_constraint({'CURRENCY': 'JPY+USD'}) # Write the message without constraint s1 = sdmx.to_pandas(msg) assert len(s1) == 12 assert set(s1.index.to_frame()['CURRENCY']) == {'CHF', 'GBP', 'JPY', 'USD'} # Writing using constraint produces a fewer items; only those matching the # constraint s2 = sdmx.to_pandas(msg, constraint=cc) assert len(s2) == 6 assert set(s2.index.to_frame()['CURRENCY']) == {'JPY', 'USD'}
def test_message_repr(pattern, expected): with specimen(pattern) as f: msg = pandasdmx.read_sdmx(f) if isinstance(expected, re.Pattern): assert expected.fullmatch(repr(msg)) else: assert expected == repr(msg)
def test_load_dataset(self, req): dataset_code = "IPI-2010-A21" # Load all dataflows dataflows_response = pandasdmx.read_sdmx(DATAFLOW_FP) dataflows = dataflows_response.dataflow assert len(dataflows) == 663 assert dataset_code in dataflows # Load datastructure for current dataset_code fp_datastructure = DATASETS[dataset_code]["datastructure-fp"] datastructure_response = pandasdmx.read_sdmx(fp_datastructure) assert dataset_code in datastructure_response.dataflow dsd = datastructure_response.dataflow[dataset_code].structure # Verify dimensions list dimensions = OrderedDict( [dim.id, dim] for dim in dsd.dimensions if dim.id not in ["TIME", "TIME_PERIOD"] ) dim_keys = list(dimensions.keys()) assert dim_keys == ["FREQ", "PRODUIT", "NATURE"] # Load datas for the current dataset fp_data = DATASETS[dataset_code]["data-fp"] data = pandasdmx.read_sdmx(fp_data) # Verify series count and values series = data.data[0].series series_count = len(series) assert series_count == DATASETS[dataset_code]["series_count"] first_series = series[0] observations = first_series first_obs = observations[0] last_obs = observations[-1] assert first_obs.dim == "2015-10" assert first_obs.value == "105.61" assert last_obs.dim == "1990-01" assert last_obs.value == "139.22"
def test_structure_roundtrip(pytestconfig, specimen_id, strict, tmp_path): """Test that pandasdmx.ML StructureMessages can be 'round-tripped'.""" # Read a specimen file with specimen(specimen_id) as f: msg0 = pandasdmx.read_sdmx(f) # Write to file path = tmp_path / "output.xml" path.write_bytes(pandasdmx.to_xml(msg0, pretty_print=True)) # Read again msg1 = pandasdmx.read_sdmx(path) # Contents are identical assert msg0.compare(msg1, strict), ( path.read_text() if pytestconfig.getoption("verbose") else path )
def test_write_data_arguments(): msg = sdmx.read_sdmx(test_files(kind='data')['argvalues'][0]) # Attributes must be a string with raises(TypeError): sdmx.to_pandas(msg, attributes=2) # Attributes must contain only 'dgso' with raises(ValueError): sdmx.to_pandas(msg, attributes='foobarbaz')
def test_write_data_arguments(): msg = pandasdmx.read_sdmx(test_files(kind="data")["argvalues"][0]) # Attributes must be a string with raises(TypeError): pandasdmx.to_pandas(msg, attributes=2) # Attributes must contain only 'dgso' with raises(ValueError): pandasdmx.to_pandas(msg, attributes="foobarbaz")
def test_load_dataset(self, req): dataset_code = 'IPI-2010-A21' # Load all dataflows dataflows_response = sdmx.read_sdmx(DATAFLOW_FP) dataflows = dataflows_response.dataflow assert len(dataflows) == 663 assert dataset_code in dataflows # Load datastructure for current dataset_code fp_datastructure = DATASETS[dataset_code]['datastructure-fp'] datastructure_response = sdmx.read_sdmx(fp_datastructure) assert dataset_code in datastructure_response.dataflow dsd = datastructure_response.dataflow[dataset_code].structure # Verify dimensions list dimensions = OrderedDict([dim.id, dim] for dim in dsd.dimensions if dim.id not in ['TIME', 'TIME_PERIOD']) dim_keys = list(dimensions.keys()) assert dim_keys == ['FREQ', 'PRODUIT', 'NATURE'] # Load datas for the current dataset fp_data = DATASETS[dataset_code]['data-fp'] data = sdmx.read_sdmx(fp_data) # Verify series count and values series = data.data[0].series series_count = len(series) assert series_count == DATASETS[dataset_code]['series_count'] first_series = series[0] observations = first_series first_obs = observations[0] last_obs = observations[-1] assert first_obs.dim == '2015-10' assert first_obs.value == '105.61' assert last_obs.dim == '1990-01' assert last_obs.value == '139.22'
def test_write_categoryscheme(): with specimen("IPI-2010-A21-structure.xml") as f: msg = pandasdmx.read_sdmx(f) data = pandasdmx.to_pandas(msg) cs = data["category_scheme"]["CLASSEMENT_DATAFLOWS"] assert cs.loc["COMPTA-NAT", "name"] == "National accounts (GDP, consumption...)" # Children appear assert cs.loc["CNA-PIB-2005", "parent"] == "CNA-PIB"
def test_read_xml_structure_insee(): with specimen('IPI-2010-A21-structure.xml') as f: msg = sdmx.read_sdmx(f) # Same objects referenced assert (id(msg.dataflow['IPI-2010-A21'].structure) == id(msg.structure['IPI-2010-A21'])) # Number of dimensions loaded correctly dsd = msg.structure['IPI-2010-A21'] assert len(dsd.dimensions) == 4
def test_read_xml_structure_insee(): msg = sdmx.read_sdmx(test_data_path / 'insee' / 'insee-IPI-2010-A21-datastructure.xml') # Same objects referenced assert (id(msg.dataflow['IPI-2010-A21'].structure) == id(msg.structure['IPI-2010-A21'])) # Number of dimensions loaded correctly dsd = msg.structure['IPI-2010-A21'] assert len(dsd.dimensions) == 4
def test_fixe_key_names(self, req): """Verify key or attribute contains '-' in name.""" dataset_code = "CNA-2010-CONSO-SI-A17" fp_datastructure = DATASETS[dataset_code]["datastructure-fp"] datastructure_response = pandasdmx.read_sdmx(fp_datastructure) assert dataset_code in datastructure_response.dataflow dsd = datastructure_response.dataflow[dataset_code].structure dimensions = OrderedDict( [dim.id, dim] for dim in dsd.dimensions if dim.id not in ["TIME", "TIME_PERIOD"] ) dim_keys = list(dimensions.keys()) assert dim_keys == ["SECT-INST", "OPERATION", "PRODUIT", "PRIX"] fp_data = DATASETS[dataset_code]["data-fp"] data = pandasdmx.read_sdmx(fp_data) series = data.data[0].series series_key = list(series.keys())[0] assert list(series_key.values.keys()) == [ "SECT-INST", "OPERATION", "PRODUIT", "PRIX", ] assert list(series_key.attrib.keys()) == [ "FREQ", "IDBANK", "TITLE", "LAST_UPDATE", "UNIT_MEASURE", "UNIT_MULT", "REF_AREA", "DECIMALS", "BASE_PER", "TIME_PER_COLLECT", ]
def test_write_data(data_path): msg = sdmx.read_sdmx(data_path) result = sdmx.to_pandas(msg) expected = expected_data(data_path) if expected is not None: print(expected, result, sep='\n') assert_pd_equal(expected, result) # TODO incomplete assert isinstance(result, (pd.Series, pd.DataFrame, list)), type(result)
def _read_structure_message(path): # Need to support pandasdmx==0.9 because 1.0 is not available for Python3.6 if pandasdmx.__version__.startswith("0.9"): req = pandasdmx.Request() structure = req.get(fromfile=str(path), writer="pandasdmx.writer.structure2pd") return structure.write() if pandasdmx.__version__.startswith("1."): return pandasdmx.read_sdmx(path) raise Exception(f"pandasdmx version is {pandasdmx.__version__}")
def test_exr_constraints(): with specimen('1/structure-full.xml') as f: m = sdmx.read_sdmx(f) ECB_EXR1 = m.structure['ECB_EXR1'] # Test DimensionDescriptor dd = ECB_EXR1.dimensions # Correct order assert dd[0].id == 'FREQ' # Correct number of dimensions assert len(dd.components) == 6 # Dimensions can be retrieved by name; membership can be tested assert 'W' in dd.get('FREQ') # Similar tests for AttributeDescriptor ad = ECB_EXR1.attributes assert len(ad.components) == 24 assert ad[-1].id == 'UNIT_MULT' assert '5' in ad.get('UNIT_MULT') pytest.xfail('constrained codes not implemented') # TODO assert len(m._constrained_codes), 14 assert 'W' not in m._constrained_codes.FREQ key = {'FREQ': ['W']} assert m.in_codes(key) assert not m.in_constraints(key, raise_error=False) with pytest.raises(ValueError): m.in_constraints(key) assert m.in_constraints({'CURRENCY': ['CHF']}) # test with invalid key with pytest.raises(TypeError): m._in_constraints({'FREQ': 'A'}) # structure writer with constraints out = sdmx.to_pandas(m) cl = out.codelist assert cl.shape == (3555, 2) # unconstrained codelists out = sdmx.to_pandas(m, constraint=False) cl = out.codelist assert cl.shape, (4177, 2)
def test_exr_constraints(): with specimen("1/structure-full.xml") as f: m = pandasdmx.read_sdmx(f) ECB_EXR1 = m.structure["ECB_EXR1"] # Test DimensionDescriptor dd = ECB_EXR1.dimensions # Correct order assert dd[0].id == "FREQ" # Correct number of dimensions assert len(dd.components) == 6 # Dimensions can be retrieved by name; membership can be tested assert "W" in dd.get("FREQ") # Similar tests for AttributeDescriptor ad = ECB_EXR1.attributes assert len(ad.components) == 24 assert ad[-1].id == "UNIT_MULT" assert "5" in ad.get("UNIT_MULT") pytest.xfail("constrained codes not implemented") assert len(m._constrained_codes), 14 assert "W" not in m._constrained_codes.FREQ key = {"FREQ": ["W"]} assert m.in_codes(key) assert not m.in_constraints(key, raise_error=False) with pytest.raises(ValueError): m.in_constraints(key) assert m.in_constraints({"CURRENCY": ["CHF"]}) # test with invalid key with pytest.raises(TypeError): m._in_constraints({"FREQ": "A"}) # structure writer with constraints out = pandasdmx.to_pandas(m) cl = out.codelist assert cl.shape == (3555, 2) # unconstrained codelists out = pandasdmx.to_pandas(m, constraint=False) cl = out.codelist assert cl.shape, (4177, 2)
def test_write_codelist(): # Retrieve codelists from a test specimen and convert to pandas with specimen("common-structure.xml") as f: dsd_common = pandasdmx.read_sdmx(f) codelists = pandasdmx.to_pandas(dsd_common)["codelist"] # File contains 5 code lists assert len(codelists) == 5 # Code lists have expected number of items assert len(codelists["CL_FREQ"]) == 8 # Items names can be retrieved by ID freq = codelists["CL_FREQ"] assert freq["A"] == "Annual" # Non-hierarchical code list has a string name assert freq.name == "Code list for Frequency (FREQ)" # Hierarchical code list with specimen("codelist_partial.xml") as f: msg = pandasdmx.read_sdmx(f) # Convert single codelist CL_AREA = pandasdmx.to_pandas(msg.codelist["CL_AREA"]) # Hierichical list has a 'parent' column; parent of Africa is the World assert CL_AREA.loc["002", "parent"] == "001" # Pandas features can be used to merge parent names area_hierarchy = pd.merge( CL_AREA, CL_AREA, how="left", left_on="parent", right_index=True, suffixes=("", "_parent"), ) assert area_hierarchy.loc["002", "name_parent"] == "World"
def test_data_roundtrip(pytestconfig, data_id, structure_id, tmp_path): """Test that SDMX-ML DataMessages can be 'round-tripped'.""" # Read structure from file with specimen(structure_id) as f: dsd = pandasdmx.read_sdmx(f).structure[0] # Read data from file, using the DSD with specimen(data_id) as f: msg0 = pandasdmx.read_sdmx(f, dsd=dsd) # Write to file path = tmp_path / "output.xml" path.write_bytes(pandasdmx.to_xml(msg0, pretty_print=True)) # Read again, using the same DSD msg1 = pandasdmx.read_sdmx(path, dsd=dsd) # Contents are identical assert msg0.compare(msg1, strict=True), ( path.read_text() if pytestconfig.getoption("verbose") else path )
def test_write_categoryscheme(): with specimen('IPI-2010-A21-structure.xml') as f: msg = sdmx.read_sdmx(f) print(msg.category_scheme) data = sdmx.to_pandas(msg) cs = data['category_scheme']['CLASSEMENT_DATAFLOWS'] assert (cs.loc['COMPTA-NAT', 'name'] == 'National accounts (GDP, consumption...)') # Children appear assert cs.loc['CNA-PIB-2005', 'parent'] == 'CNA-PIB'
def test_flat(): # Create a bare Message msg = DataMessage() # Recreate the content from exr-flat.json header = Header( id="62b5f19d-f1c9-495d-8446-a3661ed24753", prepared="2012-11-29T08:40:26Z", sender=model.Agency(id="ECB"), ) msg.header = header ds = DataSet() # Create a Key and attributes key = Key( FREQ="D", CURRENCY="NZD", CURRENCY_DENOM="EUR", EXR_TYPE="SP00", EXR_SUFFIX="A", TIME_PERIOD="2013-01-18", ) obs_status = DataAttribute(id="OBS_STATUS") attr = {"OBS_STATUS": AttributeValue(value_for=obs_status, value="A")} ds.obs.append( Observation(dimension=key, value=1.5931, attached_attribute=attr)) key = key.copy(TIME_PERIOD="2013-01-21") ds.obs.append( Observation(dimension=key, value=1.5925, attached_attribute=attr)) key = key.copy(CURRENCY="RUB", TIME_PERIOD="2013-01-18") ds.obs.append( Observation(dimension=key, value=40.3426, attached_attribute=attr)) key = key.copy(TIME_PERIOD="2013-01-21") ds.obs.append( Observation(dimension=key, value=40.3000, attached_attribute=attr)) msg.data.append(ds) # Write to pd.Dataframe df1 = pandasdmx.to_pandas(msg) with specimen("flat.json") as f: ref = pandasdmx.read_sdmx(f) df2 = pandasdmx.to_pandas(ref) assert_pd_equal(df1, df2)
def test_read_ss_xml(): base_path = test_data_path / 'exr' / '1' dsd_path = base_path / 'structure.xml' msg_path = base_path / 'M.USD.EUR.SP00.A.xml' # Read the DSD dsd = sdmx.read_sdmx(dsd_path).structure['ECB_EXR1'] # Read a data message msg = sdmx.read_sdmx(msg_path, dsd=dsd) ds = msg.data[0] # The dataset in the message is structured by the DSD assert ds.structured_by is dsd # Structures referenced in the dataset are from the dsd s0_key = list(ds.series.keys())[0] # AttributeValue.value_for assert s0_key.attrib['DECIMALS'].value_for \ is dsd.attributes.get('DECIMALS') # SeriesKey.described_by assert s0_key.described_by is dsd.dimensions # Key.described_by assert ds.obs[0].key.described_by is dsd.dimensions # KeyValue.value_for assert ds.obs[0].key.values[0].value_for \ is dsd.dimensions.get('FREQ') # DSD information that is not in the data message can be looked up through # navigating object relationships TIME_FORMAT = s0_key.attrib['TIME_FORMAT'].value_for assert len(TIME_FORMAT.related_to.dimensions) == 5
def test_flat(): # Create a bare Message msg = DataMessage() # Recreate the content from exr-flat.json header = Header( id='62b5f19d-f1c9-495d-8446-a3661ed24753', prepared='2012-11-29T08:40:26Z', sender='ECB', ) msg.header = header ds = DataSet() # Create a Key and attributes key = Key(FREQ='D', CURRENCY='NZD', CURRENCY_DENOM='EUR', EXR_TYPE='SP00', EXR_SUFFIX='A', TIME_PERIOD='2013-01-18') obs_status = DataAttribute(id='OBS_STATUS') attr = {'OBS_STATUS': AttributeValue(value_for=obs_status, value='A')} ds.obs.append( Observation(dimension=key, value=1.5931, attached_attribute=attr)) key = key.copy(TIME_PERIOD='2013-01-21') ds.obs.append( Observation(dimension=key, value=1.5925, attached_attribute=attr)) key = key.copy(CURRENCY='RUB', TIME_PERIOD='2013-01-18') ds.obs.append( Observation(dimension=key, value=40.3426, attached_attribute=attr)) key = key.copy(TIME_PERIOD='2013-01-21') ds.obs.append( Observation(dimension=key, value=40.3000, attached_attribute=attr)) msg.data.append(ds) # Write to pd.Dataframe df1 = sdmx.to_pandas(msg) with specimen('flat.json') as f: ref = sdmx.read_sdmx(f) df2 = sdmx.to_pandas(ref) assert_pd_equal(df1, df2)