def test_write_conceptscheme(): with specimen("common-structure.xml") as f: msg = sdmx.read_sdmx(f) data = sdmx.to_pandas(msg) cdc = data["concept_scheme"]["CROSS_DOMAIN_CONCEPTS"] assert cdc.loc["UNIT_MEASURE", "name"] == "Unit of Measure"
def test_freq_in_series_attribute(self, req): # Test that we don't have regression on Issues #39 and #41 # INSEE time series provide the FREQ value as attribute on the series # instead of a dimension. This caused a runtime error when writing as # pandas dataframe. data_response = sdmx.read_sdmx(SERIES["UNEMPLOYMENT_CAT_A_B_C"]["data-fp"]) sdmx.to_pandas(data_response)
def test_gh_75(self, req): """Test of https://github.com/dr-leo/pandaSDMX/pull/75.""" df_id = "47_850" # # Reported Dataflow query works # df = req.dataflow(df_id).dataflow[df_id] with specimen("47_850-structure") as f: df = sdmx.read_sdmx(f).dataflow[df_id] # dict() key for the query data_key = dict( FREQ=["A"], ITTER107=["001001"], SETTITOLARE=["1"], TIPO_DATO=["AUTP"], TIPO_GESTIONE=["ALL"], TIPSERVSOC=["ALL"], ) # Dimension components are in the correct order assert [dim.id for dim in df.structure.dimensions.components ] == list(data_key.keys()) + ["TIME_PERIOD"] # Reported data query works req.data(df_id, key="A.001001+001002.1.AUTP.ALL.ALL") # Use a dict() key to force Request to make a sub-query for the DSD req.data(df_id, key=data_key)
def test_message_repr(specimen, pattern, expected): with specimen(pattern) as f: msg = sdmx.read_sdmx(f) if isinstance(expected, re.Pattern): assert expected.fullmatch(repr(msg)) else: # __repr__() and __str__() give the same, expected result assert expected == repr(msg) == str(msg)
def test_load_dataset(self, base_path): dataset_code = "IPI-2010-A21" # Load all dataflows dataflows_response = sdmx.read_sdmx(base_path / DATAFLOW_FP) dataflows = dataflows_response.dataflow assert len(dataflows) == 663 assert dataset_code in dataflows # Load datastructure for current dataset_code fp_datastructure = base_path / DATASETS[dataset_code]["datastructure-fp"] datastructure_response = sdmx.read_sdmx(fp_datastructure) assert dataset_code in datastructure_response.dataflow dsd = datastructure_response.dataflow[dataset_code].structure # Verify dimensions list dimensions = OrderedDict( [dim.id, dim] for dim in dsd.dimensions if dim.id not in ["TIME", "TIME_PERIOD"] ) dim_keys = list(dimensions.keys()) assert dim_keys == ["FREQ", "PRODUIT", "NATURE"] # Load datas for the current dataset fp_data = base_path / DATASETS[dataset_code]["data-fp"] data = sdmx.read_sdmx(fp_data) # Verify series count and values series = data.data[0].series series_count = len(series) assert series_count == DATASETS[dataset_code]["series_count"] first_series = series[0] observations = first_series first_obs = observations[0] last_obs = observations[-1] assert first_obs.dim == "2015-10" assert first_obs.value == "105.61" assert last_obs.dim == "1990-01" assert last_obs.value == "139.22"
def test_structure_roundtrip(pytestconfig, specimen_id, strict, tmp_path): """Test that SDMX-ML StructureMessages can be 'round-tripped'.""" # Read a specimen file with specimen(specimen_id) as f: msg0 = sdmx.read_sdmx(f) # Write to file path = tmp_path / "output.xml" path.write_bytes(sdmx.to_xml(msg0, pretty_print=True)) # Read again msg1 = sdmx.read_sdmx(path) # Contents are identical assert msg0.compare(msg1, strict), ( path.read_text() if pytestconfig.getoption("verbose") else path )
def test_write_data_arguments(): msg = sdmx.read_sdmx(test_files(kind="data")["argvalues"][0]) # Attributes must be a string with raises(TypeError): sdmx.to_pandas(msg, attributes=2) # Attributes must contain only 'dgso' with raises(ValueError): sdmx.to_pandas(msg, attributes="foobarbaz")
def test_write_categoryscheme(specimen): with specimen("IPI-2010-A21-structure.xml") as f: msg = sdmx.read_sdmx(f) data = sdmx.to_pandas(msg) cs = data["category_scheme"]["CLASSEMENT_DATAFLOWS"] assert cs.loc["COMPTA-NAT", "name"] == "National accounts (GDP, consumption...)" # Children appear assert cs.loc["CNA-PIB-2005", "parent"] == "CNA-PIB"
def test_read_xml_structure_insee(specimen): with specimen("IPI-2010-A21-structure.xml") as f: msg = sdmx.read_sdmx(f) # Same objects referenced assert id(msg.dataflow["IPI-2010-A21"].structure) == id( msg.structure["IPI-2010-A21"]) # Number of dimensions loaded correctly dsd = msg.structure["IPI-2010-A21"] assert len(dsd.dimensions) == 4
def test_fixe_key_names(self, base_path): """Verify key or attribute contains '-' in name.""" dataset_code = "CNA-2010-CONSO-SI-A17" fp_datastructure = base_path / DATASETS[dataset_code]["datastructure-fp"] datastructure_response = sdmx.read_sdmx(fp_datastructure) assert dataset_code in datastructure_response.dataflow dsd = datastructure_response.dataflow[dataset_code].structure dimensions = OrderedDict( [dim.id, dim] for dim in dsd.dimensions if dim.id not in ["TIME", "TIME_PERIOD"] ) dim_keys = list(dimensions.keys()) assert dim_keys == ["SECT-INST", "OPERATION", "PRODUIT", "PRIX"] fp_data = base_path / DATASETS[dataset_code]["data-fp"] data = sdmx.read_sdmx(fp_data) series = data.data[0].series series_key = list(series.keys())[0] assert list(series_key.values.keys()) == [ "SECT-INST", "OPERATION", "PRODUIT", "PRIX", ] assert list(series_key.attrib.keys()) == [ "FREQ", "IDBANK", "TITLE", "LAST_UPDATE", "UNIT_MEASURE", "UNIT_MULT", "REF_AREA", "DECIMALS", "BASE_PER", "TIME_PER_COLLECT", ]
def test_write_data(specimen, path): msg = sdmx.read_sdmx(path) result = sdmx.to_pandas(msg) expected = specimen.expected_data(path) if expected is not None: print(expected, result, sep="\n") assert_pd_equal(expected, result) # TODO incomplete assert isinstance(result, (pd.Series, pd.DataFrame, list)), type(result)
def test_write_data_arguments(specimen): # The identity here is not important; any non-empty DataMessage will work with specimen("INSEE/CNA-2010-CONSO-SI-A17.xml") as f: msg = sdmx.read_sdmx(f) # Attributes must be a string with raises(TypeError): sdmx.to_pandas(msg, attributes=2) # Attributes must contain only 'dgso' with raises(ValueError): sdmx.to_pandas(msg, attributes="foobarbaz")
def test_exr_constraints(): with specimen("1/structure-full.xml") as f: m = sdmx.read_sdmx(f) ECB_EXR1 = m.structure["ECB_EXR1"] # Test DimensionDescriptor dd = ECB_EXR1.dimensions # Correct order assert dd[0].id == "FREQ" # Correct number of dimensions assert len(dd.components) == 6 # Dimensions can be retrieved by name; membership can be tested assert "W" in dd.get("FREQ") # Similar tests for AttributeDescriptor ad = ECB_EXR1.attributes assert len(ad.components) == 24 assert ad[-1].id == "UNIT_MULT" assert "5" in ad.get("UNIT_MULT") pytest.xfail("constrained codes not implemented") assert len(m._constrained_codes), 14 assert "W" not in m._constrained_codes.FREQ key = {"FREQ": ["W"]} assert m.in_codes(key) assert not m.in_constraints(key, raise_error=False) with pytest.raises(ValueError): m.in_constraints(key) assert m.in_constraints({"CURRENCY": ["CHF"]}) # test with invalid key with pytest.raises(TypeError): m._in_constraints({"FREQ": "A"}) # structure writer with constraints out = sdmx.to_pandas(m) cl = out.codelist assert cl.shape == (3555, 2) # unconstrained codelists out = sdmx.to_pandas(m, constraint=False) cl = out.codelist assert cl.shape, (4177, 2)
def test_write_codelist(specimen): # Retrieve codelists from a test specimen and convert to pandas with specimen("common-structure.xml") as f: dsd_common = sdmx.read_sdmx(f) codelists = sdmx.to_pandas(dsd_common)["codelist"] # File contains 5 code lists assert len(codelists) == 5 # Code lists have expected number of items assert len(codelists["CL_FREQ"]) == 8 # Items names can be retrieved by ID freq = codelists["CL_FREQ"] assert freq["A"] == "Annual" # Non-hierarchical code list has a string name assert freq.name == "Code list for Frequency (FREQ)" # Hierarchical code list with specimen("codelist_partial.xml") as f: msg = sdmx.read_sdmx(f) # Convert single codelist CL_AREA = sdmx.to_pandas(msg.codelist["CL_AREA"]) # Hierichical list has a 'parent' column; parent of Africa is the World assert CL_AREA.loc["002", "parent"] == "001" # Pandas features can be used to merge parent names area_hierarchy = pd.merge( CL_AREA, CL_AREA, how="left", left_on="parent", right_index=True, suffixes=("", "_parent"), ) assert area_hierarchy.loc["002", "name_parent"] == "World"
def test_data_roundtrip(pytestconfig, data_id, structure_id, tmp_path): """Test that SDMX-ML DataMessages can be 'round-tripped'.""" # Read structure from file with specimen(structure_id) as f: dsd = sdmx.read_sdmx(f).structure[0] # Read data from file, using the DSD with specimen(data_id) as f: msg0 = sdmx.read_sdmx(f, dsd=dsd) # Write to file path = tmp_path / "output.xml" path.write_bytes(sdmx.to_xml(msg0, pretty_print=True)) # Read again, using the same DSD msg1 = sdmx.read_sdmx(path, dsd=dsd) # Contents are identical assert msg0.compare(msg1, strict=True), ( path.read_text() if pytestconfig.getoption("verbose") else path )
def test_flat(): # Create a bare Message msg = DataMessage() # Recreate the content from exr-flat.json header = Header( id="62b5f19d-f1c9-495d-8446-a3661ed24753", prepared="2012-11-29T08:40:26Z", sender=model.Agency(id="ECB"), ) msg.header = header ds = DataSet() # Create a Key and attributes key = Key( FREQ="D", CURRENCY="NZD", CURRENCY_DENOM="EUR", EXR_TYPE="SP00", EXR_SUFFIX="A", TIME_PERIOD="2013-01-18", ) obs_status = DataAttribute(id="OBS_STATUS") attr = {"OBS_STATUS": AttributeValue(value_for=obs_status, value="A")} ds.obs.append( Observation(dimension=key, value=1.5931, attached_attribute=attr)) key = key.copy(TIME_PERIOD="2013-01-21") ds.obs.append( Observation(dimension=key, value=1.5925, attached_attribute=attr)) key = key.copy(CURRENCY="RUB", TIME_PERIOD="2013-01-18") ds.obs.append( Observation(dimension=key, value=40.3426, attached_attribute=attr)) key = key.copy(TIME_PERIOD="2013-01-21") ds.obs.append( Observation(dimension=key, value=40.3000, attached_attribute=attr)) msg.data.append(ds) # Write to pd.Dataframe df1 = sdmx.to_pandas(msg) with specimen("flat.json") as f: ref = sdmx.read_sdmx(f) df2 = sdmx.to_pandas(ref) assert_pd_equal(df1, df2)
def test_read_ss_xml(specimen): with specimen("M.USD.EUR.SP00.A.xml", opened=False) as f: msg_path = f dsd_path = f.parent / "structure.xml" # Read the DSD dsd = sdmx.read_sdmx(dsd_path).structure["ECB_EXR1"] # Read a data message msg = sdmx.read_sdmx(msg_path, dsd=dsd) ds = msg.data[0] # The dataset in the message is structured by the DSD assert ds.structured_by is dsd # Structures referenced in the dataset are from the dsd s0_key = list(ds.series.keys())[0] # AttributeValue.value_for assert s0_key.attrib["DECIMALS"].value_for is dsd.attributes.get( "DECIMALS") # SeriesKey.described_by assert s0_key.described_by is dsd.dimensions # Key.described_by assert ds.obs[0].key.described_by is dsd.dimensions # KeyValue.value_for assert ds.obs[0].key.values[0].value_for is dsd.dimensions.get("FREQ") # DSD information that is not in the data message can be looked up through # navigating object relationships TIME_FORMAT = s0_key.attrib["TIME_FORMAT"].value_for assert len(TIME_FORMAT.related_to.dimensions) == 5
def test_structuremessage(tmp_path, structuremessage): result = sdmx.to_xml(structuremessage, pretty_print=True) print(result.decode()) # Message can be round-tripped to/from file path = tmp_path / "output.xml" path.write_bytes(result) msg = sdmx.read_sdmx(path) # Contents match the original object assert (msg.codelist["CL_COLLECTION"]["A"].name["en"] == structuremessage.codelist["CL_COLLECTION"]["A"].name["en"]) # False because `structuremessage` lacks URNs, which are constructed automatically # by `to_xml` assert not msg.compare(structuremessage, strict=True) # Compares equal when allowing this difference assert msg.compare(structuremessage, strict=False)
def test_sdmx_roundtrip(tmp_path): path = tmp_path / "structure.xml" # Structure can be written with open(path, "wb") as f: f.write(sdmx.to_xml(generate(), pretty_print=True)) # Structure can be read sm = sdmx.read_sdmx(path) # One CubeRegion assert 1 == len(sm.constraint["PRICE_FUEL"].data_content_region) # One dimension with a MemberSelection cr = sm.constraint["PRICE_FUEL"].data_content_region[0] assert {"FUEL"} == set(d.id for d in cr.member.keys()) # 3 values in the MemberSelection assert 3 == len(cr.member["FUEL"].values)
def test_write_agencyscheme(specimen): # Convert an agency scheme with specimen("ECB/orgscheme.xml") as f: msg = sdmx.read_sdmx(f) data = sdmx.to_pandas(msg) assert data["organisation_scheme"]["AGENCIES"]["ESTAT"] == "Eurostat" # to_pandas only returns keys for non-empty attributes of StructureMessage # https://github.com/dr-leo/pandaSDMX/issues/90 assert set(data.keys()) == {"organisation_scheme"} # Attribute access works assert data.organisation_scheme.AGENCIES.ESTAT == "Eurostat" with pytest.raises(AttributeError): data.codelist with pytest.raises(AttributeError): data.dataflow with pytest.raises(AttributeError): data.structure
def test_read_sdmx(tmp_path, specimen): # Copy the file to a temporary file with an urecognizable suffix target = tmp_path / "foo.badsuffix" with specimen("flat.json", opened=False) as original: target.open("w").write(original.read_text()) # With unknown file extension, read_sdmx() peeks at the file content sdmx.read_sdmx(target) # Format can be inferred from an already-open file without extension with specimen("flat.json") as f: sdmx.read_sdmx(f) # Exception raised when the file contents don't allow to guess the format bad_file = BytesIO(b"#! neither XML nor JSON") exc = ( "cannot infer SDMX message format from path None, format={}, or content " "'#! ne..'") with pytest.raises(RuntimeError, match=exc.format("None")): sdmx.read_sdmx(bad_file) # Using the format= argument forces a certain reader to be used with pytest.raises(json.JSONDecodeError): sdmx.read_sdmx(bad_file, format="JSON")
def test_write_dataflow(specimen): # Read the INSEE dataflow definition with specimen("INSEE/dataflow") as f: msg = sdmx.read_sdmx(f) # Convert to pandas result = sdmx.to_pandas(msg, include="dataflow") # Number of Dataflows described in the file assert len(result["dataflow"]) == 663 # ID and names of first Dataflows mbop = "Monthly Balance of Payments - " expected = pd.Series( { "ACT-TRIM-ANC": "Activity by sex and age - Quarterly series", "BPM6-CCAPITAL": "{}Capital account".format(mbop), "BPM6-CFINANCIER": "{}Financial account".format(mbop), "BPM6-CTRANSACTION": "{}Current transactions account".format(mbop), "BPM6-TOTAL": "{}Overall total and main headings".format(mbop), } ) assert_pd_equal(result["dataflow"].head(), expected)
def test_msg(self, path, dsd): # The message can be parsed sdmx.read_sdmx(path / self.filename, dsd=dsd)
def msg(self, path, dsd): yield sdmx.read_sdmx(path / self.filename, dsd=dsd)
def dsd(self, path): yield sdmx.read_sdmx(path / self.dsd_filename).structure[0]
def test_bare_series(specimen): with specimen("ng-ts.xml") as f: sdmx.read_sdmx(f)
def get_dsd(): dsd_file = 'SDG_DSD.xml' msg = sdmx.read_sdmx(dsd_file) return msg.structure[0]
def retrieve_dsd(self, dsd): if dsd.startswith('http'): urlretrieve(dsd, 'SDG_DSD.xml') dsd = 'SDG_DSD.xml' msg = sdmx.read_sdmx(dsd) return msg.structure[0]
def test_writer_structure(path): msg = sdmx.read_sdmx(path) sdmx.to_pandas(msg)
def test_write_data_attributes(path): msg = sdmx.read_sdmx(path) result = sdmx.to_pandas(msg, attributes="osgd") # TODO incomplete assert isinstance(result, (pd.Series, pd.DataFrame, list)), type(result)