def test_complete_insert_from_pcaxis(self): """Check complete cycle by inserting a pcaxis file into a table.""" my_conn = MySQL(*self.conn_params) Base = declarative_base() current_dir = os.path.dirname(os.path.abspath(__file__)) parsed_pcaxis = pyaxis.parse(current_dir + '/22350.px', encoding='ISO-8859-2') table_data = parsed_pcaxis['DATA'] table_data = utils.parse_df_columns(table_data) table_data.name = 'ipc' class Ipc(Base): """Auxiliary sqlalchemy table model for the tests.""" __tablename__ = 'ipc' id = Column(Integer, primary_key=True) comunidades_y_ciudades_autonomas = Column(String(100)) grupos_ecoicop = Column(String(50)) tipo_de_dato = Column(String(50)) periodo = Column(String(50)) data = Column(Float) Ipc.__table__.create(bind=my_conn.engine) my_conn.insert(table_data, if_exists='append') actual = my_conn.engine.scalar( select([func.count('*')]).select_from(Ipc) ) expected = len(table_data.index) self.assertEqual(actual, expected) my_conn.drop('ipc')
def _px_from_path(dir_path, encoding='ISO-8859-2', timeout=10, null_values=r'^"\."$', sd_values=r'"\.\."'): """Massively read PC-Axis files from a directory. Read files in a directory, convert to dataframe and store in a dict. Args: dir_path (str): directory containing data files. encoding (str): file encoding for both the px file. timeout (int): request timeout in seconds; optional null_values(str): regex with the pattern for the null values in the px file. Defaults to '.'. sd_values(str): regex with the pattern for the statistical disclosured values in the px file. Defaults to '..'. Returns: dict: Name of px file as KEY and dataframe as VALUE. """ files = {} os.chdir(dir_path) with ExitStack() as context_manager: for filename in os.listdir('.'): if fnmatch.fnmatch(filename, '*.px'): px_df = pyaxis.parse(filename, encoding, timeout=timeout, null_values=null_values, sd_values=sd_values)['DATA'] files[filename.split(dir_path, 1)[-1][:-3]] = px_df context_manager.pop_all().close() return files
def _px_from_urls_in_csv(filename, sep=",", csv_encoding='windows-1252', px_encoding='ISO-8859-2', timeout=10, null_values=r'^"\."$', sd_values=r'"\.\."'): """Massively read PC-Axis files from a list of URLs in a CSV file. Read and convert PC-Axis files to dataframes from URIs listed in a CSV file. Args: filename (str): CSV FILE with uris file path (including file name). sep (str): field separator for the CSV files with the URLs. csv_encoding (str): file encoding for the CSV file. px_encoding (str): file encoding for the px file. timeout (int): request timeout in seconds; optional null_values(str): regex with the pattern for the null values in the px file. Defaults to '.'. sd_values(str): regex with the pattern for the statistical disclosured values in the px file. Defaults to '..'. Returns: dict: file names as keys and dataframes as values. """ uris = pd.read_csv(filename, sep=sep, encoding=csv_encoding) data = {} uris['data'] = uris.apply(lambda row: pyaxis.parse( row['url'], px_encoding, timeout=timeout, null_values=null_values, sd_values=sd_values)['DATA'], axis=1) data = pd.Series(uris['data'].values, index=uris['id']).to_dict() return data
def test_statistical_disclosure(): """Should parse a pc-axis with statistical disclosure into a dataframe. Uses convenient Na or NaN representations and a metadata dict. """ parsed_pcaxis = pyaxis.parse(data_path + '27067.px', encoding='ISO-8859-2') assert parsed_pcaxis['DATA'].dtypes['DATA'] == 'object' assert isnan(parsed_pcaxis['DATA']['DATA'].iloc[0]) assert parsed_pcaxis['DATA']['DATA'].iloc[804] == ''
def test_to_json_stat(): """Should generate a JSON-Stat object.""" px = pyaxis.parse(data_path + '14001.px', encoding='ISO-8859-15') json_obj = json_stat.to_json_stat(px) assert json_obj['id'] == \ ['Comunidad Autónoma de residencia del matrimonio', 'edad de los cónyuges', 'sexo', 'estado civil anterior de los cónyuges', 'Variables'] assert json_obj['source'] == ['Instituto Nacional de Estadística'] assert json_obj['value'][9] == '1' assert json_obj['value'][len(json_obj['value']) - 1] == '1600'
def test_parse(): """Should parse a pc-axis into a dataframe and a metadata dictionary""" parsed_pcaxis = pyaxis.parse(data_path + '14001.px', encoding='ISO-8859-15') assert parsed_pcaxis['DATA'].dtypes['DATA'] == 'object' assert len(parsed_pcaxis['DATA']) == 8064 assert parsed_pcaxis['METADATA'] ['VALUES(Comunidad Autónoma de residencia del matrimonio)'][0][0] == \ 'Total' assert parsed_pcaxis['METADATA'] ['VALUES(Comunidad Autónoma de residencia del matrimonio)'][0][20] == \ 'Extranjero'
def main(args): px = pyaxis.parse(args.url, encoding='windows-1252') df = px['DATA'] del px for k in df.keys(): if k in ['Katsastusvuosi']: df[k] = df[k].astype('int64') elif k in ['DATA']: df[k] = df[k] else: df[k] = df[k].astype('category') df.to_parquet(args.file, compression='brotli')
def test_parse(): """Should parse a pc-axis into a dataframe and a metadata dictionary""" parsed_pcaxis = pyaxis.parse( 'https://www.ine.es/jaxi/files/_px/es/px/t20/e301/matri/a2000/l0/' '14001.px?nocab=1', encoding='ISO-8859-15') assert parsed_pcaxis['DATA'].dtypes['DATA'] == 'object' assert len(parsed_pcaxis['DATA']) == 8064 assert parsed_pcaxis['METADATA'] ['VALUES(Comunidad Autónoma de residencia de los cónyuges)'][0][0] == \ 'Total' assert parsed_pcaxis['METADATA'] ['VALUES(Comunidad Autónoma de residencia de los cónyuges)'][0][20] == \ 'Extranjero'
def test_insert_selected_columns(self): """Check insert method only with selected columns.""" my_conn = MySQL(*self.conn_params) Base = declarative_base() current_dir = os.path.dirname(os.path.abspath(__file__)) parsed_pcaxis = pyaxis.parse(current_dir + '/22350.px', encoding='ISO-8859-2') table_data = parsed_pcaxis['DATA'] table_data = utils.parse_df_columns(table_data) table_data.name = 'ipc' class Ipc(Base): """Auxiliary sqlalchemy table model for the tests.""" __tablename__ = 'ipc' id = Column(Integer, primary_key=True) comunidades_y_ciudades_autonomas = Column(String(100)) grupos_ecoicop = Column(String(50)) tipo_de_dato = Column(String(50)) periodo = Column(String(50)) data = Column(Float) Ipc.__table__.create(bind=my_conn.engine) insert_data = table_data[['grupos_ecoicop', 'data']] insert_data.name = 'ipc' my_conn.insert(insert_data, if_exists='append', columns=['grupos_ecoicop', 'data']) result_data = pd.read_sql_query('select * from ipc', con=my_conn.engine) self.assertTrue( result_data['comunidades_y_ciudades_autonomas'].isnull().all()) self.assertTrue(result_data['periodo'].isnull().all()) self.assertTrue(result_data['tipo_de_dato'].isnull().all()) self.assertFalse(result_data['grupos_ecoicop'].isnull().all()) self.assertFalse(result_data['data'].isnull().all()) my_conn.drop('ipc')
from pyaxis import pyaxis from collections import defaultdict import matplotlib.pyplot as plt # read table from file px = pyaxis.parse("2222003S.PX", encoding='windows-1250') # save all the accidents per year # dictionary = {key: year, value: number of accidents} accidents_per_year = defaultdict(int) for x in range(len(px["DATA"]["MERITVE"])): if px["DATA"]["MERITVE"][x] == "Prometne nesreče - SKUPAJ": accidents_per_year[px["DATA"]["LETO"][x]] += int(px["DATA"]["DATA"][x]) # lists for plotting years = [] num_of_accidents = [] # fill lists with data for plotting for key in sorted(accidents_per_year.keys()): years.append(int(key)) num_of_accidents.append(accidents_per_year[key]) # name the axis plt.title("Number of accidents for each year 2003 - 2014") plt.ylabel("Number of accidents")
def test_connection_error(): """Using parse() with a wrong URL should return a 404.""" url = 'http://www.ine.net/jaxiT3/files/t/es/px/1001.px' with pytest.raises(requests.exceptions.HTTPError): pyaxis.parse(url, encoding='windows-1252')
def test_http_error(): """Using parse() with a non-existent URL should return a 404.""" url = 'http://www.ine.es/jaxi' with pytest.raises(requests.exceptions.HTTPError): pyaxis.parse(url, encoding='windows-1252')
#!/usr/bin/env python # coding: utf-8 # In[1]: #!pip install pyaxis # In[36]: from pyaxis import pyaxis import pandas as pd url = "https://www.eustat.eus/bankupx/Resources/PX/Databases/spanish/PX_3422_cet01tb.px" px = pyaxis.parse(uri=url, encoding='ISO-8859-2') data_df = px['DATA'] meta_dict = px['METADATA'] # In[3]: data_df # In[4]: meta_dict # In[5]: data_df['periodo'] # In[17]: df = data_df[(data_df['tipo de dato'] == 'Nivel')
import numpy as np import matplotlib.pyplot as plt from pyaxis import pyaxis from collections import defaultdict #Read the file data = pyaxis.parse("H057S (1).px", encoding='windows-1250') per_year_avto = defaultdict(int) per_year_new = defaultdict(int) for i in range(len(data["DATA"]["MERITVE"])): if data["DATA"]["MERITVE"][ i] == "Število osebnih avtomobilov na dan 31. 12.": per_year_avto[data["DATA"]["LETO"][i]] += int(data["DATA"]["DATA"][i]) if data["DATA"]["MERITVE"][ i] == "Število prvih registracij novih osebnih avtomobilov": per_year_new[data["DATA"]["LETO"][i]] += int(data["DATA"]["DATA"][i]) years = [] num_of_avto = [] num_of_new = [] #Insert data into lists for key in sorted(per_year_avto.keys()): years.append(int(key)) num_of_avto.append(per_year_avto[key]) for key in sorted(per_year_new.keys()): num_of_new.append(per_year_new[key]) plt.figure()
""" Example use case of pyaxis""" from pyaxis import pyaxis EXAMPLE_URL = 'http://www.ine.es/jaxiT3/files/t/es/px/2184.px' parsed_px = pyaxis.parse(EXAMPLE_URL, encoding='ISO-8859-2') print(parsed_px['DATA']) print(parsed_px['METADATA'])