Ejemplo n.º 1
0
    def test_complete_insert_from_pcaxis(self):
        """Check complete cycle by inserting a pcaxis file into a table."""
        my_conn = MySQL(*self.conn_params)
        Base = declarative_base()
        current_dir = os.path.dirname(os.path.abspath(__file__))
        parsed_pcaxis = pyaxis.parse(current_dir + '/22350.px',
                                     encoding='ISO-8859-2')
        table_data = parsed_pcaxis['DATA']
        table_data = utils.parse_df_columns(table_data)
        table_data.name = 'ipc'

        class Ipc(Base):
            """Auxiliary sqlalchemy table model for the tests."""

            __tablename__ = 'ipc'

            id = Column(Integer, primary_key=True)
            comunidades_y_ciudades_autonomas = Column(String(100))
            grupos_ecoicop = Column(String(50))
            tipo_de_dato = Column(String(50))
            periodo = Column(String(50))
            data = Column(Float)

        Ipc.__table__.create(bind=my_conn.engine)
        my_conn.insert(table_data, if_exists='append')
        actual = my_conn.engine.scalar(
            select([func.count('*')]).select_from(Ipc)
        )
        expected = len(table_data.index)
        self.assertEqual(actual, expected)
        my_conn.drop('ipc')
Ejemplo n.º 2
0
def _px_from_path(dir_path, encoding='ISO-8859-2', timeout=10,
                  null_values=r'^"\."$', sd_values=r'"\.\."'):
    """Massively read PC-Axis files from a directory.

    Read files in a directory, convert to dataframe and store in a dict.

    Args:
        dir_path (str): directory containing data files.
        encoding (str): file encoding for both the px file.
        timeout (int): request timeout in seconds; optional
        null_values(str): regex with the pattern for the null values in the px
                          file. Defaults to '.'.
        sd_values(str): regex with the pattern for the statistical disclosured
                        values in the px file. Defaults to '..'.

    Returns:
        dict: Name of px file as KEY and dataframe as VALUE.

    """
    files = {}
    os.chdir(dir_path)
    with ExitStack() as context_manager:
        for filename in os.listdir('.'):
            if fnmatch.fnmatch(filename, '*.px'):
                px_df = pyaxis.parse(filename, encoding,
                                     timeout=timeout, null_values=null_values,
                                     sd_values=sd_values)['DATA']
                files[filename.split(dir_path, 1)[-1][:-3]] = px_df
        context_manager.pop_all().close()
    return files
Ejemplo n.º 3
0
def _px_from_urls_in_csv(filename, sep=",", csv_encoding='windows-1252',
                         px_encoding='ISO-8859-2', timeout=10,
                         null_values=r'^"\."$', sd_values=r'"\.\."'):
    """Massively read PC-Axis files from a list of URLs in a CSV file.

    Read and convert PC-Axis files to dataframes from URIs listed in a CSV
    file.

    Args:
        filename (str): CSV FILE with uris file path (including file name).
        sep (str): field separator for the CSV files with the URLs.
        csv_encoding (str): file encoding for the CSV file.
        px_encoding (str): file encoding for the px file.
        timeout (int): request timeout in seconds; optional
        null_values(str): regex with the pattern for the null values in the px
                          file. Defaults to '.'.
        sd_values(str): regex with the pattern for the statistical disclosured
                        values in the px file. Defaults to '..'.
    Returns:
        dict: file names as keys and dataframes as values.

    """
    uris = pd.read_csv(filename,
                       sep=sep,
                       encoding=csv_encoding)
    data = {}
    uris['data'] = uris.apply(lambda row: pyaxis.parse(
        row['url'], px_encoding, timeout=timeout, null_values=null_values,
        sd_values=sd_values)['DATA'], axis=1)
    data = pd.Series(uris['data'].values, index=uris['id']).to_dict()
    return data
Ejemplo n.º 4
0
def test_statistical_disclosure():
    """Should parse a pc-axis with statistical disclosure into a dataframe.

    Uses convenient Na or NaN representations and a metadata dict.
    """
    parsed_pcaxis = pyaxis.parse(data_path + '27067.px', encoding='ISO-8859-2')
    assert parsed_pcaxis['DATA'].dtypes['DATA'] == 'object'
    assert isnan(parsed_pcaxis['DATA']['DATA'].iloc[0])
    assert parsed_pcaxis['DATA']['DATA'].iloc[804] == ''
Ejemplo n.º 5
0
def test_to_json_stat():
    """Should generate a JSON-Stat object."""
    px = pyaxis.parse(data_path + '14001.px', encoding='ISO-8859-15')
    json_obj = json_stat.to_json_stat(px)
    assert json_obj['id'] == \
        ['Comunidad Autónoma de residencia del matrimonio',
         'edad de los cónyuges', 'sexo',
         'estado civil anterior de los cónyuges', 'Variables']
    assert json_obj['source'] == ['Instituto Nacional de Estadística']
    assert json_obj['value'][9] == '1'
    assert json_obj['value'][len(json_obj['value']) - 1] == '1600'
Ejemplo n.º 6
0
def test_parse():
    """Should parse a pc-axis into a dataframe and a metadata dictionary"""
    parsed_pcaxis = pyaxis.parse(data_path + '14001.px',
                                 encoding='ISO-8859-15')
    assert parsed_pcaxis['DATA'].dtypes['DATA'] == 'object'
    assert len(parsed_pcaxis['DATA']) == 8064
    assert parsed_pcaxis['METADATA']
    ['VALUES(Comunidad Autónoma de residencia del matrimonio)'][0][0] == \
        'Total'
    assert parsed_pcaxis['METADATA']
    ['VALUES(Comunidad Autónoma de residencia del matrimonio)'][0][20] == \
        'Extranjero'
Ejemplo n.º 7
0
def main(args):
    px = pyaxis.parse(args.url, encoding='windows-1252')
    df = px['DATA']
    del px
    for k in df.keys():
        if k in ['Katsastusvuosi']:
            df[k] = df[k].astype('int64')
        elif k in ['DATA']:
            df[k] = df[k]
        else:
            df[k] = df[k].astype('category')
    df.to_parquet(args.file, compression='brotli')
Ejemplo n.º 8
0
def test_parse():
    """Should parse a pc-axis into a dataframe and a metadata dictionary"""
    parsed_pcaxis = pyaxis.parse(
        'https://www.ine.es/jaxi/files/_px/es/px/t20/e301/matri/a2000/l0/'
        '14001.px?nocab=1',
        encoding='ISO-8859-15')
    assert parsed_pcaxis['DATA'].dtypes['DATA'] == 'object'
    assert len(parsed_pcaxis['DATA']) == 8064
    assert parsed_pcaxis['METADATA']
    ['VALUES(Comunidad Autónoma de residencia de los cónyuges)'][0][0] == \
        'Total'
    assert parsed_pcaxis['METADATA']
    ['VALUES(Comunidad Autónoma de residencia de los cónyuges)'][0][20] == \
        'Extranjero'
Ejemplo n.º 9
0
    def test_insert_selected_columns(self):
        """Check insert method only with selected columns."""
        my_conn = MySQL(*self.conn_params)
        Base = declarative_base()
        current_dir = os.path.dirname(os.path.abspath(__file__))
        parsed_pcaxis = pyaxis.parse(current_dir + '/22350.px',
                                     encoding='ISO-8859-2')
        table_data = parsed_pcaxis['DATA']
        table_data = utils.parse_df_columns(table_data)
        table_data.name = 'ipc'

        class Ipc(Base):
            """Auxiliary sqlalchemy table model for the tests."""

            __tablename__ = 'ipc'

            id = Column(Integer, primary_key=True)
            comunidades_y_ciudades_autonomas = Column(String(100))
            grupos_ecoicop = Column(String(50))
            tipo_de_dato = Column(String(50))
            periodo = Column(String(50))
            data = Column(Float)

        Ipc.__table__.create(bind=my_conn.engine)
        insert_data = table_data[['grupos_ecoicop', 'data']]
        insert_data.name = 'ipc'

        my_conn.insert(insert_data, if_exists='append',
                       columns=['grupos_ecoicop', 'data'])
        result_data = pd.read_sql_query('select * from ipc',
                                        con=my_conn.engine)
        self.assertTrue(
            result_data['comunidades_y_ciudades_autonomas'].isnull().all())
        self.assertTrue(result_data['periodo'].isnull().all())
        self.assertTrue(result_data['tipo_de_dato'].isnull().all())
        self.assertFalse(result_data['grupos_ecoicop'].isnull().all())
        self.assertFalse(result_data['data'].isnull().all())
        my_conn.drop('ipc')
Ejemplo n.º 10
0
from pyaxis import pyaxis
from collections import defaultdict

import matplotlib.pyplot as plt

# read table from file
px = pyaxis.parse("2222003S.PX", encoding='windows-1250')


# save all the accidents per year
# dictionary = {key: year, value: number of accidents}
accidents_per_year = defaultdict(int)

for x in range(len(px["DATA"]["MERITVE"])):
    if px["DATA"]["MERITVE"][x] == "Prometne nesreče - SKUPAJ":
        accidents_per_year[px["DATA"]["LETO"][x]] += int(px["DATA"]["DATA"][x])



# lists for plotting
years = []
num_of_accidents = []

# fill lists with data for plotting
for key in sorted(accidents_per_year.keys()):
    years.append(int(key))
    num_of_accidents.append(accidents_per_year[key])

# name the axis
plt.title("Number of accidents for each year 2003 - 2014")
plt.ylabel("Number of accidents")
Ejemplo n.º 11
0
def test_connection_error():
    """Using parse() with a wrong URL should return a 404."""
    url = 'http://www.ine.net/jaxiT3/files/t/es/px/1001.px'

    with pytest.raises(requests.exceptions.HTTPError):
        pyaxis.parse(url, encoding='windows-1252')
Ejemplo n.º 12
0
def test_http_error():
    """Using parse() with a non-existent URL should return a 404."""
    url = 'http://www.ine.es/jaxi'
    with pytest.raises(requests.exceptions.HTTPError):
        pyaxis.parse(url, encoding='windows-1252')
#!/usr/bin/env python
# coding: utf-8

# In[1]:

#!pip install pyaxis

# In[36]:

from pyaxis import pyaxis
import pandas as pd
url = "https://www.eustat.eus/bankupx/Resources/PX/Databases/spanish/PX_3422_cet01tb.px"
px = pyaxis.parse(uri=url, encoding='ISO-8859-2')
data_df = px['DATA']
meta_dict = px['METADATA']

# In[3]:

data_df

# In[4]:

meta_dict

# In[5]:

data_df['periodo']

# In[17]:

df = data_df[(data_df['tipo de dato'] == 'Nivel')
Ejemplo n.º 14
0
import numpy as np
import matplotlib.pyplot as plt
from pyaxis import pyaxis
from collections import defaultdict

#Read the file
data = pyaxis.parse("H057S (1).px", encoding='windows-1250')
per_year_avto = defaultdict(int)
per_year_new = defaultdict(int)

for i in range(len(data["DATA"]["MERITVE"])):
    if data["DATA"]["MERITVE"][
            i] == "Število osebnih avtomobilov na dan 31. 12.":
        per_year_avto[data["DATA"]["LETO"][i]] += int(data["DATA"]["DATA"][i])
    if data["DATA"]["MERITVE"][
            i] == "Število prvih registracij novih osebnih avtomobilov":
        per_year_new[data["DATA"]["LETO"][i]] += int(data["DATA"]["DATA"][i])

years = []
num_of_avto = []
num_of_new = []

#Insert data into lists
for key in sorted(per_year_avto.keys()):
    years.append(int(key))
    num_of_avto.append(per_year_avto[key])

for key in sorted(per_year_new.keys()):
    num_of_new.append(per_year_new[key])

plt.figure()
Ejemplo n.º 15
0
""" Example use case of pyaxis"""
from pyaxis import pyaxis

EXAMPLE_URL = 'http://www.ine.es/jaxiT3/files/t/es/px/2184.px'
parsed_px = pyaxis.parse(EXAMPLE_URL, encoding='ISO-8859-2')
print(parsed_px['DATA'])
print(parsed_px['METADATA'])