def test_xlsx(self): """Should massively read XLSX files from a directory. Should read XLSX files in a directory and generate a dict with file and sheet names as keys and dataframes with the data as values. """ dir_path = self.base_path + '/excel/' data = extractor.xlsx(dir_path, sep=None, encoding=None) self.assertEqual(len(data['prueba_excel.xlsx']), 4) self.assertEqual(len(data['excel_prueba.xlsx']), 4) self.assertEqual(type(data['prueba_excel.xlsx']['Hoja1']), pd.core.frame.DataFrame) self.assertEqual(type(data['prueba_excel.xlsx']['Hoja3']), pd.core.frame.DataFrame) self.assertEqual(type(data['excel_prueba.xlsx']['Hoja1']), pd.core.frame.DataFrame) self.assertEqual(type(data['excel_prueba.xlsx']['Hoja2']), pd.core.frame.DataFrame)
f.truncate() # re-write file and close f.write(text) f.close() def round_avoiding_errors(value, decimals): try: _ = round(value, decimals) return _ except: return value # Read input files data = xlsx(cfg.path.input) # Value and trend files for each indicator for key in cfg.series: # Drop NA rows, if any data[cfg.file][cfg.series[key].sheet].dropna(axis=0, how='all', inplace=True) # Rename variables if key not in ['empresas_afectadas_erte', 'afiliados_afectados_erte']: if cfg.series[key].trend_vars == []: data[cfg.file][cfg.series[key].sheet].rename(columns={ cfg.series[key].value_vars[0]:
from etlstat.extractor import extractor from cfg import cfg from pyjstat import pyjstat import pandas as pd cases = extractor.xlsx(cfg.input.path) municipios = extractor.csv(cfg.input.path, sep=',') cases_df = cases['cruce qlik.xlsx']['cruce_qlik'] municipios_df = municipios['municipios.csv'] cases_df.fillna(value=cfg.fillna, inplace=True) cases_df.replace(to_replace=cfg.replace.codpostal.old, value=cfg.replace.codpostal.new, inplace=True) cases_df['codpostal'] = cases_df['codpostal'].astype(int) cases_df['codpostal'] = cases_df['codpostal'].astype(object) # TODO: extract constants to config - maybe a dict cases_df['localidad'].replace('SANTANDER - P', 'SANTANDER (SANTANDER)', inplace=True) cases_df['localidad'].replace('ENTRAMBASAGUAS - P', 'ENTRAMBASAGUAS (ENTRAMBASAGUAS)', inplace=True) cases_df['localidad'].replace('SANTIURDE DE TORANZO - P', 'SANTIURDE DE TORANZO (SANTIURDE DE TORANZO)', inplace=True) cases_df['localidad'].replace('BARCENA DE CICERO - P', 'BARCENA DE CICERO (BARCENA DE CICERO)', inplace=True) cases_df['localidad'].replace('LAREDO - P', 'LAREDO (LAREDO)', inplace=True)