def _process_partial_data(self, partial_requested_data): ''' This function is always executed after managing the response of each URL of this Data Source. The data should be processed from the external structure to the form of a DataFrame. Parameters ---------- partial_requested_data : csv it is the requested CSV of one URL. Returns ------- pd.DataFrame a DataFrame with daily [Date] as row indexer and [Region, Data Item] as column multiindexer. ''' region_codes_ine = Regions._get_property(self.regions, self.__class__.REGION_REPRESENTATION) codesine_regions_dict = dict(zip(region_codes_ine,self.regions)) df = partial_requested_data[(partial_requested_data['cod_ine_ambito'].isin(region_codes_ine)) & (partial_requested_data['nombre_sexo'] == 'todos') & (partial_requested_data['nombre_gedad'] == 'todos')] df.cod_ine_ambito = df.cod_ine_ambito.astype(int).astype(str).apply(lambda x: x.zfill(2)).replace(codesine_regions_dict) df.rename(columns={'cod_ine_ambito':'Region'},inplace=True) df.drop(['ambito','cod_ambito','nombre_ambito','cod_sexo','nombre_sexo','cod_gedad','nombre_gedad'],axis='columns',errors='ignore',inplace=True) df = df[['Region','fecha_defuncion']+self.data_items] df.set_index(['Region'],inplace=True) df = df.pivot_table(index='fecha_defuncion',columns='Region').swaplevel(i=0,j=1,axis='columns') df.columns.rename("Item",level=1,inplace=True) df.set_index(pd.to_datetime(df.index, format="%Y-%m-%d"),inplace=True) return df
def __get_stations_by_regions(self): """ Gets the aemet stations per region Returns ------- dict {str : str} a dictionary with instance regions as keys, and a string with the aemet stations separated by commas as values. Notes ----- * It is used for completing 'idema' argument in the AEMET API function (https://opendata.aemet.es/dist/index.html?#!/valores-climatologicos/Climatolog%C3%ADas_diarias) """ aemet_stations = Regions._get_property( self.regions, self.__class__.REGION_REPRESENTATION) str_aemet_stations = {} pos_region = 0 for stations in aemet_stations: str_aemet_stations[self.regions[pos_region]] = '' for station in stations: str_aemet_stations[ self.regions[pos_region]] = str_aemet_stations[ self.regions[pos_region]] + "," + station str_aemet_stations[self.regions[pos_region]] = str_aemet_stations[ self.regions[pos_region]][1:] pos_region = pos_region + 1 return str_aemet_stations
def __get_regions_by_stations(self): """ Gets the region per aemet station. Returns ------- dict {str : str} a dictionary with aemet stations as keys, and instance regions as values. """ aemet_stations = Regions._get_property( self.regions, self.__class__.REGION_REPRESENTATION) region_by_aemet_station = {} pos_region = 0 for stations in aemet_stations: for station in stations: region_by_aemet_station[station] = self.regions[pos_region] pos_region = pos_region + 1 return region_by_aemet_station
def _process_partial_data(self, partial_requested_data): ''' This function is always executed after managing the response of each URL of this Data Source. The data should be processed from the external structure to the form of a DataFrame. Parameters ---------- partial_requested_data : json it is the requested JSON of one URL. Returns ------- pd.DataFrame a DataFrame with [Region] as row indexer and [Data Item] as column indexer. ''' if self.__class__.DATA_ITEMS_INFO[self.data_items[ self.processed_urls]]['funcion'] == 'DATOS_TABLA': df = pd.json_normalize(partial_requested_data, 'Data', ['Nombre']) df.Nombre = df.Nombre.replace( dict( zip( Regions._get_property( self.regions, self.__class__.REGION_REPRESENTATION), self.regions)), regex=True ) # some regions contains commas, fix it through configuration file # json field 'Nombre' is splitted into Region, SubItem and Item columns # SubItem is a subdivision of the seeked Item df[['Region', 'SubItem', 'Item']] = df['Nombre'].str.split(", ", n=2, expand=True) # if in Region we find the word 'sexo', we should interchange Region and Subitem Columns if any(df.Region.str.contains(pat='sexo', case=False, regex=True)): df[['Region', 'SubItem']] = df[['SubItem', 'Region']]
def _process_partial_data(self, partial_requested_data): ''' This function is always executed after managing the response of each URL of this Data Source. The data should be processed from the external structure to the form of a DataFrame. Parameters ---------- partial_requested_data : csv it is the requested CSV of one URL. Returns ------- pd.DataFrame a DataFrame with daily [Date] as row indexer and [Region, Data Item] as column multiindexer. ''' region_representation_dict = Regions._get_property( self.regions, self.__class__.REGION_REPRESENTATION) region_population_dict = Regions.get_regions_population() representation_ccaa_dict = {} representation_provinces_dict = {} representation_spain_dict = {'00': 'España'} for i, r in enumerate(self.regions): code_ine = region_representation_dict[i] if 'CA' in r or code_ine == 0: representation_ccaa_dict[code_ine] = r else: representation_provinces_dict[code_ine] = r # Adaptation to vaccines dataset representation_ccaa_vac_dict = { 'Totales': 'España', 'Andalucía': 'CA Andalucía', 'Aragón': 'CA Aragón', 'Asturias': 'CA Principado de Asturias', 'Baleares': 'CA Islas Baleares', 'Canarias': 'CA Canarias', 'Cantabria': 'CA Cantabria', 'Castilla y Leon': 'CA Castilla y León', 'Castilla La Mancha': 'CA Castilla-La Mancha', 'Cataluña': 'CA Cataluña', 'C. Valenciana': 'CA Comunidad Valenciana', 'Extremadura': 'CA Extremadura', 'Galicia': 'CA Galicia', 'Madrid': 'CA Comunidad de Madrid', 'Murcia': 'CA Región de Murcia', 'Navarra': 'CA Comunidad Foral de Navarra', 'País Vasco': 'CA País Vasco', 'La Rioja': 'CA La Rioja', 'Ceuta': 'CA Ceuta', 'Melilla': 'CA Melilla' } # Fix esCOVID19data error if "intensive_care_per_1000000" in partial_requested_data.columns: partial_requested_data = partial_requested_data.rename( columns={ 'intensive_care_per_1000000': 'intensive_care_per_100000' }) # Vaccine if "date_pub" in partial_requested_data.columns: df = partial_requested_data.rename( columns={ "ccaa": "Region", 'date_pub': 'date', 'Dosis entregadas': 'vaccine_provided', 'Dosis administradas': 'vaccine_supplied', '% sobre entregadas': 'vaccine_supplied_inc' }) df = df[~df.Region.str.contains('Fuerzas Armadas')] df['Region'] = df['Region'].map(representation_ccaa_vac_dict) df['date'] = pd.to_datetime(df['date'], dayfirst=True) df.set_index(df.Region.astype(str).str.zfill(2), inplace=True, drop=True) else: df = partial_requested_data.rename(columns={"ine_code": "Region"}) df.set_index( df.Region.astype(str).str.zfill(2), inplace=True, drop=True ) # zfill used to change numbers 1, 2, 3... tu padded strings "01", "02"... (code ine) df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d') df.sort_values(['date'], inplace=True) df = df.drop([ 'Region', 'CCAA', 'ccaa', 'province', 'source_name', 'source', 'comments' ], axis='columns', errors='ignore') # Adaptation of dataitems if "province" in partial_requested_data.columns: df.rename(index=representation_provinces_dict, inplace=True) elif "ccaa" in partial_requested_data.columns and "date_pub" not in partial_requested_data.columns: df.rename(index=representation_ccaa_dict, inplace=True) df = df.pivot_table(index='date', columns='Region').swaplevel(i=0, j=1, axis='columns') df.columns.rename("Item", level=1, inplace=True) df.set_index(pd.to_datetime(df.index, format="%Y-%m-%d"), inplace=True) if "date_pub" in partial_requested_data.columns: for i in df.columns.levels[0]: df[i, 'pob_vaccine_supplied_inc'] = ( (df[i, 'vaccine_supplied'] * 100) / region_population_dict[i]).round(2) df[i, 'vaccine_supplied_inc'] = df[i, 'vaccine_supplied_inc'] * 100 if ("ccaa" in partial_requested_data.columns or "province" in partial_requested_data.columns ) and "date_pub" not in partial_requested_data.columns: for i in df.columns.levels[0]: df[i, 'accumulated_lethality'] = ( df[i, 'deceased'] / df[i, 'cases_accumulated']).round(5) df[i, 'daily_deaths_inc'] = df[i, 'daily_deaths_inc'] * 100 if "ccaa" in partial_requested_data.columns and "date_pub" not in partial_requested_data.columns: # Adaptation of Spain region try: sum_dataitems = df.sum(axis=1, level=1) df['España', 'num_casos'] = sum_dataitems['num_casos'] df['España', 'num_casos_prueba_pcr'] = sum_dataitems[ 'num_casos_prueba_pcr'] df['España', 'num_casos_prueba_test_ac'] = sum_dataitems[ 'num_casos_prueba_test_ac'] df['España', 'num_casos_prueba_ag'] = sum_dataitems[ 'num_casos_prueba_ag'] df['España', 'num_casos_prueba_elisa'] = sum_dataitems[ 'num_casos_prueba_elisa'] df['España', 'num_casos_prueba_desconocida'] = sum_dataitems[ 'num_casos_prueba_desconocida'] df['España', 'daily_deaths'] = sum_dataitems['daily_deaths'] df['España', 'cases_accumulated'] = sum_dataitems['cases_accumulated'] df['España', 'cases_accumulated_PCR'] = sum_dataitems[ 'cases_accumulated_PCR'] df['España', 'hospitalized'] = sum_dataitems['hospitalized'] df['España', 'intensive_care'] = sum_dataitems['intensive_care'] df['España', 'deceased'] = sum_dataitems['deceased'] df['España', 'recovered'] = sum_dataitems['recovered'] # Ventanas de tiempo df['España', 'daily_deaths_avg7'] = sum_dataitems['daily_deaths_avg7'] df['España', 'cases_14days'] = sum_dataitems['cases_14days'] # Medias def media_by_param(df, param, out, days): df['España', out] = 0 cont = 0 datainv = df.reindex(index=df.index[::-1]) for i, idx in enumerate(datainv.index): valor = 0 for idxx in datainv.index[i:]: if cont < days: valor += df.loc[idxx, ('España', param)] cont = cont + 1 media = valor / days df.loc[idx, ('España', out)] = media cont = 0 media_by_param(df, 'num_casos', 'daily_cases_avg7', 7) media_by_param(df, 'num_casos_prueba_pcr', 'num_casos_prueba_pcr_avg7', 7) media_by_param(df, 'daily_deaths', 'daily_deaths_avg7', 7) media_by_param(df, 'daily_deaths', 'daily_deaths_avg3', 3) # IA def ia_by_param(df, param, out, days): df['España', out] = 0 cont = 0 datainv = df.reindex(index=df.index[::-1]) for i, idx in enumerate(datainv.index): valor = 0 for idxx in datainv.index[i:]: if cont < days: valor += df.loc[idxx, ('España', param)] cont = cont + 1 ia = ((valor * 100000) / region_population_dict['España']).round(2) df.loc[idx, ('España', out)] = ia cont = 0 ia_by_param(df, 'num_casos', 'ia14', 14) # Lethality df['España', 'accumulated_lethality'] = ( df['España', 'deceased'] / df['España', 'cases_accumulated']).round(2) # 100k df['España', 'cases_per_cienmil'] = ( (df['España', 'cases_accumulated'] * 100000) / region_population_dict['España']).round(2) df['España', 'intensive_care_per_100000'] = ( (df['España', 'intensive_care'] * 100000) / region_population_dict['España']).round(2) df['España', 'hospitalized_per_100000'] = ( (df['España', 'hospitalized'] * 100000) / region_population_dict['España']).round(2) df['España', 'deceassed_per_100000'] = ( (df['España', 'deceased'] * 100000) / region_population_dict['España']).round(2) # percent def percent_by_param(df, param, out, days): df['España', out] = 0 cont = 0 datainv = df.reindex(index=df.index[::-1]) for i, idx in enumerate(datainv.index): valor = 0 for idxx in datainv.index[i + 1:]: if cont < days: valor += df.loc[idxx, ('España', param)] cont = cont + 1 percent = (df.loc[idx, ('España', param)] * 100) / valor df.loc[idx, ('España', out)] = percent.round(2) cont = 0 percent_by_param(df, 'daily_deaths', 'daily_deaths_inc', 1) except KeyError as e: print("Spain dataitems ERROR: ", e) return df