def extract_and_store(): """Read the processed PDF files, extract the information, and store it into the database""" reports = [] documents_clinic_description = [] documents_transmission_indicators = [] database = MongoDatabase(MongoDatabase.extracted_db_name) # Read the processed reports processed_reports_files = os.listdir(PDFRenaveTaskGroup.processed_reports_directory) for file in processed_reports_files: with open(PDFRenaveTaskGroup.processed_reports_directory + '/' + file, 'rb') as f: processed_report = pickle.load(f) reports.append(processed_report) for report in reports: try: clinic_description = report.get_clinic_description() if clinic_description: documents_clinic_description.extend(clinic_description) except Exception: print("Error trying to extract the clinic description from RENAVE report %i" % report.index) try: transmission_indicators = report.get_transmission_indicators() if transmission_indicators: documents_transmission_indicators.extend(transmission_indicators) except Exception: print("Error trying to extract the transmission indicators from RENAVE report %i" % report.index) database.store_data('clinic_description', documents_clinic_description) database.store_data('transmission_indicators', documents_transmission_indicators)
class VaccinationData: """Vaccination campaign progress in Spain""" def __init__(self): """Load the dataset""" # Connection to the extracted data database for reading, and to the analyzed data for writing self.db_read = MongoDatabase(MongoDatabase.extracted_db_name) self.db_write = MongoDatabase(MongoDatabase.analyzed_db_name) self.df_vaccination_general = self.db_read.read_data( 'vaccination_general') def __calculate_vaccinated_percentage__(self): """Calculate the percentage of vaccinated people""" population_df = self.db_read.read_data('population_ar', {'age_range': 'total'}, ['autonomous_region', 'total']) df_vaccination_join = pd.merge(self.df_vaccination_general, population_df, on='autonomous_region') df_vaccination_join['percentage_fully_vaccinated'] = \ 100 * df_vaccination_join['number_fully_vaccinated_people'] / df_vaccination_join['total'] df_vaccination_join['percentage_at_least_single_dose'] = \ 100 * df_vaccination_join['number_at_least_single_dose_people'] / df_vaccination_join['total'] df_vaccination_join = df_vaccination_join.drop(columns=['total']) self.df_vaccination_general = df_vaccination_join.replace( {np.nan: None}) def __calculate_vaccination_deltas__(self): """Calculate the number of new vaccinations each day, as well as the moving average""" df = self.df_vaccination_general.sort_values(['date', 'autonomous_region']).replace({None: np.nan})\ .set_index('date') df['new_vaccinations'] = df.groupby( ['autonomous_region'])['number_fully_vaccinated_people'].diff() new_vaccinations_ma = df.groupby( 'autonomous_region')['new_vaccinations'].rolling('7D').mean() self.df_vaccination_general = pd.merge(df, new_vaccinations_ma, on=['autonomous_region', 'date'])\ .rename(columns={'new_vaccinations_x': 'new_vaccinations', 'new_vaccinations_y': 'new_vaccinations_ma_7d'})\ .reset_index()\ .replace({np.nan: None}) def __move_ages_data__(self): """Just move the ages data from the extracted to the analyzed database""" vaccination_collections_names = [ 'vaccination_ages_single', 'vaccination_ages_complete' ] for collection_name in vaccination_collections_names: vaccination_collection = self.db_write.db.get_collection( collection_name) vaccination_collection.delete_many({}) vaccination_collection.insert_many( self.db_read.db.get_collection(collection_name).find({})) def move_data(self): """Calculate the vaccination percentage and move the data""" self.__calculate_vaccinated_percentage__() self.__calculate_vaccination_deltas__() self.db_write.store_data( 'vaccination_general', self.df_vaccination_general.to_dict('records')) self.__move_ages_data__()
class HospitalsPressure: """Hospitals pressure in Spain""" def __init__(self): """Load the dataset""" # Connection to the extracted data database for reading, and to the analyzed data for writing self.db_read = MongoDatabase(MongoDatabase.extracted_db_name) self.db_write = MongoDatabase(MongoDatabase.analyzed_db_name) # Load the hospitals pressure data self.hospitals_pressure = self.db_read.read_data( 'hospitals_pressure', projection=[ 'autonomous_region', 'date', 'hospitalized_patients', 'beds_percentage', 'ic_patients', 'ic_beds_percentage' ]) def __aggregate_data__(self): """Calculate the data for the whole country""" pressure_grouped = self.hospitals_pressure.groupby('date') pressure_patients = pressure_grouped[[ 'hospitalized_patients', 'ic_patients' ]].sum() pressure_beds_percentage = pressure_grouped[[ 'beds_percentage', 'ic_beds_percentage' ]].mean() hospitals_pressure_total = pd.merge(pressure_patients, pressure_beds_percentage, on='date').reset_index() hospitals_pressure_total['autonomous_region'] = 'España' self.hospitals_pressure = pd.concat( [self.hospitals_pressure, hospitals_pressure_total]) self.hospitals_pressure = self.hospitals_pressure.sort_values( by=['date', 'autonomous_region']) def __calculate_ma__(self): """Calculate the moving average for the beds percentages, since the data can be very sharp""" hospitals_pressure_df = self.hospitals_pressure.set_index('date') hospitals_ma = hospitals_pressure_df.groupby('autonomous_region')[['beds_percentage', 'ic_beds_percentage']]\ .rolling('14D').mean() self.hospitals_pressure = pd.merge(hospitals_pressure_df, hospitals_ma, on=['autonomous_region', 'date'])\ .rename(columns={'beds_percentage_x': 'beds_percentage', 'beds_percentage_y': 'beds_percentage_ma_14d', 'ic_beds_percentage_x': 'ic_beds_percentage', 'ic_beds_percentage_y': 'ic_beds_percentage_ma_14d'}) \ .reset_index() \ .replace({np.nan: None}) def transform_and_store(self): """Analyze the data, calculate some new variables, and store the results to the database""" self.__aggregate_data__() self.__calculate_ma__() self.__store_data__() def __store_data__(self): """Store the outbreaks description in the database""" mongo_data = self.hospitals_pressure.to_dict('records') collection = 'hospitals_pressure' self.db_write.store_data(collection, mongo_data)
class TransmissionIndicators: """ Transmission indicators in Spain: cases with unknown contact, identified contacts per case and asymptomatic cases percentage. """ def __init__(self): """Load the dataset""" # Connection to the extracted data database for reading, and to the analyzed data for writing self.db_read = MongoDatabase(MongoDatabase.extracted_db_name) self.db_write = MongoDatabase(MongoDatabase.analyzed_db_name) # Load the transmission indicators data self.transmission_indicators = self.db_read.read_data( 'transmission_indicators') def __transform_data__(self): """Get only the desired data and transform it to a single-level hierarchy""" ti_df = self.transmission_indicators ti_df['cases_unknown_contact'] = ti_df[ 'transmission_indicators'].apply( lambda x: x['cases_unknown_contact']['percentage']) ti_df['identified_contacts_per_case'] = ti_df[ 'transmission_indicators'].apply( lambda x: x['identified_contacts_per_case']['median']) ti_df['asymptomatic_percentage'] = ti_df[ 'transmission_indicators'].apply( lambda x: x['asymptomatic_percentage']) self.transmission_indicators = ti_df.drop( columns='transmission_indicators') def __aggregate_data__(self): """Calculate the data for the whole country""" grouped_data = self.transmission_indicators.groupby('date') grouped_df = grouped_data.mean().reset_index() grouped_df['autonomous_region'] = 'España' self.transmission_indicators = pd.concat( [self.transmission_indicators, grouped_df]) self.transmission_indicators = self.transmission_indicators.sort_values( by=['date', 'autonomous_region']) def transform_and_store(self): """Transform, aggregate, and store the data""" self.__transform_data__() self.__aggregate_data__() self.__store_data__() def __store_data__(self): """Store the outbreaks description in the database""" mongo_data = self.transmission_indicators.to_dict('records') collection = 'transmission_indicators' self.db_write.store_data(collection, mongo_data)
class SymptomsData: """Most common symptoms""" spanish_translation = { 'aki': 'Infección aguda de riñón', 'dhiarrea': 'Diarrea', 'other_respiratory': 'Otras afecciones respiratorias', 'vomit': 'Vómitos', 'dyspnoea': 'Disnea', 'fever': 'Fiebre', 'ards': 'Síndrome de dificultad respiratoria aguda', 'cough': 'Tos', 'sore_throat': 'Dolor de garganta' } def __init__(self): """Load the dataset""" # Connection to the extracted data database for reading, and to the analyzed data for writing self.db_read = MongoDatabase(MongoDatabase.extracted_db_name) self.db_write = MongoDatabase(MongoDatabase.analyzed_db_name) self.symptoms_df = self.db_read.read_data( 'clinic_description', {'date': dt(2020, 5, 29)}, ['symptom', 'patients.total.percentage']) def move_data(self): """Get the total percentage and store the data in the analyzed database""" self.__transform_data__() self.__store_data__() def __transform_data__(self): """Get only the total percentage and translate the symptoms to Spanish""" self.symptoms_df['percentage'] = self.symptoms_df['patients'].apply( lambda x: x['total']['percentage']) self.symptoms_df = self.symptoms_df.drop(columns='patients') # Translate the symptoms to Spanish self.symptoms_df['symptom'] = self.symptoms_df['symptom'].replace( SymptomsData.spanish_translation) def __store_data__(self): """Store the processed data in the database""" self.db_write.store_data('symptoms', self.symptoms_df.to_dict('records'))
class OutbreaksDescription: """Outbreaks description in Spain""" def __init__(self): """Load the dataset""" # Connection to the extracted data database for reading, and to the analyzed data for writing self.db_read = MongoDatabase(MongoDatabase.extracted_db_name) self.db_write = MongoDatabase(MongoDatabase.analyzed_db_name) # Load the outbreaks description self.outbreaks_description_df = self.db_read.read_data( 'outbreaks_description') def move_data(self): """Just move the data from the extracted to the analyzed database""" self.__store_data__() def __store_data__(self): """Store the outbreaks description in the database""" mongo_data = self.outbreaks_description_df.to_dict('records') collection = 'outbreaks_description' self.db_write.store_data(collection, mongo_data)
def extract_and_store(): """Read the processed PDF files, extract the information, and store it into the database""" reports = [] documents_diagnostic_tests = [] documents_hospitals_pressure = [] documents_outbreaks_description = [] database = MongoDatabase(MongoDatabase.extracted_db_name) # Read the processed reports processed_reports_files = os.listdir( PDFMhealthTaskGroup.processed_reports_directory) for file in processed_reports_files: with open( PDFMhealthTaskGroup.processed_reports_directory + '/' + file, 'rb') as f: processed_report = pickle.load(f) reports.append(processed_report) for report in reports: try: diagnostic_tests = report.get_diagnostic_tests() if diagnostic_tests: documents_diagnostic_tests.extend(diagnostic_tests) except Exception: print( "Error trying to extract the diagnostic tests data from Health Ministry report %i" % report.index) try: hospitals_pressure = report.get_hospital_pressure() if hospitals_pressure: documents_hospitals_pressure.extend(hospitals_pressure) except Exception: print( "Error trying to extract the hospital pressure from Health Ministry report %i" % report.index) try: outbreaks_description = report.get_outbreaks_description() if outbreaks_description: documents_outbreaks_description.extend( outbreaks_description) except Exception: print( "Error trying to extract the transmission indicators from RENAVE report %i" % report.index) database.store_data('diagnostic_tests', documents_diagnostic_tests) database.store_data('hospitals_pressure', documents_hospitals_pressure) database.store_data('outbreaks_description', documents_outbreaks_description)
def store_vaccination_reports(): """Store in the database the downloaded reports""" vaccination_data = [] vaccination_single = [] vaccination_complete = [] for file in os.listdir(VaccinationReportsTaskGroup.reports_folder): # Read the report df = pd.read_excel(VaccinationReportsTaskGroup.reports_folder + '/' + file, sheet_name=None) # Get the report date date_string = file[-12:-4] date_report = dt.strptime( date_string, VaccinationReportsTaskGroup.date_filename_format) print(f"Reading report of {date_report.isoformat()}") # Get the basic vaccination data df_basic_data = df[list(df.keys())[0]] # Translate the DataFrame columns columns_translations = { 'Unnamed: 0': 'autonomous_region', 'Dosis entregadas (1)': 'received_doses.total', 'Total Dosis entregadas (1)': 'received_doses.total', 'Dosis entregadas Pfizer (1)': 'received_doses.Pfizer', 'Dosis entregadas Moderna (1)': 'received_doses.Moderna', 'Dosis entregadas AstraZeneca (1)': 'received_doses.AstraZeneca', 'Dosis entregadas Janssen (1)': 'received_doses.Janssen', 'Dosis administradas (2)': 'applied_doses', '% sobre entregadas': 'percentage_applied_doses', 'Nº Personas con al menos 1 dosis': 'number_at_least_single_dose_people', 'Nº Personas vacunadas(pauta completada)': 'number_fully_vaccinated_people', 'Fecha de la última vacuna registrada (2)': 'date' } df_basic_data = df_basic_data.rename(columns=columns_translations) df_basic_data['autonomous_region'] = df_basic_data[ 'autonomous_region'].replace({'Totales': 'España'}) # Transform some columns df_basic_data['date'] = date_report df_basic_data['percentage_applied_doses'] = 100 * df_basic_data[ 'percentage_applied_doses'] # Save into MongoDB df_dict = df_basic_data.to_dict('records') mongo_data = [] # Transform the a.b columns into a: {b: ''} for record in df_dict: transformed_record = {} mongo_data.append(transformed_record) for k, v in record.items(): if '.' not in k: transformed_record[k] = v else: key, subkey = k.split('.') if key not in transformed_record: transformed_record[key] = {} transformed_record[key][subkey] = v vaccination_data.extend(mongo_data) if len(df) > 3: # Newer vaccination reports: get the age ranges df_single_dose = df[list(df.keys())[-2]] df_complete_dose = df[list(df.keys())[-1]] for number_doses in ['single', 'complete']: df_doses = df_single_dose if number_doses == 'single' else df_complete_dose # Remove useless columns and rename the useful ones columns = df_doses.columns df_doses = df_doses.drop(columns=[ columns[i] for i in [ 1, 2, 4, 5, 7, 8, 10, 11, 13, 14, 16, 17, 19, 20, 22, 23 ] ]) columns_translations = { 'Unnamed: 0': 'autonomous_region', '%': '80+', '%.1': '70-79', '%.2': '60-69', '%.3': '50-59', '%.4': '25-49', '%.5': '18-24', '%.6': '16-17', columns[-1]: 'total' } df_doses = df_doses.rename(columns=columns_translations) df_doses['autonomous_region'] = df_doses[ 'autonomous_region'].replace( {'Total España': 'España'}) # Remove information about the navy df_doses = df_doses[ df_doses['autonomous_region'] != 'Fuerzas Armadas'] # Remove invalid data df_doses = df_doses.dropna() # Trim the autonomous region name (some have a trailing space for unknown reason) df_doses['autonomous_region'] = df_doses[ 'autonomous_region'].apply(lambda x: x.strip()) # Multiply by 100 the percentages df_doses[df_doses.columns[1:]] = 100 * df_doses[ df_doses.columns[1:]] # Add the date to the DataFrame df_doses['date'] = date_report # Melt age range columns df_doses = df_doses.melt( id_vars=['autonomous_region', 'date'], var_name='age_range', value_name='percentage') # Convert data to dict df_dict = df_doses.to_dict('records') if number_doses == 'single': vaccination_single.extend(df_dict) else: vaccination_complete.extend(df_dict) # Store the data in MongoDB database = MongoDatabase(MongoDatabase.extracted_db_name) database.store_data("vaccination_general", vaccination_data) database.store_data("vaccination_ages_single", vaccination_single) database.store_data("vaccination_ages_complete", vaccination_complete)
class DiagnosticTests: """Dataset with the number of diagnostic tests made each day on each Autonomous Region""" def __init__(self): """Load the datasets""" # Connection to the extracted data database for reading, and to the analyzed data for writing self.db_read = MongoDatabase(MongoDatabase.extracted_db_name) self.db_write = MongoDatabase(MongoDatabase.analyzed_db_name) # Load the diagnostic tests, Spanish population and COVID cases datasets self.diagnostic_tests_df = self.db_read.read_data('diagnostic_tests') self.population_df = self.db_read.read_data( 'population_ar', {'age_range': 'total'}, ['autonomous_region', 'total']) def __process_dataset__(self): """ Get the data for the whole country, the total number of tests, the average positivity, and the number of total tests per 100k inhabitants. """ # Number of tests and average positivity in the whole country diagnostics_grouped = self.diagnostic_tests_df.groupby('date') diagnostics_total_tests = diagnostics_grouped[ 'total_diagnostic_tests'].sum() diagnostics_avg_positivity = diagnostics_grouped['positivity'].mean() diagnostics_df_total = pd.merge(diagnostics_total_tests, diagnostics_avg_positivity, on='date').reset_index() diagnostics_df_total['autonomous_region'] = 'España' self.diagnostic_tests_df = pd.concat( [self.diagnostic_tests_df, diagnostics_df_total]) self.diagnostic_tests_df = self.diagnostic_tests_df.sort_values( by=['date', 'autonomous_region']) # Moving average for positivity (the positivity line is very sharp) diagnostics_df = self.diagnostic_tests_df.set_index('date') positivity_ma = diagnostics_df.groupby( 'autonomous_region')['positivity'].rolling('14D').mean() self.diagnostic_tests_df = pd.merge(diagnostics_df, positivity_ma, on=['autonomous_region', 'date'])\ .rename(columns={'positivity_x': 'positivity', 'positivity_y': 'positivity_ma_14d'}) \ .reset_index() \ .replace({np.nan: None}) # Number of total tests diagnostic_tests_df_total = self.diagnostic_tests_df[['date', 'autonomous_region', 'total_diagnostic_tests']] \ .groupby(['date', 'autonomous_region']).sum().groupby('autonomous_region').cumsum().reset_index() self.diagnostic_tests_df = pd.merge( self.diagnostic_tests_df, diagnostic_tests_df_total, on=['date', 'autonomous_region']).rename( columns={ 'total_diagnostic_tests_x': 'new_diagnostic_tests', 'total_diagnostic_tests_y': 'total_diagnostic_tests' }) # Moving average for number of total tests diagnostics_df = self.diagnostic_tests_df.set_index('date') diagnostics_ma = diagnostics_df.groupby( 'autonomous_region')['new_diagnostic_tests'].rolling('14D').mean() self.diagnostic_tests_df = pd.merge(diagnostics_df, diagnostics_ma, on=['autonomous_region', 'date'])\ .rename(columns={'new_diagnostic_tests_x': 'new_diagnostic_tests', 'new_diagnostic_tests_y': 'new_diagnostic_tests_ma_14d'}) \ .reset_index() \ .replace({np.nan: None}) # Average positivity for each Autonomous Region diagnostic_tests_df_avg_positivity = self.diagnostic_tests_df[[ 'date', 'autonomous_region', 'positivity' ]].groupby(['date', 'autonomous_region']).sum().groupby('autonomous_region') avg_positivity_df = diagnostic_tests_df_avg_positivity.cumsum().rename( columns={'positivity': 'sum'}) avg_positivity_df[ 'count'] = diagnostic_tests_df_avg_positivity.cumcount() avg_positivity_df['average_positivity'] = avg_positivity_df[ 'sum'] / avg_positivity_df['count'] avg_positivity_df = avg_positivity_df.drop(columns=['sum', 'count']) self.diagnostic_tests_df = pd.merge(self.diagnostic_tests_df, avg_positivity_df, on=['date', 'autonomous_region']) # Total tests / 100 000 inhabitants diagnostics_population_df = pd.merge(self.diagnostic_tests_df, self.population_df, on='autonomous_region') \ .rename(columns={'total': 'population'}) diagnostics_population_df[ 'total_tests_per_population'] = 100000 * diagnostics_population_df[ 'total_diagnostic_tests'] / diagnostics_population_df[ 'population'] self.diagnostic_tests_df = diagnostics_population_df.drop( columns='population') def __store_data__(self): """Store the processed dataset in the database""" mongo_data = self.diagnostic_tests_df.replace({ np.nan: None }).to_dict('records') collection = 'diagnostic_tests' self.db_write.store_data(collection, mongo_data) def process_and_store(self): """Analyze the data, calculate some new variables, and store the results to the database""" self.__process_dataset__() self.__store_data__()
class PopulationPyramidVariation: """Create a table with the population pyramid variation suffered due to COVID""" age_range_translations = { '0-1': '0-9', '0-4': '0-9', '1-4': '0-9', '5-9': '0-9', '10-14': '10-19', '15-19': '10-19', '20-24': '20-29', '25-29': '20-29', '30-34': '30-39', '35-39': '30-39', '40-44': '40-49', '45-49': '40-49', '50-54': '50-59', '55-59': '50-59', '60-64': '60-69', '65-69': '60-69', '70-74': '70-79', '75-79': '70-79', '80-84': '80+', '85-89': '80+', '90-94': '80+', '95+': '80+', '≥90': '80+', 'Total': 'total' } def __init__(self): """Load the datasets""" # Connection to the extracted data database for reading the population data, and to the analyzed data for # writing, as well as for reading the aggregated deaths self.db_read = MongoDatabase(MongoDatabase.extracted_db_name) self.db_write = MongoDatabase(MongoDatabase.analyzed_db_name) # Load the data self.covid_deaths_df = self.db_write.read_data( 'covid_vs_all_deaths', None, ['gender', 'age_range', 'covid_deaths']) self.population_df = self.db_read.read_data( 'population_ar', {'autonomous_region': 'España'}, ['age_range', 'M', 'F', 'total']) def process_and_store_data(self): self.__transform_data__() self.__store_data__() def __transform_data__(self): """Create the table with the joined data from both DataFrames""" # Replace the age range in the population DataFrame self.population_df['age_range'] = self.population_df['age_range']. \ replace(PopulationPyramidVariation.age_range_translations) self.population_df = self.population_df.groupby( 'age_range').sum().reset_index() # Melt the gender columns in the population DataFrame self.population_df = self.population_df.melt(id_vars='age_range', var_name='gender') # Group horizontally the two DataFrames together self.population_pyramid_covid_df = \ pd.merge(self.population_df, self.covid_deaths_df, on=['age_range', 'gender']).rename(columns={'value': 'alive_population'}) self.population_pyramid_covid_df['alive_population'] = \ self.population_pyramid_covid_df['alive_population'] - self.population_pyramid_covid_df['covid_deaths'] def __store_data__(self): """Store the data in the database""" mongo_data = self.population_pyramid_covid_df.to_dict('records') collection = 'population_pyramid_variation' self.db_write.store_data(collection, mongo_data)
class DeathCauses: """Death causes in Spain""" age_range_translations = { '0-1': '0-9', '0-4': '0-9', '1-4': '0-9', '5-9': '0-9', '10-14': '10-19', '15-19': '10-19', '20-24': '20-29', '25-29': '20-29', '30-34': '30-39', '35-39': '30-39', '40-44': '40-49', '45-49': '40-49', '50-54': '50-59', '55-59': '50-59', '60-64': '60-69', '65-69': '60-69', '70-74': '70-79', '75-79': '70-79', '80-84': '80+', '85-89': '80+', '90-94': '80+', '95+': '80+', '≥90': '80+', 'Total': 'total' } def __init__(self): """Load the datasets""" # Connection to the extracted data database for reading, and to the analyzed data for writing, as well as # for reading the aggregated deaths self.db_read = MongoDatabase(MongoDatabase.extracted_db_name) self.db_write = MongoDatabase(MongoDatabase.analyzed_db_name) # Load the death causes self.death_causes_df = self.db_read.read_data('death_causes') # Load the COVID deaths until now. If it hasn't been yet a year since 15th March 2020, get the deaths until # today and calculate the proportional number to 365 days. If it's been more than one year, # get the number of deaths of the last 365 days. today = dt.today() - td( days=7 ) # the today deaths data might not be available yet, so we'll use the data from one week ago today = dt(today.year, today.month, today.day) # remove the time from the today datetime object if today - dt(2020, 3, 15) >= td(days=365): # Get the number of deaths of the last 365 days deaths_one_year_ago_df = self.db_write.read_data( 'deaths', { 'autonomous_region': 'España', 'date': today - td(days=365) }, ['age_range', 'total_deaths', 'gender']) deaths_today_df = self.db_write.read_data('deaths', { 'autonomous_region': 'España', 'date': today }, ['age_range', 'total_deaths', 'gender']) self.covid_deaths_df = deaths_today_df self.covid_deaths_df['total_deaths'] = \ self.covid_deaths_df['total_deaths'] - deaths_one_year_ago_df['total_deaths'] else: # Get the number of deaths until today, and calculate the remaining proportion to complete a year deaths_today_df = self.db_write.read_data('deaths', { 'autonomous_region': 'España', 'date': today }, ['age_range', 'total_deaths', 'gender']) proportion = 365 / (today - dt(2020, 3, 15)).days self.covid_deaths_df = deaths_today_df self.covid_deaths_df['total_deaths'] = self.covid_deaths_df[ 'total_deaths'] * proportion def process_and_store_data(self): """Analyze the data, calculate some new variables, and store the results to the database""" self.__calculate_top_10_death_causes__() self.__store_data__() def __calculate_top_10_death_causes__(self): """Calculate the top 10 death causes in Spain and the percentage of total deaths whose cause was COVID""" # Use the same age ranges in the three dataframes self.death_causes_df['age_range'] = self.death_causes_df['age_range']. \ replace(DeathCauses.age_range_translations) self.death_causes_df = self.death_causes_df.groupby( ['age_range', 'death_cause', 'gender']).sum().reset_index() # Get "all causes" death cause and then remove it all_causes_sum_df = self.death_causes_df[ self.death_causes_df['death_cause'] == 'Todas las causas'].copy() death_causes_df = self.death_causes_df[ self.death_causes_df['death_cause'] != 'Todas las causas'] # Group the 2018 death causes with the COVID-19 deaths self.covid_deaths_df['death_cause'] = 'COVID-19' death_causes_total = pd.concat([self.covid_deaths_df, death_causes_df]) # Get the top 10 death causes for each age range and gender death_causes_top_10 = death_causes_total.sort_values( ['age_range', 'total_deaths', 'gender'], ascending=False).groupby(['age_range', 'gender']).head(10) death_causes_top_10['total_deaths'] = death_causes_top_10[ 'total_deaths'].round().astype("int") self.death_causes_top_10 = death_causes_top_10 # Calculate the percentage of deaths produced by COVID covid_vs_all_deaths = pd.merge( self.covid_deaths_df, all_causes_sum_df, on=['age_range', 'gender']).rename(columns={ 'total_deaths_x': 'covid_deaths', 'total_deaths_y': 'other_deaths' }).drop(columns=['death_cause_y', 'death_cause_x']) covid_vs_all_deaths['covid_percentage'] = 100 * covid_vs_all_deaths[ 'covid_deaths'] / (covid_vs_all_deaths['covid_deaths'] + covid_vs_all_deaths['other_deaths']) self.covid_vs_all_deaths = covid_vs_all_deaths def __store_data__(self): """Store the top death causes and the COVID deaths percentage in the database""" mongo_data_top_death_causes = self.death_causes_top_10.to_dict( 'records') collection = 'top_death_causes' self.db_write.store_data(collection, mongo_data_top_death_causes) mongo_data_covid_vs_all_deaths = self.covid_vs_all_deaths.to_dict( 'records') collection = 'covid_vs_all_deaths' self.db_write.store_data(collection, mongo_data_covid_vs_all_deaths)
class DailyCOVIDData: """ Daily data of the COVID pandemic in Spain, with the number of new cases, hospitalizations, and deaths by Autonomous Region and age range. """ @staticmethod def calculate_increase_percentage(data): """Return the percentage increase or decrease in the new cases, deaths, or hospitalizations""" if data[0] == 0: return 0 return 100 * ((data[-1] - data[0]) / data[0]) def __init__(self): """Load the data from the database and store it into a Pandas DataFrame""" # Connection to the extracted data database for reading, and to the analyzed data for writing self.db_read = MongoDatabase(MongoDatabase.extracted_db_name) self.db_write = MongoDatabase(MongoDatabase.analyzed_db_name) # Load the data from the DB self.df = self.db_read.read_data('daily_data') self.population_df = self.db_read.read_data('population_ar') # Aggregate the data self.__merge__population__() def __merge__population__(self): """Merge the COVID daily data dataset with the population dataset""" # Change the population age ranges to the COVID daily data ones age_range_translations = { '0-4': '0-9', '5-9': '0-9', '10-14': '10-19', '15-19': '10-19', '20-24': '20-29', '25-29': '20-29', '30-34': '30-39', '35-39': '30-39', '40-44': '40-49', '45-49': '40-49', '50-54': '50-59', '55-59': '50-59', '60-64': '60-69', '65-69': '60-69', '70-74': '70-79', '75-79': '70-79', '80-84': '80+', '85-89': '80+', '≥90': '80+', 'Total': 'total' } self.population_df['age_range'] = self.population_df[ 'age_range'].replace(age_range_translations) self.population_df = self.population_df.groupby( ['age_range', 'autonomous_region']).sum().reset_index() # Replace the M, F, total columns by a single 'gender' column self.population_df = self.population_df.melt( id_vars=['autonomous_region', 'age_range'], value_vars=['M', 'F', 'total'], var_name='gender') # Merge the COVID dataset with the population data covid_population_df = pd.merge(self.df, self.population_df, on=['autonomous_region', 'age_range', 'gender']) \ .rename(columns={'value': 'population'}) covid_population_df['date'] = pd.to_datetime( covid_population_df['date']) self.df = covid_population_df.set_index('date') def process_and_store_cases(self): """Create a DataFrame with all the data related to the cases""" # Get only the cases from the dataset cases_df = self.df.copy()[[ 'gender', 'age_range', 'autonomous_region', 'new_cases', 'total_cases', 'population' ]] # Calculate the cases per population cases_df['new_cases_per_population'] = 100000 * cases_df[ 'new_cases'] / cases_df['population'] cases_df['total_cases_per_population'] = 100000 * cases_df[ 'total_cases'] / cases_df['population'] # CI last 14 days cases_ci = cases_df.groupby([ 'autonomous_region', 'gender', 'age_range' ])['new_cases_per_population'].rolling('14D', min_periods=1).sum() cases_df = pd.merge( cases_df, cases_ci, on=['autonomous_region', 'date', 'gender', 'age_range']).rename( columns={ 'new_cases_per_population_x': 'new_cases_per_population', 'new_cases_per_population_y': 'ci_last_14_days' }) cases_df['inverted_ci'] = cases_df['ci_last_14_days'].apply( lambda x: 100000 / x if x > 10 else 10000) # Daily, weekly and monthly increase increase_cases_df_1d = cases_df.groupby( ['autonomous_region', 'gender', 'age_range'])['new_cases'].rolling('7D').mean().rolling(2).apply( DailyCOVIDData.calculate_increase_percentage, raw=True) increase_cases_df_7d = cases_df.groupby( ['autonomous_region', 'gender', 'age_range'])['new_cases'].rolling('14D').mean().rolling(8).apply( DailyCOVIDData.calculate_increase_percentage, raw=True) increase_cases_df_30d = cases_df.groupby([ 'autonomous_region', 'gender', 'age_range' ])['new_cases'].rolling('60D').mean().rolling(31).apply( DailyCOVIDData.calculate_increase_percentage, raw=True) increase_cases_percentages = pd.DataFrame({ 'daily_increase': increase_cases_df_1d, 'weekly_increase': increase_cases_df_7d, 'monthly_increase': increase_cases_df_30d }) cases_df = pd.merge( cases_df, increase_cases_percentages, on=['autonomous_region', 'date', 'age_range', 'gender']) # New cases moving average new_cases_ma_1w = cases_df.groupby( ['autonomous_region', 'gender', 'age_range'])['new_cases_per_population'].rolling('8D').mean() new_cases_ma_2w = cases_df.groupby( ['autonomous_region', 'gender', 'age_range'])['new_cases_per_population'].rolling('15D').mean() new_cases_ma = pd.DataFrame({ 'new_cases_ma_1w': new_cases_ma_1w, 'new_cases_ma_2w': new_cases_ma_2w }) cases_df = pd.merge( cases_df, new_cases_ma, on=['autonomous_region', 'date', 'age_range', 'gender']) cases_df = cases_df.drop(columns=['population']) # Store the data self.db_write.store_data('cases', cases_df.reset_index().to_dict('records')) def process_and_store_deaths(self): """Create a DataFrame with all the data related to the deaths""" # Get only the deaths from the dataset deaths_df = self.df.copy()[[ 'gender', 'age_range', 'autonomous_region', 'new_deaths', 'total_deaths', 'new_cases', 'total_cases', 'population' ]] # Calculate the deaths per population deaths_df['new_deaths_per_population'] = 100000 * deaths_df[ 'new_deaths'] / deaths_df['population'] deaths_df['total_deaths_per_population'] = 100000 * deaths_df[ 'total_deaths'] / deaths_df['population'] # Daily, weekly and monthly increase increase_deaths_df_1d = deaths_df.groupby( ['autonomous_region', 'gender', 'age_range'])['new_deaths'].rolling('2D').apply( DailyCOVIDData.calculate_increase_percentage, raw=True) increase_deaths_df_7d = deaths_df.groupby( ['autonomous_region', 'gender', 'age_range'])['new_deaths'].rolling('8D').apply( DailyCOVIDData.calculate_increase_percentage, raw=True) increase_deaths_df_14d = deaths_df.groupby( ['autonomous_region', 'gender', 'age_range'])['new_deaths'].rolling('15D').apply( DailyCOVIDData.calculate_increase_percentage, raw=True) increase_deaths_df_30d = deaths_df.groupby( ['autonomous_region', 'gender', 'age_range'])['new_deaths'].rolling('31D').apply( DailyCOVIDData.calculate_increase_percentage, raw=True) increase_deaths_percentages = pd.DataFrame({ 'daily_increase': increase_deaths_df_1d, 'weekly_increase': increase_deaths_df_7d, 'two_weeks_increase': increase_deaths_df_14d, 'monthly_increase': increase_deaths_df_30d }) deaths_df = pd.merge( deaths_df, increase_deaths_percentages, on=['autonomous_region', 'date', 'age_range', 'gender']) # New deaths moving average new_deaths_ma_1w = deaths_df.groupby( ['autonomous_region', 'gender', 'age_range'])['new_deaths_per_population'].rolling('8D').mean() new_deaths_ma_2w = deaths_df.groupby( ['autonomous_region', 'gender', 'age_range'])['new_deaths_per_population'].rolling('15D').mean() new_deaths_ma = pd.DataFrame({ 'new_deaths_ma_1w': new_deaths_ma_1w, 'new_deaths_ma_2w': new_deaths_ma_2w }) deaths_df = pd.merge( deaths_df, new_deaths_ma, on=['autonomous_region', 'date', 'age_range', 'gender']) # Mortality percentage deaths_df['new_cases_per_population'] = 100000 * deaths_df[ 'new_cases'] / deaths_df['population'] new_cases_ma_2w = deaths_df.groupby( ['autonomous_region', 'gender', 'age_range'])['new_cases_per_population'].rolling('15D').mean() new_cases_ma_2w_df = pd.DataFrame({'new_cases_ma_2w': new_cases_ma_2w}) deaths_df = pd.merge( deaths_df, new_cases_ma_2w_df, on=['autonomous_region', 'date', 'age_range', 'gender']) deaths_df['mortality_2w'] = 100 * (deaths_df['new_deaths_ma_2w'] / deaths_df['new_cases_ma_2w']). \ replace(np.nan, 0) deaths_df['mortality_total'] = 100 * ( deaths_df['total_deaths'] / deaths_df['total_cases']).replace( np.nan, 0) deaths_df = deaths_df.drop(columns=[ 'new_cases_ma_2w', 'new_cases_per_population', 'new_cases', 'total_cases', 'population' ]) # Store the data self.db_write.store_data('deaths', deaths_df.reset_index().to_dict('records')) def process_and_store_hospitalizations(self): """Create a DataFrame with all the data related to the hospitalizations""" # Get only the hospitalizations from the dataset hospitalizations_df = self.df.copy()[[ 'gender', 'age_range', 'autonomous_region', 'new_hospitalizations', 'total_hospitalizations', 'new_ic_hospitalizations', 'total_ic_hospitalizations', 'new_cases', 'total_cases', 'population' ]] # Calculate the hospitalizations per population hospitalizations_df[ 'new_hospitalizations_per_population'] = 100000 * hospitalizations_df[ 'new_hospitalizations'] / hospitalizations_df['population'] hospitalizations_df[ 'total_hospitalizations_per_population'] = 100000 * hospitalizations_df[ 'total_hospitalizations'] / hospitalizations_df['population'] hospitalizations_df[ 'new_ic_hospitalizations_per_population'] = 100000 * hospitalizations_df[ 'new_ic_hospitalizations'] / hospitalizations_df['population'] hospitalizations_df[ 'total_ic_hospitalizations_per_population'] = 100000 * hospitalizations_df[ 'total_ic_hospitalizations'] / hospitalizations_df['population'] # Daily, weekly and monthly increase increase_hospitalizations_df_1d = hospitalizations_df.groupby(['autonomous_region', 'gender', 'age_range'])[ ['new_hospitalizations', 'new_ic_hospitalizations']].rolling('2D'). \ apply(DailyCOVIDData.calculate_increase_percentage, raw=True) increase_hospitalizations_df_7d = hospitalizations_df.groupby(['autonomous_region', 'gender', 'age_range'])[ ['new_hospitalizations', 'new_ic_hospitalizations']].rolling('8D'). \ apply(DailyCOVIDData.calculate_increase_percentage, raw=True) increase_hospitalizations_df_14d = hospitalizations_df.groupby(['autonomous_region', 'gender', 'age_range'])[ ['new_hospitalizations', 'new_ic_hospitalizations']].rolling('15D'). \ apply(DailyCOVIDData.calculate_increase_percentage, raw=True) increase_hospitalizations_df_30d = hospitalizations_df.groupby(['autonomous_region', 'gender', 'age_range'])[ ['new_hospitalizations', 'new_ic_hospitalizations']].rolling('31D'). \ apply(DailyCOVIDData.calculate_increase_percentage, raw=True) increase_hospitalizations_percentages = pd.DataFrame({ 'hospitalizations_daily_increase': increase_hospitalizations_df_1d['new_hospitalizations'], 'hospitalizations_weekly_increase': increase_hospitalizations_df_7d['new_hospitalizations'], 'hospitalizations_two_weeks_increase': increase_hospitalizations_df_14d['new_hospitalizations'], 'hospitalizations_monthly_increase': increase_hospitalizations_df_30d['new_hospitalizations'], 'ic_daily_increase': increase_hospitalizations_df_1d['new_ic_hospitalizations'], 'ic_weekly_increase': increase_hospitalizations_df_7d['new_ic_hospitalizations'], 'ic_two_weeks_increase': increase_hospitalizations_df_14d['new_ic_hospitalizations'], 'ic_monthly_increase': increase_hospitalizations_df_30d['new_ic_hospitalizations'] }) hospitalizations_df = pd.merge( hospitalizations_df, increase_hospitalizations_percentages, on=['autonomous_region', 'date', 'age_range', 'gender']) # New hospitalizations moving average new_hospitalizations_ma_1w = hospitalizations_df.groupby( ['autonomous_region', 'gender', 'age_range'])[[ 'new_hospitalizations_per_population', 'new_ic_hospitalizations_per_population' ]].rolling('8D').mean() new_hospitalizations_ma_2w = hospitalizations_df.groupby( ['autonomous_region', 'gender', 'age_range'])[[ 'new_hospitalizations_per_population', 'new_ic_hospitalizations_per_population' ]].rolling('15D').mean() new_hospitalizations_ma = pd.DataFrame({ 'new_hospitalizations_ma_1w': new_hospitalizations_ma_1w['new_hospitalizations_per_population'], 'new_hospitalizations_ma_2w': new_hospitalizations_ma_2w['new_hospitalizations_per_population'], 'new_ic_ma_1w': new_hospitalizations_ma_1w[ 'new_ic_hospitalizations_per_population'], 'new_ic_ma_2w': new_hospitalizations_ma_2w[ 'new_ic_hospitalizations_per_population'] }) hospitalizations_df = pd.merge( hospitalizations_df, new_hospitalizations_ma, on=['autonomous_region', 'date', 'age_range', 'gender']) # Hospitalization percentage hospitalizations_df['new_cases_per_population'] = \ 100000 * hospitalizations_df['new_cases'] / hospitalizations_df['population'] new_cases_ma_2w = hospitalizations_df.groupby( ['autonomous_region', 'gender', 'age_range'])['new_cases_per_population'].rolling('15D').mean() new_cases_ma_2w_df = pd.DataFrame({'new_cases_ma_2w': new_cases_ma_2w}) hospitalizations_df = pd.merge( hospitalizations_df, new_cases_ma_2w_df, on=['autonomous_region', 'date', 'age_range', 'gender']) hospitalizations_df['hospitalization_ratio_2w'] = 100 * ( hospitalizations_df['new_hospitalizations_ma_2w'] / hospitalizations_df['new_cases_ma_2w']).replace(np.nan, 0) hospitalizations_df['hospitalization_ratio_total'] = 100 * ( hospitalizations_df['total_hospitalizations'] / hospitalizations_df['total_cases']).replace(np.nan, 0) hospitalizations_df['hospitalization_ic_ratio_2w'] = 100 * ( hospitalizations_df['new_ic_ma_2w'] / hospitalizations_df['new_cases_ma_2w']).replace(np.nan, 0) hospitalizations_df['hospitalization_ic_ratio_total'] = 100 * ( hospitalizations_df['total_ic_hospitalizations'] / hospitalizations_df['total_cases']).replace(np.nan, 0) hospitalizations_df = hospitalizations_df.drop(columns=[ 'new_cases_ma_2w', 'new_cases_per_population', 'new_cases', 'total_cases', 'population' ]) # Store the data self.db_write.store_data( 'hospitalizations', hospitalizations_df.reset_index().to_dict('records'))