def __init__(self): """Load the dataset""" # Connection to the extracted data database for reading, and to the analyzed data for writing self.db_read = MongoDatabase(MongoDatabase.extracted_db_name) self.db_write = MongoDatabase(MongoDatabase.analyzed_db_name) self.df_vaccination_general = self.db_read.read_data( 'vaccination_general')
def __init__(self): """Load the dataset""" # Connection to the extracted data database for reading, and to the analyzed data for writing self.db_read = MongoDatabase(MongoDatabase.extracted_db_name) self.db_write = MongoDatabase(MongoDatabase.analyzed_db_name) # Load the transmission indicators data self.transmission_indicators = self.db_read.read_data( 'transmission_indicators')
def __init__(self): """Load the dataset""" # Connection to the extracted data database for reading, and to the analyzed data for writing self.db_read = MongoDatabase(MongoDatabase.extracted_db_name) self.db_write = MongoDatabase(MongoDatabase.analyzed_db_name) # Load the outbreaks description self.outbreaks_description_df = self.db_read.read_data( 'outbreaks_description')
def __init__(self): """Load the dataset""" # Connection to the extracted data database for reading, and to the analyzed data for writing self.db_read = MongoDatabase(MongoDatabase.extracted_db_name) self.db_write = MongoDatabase(MongoDatabase.analyzed_db_name) self.symptoms_df = self.db_read.read_data( 'clinic_description', {'date': dt(2020, 5, 29)}, ['symptom', 'patients.total.percentage'])
def __init__(self): """Load the datasets""" # Connection to the extracted data database for reading, and to the analyzed data for writing self.db_read = MongoDatabase(MongoDatabase.extracted_db_name) self.db_write = MongoDatabase(MongoDatabase.analyzed_db_name) # Load the diagnostic tests, Spanish population and COVID cases datasets self.diagnostic_tests_df = self.db_read.read_data('diagnostic_tests') self.population_df = self.db_read.read_data( 'population_ar', {'age_range': 'total'}, ['autonomous_region', 'total'])
def __init__(self): """Load the data from the database and store it into a Pandas DataFrame""" # Connection to the extracted data database for reading, and to the analyzed data for writing self.db_read = MongoDatabase(MongoDatabase.extracted_db_name) self.db_write = MongoDatabase(MongoDatabase.analyzed_db_name) # Load the data from the DB self.df = self.db_read.read_data('daily_data') self.population_df = self.db_read.read_data('population_ar') # Aggregate the data self.__merge__population__()
def __init__(self): """Load the dataset""" # Connection to the extracted data database for reading, and to the analyzed data for writing self.db_read = MongoDatabase(MongoDatabase.extracted_db_name) self.db_write = MongoDatabase(MongoDatabase.analyzed_db_name) # Load the hospitals pressure data self.hospitals_pressure = self.db_read.read_data( 'hospitals_pressure', projection=[ 'autonomous_region', 'date', 'hospitalized_patients', 'beds_percentage', 'ic_patients', 'ic_beds_percentage' ])
def __init__(self): """Load the datasets""" # Connection to the extracted data database for reading the population data, and to the analyzed data for # writing, as well as for reading the aggregated deaths self.db_read = MongoDatabase(MongoDatabase.extracted_db_name) self.db_write = MongoDatabase(MongoDatabase.analyzed_db_name) # Load the data self.covid_deaths_df = self.db_write.read_data( 'covid_vs_all_deaths', None, ['gender', 'age_range', 'covid_deaths']) self.population_df = self.db_read.read_data( 'population_ar', {'autonomous_region': 'España'}, ['age_range', 'M', 'F', 'total'])
def extract_and_store(): """Read the processed PDF files, extract the information, and store it into the database""" reports = [] documents_clinic_description = [] documents_transmission_indicators = [] database = MongoDatabase(MongoDatabase.extracted_db_name) # Read the processed reports processed_reports_files = os.listdir(PDFRenaveTaskGroup.processed_reports_directory) for file in processed_reports_files: with open(PDFRenaveTaskGroup.processed_reports_directory + '/' + file, 'rb') as f: processed_report = pickle.load(f) reports.append(processed_report) for report in reports: try: clinic_description = report.get_clinic_description() if clinic_description: documents_clinic_description.extend(clinic_description) except Exception: print("Error trying to extract the clinic description from RENAVE report %i" % report.index) try: transmission_indicators = report.get_transmission_indicators() if transmission_indicators: documents_transmission_indicators.extend(transmission_indicators) except Exception: print("Error trying to extract the transmission indicators from RENAVE report %i" % report.index) database.store_data('clinic_description', documents_clinic_description) database.store_data('transmission_indicators', documents_transmission_indicators)
def process_and_store_ar_population(): dataset = ARPopulationCSVDataset("csv_data/population_ar.csv", separator=';', decimal=',', thousands='.') database = MongoDatabase(MongoDatabase.extracted_db_name) dataset.store_dataset(database, 'population_ar')
def __init__(self): """Load the datasets""" # Connection to the extracted data database for reading, and to the analyzed data for writing, as well as # for reading the aggregated deaths self.db_read = MongoDatabase(MongoDatabase.extracted_db_name) self.db_write = MongoDatabase(MongoDatabase.analyzed_db_name) # Load the death causes self.death_causes_df = self.db_read.read_data('death_causes') # Load the COVID deaths until now. If it hasn't been yet a year since 15th March 2020, get the deaths until # today and calculate the proportional number to 365 days. If it's been more than one year, # get the number of deaths of the last 365 days. today = dt.today() - td( days=7 ) # the today deaths data might not be available yet, so we'll use the data from one week ago today = dt(today.year, today.month, today.day) # remove the time from the today datetime object if today - dt(2020, 3, 15) >= td(days=365): # Get the number of deaths of the last 365 days deaths_one_year_ago_df = self.db_write.read_data( 'deaths', { 'autonomous_region': 'España', 'date': today - td(days=365) }, ['age_range', 'total_deaths', 'gender']) deaths_today_df = self.db_write.read_data('deaths', { 'autonomous_region': 'España', 'date': today }, ['age_range', 'total_deaths', 'gender']) self.covid_deaths_df = deaths_today_df self.covid_deaths_df['total_deaths'] = \ self.covid_deaths_df['total_deaths'] - deaths_one_year_ago_df['total_deaths'] else: # Get the number of deaths until today, and calculate the remaining proportion to complete a year deaths_today_df = self.db_write.read_data('deaths', { 'autonomous_region': 'España', 'date': today }, ['age_range', 'total_deaths', 'gender']) proportion = 365 / (today - dt(2020, 3, 15)).days self.covid_deaths_df = deaths_today_df self.covid_deaths_df['total_deaths'] = self.covid_deaths_df[ 'total_deaths'] * proportion
def extract_and_store(): """Read the processed PDF files, extract the information, and store it into the database""" reports = [] documents_diagnostic_tests = [] documents_hospitals_pressure = [] documents_outbreaks_description = [] database = MongoDatabase(MongoDatabase.extracted_db_name) # Read the processed reports processed_reports_files = os.listdir( PDFMhealthTaskGroup.processed_reports_directory) for file in processed_reports_files: with open( PDFMhealthTaskGroup.processed_reports_directory + '/' + file, 'rb') as f: processed_report = pickle.load(f) reports.append(processed_report) for report in reports: try: diagnostic_tests = report.get_diagnostic_tests() if diagnostic_tests: documents_diagnostic_tests.extend(diagnostic_tests) except Exception: print( "Error trying to extract the diagnostic tests data from Health Ministry report %i" % report.index) try: hospitals_pressure = report.get_hospital_pressure() if hospitals_pressure: documents_hospitals_pressure.extend(hospitals_pressure) except Exception: print( "Error trying to extract the hospital pressure from Health Ministry report %i" % report.index) try: outbreaks_description = report.get_outbreaks_description() if outbreaks_description: documents_outbreaks_description.extend( outbreaks_description) except Exception: print( "Error trying to extract the transmission indicators from RENAVE report %i" % report.index) database.store_data('diagnostic_tests', documents_diagnostic_tests) database.store_data('hospitals_pressure', documents_hospitals_pressure) database.store_data('outbreaks_description', documents_outbreaks_description)
def store_vaccination_reports(): """Store in the database the downloaded reports""" vaccination_data = [] vaccination_single = [] vaccination_complete = [] for file in os.listdir(VaccinationReportsTaskGroup.reports_folder): # Read the report df = pd.read_excel(VaccinationReportsTaskGroup.reports_folder + '/' + file, sheet_name=None) # Get the report date date_string = file[-12:-4] date_report = dt.strptime( date_string, VaccinationReportsTaskGroup.date_filename_format) print(f"Reading report of {date_report.isoformat()}") # Get the basic vaccination data df_basic_data = df[list(df.keys())[0]] # Translate the DataFrame columns columns_translations = { 'Unnamed: 0': 'autonomous_region', 'Dosis entregadas (1)': 'received_doses.total', 'Total Dosis entregadas (1)': 'received_doses.total', 'Dosis entregadas Pfizer (1)': 'received_doses.Pfizer', 'Dosis entregadas Moderna (1)': 'received_doses.Moderna', 'Dosis entregadas AstraZeneca (1)': 'received_doses.AstraZeneca', 'Dosis entregadas Janssen (1)': 'received_doses.Janssen', 'Dosis administradas (2)': 'applied_doses', '% sobre entregadas': 'percentage_applied_doses', 'Nº Personas con al menos 1 dosis': 'number_at_least_single_dose_people', 'Nº Personas vacunadas(pauta completada)': 'number_fully_vaccinated_people', 'Fecha de la última vacuna registrada (2)': 'date' } df_basic_data = df_basic_data.rename(columns=columns_translations) df_basic_data['autonomous_region'] = df_basic_data[ 'autonomous_region'].replace({'Totales': 'España'}) # Transform some columns df_basic_data['date'] = date_report df_basic_data['percentage_applied_doses'] = 100 * df_basic_data[ 'percentage_applied_doses'] # Save into MongoDB df_dict = df_basic_data.to_dict('records') mongo_data = [] # Transform the a.b columns into a: {b: ''} for record in df_dict: transformed_record = {} mongo_data.append(transformed_record) for k, v in record.items(): if '.' not in k: transformed_record[k] = v else: key, subkey = k.split('.') if key not in transformed_record: transformed_record[key] = {} transformed_record[key][subkey] = v vaccination_data.extend(mongo_data) if len(df) > 3: # Newer vaccination reports: get the age ranges df_single_dose = df[list(df.keys())[-2]] df_complete_dose = df[list(df.keys())[-1]] for number_doses in ['single', 'complete']: df_doses = df_single_dose if number_doses == 'single' else df_complete_dose # Remove useless columns and rename the useful ones columns = df_doses.columns df_doses = df_doses.drop(columns=[ columns[i] for i in [ 1, 2, 4, 5, 7, 8, 10, 11, 13, 14, 16, 17, 19, 20, 22, 23 ] ]) columns_translations = { 'Unnamed: 0': 'autonomous_region', '%': '80+', '%.1': '70-79', '%.2': '60-69', '%.3': '50-59', '%.4': '25-49', '%.5': '18-24', '%.6': '16-17', columns[-1]: 'total' } df_doses = df_doses.rename(columns=columns_translations) df_doses['autonomous_region'] = df_doses[ 'autonomous_region'].replace( {'Total España': 'España'}) # Remove information about the navy df_doses = df_doses[ df_doses['autonomous_region'] != 'Fuerzas Armadas'] # Remove invalid data df_doses = df_doses.dropna() # Trim the autonomous region name (some have a trailing space for unknown reason) df_doses['autonomous_region'] = df_doses[ 'autonomous_region'].apply(lambda x: x.strip()) # Multiply by 100 the percentages df_doses[df_doses.columns[1:]] = 100 * df_doses[ df_doses.columns[1:]] # Add the date to the DataFrame df_doses['date'] = date_report # Melt age range columns df_doses = df_doses.melt( id_vars=['autonomous_region', 'date'], var_name='age_range', value_name='percentage') # Convert data to dict df_dict = df_doses.to_dict('records') if number_doses == 'single': vaccination_single.extend(df_dict) else: vaccination_complete.extend(df_dict) # Store the data in MongoDB database = MongoDatabase(MongoDatabase.extracted_db_name) database.store_data("vaccination_general", vaccination_data) database.store_data("vaccination_ages_single", vaccination_single) database.store_data("vaccination_ages_complete", vaccination_complete)
def process_and_store_death_causes(): dataset = DeathCausesDataset('csv_data/death_causes.csv') database = MongoDatabase(MongoDatabase.extracted_db_name) dataset.store_dataset(database, 'death_causes')
def process_and_store_cases_and_deaths(): dataset = DailyCOVIDData('csv_data/daily_covid_data.csv', 'csv_data/provinces_ar.csv') database = MongoDatabase(MongoDatabase.extracted_db_name) dataset.store_dataset(database, 'daily_data')