Example #1
0
 def __init__(self):
     """Load the dataset"""
     # Connection to the extracted data database for reading, and to the analyzed data for writing
     self.db_read = MongoDatabase(MongoDatabase.extracted_db_name)
     self.db_write = MongoDatabase(MongoDatabase.analyzed_db_name)
     self.df_vaccination_general = self.db_read.read_data(
         'vaccination_general')
Example #2
0
    def __init__(self):
        """Load the dataset"""
        # Connection to the extracted data database for reading, and to the analyzed data for writing
        self.db_read = MongoDatabase(MongoDatabase.extracted_db_name)
        self.db_write = MongoDatabase(MongoDatabase.analyzed_db_name)

        # Load the transmission indicators data
        self.transmission_indicators = self.db_read.read_data(
            'transmission_indicators')
Example #3
0
    def __init__(self):
        """Load the dataset"""
        # Connection to the extracted data database for reading, and to the analyzed data for writing
        self.db_read = MongoDatabase(MongoDatabase.extracted_db_name)
        self.db_write = MongoDatabase(MongoDatabase.analyzed_db_name)

        # Load the outbreaks description
        self.outbreaks_description_df = self.db_read.read_data(
            'outbreaks_description')
Example #4
0
    def __init__(self):
        """Load the dataset"""
        # Connection to the extracted data database for reading, and to the analyzed data for writing
        self.db_read = MongoDatabase(MongoDatabase.extracted_db_name)
        self.db_write = MongoDatabase(MongoDatabase.analyzed_db_name)

        self.symptoms_df = self.db_read.read_data(
            'clinic_description', {'date': dt(2020, 5, 29)},
            ['symptom', 'patients.total.percentage'])
Example #5
0
    def __init__(self):
        """Load the datasets"""
        # Connection to the extracted data database for reading, and to the analyzed data for writing
        self.db_read = MongoDatabase(MongoDatabase.extracted_db_name)
        self.db_write = MongoDatabase(MongoDatabase.analyzed_db_name)

        # Load the diagnostic tests, Spanish population and COVID cases datasets
        self.diagnostic_tests_df = self.db_read.read_data('diagnostic_tests')
        self.population_df = self.db_read.read_data(
            'population_ar', {'age_range': 'total'},
            ['autonomous_region', 'total'])
Example #6
0
    def __init__(self):
        """Load the data from the database and store it into a Pandas DataFrame"""
        # Connection to the extracted data database for reading, and to the analyzed data for writing
        self.db_read = MongoDatabase(MongoDatabase.extracted_db_name)
        self.db_write = MongoDatabase(MongoDatabase.analyzed_db_name)

        # Load the data from the DB
        self.df = self.db_read.read_data('daily_data')
        self.population_df = self.db_read.read_data('population_ar')

        # Aggregate the data
        self.__merge__population__()
Example #7
0
    def __init__(self):
        """Load the dataset"""
        # Connection to the extracted data database for reading, and to the analyzed data for writing
        self.db_read = MongoDatabase(MongoDatabase.extracted_db_name)
        self.db_write = MongoDatabase(MongoDatabase.analyzed_db_name)

        # Load the hospitals pressure data
        self.hospitals_pressure = self.db_read.read_data(
            'hospitals_pressure',
            projection=[
                'autonomous_region', 'date', 'hospitalized_patients',
                'beds_percentage', 'ic_patients', 'ic_beds_percentage'
            ])
Example #8
0
    def __init__(self):
        """Load the datasets"""
        # Connection to the extracted data database for reading the population data, and to the analyzed data for
        # writing, as well as for reading the aggregated deaths
        self.db_read = MongoDatabase(MongoDatabase.extracted_db_name)
        self.db_write = MongoDatabase(MongoDatabase.analyzed_db_name)

        # Load the data
        self.covid_deaths_df = self.db_write.read_data(
            'covid_vs_all_deaths', None,
            ['gender', 'age_range', 'covid_deaths'])
        self.population_df = self.db_read.read_data(
            'population_ar', {'autonomous_region': 'España'},
            ['age_range', 'M', 'F', 'total'])
Example #9
0
    def extract_and_store():
        """Read the processed PDF files, extract the information, and store it into the database"""
        reports = []
        documents_clinic_description = []
        documents_transmission_indicators = []
        database = MongoDatabase(MongoDatabase.extracted_db_name)

        # Read the processed reports
        processed_reports_files = os.listdir(PDFRenaveTaskGroup.processed_reports_directory)
        for file in processed_reports_files:
            with open(PDFRenaveTaskGroup.processed_reports_directory + '/' + file, 'rb') as f:
                processed_report = pickle.load(f)
                reports.append(processed_report)

        for report in reports:
            try:
                clinic_description = report.get_clinic_description()
                if clinic_description:
                    documents_clinic_description.extend(clinic_description)
            except Exception:
                print("Error trying to extract the clinic description from RENAVE report %i" % report.index)

            try:
                transmission_indicators = report.get_transmission_indicators()
                if transmission_indicators:
                    documents_transmission_indicators.extend(transmission_indicators)
            except Exception:
                print("Error trying to extract the transmission indicators from RENAVE report %i" % report.index)

        database.store_data('clinic_description', documents_clinic_description)
        database.store_data('transmission_indicators', documents_transmission_indicators)
Example #10
0
 def process_and_store_ar_population():
     dataset = ARPopulationCSVDataset("csv_data/population_ar.csv",
                                      separator=';',
                                      decimal=',',
                                      thousands='.')
     database = MongoDatabase(MongoDatabase.extracted_db_name)
     dataset.store_dataset(database, 'population_ar')
Example #11
0
    def __init__(self):
        """Load the datasets"""
        # Connection to the extracted data database for reading, and to the analyzed data for writing, as well as
        # for reading the aggregated deaths
        self.db_read = MongoDatabase(MongoDatabase.extracted_db_name)
        self.db_write = MongoDatabase(MongoDatabase.analyzed_db_name)

        # Load the death causes
        self.death_causes_df = self.db_read.read_data('death_causes')

        # Load the COVID deaths until now. If it hasn't been yet a year since 15th March 2020, get the deaths until
        # today and calculate the proportional number to 365 days. If it's been more than one year,
        # get the number of deaths of the last 365 days.
        today = dt.today() - td(
            days=7
        )  # the today deaths data might not be available yet, so we'll use the data from one week ago
        today = dt(today.year, today.month,
                   today.day)  # remove the time from the today datetime object

        if today - dt(2020, 3, 15) >= td(days=365):
            # Get the number of deaths of the last 365 days
            deaths_one_year_ago_df = self.db_write.read_data(
                'deaths', {
                    'autonomous_region': 'España',
                    'date': today - td(days=365)
                }, ['age_range', 'total_deaths', 'gender'])
            deaths_today_df = self.db_write.read_data('deaths', {
                'autonomous_region': 'España',
                'date': today
            }, ['age_range', 'total_deaths', 'gender'])
            self.covid_deaths_df = deaths_today_df
            self.covid_deaths_df['total_deaths'] = \
                self.covid_deaths_df['total_deaths'] - deaths_one_year_ago_df['total_deaths']
        else:
            # Get the number of deaths until today, and calculate the remaining proportion to complete a year
            deaths_today_df = self.db_write.read_data('deaths', {
                'autonomous_region': 'España',
                'date': today
            }, ['age_range', 'total_deaths', 'gender'])
            proportion = 365 / (today - dt(2020, 3, 15)).days
            self.covid_deaths_df = deaths_today_df
            self.covid_deaths_df['total_deaths'] = self.covid_deaths_df[
                'total_deaths'] * proportion
Example #12
0
    def extract_and_store():
        """Read the processed PDF files, extract the information, and store it into the database"""
        reports = []
        documents_diagnostic_tests = []
        documents_hospitals_pressure = []
        documents_outbreaks_description = []
        database = MongoDatabase(MongoDatabase.extracted_db_name)

        # Read the processed reports
        processed_reports_files = os.listdir(
            PDFMhealthTaskGroup.processed_reports_directory)
        for file in processed_reports_files:
            with open(
                    PDFMhealthTaskGroup.processed_reports_directory + '/' +
                    file, 'rb') as f:
                processed_report = pickle.load(f)
                reports.append(processed_report)

        for report in reports:
            try:
                diagnostic_tests = report.get_diagnostic_tests()
                if diagnostic_tests:
                    documents_diagnostic_tests.extend(diagnostic_tests)
            except Exception:
                print(
                    "Error trying to extract the diagnostic tests data from Health Ministry report %i"
                    % report.index)

            try:
                hospitals_pressure = report.get_hospital_pressure()
                if hospitals_pressure:
                    documents_hospitals_pressure.extend(hospitals_pressure)
            except Exception:
                print(
                    "Error trying to extract the hospital pressure from Health Ministry report %i"
                    % report.index)

            try:
                outbreaks_description = report.get_outbreaks_description()
                if outbreaks_description:
                    documents_outbreaks_description.extend(
                        outbreaks_description)
            except Exception:
                print(
                    "Error trying to extract the transmission indicators from RENAVE report %i"
                    % report.index)

        database.store_data('diagnostic_tests', documents_diagnostic_tests)
        database.store_data('hospitals_pressure', documents_hospitals_pressure)
        database.store_data('outbreaks_description',
                            documents_outbreaks_description)
Example #13
0
    def store_vaccination_reports():
        """Store in the database the downloaded reports"""
        vaccination_data = []
        vaccination_single = []
        vaccination_complete = []

        for file in os.listdir(VaccinationReportsTaskGroup.reports_folder):
            # Read the report
            df = pd.read_excel(VaccinationReportsTaskGroup.reports_folder +
                               '/' + file,
                               sheet_name=None)

            # Get the report date
            date_string = file[-12:-4]
            date_report = dt.strptime(
                date_string, VaccinationReportsTaskGroup.date_filename_format)
            print(f"Reading report of {date_report.isoformat()}")

            # Get the basic vaccination data
            df_basic_data = df[list(df.keys())[0]]

            # Translate the DataFrame columns
            columns_translations = {
                'Unnamed: 0': 'autonomous_region',
                'Dosis entregadas (1)': 'received_doses.total',
                'Total Dosis entregadas (1)': 'received_doses.total',
                'Dosis entregadas Pfizer (1)': 'received_doses.Pfizer',
                'Dosis entregadas Moderna (1)': 'received_doses.Moderna',
                'Dosis entregadas AstraZeneca (1)':
                'received_doses.AstraZeneca',
                'Dosis entregadas Janssen (1)': 'received_doses.Janssen',
                'Dosis administradas (2)': 'applied_doses',
                '% sobre entregadas': 'percentage_applied_doses',
                'Nº Personas con al menos 1 dosis':
                'number_at_least_single_dose_people',
                'Nº Personas vacunadas(pauta completada)':
                'number_fully_vaccinated_people',
                'Fecha de la última vacuna registrada (2)': 'date'
            }
            df_basic_data = df_basic_data.rename(columns=columns_translations)
            df_basic_data['autonomous_region'] = df_basic_data[
                'autonomous_region'].replace({'Totales': 'España'})

            # Transform some columns
            df_basic_data['date'] = date_report
            df_basic_data['percentage_applied_doses'] = 100 * df_basic_data[
                'percentage_applied_doses']

            # Save into MongoDB
            df_dict = df_basic_data.to_dict('records')
            mongo_data = []

            # Transform the a.b columns into a: {b: ''}
            for record in df_dict:
                transformed_record = {}
                mongo_data.append(transformed_record)
                for k, v in record.items():
                    if '.' not in k:
                        transformed_record[k] = v
                    else:
                        key, subkey = k.split('.')
                        if key not in transformed_record:
                            transformed_record[key] = {}

                        transformed_record[key][subkey] = v

            vaccination_data.extend(mongo_data)

            if len(df) > 3:
                # Newer vaccination reports: get the age ranges
                df_single_dose = df[list(df.keys())[-2]]
                df_complete_dose = df[list(df.keys())[-1]]

                for number_doses in ['single', 'complete']:
                    df_doses = df_single_dose if number_doses == 'single' else df_complete_dose

                    # Remove useless columns and rename the useful ones
                    columns = df_doses.columns
                    df_doses = df_doses.drop(columns=[
                        columns[i] for i in [
                            1, 2, 4, 5, 7, 8, 10, 11, 13, 14, 16, 17, 19, 20,
                            22, 23
                        ]
                    ])
                    columns_translations = {
                        'Unnamed: 0': 'autonomous_region',
                        '%': '80+',
                        '%.1': '70-79',
                        '%.2': '60-69',
                        '%.3': '50-59',
                        '%.4': '25-49',
                        '%.5': '18-24',
                        '%.6': '16-17',
                        columns[-1]: 'total'
                    }
                    df_doses = df_doses.rename(columns=columns_translations)
                    df_doses['autonomous_region'] = df_doses[
                        'autonomous_region'].replace(
                            {'Total España': 'España'})

                    # Remove information about the navy
                    df_doses = df_doses[
                        df_doses['autonomous_region'] != 'Fuerzas Armadas']

                    # Remove invalid data
                    df_doses = df_doses.dropna()

                    # Trim the autonomous region name (some have a trailing space for unknown reason)
                    df_doses['autonomous_region'] = df_doses[
                        'autonomous_region'].apply(lambda x: x.strip())

                    # Multiply by 100 the percentages
                    df_doses[df_doses.columns[1:]] = 100 * df_doses[
                        df_doses.columns[1:]]

                    # Add the date to the DataFrame
                    df_doses['date'] = date_report

                    # Melt age range columns
                    df_doses = df_doses.melt(
                        id_vars=['autonomous_region', 'date'],
                        var_name='age_range',
                        value_name='percentage')

                    # Convert data to dict
                    df_dict = df_doses.to_dict('records')

                    if number_doses == 'single':
                        vaccination_single.extend(df_dict)
                    else:
                        vaccination_complete.extend(df_dict)

        # Store the data in MongoDB
        database = MongoDatabase(MongoDatabase.extracted_db_name)
        database.store_data("vaccination_general", vaccination_data)
        database.store_data("vaccination_ages_single", vaccination_single)
        database.store_data("vaccination_ages_complete", vaccination_complete)
Example #14
0
 def process_and_store_death_causes():
     dataset = DeathCausesDataset('csv_data/death_causes.csv')
     database = MongoDatabase(MongoDatabase.extracted_db_name)
     dataset.store_dataset(database, 'death_causes')
Example #15
0
 def process_and_store_cases_and_deaths():
     dataset = DailyCOVIDData('csv_data/daily_covid_data.csv',
                              'csv_data/provinces_ar.csv')
     database = MongoDatabase(MongoDatabase.extracted_db_name)
     dataset.store_dataset(database, 'daily_data')