Beispiel #1
0
    def get_data_items(cls,
                       data_items='all',
                       regions='ES',
                       start_date=None,
                       end_date=None,
                       language='ES',
                       errors='ignore'):
        """
        Collects the required Data Items from associated Data Sources

        Parameters
        ----------
        data_items : list of str
            list of data item names. By default, 'all' are collected.
        regions : list of str
            list of region names. By default, 'ES' refers to all Spanish regions.
        start_date : pd.datetime
            first day to be considered in TEMPORAL data items. By default, None is established.
        end_date : pd.datetime
            last day to be considered in TEMPORAL data items. By default, None is established.
        language : str
            language of the returned data. 
                'ES' for Spanish (default value),
                'EN' for English.
        errors : str
            action to be taken when errors occur.
                'ignore' tries to get all possible data items even if some can't be collected,
                'raise' throws an exception and the execution is aborted upon detection of any error. 

        Returns
        -------
        pd.DataFrame 
            a DataFrame with the required information.

        Notes
        -----
        If dates are passed, then it is assumed that TEMPORAL data items are required. Otherwise, a GEOGRAPHICAL retrieval is assumed.
        A TEMPORAL retrieval produces a DataFrame with daily [Date] as row indexer and [Region, Data Item] as column multiindexer.
        A GEOGRAPHICAL retrieval produces a DataFrame with [Region] as row indexer and [Data Item] as column indexer.
        """

        # if data sources are not initialized, lets read configurations
        if not cls.__DATA_SOURCES_INITIALIZED:
            cls.__init_data_sources()

        ##### check of parameters #####

        if data_items == 'all':
            data_items = cls.get_data_items_names(data_type=None,
                                                  language=language)
        else:
            ## check if items are implemented ##

            # get all implemented items
            implemented_data_sources = cls.get_data_items_names(
                data_type=None, language=language)
            implemented_data_items = []
            for implemented_data_source in list(
                    implemented_data_sources.keys()):
                implemented_data_items = implemented_data_items + implemented_data_sources[
                    implemented_data_source]

            # check
            successful_data_items = []
            for data_item in data_items:
                if data_item not in implemented_data_items:
                    print(f'WARNING: Item {data_item} is not implemented')
                else:
                    successful_data_items.append(data_item)

            if not successful_data_items:
                print(
                    'WARNING: No result found for the specified data items and conditions'
                )
                return None

            data_items = successful_data_items

        if regions == 'ES':
            regions = Regions.get_regions('ES')

        if start_date is None or end_date is None:
            assumed_data_type = DataType.GEOGRAPHICAL
        else:
            assumed_data_type = DataType.TEMPORAL

        print("Assumed a " + str(assumed_data_type.name) +
              " data retrieval...")

        if assumed_data_type is DataType.TEMPORAL:
            if start_date > end_date:
                print('ERROR: start_date (' + str(start_date) +
                      ') should be smaller or equal than end_date (' +
                      str(start_date) + ')')
                return None

            if end_date > pd.to_datetime('today').date():
                print('ERROR: end_date (' + str(end_date) +
                      ') should not refer to the future')
                return None

        ### change data items (display names) to internal representation ###

        internalname_displayname_dict = cls._get_internal_names_mapping(
            assumed_data_type, data_items, language=language
        )  # get internal name - display name dict (then is used to rename again)
        if internalname_displayname_dict is None:
            return None
        data_items = list(internalname_displayname_dict.keys()
                          )  # change data_items to internal representation

        ### group data items by data source in dictionary ###

        # existing items for assumed data type
        items_by_source = cls.get_data_items_names(
            data_type=assumed_data_type,
            language=language)  # dict with : source -> [item1, item2]
        items_by_assumed_data_type = []
        for items in items_by_source.values():
            items_by_assumed_data_type = items_by_assumed_data_type + items

        # group requested items by data sources
        requested_items_by_source = defaultdict(
            list
        )  # dict  datasource : [requested item 1, requested item 2, ...]
        for data_item in data_items:
            source_class_found = False
            source = 0
            while source < len(
                    cls.__DATA_SOURCE_CLASSES) and not source_class_found:
                source_class_found = cls.__DATA_SOURCE_CLASSES[
                    source].data_item_exists(data_item)
                source = source + 1
            if source_class_found:
                requested_items_by_source[cls.__DATA_SOURCE_CLASSES[
                    source - 1]].append(data_item)
            else:
                # never should get there
                print('WARNING: Data source not found for item \'' +
                      str(data_item) + '\'')

        ##### data retrieval #####
        df_all_data_sources = None

        ## get data by data source ##
        for DATA_SOURCE_CLASS in requested_items_by_source.keys():

            df_data_source = None

            data_items = requested_items_by_source[DATA_SOURCE_CLASS]

            # for temporal data type
            if assumed_data_type is DataType.TEMPORAL:

                df_data_source = DATA_SOURCE_CLASS(data_items, regions,
                                                   start_date,
                                                   end_date).get_data(errors)

                if df_data_source is not None:
                    df_data_source = cls.__complete_dates(
                        df_data_source, start_date, end_date
                    )  # complete with nan values those days without info

            # for geographical data type
            elif assumed_data_type is DataType.GEOGRAPHICAL:
                df_data_source = DATA_SOURCE_CLASS(data_items,
                                                   regions).get_data(errors)
            else:
                # never should get here
                return None

            # continuous joining of data from diverse data sources
            if df_data_source is not None:
                if df_all_data_sources is None:
                    df_all_data_sources = df_data_source.sort_index(axis=1)
                else:
                    df_all_data_sources = pd.concat(
                        [df_all_data_sources, df_data_source],
                        axis='columns').sort_index(axis=1)

        ## END: get data by data source ##

        if df_all_data_sources is None:
            print(
                'WARNING: No result found for the specified data items and conditions'
            )
            return None

        def rename_with_regex(col_name):
            for internal_name in list(internalname_displayname_dict.keys()):
                if re.match(f"^{internal_name}$|^{internal_name} \(",
                            col_name):
                    return re.sub(
                        pattern=internal_name,
                        repl=internalname_displayname_dict[internal_name],
                        string=col_name)
            return 'None'

        df_all_data_sources.rename(columns=rename_with_regex,
                                   level='Item',
                                   inplace=True)

        ### filter retrieved data to match the specific query determined by data_items, regions and dates ###

        # filter requested data_items (some data sources request in the same query more data items than the ones requested )
        df_all_data_sources = df_all_data_sources.loc[:,
                                                      df_all_data_sources.
                                                      columns.get_level_values(
                                                          'Item') != 'None']

        if assumed_data_type is DataType.TEMPORAL:
            df_all_data_sources = df_all_data_sources[
                (df_all_data_sources.index >= start_date) &
                (df_all_data_sources.index <=
                 end_date)]  # to filter requested dates (indexes)
            df_all_data_sources = df_all_data_sources.loc[:,
                                                          df_all_data_sources.
                                                          columns.
                                                          get_level_values(
                                                              'Region'
                                                          ).isin(
                                                              regions
                                                          )]  # to filter dates (indexes)
        else:
            df_all_data_sources = df_all_data_sources[
                df_all_data_sources.index.isin(
                    regions)]  # to filter requested regions (indexes)

        df_all_data_sources = df_all_data_sources.loc[:, ~df_all_data_sources.
                                                      columns.duplicated()]
        return df_all_data_sources
    def daily_update(cls) -> bool:
        """
        Checks which data source should be refreshed and accordingly updates the Data Cache (which is loaded in memory) FROM the last day cached minus the number of days indicated in class attribute __UPDATE_DAYS UNTIL today. This method removes the outdated file and creates the up-to-date file in the data path (class attribute__DATA_PATH) with the filename `cache_YYYY-MM-DD.h5` of today.

        Returns
        -------
        boolean
            True if the update was done, False otherwise.

        Notes
        ----
        * If this method notices that the cache filename corresponds to the date of today, it assumes that the Data Cache is up-to-date and nothing more is performed.
        * This function updates the Data Cache on disk, but load_data() function should be executed afterwards to perform the update in memory and, in turn, enable up-to-date queries.
        """
        # date of today
        today = pd.to_datetime(
            pd.to_datetime('today').strftime(format='%Y-%m-%d'))

        # check if daily update has been done before
        try:
            for file in os.listdir(cls.__DATA_PATH):
                if re.match(f"cache_{str(today)[0:10]}.h5", file):
                    cls.__LOGGER.info(
                        f"Daily update avoided, the cache is up-to-date (today file cache_{str(today)[0:10]}.h5 already exists)"
                    )
                    return True
        except Exception as e:
            cls.__LOGGER.exception(f"ERROR finding cache file", str(e))
            return False

        # check which data sources should be updated
        datasources_to_update = []

        dsi = COnVIDa._get_update_frequencies()
        for ds in dsi.keys():
            if cls.__LAST_UPDATE_TIMESTAMPS is None:
                datasources_to_update.append(ds)
            else:
                if cls.__LAST_UPDATE_TIMESTAMPS.loc[ds, 'last_update'] is None:
                    datasources_to_update.append(ds)
                else:
                    days_without_updating = (
                        today -
                        cls.__LAST_UPDATE_TIMESTAMPS.loc[ds,
                                                         'last_update']).days
                    if days_without_updating >= dsi[ds]:
                        datasources_to_update.append(ds)

        if not datasources_to_update:
            cls.__LOGGER.info("No source is out of date")
            return True

        # all regions
        all_regions = Regions.get_regions('ES')

        # new cache file
        new_cache_file = os.path.join(cls.__DATA_PATH,
                                      "cache_{}.h5".format(str(today)[0:10]))

        # last cache file
        last_cache_file = cls.__CACHE_PATH

        ####### GEOGRAPHICAL UPDATE #######

        # all data items
        try:
            datasources = COnVIDa.get_data_items_names(DataType.GEOGRAPHICAL,
                                                       language='internal')
            all_data_items = []

            for datasource in datasources.keys():
                if datasource in datasources_to_update:
                    for data_item in datasources[datasource]:
                        all_data_items.append(data_item)

            if not all_data_items:
                new_geodata = None
            else:
                new_geodata = COnVIDa.get_data_items(regions=all_regions,
                                                     data_items=all_data_items,
                                                     language='internal',
                                                     errors='raise')

        except Exception as e:
            cls.__LOGGER.exception(
                "Retrieval of geographical data in daily update failed: ",
                str(e))
            return False

        ####### TEMPORAL UPDATE #######

        # all data items
        try:
            datasources = COnVIDa.get_data_items_names(DataType.TEMPORAL,
                                                       language='internal')
            all_data_items = []

            for datasource in datasources.keys():
                if datasource in datasources_to_update:
                    for data_item in datasources[datasource]:
                        all_data_items.append(data_item)

            if not all_data_items:
                new_tempdata = None
            else:
                last_date = cls.__DATA[DataType.TEMPORAL].index[-1]
                start_date = last_date - pd.DateOffset(days=cls.__UPDATE_DAYS)

                # get updated data of last days and today
                new_data = COnVIDa.get_data_items(regions=all_regions,
                                                  data_items=all_data_items,
                                                  start_date=start_date,
                                                  end_date=today,
                                                  language='internal',
                                                  errors='raise')

                # update cache
                new_tempdata = cls.__DATA[DataType.TEMPORAL]
                new_tempdata = new_tempdata.append(new_data)
                new_tempdata = new_tempdata.loc[~new_tempdata.index.duplicated(
                    keep='last')]

        except Exception as e:
            cls.__LOGGER.exception(
                "Retrieval of temporal data in daily update failed: ", str(e))
            return False

        ####### COMPLETE UPDATE IF NEW DATA IS AVAILABLE ##########

        try:

            if new_geodata is not None:
                new_geodata.to_hdf(path_or_buf=new_cache_file,
                                   key='geographical',
                                   mode='a')
            else:
                cls.__DATA[DataType.GEOGRAPHICAL].to_hdf(
                    path_or_buf=new_cache_file, key='geographical', mode='a')

            if new_tempdata is not None:
                new_tempdata.to_hdf(path_or_buf=new_cache_file,
                                    key='temporal',
                                    mode='a')
            else:
                cls.__DATA[DataType.TEMPORAL].to_hdf(
                    path_or_buf=new_cache_file, key='temporal', mode='a')

            if cls.__LAST_UPDATE_TIMESTAMPS is None:
                cls.__LAST_UPDATE_TIMESTAMPS.loc[:, "last_update"] = today
            else:
                cls.__LAST_UPDATE_TIMESTAMPS.loc[
                    cls.__LAST_UPDATE_TIMESTAMPS.index.
                    isin(datasources_to_update), "last_update"] = today

            cls.__LAST_UPDATE_TIMESTAMPS.to_hdf(path_or_buf=new_cache_file,
                                                key='last_updates',
                                                mode='a')

        except Exception as e:
            if os.path.exists(new_cache_file):
                os.remove(new_cache_file
                          )  # remove created cache if daily update fail
            cls.__LOGGER.exception(
                "Creation of new cache file in daily update failed:  ", str(e))
            return False

        # if the process has been stably completed, lets remove old cache
        # at this point, both old and new cache exist and new cache is in memory
        try:
            # at this point, both the old and new cache should exist: lets remove the old one
            if os.path.exists(new_cache_file) and os.path.exists(
                    last_cache_file):
                os.remove(last_cache_file
                          )  # remove created cache if daily update fail
                cls.__LOGGER.info("Daily update done!")
                return True
        except Exception as e:
            cls.__LOGGER.exception("Error in removing old cache file: ",
                                   str(e))

        ### should never get here ###
        try:
            # if error in removing old cache file, lets recover old status
            if os.path.exists(new_cache_file):
                os.remove(new_cache_file)
            cls.load_data(old_cache_file)
        except Exception as e:
            cls.__LOGGER.info(
                "Critical fail in daily update: it was not possible to recover old status"
            )
        return False