Exemple #1
0
def process_noaa_data(countries):
    """Returns a dataset for the given countries."""

    df_stations = load_dataset('stations')
    df_countries = load_dataset('countries')

    # Join country to df_stations
    df_stations['COUNTRY_CODE'] = df_stations['ID'].str.slice(0, 2)
    df_stations = df_stations.merge(df_countries,
                                    on=['COUNTRY_CODE'],
                                    how='left')

    df_stations = df_stations[df_stations['COUNTRY_CODE'].isin(countries)]

    df_daily_information = load_stations_data(df_stations['ID'])

    return df_daily_information.merge(df_stations, how='left', on=['ID'])
Exemple #2
0
def get_request_urls(country, start_date, end_date=None, metrics=None):
    """Encodes the parameters the URL to make a GET request

    Arguments:
        country(str): FIPS Country code
        start_date(datetime)
        end_date(datetime): Defaults to today
        metrics(list[str]): Optional.List of metrics to retrieve,valid values are:
            TMIN: Minimum temperature.
            TMAX: Maximum temperature.
            TAVG: Average of temperature.
            SNOW: Snowfall (mm).
            SNWD: Snow depth (mm).
            PRCP: Precipitation.
            PSUN: Daily percent of possible sunshine (percent)
            TSUN: Daily total sunshine (minutes)

    Returns:
        str
    """

    base_url = 'https://www.ncei.noaa.gov/access/services/data/v1?dataset=daily-summaries'
    max_stations_req = 50

    if metrics is None:
        metrics = DEFAULT_METRICS

    request_common_args = (f'&format=json'
                           f'&units=metric'
                           f'&dataTypes={",".join(metrics)}')

    if end_date is None:
        end_date = datetime.now()

    start = start_date.date().isoformat()
    end = end_date.date().isoformat()

    stations_list = get_stations_by_country(country)
    inventory_data = load_dataset('inventory')
    inventory_data = inventory_data[inventory_data.end_date >= start_date.year]
    stations_list = inventory_data[inventory_data.ID.isin(
        stations_list)].ID.unique()
    if len(stations_list) < max_stations_req:
        stations = ','.join(stations_list)
        return [
            f'{base_url}&stations={stations}&startDate={start}&endDate={end}{request_common_args}'
        ]

    else:
        chunked_station_list = [
            stations_list[i:i + max_stations_req]
            for i in range(0, len(stations_list), max_stations_req)
        ]

        return [(f'{base_url}&stations={",".join(chunk)}&startDate={start}'
                 f'&endDate={end}{request_common_args}')
                for chunk in chunked_station_list]
Exemple #3
0
def noaa_api_connector(countries, start_date, end_date=None, metrics=None):
    """Get data from NOAA API.

    Arguments:
        countries(list[str]): List of FIPS country codes to retrieve.
        start_date(datetime)
        end_date(datetime)
        metrics(list[str]): Optional.List of metrics to retrieve,valid values are:
            TMIN: Minimum temperature.
            TMAX: Maximum temperature.
            TAVG: Average of temperature.
            SNOW: Snowfall (mm).
            SNWD: Snow depth (mm).
            PRCP: Precipitation.
            PSUN: Daily percent of possible sunshine (percent)
            TSUN: Daily total sunshine (minutes)

    Returns:
        tuple[list[dict], list[Exception]]
    """
    if not os.path.isfile(f'{DATA_DIRECTORY}/stations_metadata.txt'):
        download_noaa_files(large_files=False)

    result = list()
    for country in countries:
        logging.info('Requesting data for %s', country)
        urls = get_request_urls(country, start_date, end_date, metrics)
        country_results, errors = get_parse_response(urls)

        if errors:
            logging.info(
                'The following errors where found during the operation:')
            for error in errors:
                logging.info(error)

        result.extend(country_results)

    data = pd.DataFrame(result)
    stations = load_dataset('stations')
    data = data.merge(stations, how='left', left_on='STATION', right_on='ID')

    del data['ID']
    del data['STATE']

    columns = [
        'DATE', 'STATION', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'NAME',
        'GSN FLAG', 'HCN/CRN FLAG', 'WMO ID'
    ]

    if metrics is None:
        metrics = DEFAULT_METRICS

    columns.extend([metric for metric in metrics if metric in data.columns])

    return data[columns]
Exemple #4
0
def process_noaa_files(countries):
    """Returns a dataset for the given countries.

    Arguments:
        countries(list[str]): List of countries in ISO-2 format.

    Returns:
        pandas.DataFrame
    """

    df_stations = load_dataset('stations')
    df_countries = load_dataset('countries')

    # Join country to df_stations
    df_stations['COUNTRY_CODE'] = df_stations['ID'].str.slice(0, 2)
    df_stations = df_stations.merge(df_countries,
                                    on=['COUNTRY_CODE'],
                                    how='left')

    df_stations = df_stations[df_stations['COUNTRY_CODE'].isin(countries)]

    df_daily_information = load_stations_data(df_stations['ID'])

    return df_daily_information.merge(df_stations, how='left', on=['ID'])
Exemple #5
0
def noaa_api_connector(countries, start_date, end_date=None):
    """Get data from NOAA API.

    Arguments:
        countries(list[str]): List of FIPS country codes to retrieve.
        start_date(datetime)
        end_date(datetime)

    Returns:
        tuple[list[dict], list[Exception]]
    """
    if not os.path.isfile(f'{DATA_DIRECTORY}/stations_metadata.txt'):
        download_noaa_files(large_files=False)

    result = list()
    for country in countries:
        logging.info('Requesting data for %s', country)
        urls = get_request_urls(country, start_date, end_date)
        country_results, errors = get_parse_response(urls)

        if errors:
            logging.INFO(
                'The following errors where found during the operation:')
            for error in errors:
                logging.INFO(error)

        result.extend(country_results)

    data = pd.DataFrame(result)
    stations = load_dataset('stations')
    data = data.merge(stations, how='left', left_on='STATION', right_on='ID')

    del data['ID']
    del data['STATE']

    columns = [
        'DATE', 'STATION', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'NAME',
        'GSN FLAG', 'HCN/CRN FLAG', 'WMO ID', 'TMAX', 'TAVG', 'TMIN', 'PRCP',
        'SNWD'
    ]
    return data[columns]
Exemple #6
0
def load_and_filter_dataset(dataset_name):
    dataset = load_dataset(dataset_name)
    return dataset[(dataset.YEAR >= 2020) | ((dataset.YEAR == 2019) &
                                             (dataset.MONTH >= 11))]