Example #1
0
File: core.py Project: ocefpaf/ulmo
def _get_inventory(update=True):
    columns = [
        ("id", 0, 11, None),
        ("latitude", 12, 20, None),
        ("longitude", 21, 30, None),
        ("element", 31, 35, None),
        ("first_year", 36, 40, None),
        ("last_year", 41, 45, None),
    ]

    inventory_file = _get_ghcn_file("ghcnd-inventory.txt", check_modified=update)
    return util.parse_fwf(inventory_file, columns)
Example #2
0
File: core.py Project: wilsaj/ulmo
def _get_inventory(update=True):
    columns = [
        ('id', 0, 11, None),
        ('latitude', 12, 20, None),
        ('longitude', 21, 30, None),
        ('element', 31, 35, None),
        ('first_year', 36, 40, None),
        ('last_year', 41, 45, None),
    ]

    inventory_file = _get_ghcn_file('ghcnd-inventory.txt',
            check_modified=update)
    return util.parse_fwf(inventory_file, columns)
Example #3
0
def _get_inventory(update=True):
    columns = [
        ('id', 0, 11, None),
        ('latitude', 12, 20, None),
        ('longitude', 21, 30, None),
        ('element', 31, 35, None),
        ('first_year', 36, 40, None),
        ('last_year', 41, 45, None),
    ]

    inventory_file = _get_ghcn_file('ghcnd-inventory.txt',
                                    check_modified=update)
    return util.parse_fwf(inventory_file, columns)
Example #4
0
File: core.py Project: wilsaj/ulmo
def _parse_values(file_handle, by_state, location_names, element):
    if by_state:
        if 'sp' in element:
            id_columns = [
                ('location_code', 0, 3, None),
                #('division', 3, 3, None), # ignored in state files
                #('element', 4, 6, None),  # element is redundant
                ('year', 7, 11, None),
            ]
        else:
            id_columns = [
                ('location_code', 0, 3, None),
                #('division', 3, 3, None), # ignored in state files
                #('element', 4, 6, None),  # element is redundant
                ('year', 6, 10, None),
            ]
    else:
        id_columns = [
            ('location_code', 0, 2, None),
            ('division', 2, 4, None),
            #('element', 4, 6, None),  # element is redundant
            ('year', 6, 10, None),
        ]

    year_col_end = id_columns[-1][2]

    month_columns = [
        (str(n), year_col_end - 6 + (7 * n), year_col_end + (7 * n), None)
        for n in range(1, 13)
    ]

    columns = id_columns + month_columns

    na_values = [NO_DATA_VALUES.get(element)]
    parsed = util.parse_fwf(file_handle, columns, na_values=na_values)

    month_columns = [id_column[0] for id_column in id_columns]
    melted = pandas.melt(parsed, id_vars=month_columns)\
        .rename(columns={'variable': 'month'})

    melted.month = melted.month.astype(int)

    # throw away NaNs
    melted = melted[melted['value'].notnull()]

    data = melted.rename(columns={
        'value': element,
    })

    return data
Example #5
0
File: core.py Project: wilsaj/ulmo
def get_data(station_id, elements=None, update=True, as_dataframe=False):
    """Retrieves data for a given station.


    Parameters
    ----------
    station_id : str
        Station ID to retrieve data for.
    elements : ``None``, str, or list of str
        If specified, limits the query to given element code(s).
    update : bool
        If ``True`` (default),  new data files will be downloaded if they are
        newer than any previously cached files. If ``False``, then previously
        downloaded files will be used and new files will only be downloaded if
        there is not a previously downloaded file for a given station.
    as_dataframe : bool
        If ``False`` (default), a dict with element codes mapped to value dicts
        is returned. If ``True``, a dict with element codes mapped to equivalent
        pandas.DataFrame objects will be returned. The pandas dataframe is used
        internally, so setting this to ``True`` is a little bit faster as it
        skips a serialization step.


    Returns
    -------
    site_dict : dict
        A dict with element codes as keys, mapped to collections of values. See
        the ``as_dataframe`` parameter for more.
    """
    if isinstance(elements, basestring):
        elements = [elements]

    start_columns = [
        ('year', 11, 15, int),
        ('month', 15, 17, int),
        ('element', 17, 21, str),
    ]
    value_columns = [
        ('value', 0, 5, float),
        ('mflag', 5, 6, str),
        ('qflag', 6, 7, str),
        ('sflag', 7, 8, str),
    ]
    columns = list(itertools.chain(start_columns, *[
        [(name + str(n), start + 13 + (8 * n), end + 13 + (8 * n), converter)
         for name, start, end, converter in value_columns]
        for n in xrange(1, 32)
    ]))

    station_file_path = _get_ghcn_file(
        station_id + '.dly', check_modified=update)
    station_data = util.parse_fwf(station_file_path, columns, na_values=[-9999])

    dataframes = {}

    for element_name, element_df in station_data.groupby('element'):
        if not elements is None and element_name not in elements:
            continue

        element_df['month_period'] = element_df.apply(
                lambda x: pandas.Period('%s-%s' % (x['year'], x['month'])),
                axis=1)
        element_df = element_df.set_index('month_period')
        monthly_index = element_df.index

        # here we're just using pandas' builtin resample logic to construct a daily
        # index for the timespan
        daily_index = element_df.resample('D').index.copy()

        # XXX: hackish; pandas support for this sort of thing will probably be
        # added soon
        month_starts = (monthly_index - 1).asfreq('D') + 1
        dataframe = pandas.DataFrame(
                columns=['value', 'mflag', 'qflag', 'sflag'], index=daily_index)

        for day_of_month in range(1, 32):
            dates = [date for date in (month_starts + day_of_month - 1)
                    if date.day == day_of_month]
            if not len(dates):
                continue
            months = pandas.PeriodIndex([pandas.Period(date, 'M') for date in dates])
            for column_name in dataframe.columns:
                col = column_name + str(day_of_month)
                dataframe[column_name][dates] = element_df[col][months]

        dataframes[element_name] = dataframe

    if as_dataframe:
        return dataframes
    else:
        return dict([
            (key, util.dict_from_dataframe(dataframe))
            for key, dataframe in dataframes.iteritems()
        ])
Example #6
0
File: core.py Project: wilsaj/ulmo
def get_stations(country=None, state=None, elements=None, start_year=None,
        end_year=None, update=True, as_dataframe=False):
    """Retrieves station information, optionally limited to specific parameters.


    Parameters
    ----------
    country : str
        The country code to use to limit station results. If set to ``None``
        (default), then stations from all countries are returned.
    state : str
        The state code to use to limit station results. If set to ``None``
        (default), then stations from all states are returned.
    elements : ``None``, str, or list of str
        If specified, station results will be limited to the given element codes
        and only stations that have data for any these elements will be
        returned.
    start_year : int
        If specified, station results will be limited to contain only stations
        that have data after this year. Can be combined with the ``end_year``
        argument to get stations with data within a range of years.
    end_year : int
        If specified, station results will be limited to contain only stations
        that have data before this year. Can be combined with the ``start_year``
        argument to get stations with data within a range of years.
    update : bool
        If ``True`` (default),  new data files will be downloaded if they are
        newer than any previously cached files. If ``False``, then previously
        downloaded files will be used and new files will only be downloaded if
        there is not a previously downloaded file for a given station.
    as_dataframe : bool
        If ``False`` (default), a dict with station IDs keyed to station dicts
        is returned. If ``True``, a single pandas.DataFrame object will be
        returned. The pandas dataframe is used internally, so setting this to
        ``True`` is a little bit faster as it skips a serialization step.


    Returns
    -------
    stations_dict : dict or pandas.DataFrame
        A dict or pandas.DataFrame representing station information for stations
        matching the arguments. See the ``as_dataframe`` parameter for more.
    """

    columns = [
        ('country', 0, 2, None),
        ('network', 2, 3, None),
        ('network_id', 3, 11, None),
        ('latitude', 12, 20, None),
        ('longitude', 21, 30, None),
        ('elevation', 31, 37, None),
        ('state', 38, 40, None),
        ('name', 41, 71, None),
        ('gsn_flag', 72, 75, None),
        ('hcn_flag', 76, 79, None),
        ('wm_oid', 80, 85, None),
    ]

    stations_file = _get_ghcn_file('ghcnd-stations.txt', check_modified=update)
    stations = util.parse_fwf(stations_file, columns)

    if not country is None:
        stations = stations[stations['country'] == country]
    if not state is None:
        stations = stations[stations['state'] == state]

    # set station id and index by it
    stations['id'] = stations[['country', 'network', 'network_id']].T.apply(''.join)
    stations = stations.set_index('id', drop=False)

    if not elements is None or not start_year is None or not end_year is None:
        inventory = _get_inventory(update=update)
        if not elements is None:
            if isinstance(elements, basestring):
                elements = [elements]

            mask = np.zeros(len(inventory), dtype=bool)
            for element in elements:
                mask += inventory['element'] == element
            inventory = inventory[mask]
        if not start_year is None:
            inventory = inventory[inventory['last_year'] >= start_year]
        if not end_year is None:
            inventory = inventory[inventory['first_year'] <= end_year]

        uniques = inventory['id'].unique()
        ids = pandas.DataFrame(uniques, index=uniques, columns=['id'])
        stations = pandas.merge(stations, ids).set_index('id', drop=False)

    # wm_oid gets converted as a float, so cast it to str manually
    # pandas versions prior to 0.13.0 could use numpy's fix-width string type
    # to do this but that stopped working in pandas 0.13.0 - fortunately a
    # regex-based helper method was added then, too
    if pandas.__version__ < '0.13.0':
        stations['wm_oid'] = stations['wm_oid'].astype('|S5')
    else:
        stations['wm_oid'] = stations['wm_oid'].astype(str).str.extract('(.{0,5})')

    stations['wm_oid'][stations['wm_oid'] == 'nan'] = np.nan

    if as_dataframe:
        return stations
    else:
        return util.dict_from_dataframe(stations)
Example #7
0
def get_data(station_id, elements=None, update=True, as_dataframe=False):
    """Retrieves data for a given station.


    Parameters
    ----------
    station_id : str
        Station ID to retrieve data for.
    elements : ``None``, str, or list of str
        If specified, limits the query to given element code(s).
    update : bool
        If ``True`` (default),  new data files will be downloaded if they are
        newer than any previously cached files. If ``False``, then previously
        downloaded files will be used and new files will only be downloaded if
        there is not a previously downloaded file for a given station.
    as_dataframe : bool
        If ``False`` (default), a dict with element codes mapped to value dicts
        is returned. If ``True``, a dict with element codes mapped to equivalent
        pandas.DataFrame objects will be returned. The pandas dataframe is used
        internally, so setting this to ``True`` is a little bit faster as it
        skips a serialization step.


    Returns
    -------
    site_dict : dict
        A dict with element codes as keys, mapped to collections of values. See
        the ``as_dataframe`` parameter for more.
    """
    if isinstance(elements, basestring):
        elements = [elements]

    start_columns = [
        ('year', 11, 15, int),
        ('month', 15, 17, int),
        ('element', 17, 21, str),
    ]
    value_columns = [
        ('value', 0, 5, float),
        ('mflag', 5, 6, str),
        ('qflag', 6, 7, str),
        ('sflag', 7, 8, str),
    ]
    columns = list(
        itertools.chain(
            start_columns, *[[(name + str(n), start + 13 + (8 * n),
                               end + 13 + (8 * n), converter)
                              for name, start, end, converter in value_columns]
                             for n in range(1, 32)]))

    station_file_path = _get_ghcn_file(station_id + '.dly',
                                       check_modified=update)
    station_data = util.parse_fwf(station_file_path,
                                  columns,
                                  na_values=[-9999])

    dataframes = {}

    for element_name, element_df in station_data.groupby('element'):
        if not elements is None and element_name not in elements:
            continue

        element_df['month_period'] = element_df.apply(
            lambda x: pandas.Period('%s-%s' % (x['year'], x['month'])), axis=1)
        element_df = element_df.set_index('month_period')
        monthly_index = element_df.index

        # here we're just using pandas' builtin resample logic to construct a daily
        # index for the timespan
        # 2018/11/27 johanneshorak: hotfix to get ncdc ghcn_daily working again
        # new resample syntax requires resample method to generate resampled index.
        daily_index = element_df.resample('D').sum().index.copy()

        # XXX: hackish; pandas support for this sort of thing will probably be
        # added soon
        month_starts = (monthly_index - 1).asfreq('D') + 1
        dataframe = pandas.DataFrame(
            columns=['value', 'mflag', 'qflag', 'sflag'], index=daily_index)

        for day_of_month in range(1, 32):
            dates = [
                date for date in (month_starts + day_of_month - 1)
                if date.day == day_of_month
            ]
            if not len(dates):
                continue
            months = pandas.PeriodIndex(
                [pandas.Period(date, 'M') for date in dates])
            for column_name in dataframe.columns:
                col = column_name + str(day_of_month)
                dataframe[column_name][dates] = element_df[col][months]

        dataframes[element_name] = dataframe

    if as_dataframe:
        return dataframes
    else:
        return dict([(key, util.dict_from_dataframe(dataframe))
                     for key, dataframe in dataframes.items()])
Example #8
0
def get_stations(country=None,
                 state=None,
                 elements=None,
                 start_year=None,
                 end_year=None,
                 update=True,
                 as_dataframe=False):
    """Retrieves station information, optionally limited to specific parameters.


    Parameters
    ----------
    country : str
        The country code to use to limit station results. If set to ``None``
        (default), then stations from all countries are returned.
    state : str
        The state code to use to limit station results. If set to ``None``
        (default), then stations from all states are returned.
    elements : ``None``, str, or list of str
        If specified, station results will be limited to the given element codes
        and only stations that have data for any these elements will be
        returned.
    start_year : int
        If specified, station results will be limited to contain only stations
        that have data after this year. Can be combined with the ``end_year``
        argument to get stations with data within a range of years.
    end_year : int
        If specified, station results will be limited to contain only stations
        that have data before this year. Can be combined with the ``start_year``
        argument to get stations with data within a range of years.
    update : bool
        If ``True`` (default),  new data files will be downloaded if they are
        newer than any previously cached files. If ``False``, then previously
        downloaded files will be used and new files will only be downloaded if
        there is not a previously downloaded file for a given station.
    as_dataframe : bool
        If ``False`` (default), a dict with station IDs keyed to station dicts
        is returned. If ``True``, a single pandas.DataFrame object will be
        returned. The pandas dataframe is used internally, so setting this to
        ``True`` is a little bit faster as it skips a serialization step.


    Returns
    -------
    stations_dict : dict or pandas.DataFrame
        A dict or pandas.DataFrame representing station information for stations
        matching the arguments. See the ``as_dataframe`` parameter for more.
    """

    columns = [
        ('country', 0, 2, None),
        ('network', 2, 3, None),
        ('network_id', 3, 11, None),
        ('latitude', 12, 20, None),
        ('longitude', 21, 30, None),
        ('elevation', 31, 37, None),
        ('state', 38, 40, None),
        ('name', 41, 71, None),
        ('gsn_flag', 72, 75, None),
        ('hcn_flag', 76, 79, None),
        ('wm_oid', 80, 85, None),
    ]

    stations_file = _get_ghcn_file('ghcnd-stations.txt', check_modified=update)
    stations = util.parse_fwf(stations_file, columns)

    if not country is None:
        stations = stations[stations['country'] == country]
    if not state is None:
        stations = stations[stations['state'] == state]

    # set station id and index by it
    stations['id'] = stations[['country', 'network',
                               'network_id']].T.apply(''.join)

    if not elements is None or not start_year is None or not end_year is None:
        inventory = _get_inventory(update=update)
        if not elements is None:
            if isinstance(elements, basestring):
                elements = [elements]

            mask = np.zeros(len(inventory), dtype=bool)
            for element in elements:
                mask += inventory['element'] == element
            inventory = inventory[mask]
        if not start_year is None:
            inventory = inventory[inventory['last_year'] >= start_year]
        if not end_year is None:
            inventory = inventory[inventory['first_year'] <= end_year]

        uniques = inventory['id'].unique()
        ids = pandas.DataFrame(uniques, index=uniques, columns=['id'])
        stations = pandas.merge(stations, ids).set_index('id', drop=False)

    stations = stations.set_index('id', drop=False)
    # wm_oid gets convertidsed as a float, so cast it to str manually
    # pandas versions prior to 0.13.0 could use numpy's fix-width string type
    # to do this but that stopped working in pandas 0.13.0 - fortunately a
    # regex-based helper method was added then, too
    if pandas.__version__ < '0.13.0':
        stations['wm_oid'] = stations['wm_oid'].astype('|U5')
        stations['wm_oid'][stations['wm_oid'] == 'nan'] = np.nan
    else:
        stations['wm_oid'] = stations['wm_oid'].astype('|U5').map(
            lambda x: x[:-2])
        is_nan = stations['wm_oid'] == 'n'
        is_empty = stations['wm_oid'] == ''
        is_invalid = is_nan | is_empty
        stations.loc[is_invalid, 'wm_oid'] = np.nan

    if as_dataframe:
        return stations
    else:
        return util.dict_from_dataframe(stations)