Python dict_from_dataframe Examples, ulmo.util.dict_from_dataframe Python Examples

Example #1

0

Show file

File: core.py Project: ulmo-dev/ulmo

def get_recent_data(site_code, as_dataframe=False):
    """fetches near real-time instantaneous water quality data for the LCRA
    bay sites.

    Parameters
    ----------
    site_code : str
        The bay site to fetch data for. see `real_time_sites`
    as_dataframe : bool
        This determines what format values are returned as. If ``False``
        (default), the values will be list of value dicts. If ``True`` then 
        values are returned as pandas.DataFrame.

    Returns
    -------
    list
        list of values or dataframe.
    """
    if site_code not in real_time_sites.keys():
        log.info('%s is not in the list of LCRA real time salinity sites' %
                 site_code)
        return {}
    data_url = 'http://waterquality.lcra.org/salinity.aspx?sNum=%s&name=%s' % (
        site_code, real_time_sites[site_code])
    data = pd.read_html(data_url, header=0)[1]
    data.index = data['Date - Time'].apply(lambda x: util.convert_datetime(x))
    data.drop('Date - Time', axis=1, inplace=True)
    data = data.applymap(_nan_values)
    data.dropna(how='all', axis=0, inplace=True)
    data.dropna(how='all', axis=1, inplace=True)
    columns = dict([(column, _beautify_header(column))
                    for column in data.columns])
    data.rename(columns=columns, inplace=True)
    data = data.astype(float)

    if as_dataframe:
        return data
    else:
        return util.dict_from_dataframe(data)

Example #2

0

Show file

File: core.py Project: AlexanderSWalker/ulmo

def get_recent_data(site_code, as_dataframe=False):
    """fetches near real-time instantaneous water quality data for the LCRA
    bay sites.
    Parameters
    ----------
    site_code : str
        The bay site to fetch data for. see `real_time_sites`
    as_dataframe : bool
        This determines what format values are returned as. If ``False``
        (default), the values will be list of value dicts. If ``True`` then 
        values are returned as pandas.DataFrame.
    Returns
    -------
    list of values or dataframe.
    """
    if site_code not in real_time_sites.keys():
        log.info('%s is not in the list of LCRA real time salinity sites' %
                 site_code)
        return {}
    data_url = 'http://waterquality.lcra.org/salinity.aspx?sNum=%s&name=%s' % (
        site_code, real_time_sites[site_code])
    data = pd.read_html(data_url, header=0)[1]
    data.index = data['Date - Time'].apply(lambda x: util.convert_datetime(
        x))
    data.drop('Date - Time', axis=1, inplace=True)
    data = data.applymap(_nan_values)
    data.dropna(how='all', axis=0, inplace=True)
    data.dropna(how='all', axis=1, inplace=True)
    columns = dict([(column, _beautify_header(column)) for column in
                     data.columns])
    data.rename(columns=columns, inplace=True)
    data = data.astype(float)

    if as_dataframe:
        return data
    else:
        return util.dict_from_dataframe(data)

Example #3

0

Show file

File: core.py Project: wilsaj/ulmo

def get_data(station_id, elements=None, update=True, as_dataframe=False):
    """Retrieves data for a given station.


    Parameters
    ----------
    station_id : str
        Station ID to retrieve data for.
    elements : ``None``, str, or list of str
        If specified, limits the query to given element code(s).
    update : bool
        If ``True`` (default),  new data files will be downloaded if they are
        newer than any previously cached files. If ``False``, then previously
        downloaded files will be used and new files will only be downloaded if
        there is not a previously downloaded file for a given station.
    as_dataframe : bool
        If ``False`` (default), a dict with element codes mapped to value dicts
        is returned. If ``True``, a dict with element codes mapped to equivalent
        pandas.DataFrame objects will be returned. The pandas dataframe is used
        internally, so setting this to ``True`` is a little bit faster as it
        skips a serialization step.


    Returns
    -------
    site_dict : dict
        A dict with element codes as keys, mapped to collections of values. See
        the ``as_dataframe`` parameter for more.
    """
    if isinstance(elements, basestring):
        elements = [elements]

    start_columns = [
        ('year', 11, 15, int),
        ('month', 15, 17, int),
        ('element', 17, 21, str),
    ]
    value_columns = [
        ('value', 0, 5, float),
        ('mflag', 5, 6, str),
        ('qflag', 6, 7, str),
        ('sflag', 7, 8, str),
    ]
    columns = list(itertools.chain(start_columns, *[
        [(name + str(n), start + 13 + (8 * n), end + 13 + (8 * n), converter)
         for name, start, end, converter in value_columns]
        for n in xrange(1, 32)
    ]))

    station_file_path = _get_ghcn_file(
        station_id + '.dly', check_modified=update)
    station_data = util.parse_fwf(station_file_path, columns, na_values=[-9999])

    dataframes = {}

    for element_name, element_df in station_data.groupby('element'):
        if not elements is None and element_name not in elements:
            continue

        element_df['month_period'] = element_df.apply(
                lambda x: pandas.Period('%s-%s' % (x['year'], x['month'])),
                axis=1)
        element_df = element_df.set_index('month_period')
        monthly_index = element_df.index

        # here we're just using pandas' builtin resample logic to construct a daily
        # index for the timespan
        daily_index = element_df.resample('D').index.copy()

        # XXX: hackish; pandas support for this sort of thing will probably be
        # added soon
        month_starts = (monthly_index - 1).asfreq('D') + 1
        dataframe = pandas.DataFrame(
                columns=['value', 'mflag', 'qflag', 'sflag'], index=daily_index)

        for day_of_month in range(1, 32):
            dates = [date for date in (month_starts + day_of_month - 1)
                    if date.day == day_of_month]
            if not len(dates):
                continue
            months = pandas.PeriodIndex([pandas.Period(date, 'M') for date in dates])
            for column_name in dataframe.columns:
                col = column_name + str(day_of_month)
                dataframe[column_name][dates] = element_df[col][months]

        dataframes[element_name] = dataframe

    if as_dataframe:
        return dataframes
    else:
        return dict([
            (key, util.dict_from_dataframe(dataframe))
            for key, dataframe in dataframes.iteritems()
        ])

Example #4

0

Show file

File: core.py Project: wilsaj/ulmo

def get_stations(country=None, state=None, elements=None, start_year=None,
        end_year=None, update=True, as_dataframe=False):
    """Retrieves station information, optionally limited to specific parameters.


    Parameters
    ----------
    country : str
        The country code to use to limit station results. If set to ``None``
        (default), then stations from all countries are returned.
    state : str
        The state code to use to limit station results. If set to ``None``
        (default), then stations from all states are returned.
    elements : ``None``, str, or list of str
        If specified, station results will be limited to the given element codes
        and only stations that have data for any these elements will be
        returned.
    start_year : int
        If specified, station results will be limited to contain only stations
        that have data after this year. Can be combined with the ``end_year``
        argument to get stations with data within a range of years.
    end_year : int
        If specified, station results will be limited to contain only stations
        that have data before this year. Can be combined with the ``start_year``
        argument to get stations with data within a range of years.
    update : bool
        If ``True`` (default),  new data files will be downloaded if they are
        newer than any previously cached files. If ``False``, then previously
        downloaded files will be used and new files will only be downloaded if
        there is not a previously downloaded file for a given station.
    as_dataframe : bool
        If ``False`` (default), a dict with station IDs keyed to station dicts
        is returned. If ``True``, a single pandas.DataFrame object will be
        returned. The pandas dataframe is used internally, so setting this to
        ``True`` is a little bit faster as it skips a serialization step.


    Returns
    -------
    stations_dict : dict or pandas.DataFrame
        A dict or pandas.DataFrame representing station information for stations
        matching the arguments. See the ``as_dataframe`` parameter for more.
    """

    columns = [
        ('country', 0, 2, None),
        ('network', 2, 3, None),
        ('network_id', 3, 11, None),
        ('latitude', 12, 20, None),
        ('longitude', 21, 30, None),
        ('elevation', 31, 37, None),
        ('state', 38, 40, None),
        ('name', 41, 71, None),
        ('gsn_flag', 72, 75, None),
        ('hcn_flag', 76, 79, None),
        ('wm_oid', 80, 85, None),
    ]

    stations_file = _get_ghcn_file('ghcnd-stations.txt', check_modified=update)
    stations = util.parse_fwf(stations_file, columns)

    if not country is None:
        stations = stations[stations['country'] == country]
    if not state is None:
        stations = stations[stations['state'] == state]

    # set station id and index by it
    stations['id'] = stations[['country', 'network', 'network_id']].T.apply(''.join)
    stations = stations.set_index('id', drop=False)

    if not elements is None or not start_year is None or not end_year is None:
        inventory = _get_inventory(update=update)
        if not elements is None:
            if isinstance(elements, basestring):
                elements = [elements]

            mask = np.zeros(len(inventory), dtype=bool)
            for element in elements:
                mask += inventory['element'] == element
            inventory = inventory[mask]
        if not start_year is None:
            inventory = inventory[inventory['last_year'] >= start_year]
        if not end_year is None:
            inventory = inventory[inventory['first_year'] <= end_year]

        uniques = inventory['id'].unique()
        ids = pandas.DataFrame(uniques, index=uniques, columns=['id'])
        stations = pandas.merge(stations, ids).set_index('id', drop=False)

    # wm_oid gets converted as a float, so cast it to str manually
    # pandas versions prior to 0.13.0 could use numpy's fix-width string type
    # to do this but that stopped working in pandas 0.13.0 - fortunately a
    # regex-based helper method was added then, too
    if pandas.__version__ < '0.13.0':
        stations['wm_oid'] = stations['wm_oid'].astype('|S5')
    else:
        stations['wm_oid'] = stations['wm_oid'].astype(str).str.extract('(.{0,5})')

    stations['wm_oid'][stations['wm_oid'] == 'nan'] = np.nan

    if as_dataframe:
        return stations
    else:
        return util.dict_from_dataframe(stations)

Example #5

0

Show file

File: core.py Project: nathanhilbert/ulmo

def get_station_data(station_code, date=None, as_dataframe=False):
    """Fetches data for a station at a given date.


    Parameters
    ----------
    station_code: str
        The station code to fetch data for. A list of stations can be retrieved with
        ``get_stations()``
    date : ``None`` or date (see :ref:`dates-and-times`)
        The date of the data to be queried. If date is ``None`` (default), then
        data for the current day is retreived.
    as_dataframe : bool
        This determines what format values are returned as. If ``False``
        (default), the values dict will be a dict with timestamps as keys mapped
        to a dict of gauge variables and values. If ``True`` then the values
        dict will be a pandas.DataFrame object containing the equivalent
        information.


    Returns
    -------
    data_dict : dict
        A dict containing station information and values.
    """

    station_dict = {}
    if date is None:
        date_str = 'current'
        year = datetime.date.today().year
    else:
        date = util.convert_date(date)
        date_str = date.strftime('%Y%m%d')
        year = date.year

    filename = '%s.%s.html' % (station_code, date_str)
    data_url = 'http://www.swt-wc.usace.army.mil/webdata/gagedata/' + filename
    path = os.path.join(USACE_SWTWC_DIR, filename)

    with util.open_file_for_url(data_url, path) as f:
        soup = BeautifulSoup(f)
        pre = soup.find('pre')
        if pre is None:
            error_msg = 'no data could be found for station code %(station_code)s and date %(date)s (url: %(data_url)s)' % {
                'date': date,
                'data_url': data_url,
                'station_code': station_code,
            }
            raise ValueError(error_msg)
        sio = StringIO.StringIO(str(pre.text.strip()))

    first_line = sio.readline()
    split = first_line[8:].strip().split()

    station_dict['code'] = split[0]
    station_dict['description'] = ' '.join(split[1:])

    second_line = sio.readline()
    station_dict['station_type'] = second_line.strip().split(':')[1].strip()

    notes = []

    while 1:
        next_line = sio.readline()
        if ':' in next_line:
            notes.append(next_line.strip())
        else:
            break

    if len(notes):
        station_dict['notes'] = '\n'.join(notes)

    variable_names = _split_line(sio.readline()[15:], 10)
    variable_units = _split_line(sio.readline()[15:], 10)
    variable_sources = _split_line(sio.readline()[15:], 10)

    station_dict['variables'] = dict([(name, {
        'unit': unit,
        'source': source
    }) for name, unit, source in zip(variable_names, variable_units,
                                     variable_sources)])

    station_dict['timezone'] = sio.readline().strip().strip('()')
    column_names = ['datetime'] + variable_names
    widths = [15] + ([10] * len(variable_names))
    converters = dict([(variable_name,
                        lambda x: float(x) if x != '----' else np.nan)
                       for variable_name in variable_names])
    date_parser = lambda x: _convert_datetime(x, year)
    dataframe = pandas.read_fwf(
        sio,
        names=column_names,
        widths=widths,
        index_col=['datetime'],
        na_values=['----'],
        converters=converters,
        parse_dates=True,
        date_parser=date_parser)

    # parse out rows that are all nans (e.g. end of "current" page)
    dataframe = dataframe[~np.isnan(dataframe.T.sum())]

    if as_dataframe:
        station_dict['values'] = dataframe
    else:
        station_dict['values'] = util.dict_from_dataframe(dataframe)

    return station_dict

Example #6

0

Show file

File: core.py Project: spestana/ulmo

def get_data(station_id, elements=None, update=True, as_dataframe=False):
    """Retrieves data for a given station.


    Parameters
    ----------
    station_id : str
        Station ID to retrieve data for.
    elements : ``None``, str, or list of str
        If specified, limits the query to given element code(s).
    update : bool
        If ``True`` (default),  new data files will be downloaded if they are
        newer than any previously cached files. If ``False``, then previously
        downloaded files will be used and new files will only be downloaded if
        there is not a previously downloaded file for a given station.
    as_dataframe : bool
        If ``False`` (default), a dict with element codes mapped to value dicts
        is returned. If ``True``, a dict with element codes mapped to equivalent
        pandas.DataFrame objects will be returned. The pandas dataframe is used
        internally, so setting this to ``True`` is a little bit faster as it
        skips a serialization step.


    Returns
    -------
    site_dict : dict
        A dict with element codes as keys, mapped to collections of values. See
        the ``as_dataframe`` parameter for more.
    """
    if isinstance(elements, basestring):
        elements = [elements]

    start_columns = [
        ('year', 11, 15, int),
        ('month', 15, 17, int),
        ('element', 17, 21, str),
    ]
    value_columns = [
        ('value', 0, 5, float),
        ('mflag', 5, 6, str),
        ('qflag', 6, 7, str),
        ('sflag', 7, 8, str),
    ]
    columns = list(
        itertools.chain(
            start_columns, *[[(name + str(n), start + 13 + (8 * n),
                               end + 13 + (8 * n), converter)
                              for name, start, end, converter in value_columns]
                             for n in range(1, 32)]))

    station_file_path = _get_ghcn_file(station_id + '.dly',
                                       check_modified=update)
    station_data = util.parse_fwf(station_file_path,
                                  columns,
                                  na_values=[-9999])

    dataframes = {}

    for element_name, element_df in station_data.groupby('element'):
        if not elements is None and element_name not in elements:
            continue

        element_df['month_period'] = element_df.apply(
            lambda x: pandas.Period('%s-%s' % (x['year'], x['month'])), axis=1)
        element_df = element_df.set_index('month_period')
        monthly_index = element_df.index

        # here we're just using pandas' builtin resample logic to construct a daily
        # index for the timespan
        # 2018/11/27 johanneshorak: hotfix to get ncdc ghcn_daily working again
        # new resample syntax requires resample method to generate resampled index.
        daily_index = element_df.resample('D').sum().index.copy()

        # XXX: hackish; pandas support for this sort of thing will probably be
        # added soon
        month_starts = (monthly_index - 1).asfreq('D') + 1
        dataframe = pandas.DataFrame(
            columns=['value', 'mflag', 'qflag', 'sflag'], index=daily_index)

        for day_of_month in range(1, 32):
            dates = [
                date for date in (month_starts + day_of_month - 1)
                if date.day == day_of_month
            ]
            if not len(dates):
                continue
            months = pandas.PeriodIndex(
                [pandas.Period(date, 'M') for date in dates])
            for column_name in dataframe.columns:
                col = column_name + str(day_of_month)
                dataframe[column_name][dates] = element_df[col][months]

        dataframes[element_name] = dataframe

    if as_dataframe:
        return dataframes
    else:
        return dict([(key, util.dict_from_dataframe(dataframe))
                     for key, dataframe in dataframes.items()])

Example #7

0

Show file

File: core.py Project: spestana/ulmo

def get_stations(country=None,
                 state=None,
                 elements=None,
                 start_year=None,
                 end_year=None,
                 update=True,
                 as_dataframe=False):
    """Retrieves station information, optionally limited to specific parameters.


    Parameters
    ----------
    country : str
        The country code to use to limit station results. If set to ``None``
        (default), then stations from all countries are returned.
    state : str
        The state code to use to limit station results. If set to ``None``
        (default), then stations from all states are returned.
    elements : ``None``, str, or list of str
        If specified, station results will be limited to the given element codes
        and only stations that have data for any these elements will be
        returned.
    start_year : int
        If specified, station results will be limited to contain only stations
        that have data after this year. Can be combined with the ``end_year``
        argument to get stations with data within a range of years.
    end_year : int
        If specified, station results will be limited to contain only stations
        that have data before this year. Can be combined with the ``start_year``
        argument to get stations with data within a range of years.
    update : bool
        If ``True`` (default),  new data files will be downloaded if they are
        newer than any previously cached files. If ``False``, then previously
        downloaded files will be used and new files will only be downloaded if
        there is not a previously downloaded file for a given station.
    as_dataframe : bool
        If ``False`` (default), a dict with station IDs keyed to station dicts
        is returned. If ``True``, a single pandas.DataFrame object will be
        returned. The pandas dataframe is used internally, so setting this to
        ``True`` is a little bit faster as it skips a serialization step.


    Returns
    -------
    stations_dict : dict or pandas.DataFrame
        A dict or pandas.DataFrame representing station information for stations
        matching the arguments. See the ``as_dataframe`` parameter for more.
    """

    columns = [
        ('country', 0, 2, None),
        ('network', 2, 3, None),
        ('network_id', 3, 11, None),
        ('latitude', 12, 20, None),
        ('longitude', 21, 30, None),
        ('elevation', 31, 37, None),
        ('state', 38, 40, None),
        ('name', 41, 71, None),
        ('gsn_flag', 72, 75, None),
        ('hcn_flag', 76, 79, None),
        ('wm_oid', 80, 85, None),
    ]

    stations_file = _get_ghcn_file('ghcnd-stations.txt', check_modified=update)
    stations = util.parse_fwf(stations_file, columns)

    if not country is None:
        stations = stations[stations['country'] == country]
    if not state is None:
        stations = stations[stations['state'] == state]

    # set station id and index by it
    stations['id'] = stations[['country', 'network',
                               'network_id']].T.apply(''.join)

    if not elements is None or not start_year is None or not end_year is None:
        inventory = _get_inventory(update=update)
        if not elements is None:
            if isinstance(elements, basestring):
                elements = [elements]

            mask = np.zeros(len(inventory), dtype=bool)
            for element in elements:
                mask += inventory['element'] == element
            inventory = inventory[mask]
        if not start_year is None:
            inventory = inventory[inventory['last_year'] >= start_year]
        if not end_year is None:
            inventory = inventory[inventory['first_year'] <= end_year]

        uniques = inventory['id'].unique()
        ids = pandas.DataFrame(uniques, index=uniques, columns=['id'])
        stations = pandas.merge(stations, ids).set_index('id', drop=False)

    stations = stations.set_index('id', drop=False)
    # wm_oid gets convertidsed as a float, so cast it to str manually
    # pandas versions prior to 0.13.0 could use numpy's fix-width string type
    # to do this but that stopped working in pandas 0.13.0 - fortunately a
    # regex-based helper method was added then, too
    if pandas.__version__ < '0.13.0':
        stations['wm_oid'] = stations['wm_oid'].astype('|U5')
        stations['wm_oid'][stations['wm_oid'] == 'nan'] = np.nan
    else:
        stations['wm_oid'] = stations['wm_oid'].astype('|U5').map(
            lambda x: x[:-2])
        is_nan = stations['wm_oid'] == 'n'
        is_empty = stations['wm_oid'] == ''
        is_invalid = is_nan | is_empty
        stations.loc[is_invalid, 'wm_oid'] = np.nan

    if as_dataframe:
        return stations
    else:
        return util.dict_from_dataframe(stations)

Example #8

0

Show file

File: core.py Project: wilsaj/ulmo

def get_station_data(station_code, date=None, as_dataframe=False):
    """Fetches data for a station at a given date.


    Parameters
    ----------
    station_code: str
        The station code to fetch data for. A list of stations can be retrieved with
        ``get_stations()``
    date : ``None`` or date (see :ref:`dates-and-times`)
        The date of the data to be queried. If date is ``None`` (default), then
        data for the current day is retreived.
    as_dataframe : bool
        This determines what format values are returned as. If ``False``
        (default), the values dict will be a dict with timestamps as keys mapped
        to a dict of gauge variables and values. If ``True`` then the values
        dict will be a pandas.DataFrame object containing the equivalent
        information.


    Returns
    -------
    data_dict : dict
        A dict containing station information and values.
    """

    station_dict = {}
    if date is None:
        date_str = 'current'
        year = datetime.date.today().year
    else:
        date = util.convert_date(date)
        date_str = date.strftime('%Y%m%d')
        year = date.year

    filename = '%s.%s.html' % (station_code, date_str)
    data_url = 'http://www.swt-wc.usace.army.mil/webdata/gagedata/' + filename
    path = os.path.join(USACE_SWTWC_DIR, filename)

    with util.open_file_for_url(data_url, path) as f:
        soup = BeautifulSoup(f)
        pre = soup.find('pre')
        if pre is None:
            error_msg = 'no data could be found for station code %(station_code)s and date %(date)s (url: %(data_url)s)' % {
                'date': date,
                'data_url': data_url,
                'station_code': station_code,
            }
            raise ValueError(error_msg)
        sio = StringIO.StringIO(str(pre.text.strip()))

    first_line = sio.readline()
    split = first_line[8:].strip().split()

    station_dict['code'] = split[0]
    station_dict['description'] = ' '.join(split[1:])

    second_line = sio.readline()
    station_dict['station_type'] = second_line.strip().split(':')[1].strip()

    notes = []

    while 1:
        next_line = sio.readline()
        if ':' in next_line:
            notes.append(next_line.strip())
        else:
            break

    if len(notes):
        station_dict['notes'] = '\n'.join(notes)

    variable_names = _split_line(sio.readline()[15:], 10)
    variable_units = _split_line(sio.readline()[15:], 10)
    variable_sources = _split_line(sio.readline()[15:], 10)

    station_dict['variables'] = dict([(name, {
        'unit': unit,
        'source': source
    }) for name, unit, source in zip(variable_names, variable_units,
                                     variable_sources)])

    station_dict['timezone'] = sio.readline().strip().strip('()')
    column_names = ['datetime'] + variable_names
    widths = [15] + ([10] * len(variable_names))
    converters = dict([(variable_name, lambda x: float(x)
                        if x != '----' else np.nan)
                       for variable_name in variable_names])
    date_parser = lambda x: _convert_datetime(x, year)
    dataframe = pandas.read_fwf(sio,
                                names=column_names,
                                widths=widths,
                                index_col=['datetime'],
                                na_values=['----'],
                                converters=converters,
                                parse_dates=True,
                                date_parser=date_parser)

    # parse out rows that are all nans (e.g. end of "current" page)
    dataframe = dataframe[~np.isnan(dataframe.T.sum())]

    if as_dataframe:
        station_dict['values'] = dataframe
    else:
        station_dict['values'] = util.dict_from_dataframe(dataframe)

    return station_dict