def get_data(county=None, start=None, end=None, as_dataframe=False, data_dir=None): """Retreives data. Parameters ---------- county : ``None`` or str If specified, results will be limited to the county corresponding to the given 5-character Texas county fips code i.e. 48???. end : ``None`` or date (see :ref:`dates-and-times`) Results will be limited to data on or before this date. Default is the current date. start : ``None`` or date (see :ref:`dates-and-times`) Results will be limited to data on or after this date. Default is the start of the calendar year for the end date. as_dataframe: bool If ``False`` (default), a dict with a nested set of dicts will be returned with data indexed by 5-character Texas county FIPS code. If ``True`` then a pandas.DataFrame object will be returned. The pandas dataframe is used internally, so setting this to ``True`` is a little bit faster as it skips a serialization step. data_dir : ``None`` or directory path Directory for holding downloaded data files. If no path is provided (default), then a user-specific directory for holding application data will be used (the directory will depend on the platform/operating system). Returns ------- data : dict or pandas.Dataframe A dict or pandas.DataFrame representing the data. See the ``as_dataframe`` parameter for more. """ if end is None: end_date = datetime.date.today() else: end_date = util.convert_date(end) if start is None: start_date = datetime.date(end_date.year, 1, 1) else: start_date = util.convert_date(start) if data_dir is None: data_dir = os.path.join(util.get_ulmo_dir(), 'twc/kbdi') df = pandas.concat([ _date_dataframe(date, data_dir) for date in pandas.period_range(start_date, end_date, freq='D') ], ignore_index=True) fips_df = _fips_dataframe() df = pandas.merge(df, fips_df, left_on='county', right_on='name') del df['name'] if county: df = df[df['fips'] == county] if as_dataframe: return df else: return _as_data_dict(df)
def get_stations(fips=None, country=None, state=None, start=None, end=None, update=True): if start: start_date = util.convert_date(start) else: start_date = None if end: end_date = util.convert_date(end) else: end_date = None if isinstance(fips, basestring): fips = [fips] if isinstance(country, basestring): country = [country] if isinstance(state, basestring): state = [state] stations_url = 'http://www1.ncdc.noaa.gov/pub/data/gsod/ish-history.csv' with util.open_file_for_url(stations_url, NCDC_GSOD_STATIONS_FILE) as f: reader = csv.DictReader(f) if fips is None and country is None and state is None \ and start is None and end is None: rows = reader else: if start_date is None: start_str = None else: start_str = start_date.strftime('%Y%m%d') if end_date is None: end_str = None else: end_str = end_date.strftime('%Y%m%d') rows = [ row for row in reader if _passes_row_filter(row, fips=fips, country=country, state=state, start_str=start_str, end_str=end_str) ] stations = dict([ (_station_code(row), _process_station(row)) for row in rows ]) return stations
def get_all_files(start=None, end=None): if start: start_date = util.convert_date(start) else: start_date = NCDC_GSOD_START_DATE if end: end_date = util.convert_date(end) else: end_date = datetime.date.today() for year in range(start_date.year, end_date.year + 1): tar_path = _get_gsod_file(year) with _open_tarfile(tar_path, 'r:') as gsod_tar: stations_in_file = [ name.split('./')[-1].rsplit('-', 1)[0] for name in gsod_tar.getnames() if len(name) > 1] stations = stations_in_file for station in stations: _read_gsod_file(gsod_tar, station, year)
def test_convert_date_from_datetime(): compare_dates = [ (datetime.datetime(2011, 12, 31, 20), datetime.date(2011, 12, 31)), (datetime.datetime(2011, 12, 31, 0, 0, 0), datetime.date(2011, 12, 31)), (datetime.datetime(2011, 12, 31, 23, 59, 59), datetime.date(2011, 12, 31)), ] for test_datetime, test_date in compare_dates: date = util.convert_date(test_datetime) assert date == test_date
def test_convert_date_from_date(): compare_dates = [ datetime.date(2011, 12, 31), datetime.date(2012, 2, 29), datetime.date(2013, 1, 1), ] for test_date in compare_dates: date = util.convert_date(test_date) assert date == test_date
def get_station_data(station_code, parameter, start=None, end=None, min_value=None, max_value=None): if min_value is None: min_value = -9000000 if max_value is None: max_value = 9000000 if start is None: start_date = DEFAULT_START_DATE else: start_date = util.convert_date(start) if end is None: end_date = datetime.date.today() else: end_date = util.convert_date(end) start_date_str = _format_date(start_date) end_date_str = _format_date(end_date) form_data = { 'fld_station': station_code, 'fld_parameter': parameter, 'fld_from': min_value, 'fld_to': max_value, 'fld_fromdate': start_date_str, 'fld_todate': end_date_str, 'hdn_excel': '', } req = requests.post(URL, params=dict(sid=station_code), data=form_data) soup = BeautifulSoup(req.content) data_table = soup.find('table').find_all('table')[-1] return dict( [_parse_value(value_tr) for value_tr in data_table.find_all('tr')[2:]])
def test_convert_date_from_string(): compare_dates = [ ("2011-12-31", datetime.date(2011, 12, 31)), ("12/31/2011", datetime.date(2011, 12, 31)), ("2012-02-29", datetime.date(2012, 2, 29)), ("2012-2-29", datetime.date(2012, 2, 29)), ("2/29/2012", datetime.date(2012, 2, 29)), ("02/29/2012", datetime.date(2012, 2, 29)), ("2013-01-01", datetime.date(2013, 1, 1)), ] for test_str, test_date in compare_dates: date = util.convert_date(test_str) assert date == test_date
def test_convert_date_from_string(): compare_dates = [ ('2011-12-31', datetime.date(2011, 12, 31)), ('12/31/2011', datetime.date(2011, 12, 31)), ('2012-02-29', datetime.date(2012, 2, 29)), ('2012-2-29', datetime.date(2012, 2, 29)), ('2/29/2012', datetime.date(2012, 2, 29)), ('02/29/2012', datetime.date(2012, 2, 29)), ('2013-01-01', datetime.date(2013, 1, 1)), ] for test_str, test_date in compare_dates: date = util.convert_date(test_str) assert date == test_date
def get_station_data(station_code, parameter, start=None, end=None, min_value=None, max_value=None): if min_value is None: min_value = -9000000 if max_value is None: max_value = 9000000 if start is None: start_date = DEFAULT_START_DATE else: start_date = util.convert_date(start) if end is None: end_date = datetime.date.today() else: end_date = util.convert_date(end) start_date_str = _format_date(start_date) end_date_str = _format_date(end_date) form_data = { 'fld_station': station_code, 'fld_parameter': parameter, 'fld_from': min_value, 'fld_to': max_value, 'fld_fromdate': start_date_str, 'fld_todate': end_date_str, 'hdn_excel': '', } req = requests.post(URL, params=dict(sid=station_code), data=form_data) soup = BeautifulSoup(req.text) data_table = soup.find('table').find_all('table')[-1] return dict([ _parse_value(value_tr) for value_tr in data_table.find_all('tr')[2:] ])
def get_station_data(station_code, date=None, as_dataframe=False): """Fetches data for a station at a given date. Parameters ---------- station_code: str The station code to fetch data for. A list of stations can be retrieved with ``get_stations()`` date : ``None`` or date (see :ref:`dates-and-times`) The date of the data to be queried. If date is ``None`` (default), then data for the current day is retreived. as_dataframe : bool This determines what format values are returned as. If ``False`` (default), the values dict will be a dict with timestamps as keys mapped to a dict of gauge variables and values. If ``True`` then the values dict will be a pandas.DataFrame object containing the equivalent information. Returns ------- data_dict : dict A dict containing station information and values. """ station_dict = {} if date is None: date_str = 'current' year = datetime.date.today().year else: date = util.convert_date(date) date_str = date.strftime('%Y%m%d') year = date.year filename = '%s.%s.html' % (station_code, date_str) data_url = 'http://www.swt-wc.usace.army.mil/webdata/gagedata/' + filename path = os.path.join(USACE_SWTWC_DIR, filename) with util.open_file_for_url(data_url, path) as f: soup = BeautifulSoup(f) pre = soup.find('pre') if pre is None: error_msg = 'no data could be found for station code %(station_code)s and date %(date)s (url: %(data_url)s)' % { 'date': date, 'data_url': data_url, 'station_code': station_code, } raise ValueError(error_msg) sio = StringIO.StringIO(str(pre.text.strip())) first_line = sio.readline() split = first_line[8:].strip().split() station_dict['code'] = split[0] station_dict['description'] = ' '.join(split[1:]) second_line = sio.readline() station_dict['station_type'] = second_line.strip().split(':')[1].strip() notes = [] while 1: next_line = sio.readline() if ':' in next_line: notes.append(next_line.strip()) else: break if len(notes): station_dict['notes'] = '\n'.join(notes) variable_names = _split_line(sio.readline()[15:], 10) variable_units = _split_line(sio.readline()[15:], 10) variable_sources = _split_line(sio.readline()[15:], 10) station_dict['variables'] = dict([(name, { 'unit': unit, 'source': source }) for name, unit, source in zip(variable_names, variable_units, variable_sources)]) station_dict['timezone'] = sio.readline().strip().strip('()') column_names = ['datetime'] + variable_names widths = [15] + ([10] * len(variable_names)) converters = dict([(variable_name, lambda x: float(x) if x != '----' else np.nan) for variable_name in variable_names]) date_parser = lambda x: _convert_datetime(x, year) dataframe = pandas.read_fwf( sio, names=column_names, widths=widths, index_col=['datetime'], na_values=['----'], converters=converters, parse_dates=True, date_parser=date_parser) # parse out rows that are all nans (e.g. end of "current" page) dataframe = dataframe[~np.isnan(dataframe.T.sum())] if as_dataframe: station_dict['values'] = dataframe else: station_dict['values'] = util.dict_from_dataframe(dataframe) return station_dict
def get_data(station_ids=None, sensor_ids=None, resolutions=None, start=None, end=None): """ Downloads data for a set of CDEC station and sensor ids. If either is not provided, all available data will be downloaded. Be really careful with choosing hourly resolution as the data sets are big, and CDEC's servers are slow as molasses in winter. Usage example:: from ulmo import cdec dat = cdec.historical.get_data(['PRA'],resolutions=['daily']) Parameters ---------- station_ids : iterable of strings or ``None`` sensor_ids : iterable of integers or ``None`` check out or use the ``get_sensors()`` function to see a list of available sensor numbers resolutions : iterable of strings or ``None`` Possible values are 'event', 'hourly', 'daily', and 'monthly' but not all of these time resolutions are available at every station. Returns ------- dict : a python dict a python dict with site codes as keys. Values will be nested dicts containing all of the sensor/resolution combinations. """ if start is None: start_date = util.convert_date(DEFAULT_START_DATE) else: start_date = util.convert_date(start) if end is None: end_date = util.convert_date(DEFAULT_END_DATE) else: end_date = util.convert_date(end) start_date_str = _format_date(start_date) end_date_str = _format_date(end_date) if station_ids is None: station_ids = get_stations().index sensors = get_station_sensors(station_ids, sensor_ids, resolutions) d = {} for station_id, sensor_list in list(sensors.items()): station_data = {} for index, row in sensor_list.iterrows(): res = row.ix['resolution'] var = row.ix['variable'] sensor_id = row.ix['sensor_id'] station_data[var] = _download_raw(station_id, sensor_id, _res_to_dur_code(res), start_date_str, end_date_str) d[station_id] = station_data return d
def get_data(station_codes, start=None, end=None, parameters=None): """Retrieves data for a set of stations. Parameters ---------- station_codes : str or list Single station code or iterable of station codes to retrieve data for. start : ``None`` or date (see :ref:`dates-and-times`) If specified, data are limited to values after this date. end : ``None`` or date (see :ref:`dates-and-times`) If specified, data are limited to values before this date. parameters : ``None``, str or list If specified, data are limited to this set of parameter codes. Returns ------- data_dict : dict Dict with station codes keyed to lists of value dicts. """ if start: start_date = util.convert_date(start) else: start_date = NCDC_GSOD_START_DATE if end: end_date = util.convert_date(end) else: end_date = datetime.date.today() if isinstance(parameters, basestring): parameters = [parameters] if parameters and not 'date' in parameters: # add date to list of parameters if it's not there already parameters.insert(0, 'date') if isinstance(station_codes, basestring): station_codes = [station_codes] # note: opening tar files and parsing the headers and such is a relatively # lengthy operation so you don't want to do it too often, hence try to # grab all stations at the same time per tarfile data_dict = dict([(station_code, None) for station_code in station_codes]) for year in range(start_date.year, end_date.year + 1): tar_path = _get_gsod_file(year) with tarfile.open(tar_path, 'r:') as gsod_tar: stations_in_file = [ name.split('./')[-1].rsplit('-', 1)[0] for name in gsod_tar.getnames() if len(name) > 1] if station_codes: stations = list(set(station_codes) & set(stations_in_file)) else: stations = stations_in_file for station in stations: year_data = _read_gsod_file(gsod_tar, station, year) if parameters: year_data = _subset_record_array(year_data, parameters) if not year_data is None: # apply date ranges if they exist if start_date or end_date: mask = np.ones(len(year_data), dtype=bool) if start_date: mask = mask & (year_data['date'] >= start_date) if end_date: mask = mask & (year_data['date'] <= end_date) year_data = year_data[mask] if not data_dict[station] is None: # XXX: this could be more efficient for large numbers # of years with a list comprehension or generator data_dict[station] = np.append(data_dict[station], year_data) else: data_dict[station] = year_data for station, data_array in data_dict.items(): if not data_dict[station] is None: data_dict[station] = _record_array_to_value_dicts(data_array) return data_dict
def get_data(state=None, climate_division=None, start=None, end=None, as_dataframe=False): """Retreives data. Parameters ---------- state : ``None`` or str If specified, results will be limited to the state corresponding to the given 2-character state code. climate_division : ``None`` or int If specified, results will be limited to the climate division. start : ``None`` or date (see :ref:`dates-and-times`) Results will be limited to those after the given date. Default is the start of the current calendar year. end : ``None`` or date (see :ref:`dates-and-times`) If specified, results will be limited to data before this date. as_dataframe: bool If ``False`` (default), a dict with a nested set of dicts will be returned with data indexed by state, then climate division. If ``True`` then a pandas.DataFrame object will be returned. The pandas dataframe is used internally, so setting this to ``True`` is a little bit faster as it skips a serialization step. Returns ------- data : dict or pandas.Dataframe A dict or pandas.DataFrame representing the data. See the ``as_dataframe`` parameter for more. """ if not start is None: start_date = util.convert_date(start) else: start_date = None if not end is None: end_date = util.convert_date(end) else: end_date = None if not end_date: end_date = datetime.date.today() if not start_date: start_date = datetime.date(end_date.year, 1, 1) start_year, start_week = _week_number(start_date) end_year, end_week = _week_number(end_date) if state: state_code = STATE_CODES.get(state.upper()) else: state_code = None data = None for year in range(start_year, end_year + 1): url = _get_data_url(year) format_type = _get_data_format(year) with _open_data_file(url) as data_file: year_data = _parse_data_file(data_file, format_type, year) if state_code: year_data = year_data[year_data['state_code'] == state_code] if climate_division: year_data = year_data[year_data['climate_division'] == climate_division] year_data = _reindex_data(year_data) if data is None: data = year_data else: # some data are duplicated (e.g. final data from 2011 stretches into # prelim data of 2012), so just take those that are new append_index = year_data.index - data.index if len(append_index): data = data.append(year_data.ix[append_index]) # restrict results to date range period_index = pandas.PeriodIndex(data['period']) periods_in_range = (period_index >= start_date) & (period_index <= end_date) data = data[periods_in_range] # this does what data.reset_index() should do, but at least as of 0.10.1, that sets # will cast period objects to ints data.index = np.arange(len(data)) if as_dataframe: return data else: return _as_data_dict(data)
def get_historical_data(site_code, start=None, end=None, as_dataframe=False): """Fetches data for a site at a given date. Parameters ---------- site_code: str The site code to fetch data for. A list of sites can be retrieved with ``get_sites()`` date : ``None`` or date (see :ref:`dates-and-times`) The date of the data to be queried. If date is ``None`` (default), then all data will be returned. as_dataframe : bool This determines what format values are returned as. If ``False`` (default), the values dict will be a dict with timestamps as keys mapped to a dict of gauge variables and values. If ``True`` then the values dict will be a pandas.DataFrame object containing the equivalent information. Returns ------- data_dict : dict A dict containing site information and values. """ if isinstance(site_code, (str)): pass elif isinstance(site_code, (int)): site_code = str(site_code) else: log.error("Unsure of the site_code parameter type. \ Try string or int") raise waterquality_url = "http://waterquality.lcra.org/parameter.aspx?qrySite=%s" % site_code waterquality_url2 = 'http://waterquality.lcra.org/events.aspx' initial_request = requests.get(waterquality_url) initialsoup = BeautifulSoup(initial_request.content, 'html.parser') sitevals = [statag.get('value', None) for statag in initialsoup.findAll(id="multiple") if statag.get('value', None)] result = _make_next_request(waterquality_url2, initial_request, {'multiple': sitevals, 'site': site_code}) soup = BeautifulSoup(result.content, 'html.parser') gridview = soup.find(id="GridView1") results = [] headers = [head.text for head in gridview.findAll('th')] #uses \xa0 for blank for row in gridview.findAll('tr'): vals = [_parse_val(aux.text) for aux in row.findAll('td')] if len(vals) == 0: continue results.append(dict(zip(headers, vals))) data = _create_dataframe(results) if start and not data.empty: data = data.ix[util.convert_date(start):] if end and not data.empty: data = data.ix[:util.convert_date(end)] if as_dataframe: return data else: return data.to_dict(orient='records')
def get_stations(country=None, state=None, start=None, end=None, update=True): """Retrieve information on the set of available stations. Parameters ---------- country : {``None``, str, or iterable} If specified, results will be limited to stations with matching country codes. state : {``None``, str, or iterable} If specified, results will be limited to stations with matching state codes. start : ``None`` or date (see :ref:`dates-and-times`) If specified, results will be limited to stations which have data after this start date. end : ``None`` or date (see :ref:`dates-and-times`) If specified, results will be limited to stations which have data before this end date. update : bool If ``True`` (default), check for a newer copy of the stations file and download if it is newer the previously downloaded copy. If ``False``, then a new stations file will only be downloaded if a previously downloaded file cannot be found. Returns ------- stations_dict : dict A dict with USAF-WBAN codes keyed to station information dicts. """ if start: start_date = util.convert_date(start) else: start_date = None if end: end_date = util.convert_date(end) else: end_date = None if isinstance(country, basestring): country = [country] if isinstance(state, basestring): state = [state] stations_url = 'http://www1.ncdc.noaa.gov/pub/data/noaa/isd-history.csv' with util.open_file_for_url(stations_url, NCDC_GSOD_STATIONS_FILE) as f: reader = csv.DictReader(f) if country is None and state is None and start is None and end is None: rows = reader else: if start_date is None: start_str = None else: start_str = start_date.strftime('%Y%m%d') if end_date is None: end_str = None else: end_str = end_date.strftime('%Y%m%d') rows = [ row for row in reader if _passes_row_filter(row, country=country, state=state, start_str=start_str, end_str=end_str) ] stations = dict([ (_station_code(row), _process_station(row)) for row in rows ]) return stations
def get_station_data(station_code, date=None, as_dataframe=False): """Fetches data for a station at a given date. Parameters ---------- station_code: str The station code to fetch data for. A list of stations can be retrieved with ``get_stations()`` date : ``None`` or date (see :ref:`dates-and-times`) The date of the data to be queried. If date is ``None`` (default), then data for the current day is retreived. as_dataframe : bool This determines what format values are returned as. If ``False`` (default), the values dict will be a dict with timestamps as keys mapped to a dict of gauge variables and values. If ``True`` then the values dict will be a pandas.DataFrame object containing the equivalent information. Returns ------- data_dict : dict A dict containing station information and values. """ station_dict = {} if date is None: date_str = 'current' year = datetime.date.today().year else: date = util.convert_date(date) date_str = date.strftime('%Y%m%d') year = date.year filename = '%s.%s.html' % (station_code, date_str) data_url = 'http://www.swt-wc.usace.army.mil/webdata/gagedata/' + filename path = os.path.join(USACE_SWTWC_DIR, filename) with util.open_file_for_url(data_url, path) as f: soup = BeautifulSoup(f) pre = soup.find('pre') if pre is None: error_msg = 'no data could be found for station code %(station_code)s and date %(date)s (url: %(data_url)s)' % { 'date': date, 'data_url': data_url, 'station_code': station_code, } raise ValueError(error_msg) sio = StringIO.StringIO(str(pre.text.strip())) first_line = sio.readline() split = first_line[8:].strip().split() station_dict['code'] = split[0] station_dict['description'] = ' '.join(split[1:]) second_line = sio.readline() station_dict['station_type'] = second_line.strip().split(':')[1].strip() notes = [] while 1: next_line = sio.readline() if ':' in next_line: notes.append(next_line.strip()) else: break if len(notes): station_dict['notes'] = '\n'.join(notes) variable_names = _split_line(sio.readline()[15:], 10) variable_units = _split_line(sio.readline()[15:], 10) variable_sources = _split_line(sio.readline()[15:], 10) station_dict['variables'] = dict([(name, { 'unit': unit, 'source': source }) for name, unit, source in zip(variable_names, variable_units, variable_sources)]) station_dict['timezone'] = sio.readline().strip().strip('()') column_names = ['datetime'] + variable_names widths = [15] + ([10] * len(variable_names)) converters = dict([(variable_name, lambda x: float(x) if x != '----' else np.nan) for variable_name in variable_names]) date_parser = lambda x: _convert_datetime(x, year) dataframe = pandas.read_fwf(sio, names=column_names, widths=widths, index_col=['datetime'], na_values=['----'], converters=converters, parse_dates=True, date_parser=date_parser) # parse out rows that are all nans (e.g. end of "current" page) dataframe = dataframe[~np.isnan(dataframe.T.sum())] if as_dataframe: station_dict['values'] = dataframe else: station_dict['values'] = util.dict_from_dataframe(dataframe) return station_dict
def test_convert_date_from_date(): compare_dates = [datetime.date(2011, 12, 31), datetime.date(2012, 2, 29), datetime.date(2013, 1, 1)] for test_date in compare_dates: date = util.convert_date(test_date) assert date == test_date
def get_historical_data(site_code, start=None, end=None, as_dataframe=False): """Fetches data for a site at a given date. Parameters ---------- site_code : str The site code to fetch data for. A list of sites can be retrieved with ``get_sites()`` date : ``None`` or date (see :ref:`dates-and-times`) The date of the data to be queried. If date is ``None`` (default), then all data will be returned. as_dataframe : bool This determines what format values are returned as. If ``False`` (default), the values dict will be a dict with timestamps as keys mapped to a dict of gauge variables and values. If ``True`` then the values dict will be a pandas.DataFrame object containing the equivalent information. Returns ------- data_dict : dict A dict containing site information and values. """ if isinstance(site_code, (str)): pass elif isinstance(site_code, (int)): site_code = str(site_code) else: log.error("Unsure of the site_code parameter type. \ Try string or int") raise waterquality_url = "http://waterquality.lcra.org/parameter.aspx?qrySite=%s" % site_code waterquality_url2 = 'http://waterquality.lcra.org/events.aspx' initial_request = requests.get(waterquality_url) initialsoup = BeautifulSoup(initial_request.content, 'html.parser') sitevals = [ statag.get('value', None) for statag in initialsoup.findAll(id="multiple") if statag.get('value', None) ] result = _make_next_request(waterquality_url2, initial_request, { 'multiple': sitevals, 'site': site_code }) soup = BeautifulSoup(result.content, 'html.parser') gridview = soup.find(id="GridView1") results = [] headers = [head.text for head in gridview.findAll('th')] # uses \xa0 for blank for row in gridview.findAll('tr'): vals = [_parse_val(aux.text) for aux in row.findAll('td')] if len(vals) == 0: continue results.append(dict(zip(headers, vals))) data = _create_dataframe(results) if start and not data.empty: data = data.ix[util.convert_date(start):] if end and not data.empty: data = data.ix[:util.convert_date(end)] if as_dataframe: return data else: return data.to_dict(orient='records')
def _parse_value(value_tr): date_td, value_td = value_tr.find_all('td') return (util.convert_date(date_td.text), float(value_td.text))