class PhilDBSink(BufferedSink): """ A buffered sink using the PhilDB timeseries database. """ def __init__(self, dbfile, *args, **kwargs): super(PhilDBSink, self).__init__(*args, **kwargs) try: create(dbfile) except AlreadyExistsError: pass # Database already exists, so no creation required. self.db = PhilDB(dbfile) self.last_known_freq = None try: self.db.add_source('SENSOR', 'Data from hardware sensor') except DuplicateError: pass # DuplicateError means the source already existed def write_buffer(self, param_name, series): """Write buffer of data to database.""" if len(series) == 0: return try: self.db.add_measurand(param_name, param_name, param_name) except DuplicateError: pass # DuplicateError means the measurand already existed try: self.db.add_timeseries(param_name) except DuplicateError: pass # DuplicateError means the timeseries already existed freq = series.index.inferred_freq # need to handle special case where only one value being written # unable to calculate the frequency so we use the last known # value which in general should always be the same. if len(series) == 1: freq = self.last_known_freq elif freq is not None: self.last_known_freq = freq if freq is None: raise ValueError('Unable to determine sensor frequency') try: self.db.add_timeseries_instance(param_name, freq, 'None', measurand=param_name, source='SENSOR') except DuplicateError: pass # DuplicateError - the timeseries instance already existed self.db.write(param_name, freq, series, measurand=param_name, source='SENSOR')
def test_ts_list_sorted(self): """ Test that the list of IDs is sorted. """ db = PhilDB(self.test_tsdb) db.add_measurand('P', 'PRECIPITATION', 'Precipitation') db.add_timeseries_instance('410730', 'D', 'Foo', measurand='P', source='DATA_SOURCE') db.add_timeseries('410731') db.add_timeseries_instance('410731', 'D', 'Foo', measurand='P', source='DATA_SOURCE') db.add_timeseries_instance('410731', 'D', 'Foo', measurand='Q', source='DATA_SOURCE') ts_list = db.ts_list() self.assertEqual(['123456', '410730', '410731'], ts_list)
def test_read_non_unique(self): db = PhilDB(self.test_tsdb) db.add_measurand('P', 'PRECIPITATION', 'Precipitation') db.add_timeseries_instance('410730', 'D', 'Foo', measurand = 'P', source = 'DATA_SOURCE') with self.assertRaises(MultipleResultsFound) as context: results = db.read('410730', 'D')
def test_ts_list_ids(self): db = PhilDB(self.test_tsdb) db.add_timeseries('410731') db.add_measurand('P', 'PRECIPITATION', 'Precipitation') db.add_timeseries_instance('410731', 'D', 'Foo', measurand = 'P', source = 'DATA_SOURCE') ts_list = db.list_ids() self.assertEqual(['123456', '410730', '410731'], ts_list)
def test_measurand_list_sorted(self): """ Test that the list of measurand short IDs is sorted. """ db = PhilDB(self.test_tsdb) db.add_measurand('P', 'PRECIPITATION', 'Precipitation') ts_list = db.list_measurands() self.assertEqual(['P', 'Q'], ts_list)
def test_ts_list_measurand_and_source(self): db = PhilDB(self.test_tsdb) db.add_timeseries('410731') db.add_source('EXAMPLE_SOURCE', 'Example source, i.e. a dataset') db.add_measurand('P', 'PRECIPITATION', 'Precipitation') db.add_timeseries_instance('410731', 'D', 'Foo', measurand = 'P', source = 'EXAMPLE_SOURCE') ts_list = db.ts_list(source = 'EXAMPLE_SOURCE', measurand = 'P') self.assertEqual(['410731'], ts_list)
def test_ts_list_unique_ids(self): """ Test that IDs don't appear multiple times due to different combinations. """ db = PhilDB(self.test_tsdb) db.add_measurand('P', 'PRECIPITATION', 'Precipitation') db.add_timeseries_instance('410730', 'D', 'Foo', measurand = 'P', source = 'DATA_SOURCE') ts_list = db.ts_list() self.assertEqual(['123456', '410730'], ts_list)
def test_get_ts_instance(self): db = PhilDB(self.test_tsdb) ts_instance = db._PhilDB__get_ts_instance('410730', 'D', measurand = 'Q', source = 'DATA_SOURCE') self.assertEqual('410730', ts_instance.timeseries.primary_id) self.assertEqual('Q', ts_instance.measurand.short_id) self.assertRaises(MissingDataError, db._PhilDB__get_ts_instance, '410731', 'D', measurand = 'Q', source = 'DATA_SOURCE') db.add_measurand('P', 'PRECIPITATION', 'Precipitation') self.assertRaises(MissingDataError, db._PhilDB__get_ts_instance, '410730', 'D', measurand = 'P', source = 'DATA_SOURCE')
def test_ts_list_ids(self): db = PhilDB(self.test_tsdb) db.add_timeseries('410731') db.add_measurand('P', 'PRECIPITATION', 'Precipitation') db.add_timeseries_instance('410731', 'D', 'Foo', measurand='P', source='DATA_SOURCE') ts_list = db.list_ids() self.assertEqual(['123456', '410730', '410731'], ts_list)
def test_read_non_unique(self): db = PhilDB(self.test_tsdb) db.add_measurand('P', 'PRECIPITATION', 'Precipitation') db.add_timeseries_instance('410730', 'D', 'Foo', measurand='P', source='DATA_SOURCE') with self.assertRaises(MultipleResultsFound) as context: results = db.read('410730', 'D')
def test_ts_list_measurand_and_source(self): db = PhilDB(self.test_tsdb) db.add_timeseries('410731') db.add_source('EXAMPLE_SOURCE', 'Example source, i.e. a dataset') db.add_measurand('P', 'PRECIPITATION', 'Precipitation') db.add_timeseries_instance('410731', 'D', 'Foo', measurand='P', source='EXAMPLE_SOURCE') ts_list = db.ts_list(source='EXAMPLE_SOURCE', measurand='P') self.assertEqual(['410731'], ts_list)
def test_add_duplicates(self): db = PhilDB(self.test_tsdb) with self.assertRaises(DuplicateError) as context: db.add_source('DATA_SOURCE', 'Duplicate source') with self.assertRaises(DuplicateError) as context: db.add_measurand('Q', 'STREAMFLOW', 'Duplicate measurand') with self.assertRaises(DuplicateError) as context: db.add_timeseries('410730') with self.assertRaises(DuplicateError) as context: db.add_timeseries_instance('410730', 'D', '', source='DATA_SOURCE', measurand='Q')
def test_add_measurand_entry(self): create(self.temp_dir) db = PhilDB(self.temp_dir) db.add_measurand('P', 'PRECIPITATION', 'Precipitation') conn = sqlite3.connect(db._PhilDB__meta_data_db()) c = conn.cursor() c.execute("SELECT * FROM measurand;") pk, measurand_short_id, measurand_long_id, measurand_description = c.fetchone(); self.assertEqual(measurand_short_id, 'P') self.assertEqual(measurand_long_id, 'PRECIPITATION') self.assertEqual(measurand_description, 'Precipitation')
def test_add_measurand_entry(self): create(self.temp_dir) db = PhilDB(self.temp_dir) db.add_measurand('P', 'PRECIPITATION', 'Precipitation') conn = sqlite3.connect(db._PhilDB__meta_data_db()) c = conn.cursor() c.execute("SELECT * FROM measurand;") pk, measurand_short_id, measurand_long_id, measurand_description = c.fetchone( ) self.assertEqual(measurand_short_id, 'P') self.assertEqual(measurand_long_id, 'PRECIPITATION') self.assertEqual(measurand_description, 'Precipitation')
def test_ts_list_unique_ids(self): """ Test that IDs don't appear multiple times due to different combinations. """ db = PhilDB(self.test_tsdb) db.add_measurand('P', 'PRECIPITATION', 'Precipitation') db.add_timeseries_instance('410730', 'D', 'Foo', measurand='P', source='DATA_SOURCE') ts_list = db.ts_list() self.assertEqual(['123456', '410730'], ts_list)
def test_ts_list_sorted(self): """ Test that the list of IDs is sorted. """ db = PhilDB(self.test_tsdb) db.add_measurand('P', 'PRECIPITATION', 'Precipitation') db.add_timeseries_instance('410730', 'D', 'Foo', measurand = 'P', source = 'DATA_SOURCE') db.add_timeseries('410731') db.add_timeseries_instance('410731', 'D', 'Foo', measurand = 'P', source = 'DATA_SOURCE') db.add_timeseries_instance('410731', 'D', 'Foo', measurand = 'Q', source = 'DATA_SOURCE') ts_list = db.ts_list() self.assertEqual(['123456', '410730', '410731'], ts_list)
def test_get_ts_instance(self): db = PhilDB(self.test_tsdb) ts_instance = db._PhilDB__get_ts_instance('410730', 'D', measurand='Q', source='DATA_SOURCE') self.assertEqual('410730', ts_instance.timeseries.primary_id) self.assertEqual('Q', ts_instance.measurand.short_id) self.assertRaises(MissingDataError, db._PhilDB__get_ts_instance, '410731', 'D', measurand='Q', source='DATA_SOURCE') db.add_measurand('P', 'PRECIPITATION', 'Precipitation') self.assertRaises(MissingDataError, db._PhilDB__get_ts_instance, '410730', 'D', measurand='P', source='DATA_SOURCE')
def write_phildb(file_list, results_file, first_run = False): if first_run: create('hrs_phildb') db = PhilDB('hrs_phildb') if first_run: db.add_measurand('Q', 'STREAMFLOW', 'Streamflow') db.add_source('BOM_HRS', 'Bureau of Meteorology; Hydrological Reference Stations dataset.') write_times = [] for filename in file_list: print("Processing file: ", filename, '...') station_id = os.path.basename(filename).split('_')[0] print("Using station ID: ", station_id, '...') streamflow = pd.read_csv(filename, parse_dates=True, index_col=0, header = None) if first_run: db.add_timeseries(station_id) db.add_timeseries_instance(station_id, freq, '', measurand = 'Q', source = 'BOM_HRS') start = time.time() db.write(station_id, freq, streamflow, measurand = 'Q', source = 'BOM_HRS') write_times.append(time.time() - start) np.savetxt(results_file, np.array(write_times))
import pandas as pd from phildb.database import PhilDB from phildb.create import create test_tsdb_path = os.path.join(os.path.dirname(__file__), 'test_tsdb') try: shutil.rmtree(test_tsdb_path) except OSError as e: if e.errno != 2: # Code 2: No such file or directory. raise create(test_tsdb_path) db = PhilDB(test_tsdb_path) db.add_measurand('Q', 'STREAMFLOW', 'Streamflow') db.add_source('DATA_SOURCE', '') db.add_timeseries('410730') db.add_timeseries_instance('410730', 'D', '', measurand = 'Q', source = 'DATA_SOURCE') db.write('410730', 'D', pd.Series( index = [datetime.date(2014, 1, 1), datetime.date(2014, 1, 2), datetime.date(2014, 1, 3)], data = [1,2,3]), source = 'DATA_SOURCE', measurand = 'Q' ) db.add_timeseries('123456') db.add_timeseries_instance('123456', 'D', '', measurand = 'Q', source = 'DATA_SOURCE') db.write('123456', 'D', pd.Series(index = [datetime.date(2014, 1, 1),
import pandas as pd from phildb.database import PhilDB from phildb.create import create test_tsdb_path = os.path.join(os.path.dirname(__file__), 'test_tsdb') try: shutil.rmtree(test_tsdb_path) except OSError as e: if e.errno != 2: # Code 2: No such file or directory. raise create(test_tsdb_path) db = PhilDB(test_tsdb_path) db.add_measurand('Q', 'STREAMFLOW', 'Streamflow') db.add_source('DATA_SOURCE', '') db.add_timeseries('410730') db.add_timeseries_instance('410730', 'D', '', measurand='Q', source='DATA_SOURCE') db.write('410730', 'D', pd.Series(index=[ datetime.date(2014, 1, 1), datetime.date(2014, 1, 2), datetime.date(2014, 1, 3) ],
dates.append(the_date) data.append(ob[measurand]) dates.reverse() data.reverse() station_id = station_json['observations']['header'][0]['ID'] return station_id, pd.Series(data, dates) measurand = 'air_temp' source = 'BOM_OBS' freq = '30min' station_id, data = parse(json.load(open(sys.argv[2])), measurand) db.add_measurand(measurand, measurand, 'Air Temperature') db.add_source('BOM_OBS', 'Australian Bureau of Meteorology Observations') db.add_timeseries(station_id) db.add_timeseries_instance(station_id, freq, 'None', measurand = measurand, source = source) db.write(station_id, freq, data, measurand = measurand, source = source) for i in range(3, len(sys.argv)): print("Processing file: ", sys.argv[i], '...') try: x = parse(json.load(open(sys.argv[i])), measurand) db.write(station_id, freq, x, measurand = measurand, source = source) except ValueError as e: print('Could not parse: {0}'.format(sys.argv[i]))
from datetime import date from phildb.create import create from phildb.database import PhilDB create('pypi_downloads') from count import write_downloads db = PhilDB('pypi_downloads') db.add_source('pypi', 'pypi.python.org') db.add_measurand('last_day', 'last_day', 'Downloads in the last day') db.add_measurand('last_week', 'last_week', 'Downloads in the last week') db.add_measurand('last_month', 'last_month', 'Downloads in the last month') db.add_measurand('total', 'total', 'Total downloads') # Write some download information I had manually collected over the last few days write_downloads( { 'info': { 'name': 'PhilDB', 'downloads': {'last_day': 6, 'last_month': 572, 'last_week': 74} } }, date(2015, 11, 12) ) write_downloads( { 'info': { 'name': 'PhilDB', 'downloads': {'last_day': 20, 'last_month': 596, 'last_week': 92} }
data.append(ob[measurand]) dates.reverse() data.reverse() station_id = station_json['observations']['header'][0]['ID'] return station_id, pd.Series(data, dates) measurand = 'air_temp' source = 'BOM_OBS' freq = '30min' station_id, data = parse(json.load(open(sys.argv[2])), measurand) db.add_measurand(measurand, measurand, 'Air Temperature') db.add_source('BOM_OBS', 'Australian Bureau of Meteorology Observations') db.add_timeseries(station_id) db.add_timeseries_instance(station_id, freq, 'None', measurand=measurand, source=source) db.write(station_id, freq, data, measurand=measurand, source=source) for i in range(3, len(sys.argv)): print("Processing file: ", sys.argv[i], '...') try: x = parse(json.load(open(sys.argv[i])), measurand)
import os import sys import datetime import pandas as pd from phildb.database import PhilDB print("Writing to PhilDB({0})".format(sys.argv[1])) db = PhilDB(sys.argv[1]) db.add_measurand('maxT', 'MAXIMUM_TEMPERATURE', 'Maximum Temperature') db.add_measurand('minT', 'MINIMUM_TEMPERATURE', 'Minimum Temperature') db.add_source('BOM_ACORN_SAT', 'Bureau of Meteorology; Hydrological Reference Stations dataset.') freq = 'D' for i in range(2, len(sys.argv)): print("Processing file: ", sys.argv[i], '...') station_id = "{0:06d}".format(int(os.path.basename(sys.argv[i]))) print("Using station ID: ", station_id, '...') db.add_timeseries(station_id) for variable in ['minT', 'maxT']: input_file = 'data/acorn.sat.{0}.{1}.daily.txt'.format(variable, station_id) df = pd.read_csv(input_file, parse_dates=[0], index_col=0, header=None, skiprows=1, sep=r"\s+", na_values='99999.9', names=['Date',variable]) db.add_timeseries_instance(station_id, freq, 'ACORN-SAT', measurand = variable, source = 'BOM_ACORN_SAT') db.write(station_id, freq, df[variable], measurand = variable, source = 'BOM_ACORN_SAT')