def add_to_database(**kwargs): objs = kwargs['ti'].xcom_pull(key='object_location', task_ids='generate_object_list') logging.info(f'Processing object list from {objs}') with open(objs, 'r') as f: wl = read_object_list(f) # execution_date = kwargs['execution_date'].strftime('%Y-%m-%dT%H-%M') # previous_run = kwargs['prev_execution_date'].strftime('%Y-%m-%dT%H-%M') # filtered = list(filter_objects(all_objects=wl, start_date=previous_run, end_date=execution_date)) filtered = list(wl) station_dao, series_dao, mes_dao = setup_daos() records = 0 for obj in filtered: for record in get_jsons_from_object(bucket=FETCHES_BUCKET, object_name=obj['Name']): station, measurement, _ = split_record(record) add_to_db(station_dao=station_dao, series_dao=series_dao, mes_dao=mes_dao, station=station, measurement=measurement) records += 1 print_db_stats(station_dao, series_dao, mes_dao)
def test_counter(self): with open('./tests/series.ndjson', 'rb') as f: ll = list( get_jsons_from_stream(stream=f, object_name='series.ndjson')) self.assertEqual(15, len(ll)) for rec in ll: _, measurement, _ = split_record(rec) station_id = self.smdao.store_from_json(rec) series_id = self.dao.store( station_id=station_id, parameter=measurement['parameter'], unit=measurement['unit'], averagingPeriod=f"{measurement['averagingPeriod']}") mes_id = self.mdao.store(series_id=series_id, value=measurement['value'], date=measurement['date']['utc']) mes = self.mdao.get_all() self.assertEqual(15, len(mes)) self.assertEqual(self.mdao.count(), 15) self.assertEqual(4, self.smdao.count()) self.assertEqual(6, self.dao.count()) self.assertEqual(4, self.mdao.count(series_id=1)) self.assertEqual(0, self.mdao.count(series_id=838232)) self.assertEqual(0, self.dao.count(station_id=1212))
def update_last(**kwargs): prefix = get_prefix(**kwargs) target_dir = os.path.join(Variable.get('target_dir'), prefix) logging.info(f'Will be processing [{ target_dir }]') flist = list_directory(target_dir) logging.info(f'Files detected: { len(flist)}') previous_run = kwargs['prev_execution_date'] next_run = kwargs['next_execution_date'] filtered_list = filter_file_list( flist=flist, previous_run=previous_run, next_run=next_run) logging.info(f'Previous run was @{previous_run}, next will be @{next_run}. File list reduced to: {len(filtered_list)}') station_dao, series_dao, mes_dao = setup_daos() m = 0 for fname in filtered_list: logging.info(f'Analyzing { fname}') with open(fname, 'rb') as f: for record in get_jsons_from_stream(stream=f, object_name=fname): station, measurement, _ = split_record(record) m += 1 add_to_db(station_dao, series_dao, mes_dao, station=station, measurement=measurement) logging.info(f'Number of measurements added to DB: {m}') print_db_stats(station_dao, series_dao, mes_dao) return True
def test_store_unique_series(self): asj = json.loads(jseg) id1 = self.smdao.store_from_json(asj) self.assertIsNotNone(id1) station, measurement, _ = split_record(json.loads(jseg)) res1 = self.dao.store(station_id=id1, parameter=measurement['parameter'], unit=measurement['unit'], averagingPeriod="") res2 = self.dao.store(station_id=id1, parameter=measurement['parameter'], unit=measurement['unit'], averagingPeriod="") print(f'{res1} == {res2}') self.assertIsNotNone(res2) self.assertIsNotNone(res1) self.assertEqual(res1, res2) result = self.dao.get_all() self.assertEqual(1, len(result))
def test_get_forstation(self): with open('./tests/series.ndjson', 'rb') as f: ll = list( get_jsons_from_stream(stream=f, object_name='series.ndjson')) self.assertEqual(15, len(ll)) for rec in ll: _, measurement, _ = split_record(rec) station_id = self.smdao.store_from_json(rec) series_id = self.dao.store( station_id=station_id, parameter=measurement['parameter'], unit=measurement['unit'], averagingPeriod=f"{measurement['averagingPeriod']}") mes_id = self.mdao.store(series_id=series_id, value=measurement['value'], date=measurement['date']['utc']) mes = self.mdao.get_all() self.assertEqual(15, len(mes)) only_one = self.dao.get_all_for_station(station_id="Nisekh") self.assertEqual(1, len(only_one)) more_than_one = self.dao.get_all_for_station( station_id="Sankt Eriksgatan") self.assertEqual(3, len(more_than_one)) my_series = self.dao.get_for_id(series_id=1) self.assertEqual(my_series[2], 'pm10')
def test_multiple_inserts(self): with open('./tests/series.ndjson', 'rb') as f: ll = list( get_jsons_from_stream(stream=f, object_name='series.ndjson')) self.assertEqual(len(ll), 15) for rec in ll: station, measurement, _ = split_record(rec) station_id = self.smdao.store_from_json(rec) series_id = self.dao.store( station_id=station_id, parameter=measurement['parameter'], unit=measurement['unit'], averagingPeriod=f"{measurement['averagingPeriod']}") mes_id = self.mdao.store(series_id=series_id, value=measurement['value'], date=measurement['date']['utc']) stations = self.smdao.get_all() self.assertEqual(len(stations), 4) series = self.dao.get_all() self.assertEqual(len(series), 6) mes = self.mdao.get_all() self.assertEqual(len(mes), 15)
def local_process_file(object_name): ret = [] for record in get_jsons_from_object(bucket=FETCHES_BUCKET, object_name=object_name): station, measurement, _ = split_record(record) ret.append([station, measurement]) return ret
def test_split_single(self): strr = '{"date":{"utc":"2015-12-10T01:00:00.000Z","local":"2015-12-10T08:00:00+07:00"},"parameter":"no2","location":"Pak Nam, Mueang","value":0.058,"unit":"ppm","city":"Samut Prakan","attribution":[{"name":"Pollution Control Department","url":"http://www.aqmthai.com/index.php"}],"averagingPeriod":{"value":1,"unit":"hours"},"coordinates":null,"country":"TH","sourceName":"Thailand"}' asj = json.loads(strr) print(f'got {asj}') self.assertIsNotNone(asj) station, measurement, ext = split_record(asj) self.assertIn('coordinates', station) self.assertEqual(station['coordinates']['latitude'], 0)
def test_foreignkey_violation(self): station, measurement, _ = split_record(json.loads(jseg)) res = self.dao.store(station_id='non-existing name', parameter=measurement['parameter'], unit=measurement['unit'], averagingPeriod="") result = self.dao.get_all() assert len(result) == 1 if self.engine.get_engine().name == 'sqlite': raise IntegrityError(statement='SQLLITE', params='does not support', orig='remote key violation', connection_invalidated=False, code=None)
def test_insert_all(self): station, measurement, _ = split_record(json.loads(jseg)) station_id = self.smdao.store_from_json(station) series_id = self.dao.store(station_id=station_id, parameter=measurement['parameter'], unit=measurement['unit'], averagingPeriod="") result = self.dao.get_all() self.assertEqual(len(result), 1) mes_id = self.mdao.store(series_id=series_id, value=measurement['value'], date=measurement['date']['utc']) self.assertTrue(mes_id) res = self.mdao.get_all() self.assertEqual(len(res), 1)
def test_massive_split(self): client = Mock() f = open('./tests/exobj.ndjson', 'rb') client.get_object = MagicMock(return_value={'Body': f}) for jl in get_jsons_from_object(bucket='bucket', object_name='obj', client=client): station, measurement, ext = split_record(jl) assert 'location' in station assert 'value' in measurement assert 'location' not in measurement assert 'city' in station assert 'city' not in ext assert 'location' not in ext assert 'attribution' in ext assert 'sourceName' in station assert 'sourceName' in measurement f.close()
def go_through(**kwargs): prefix = get_prefix(**kwargs) target_dir = os.path.join(Variable.get('target_dir'), prefix) logging.info(f'Will be processing [{ target_dir }]') flist = glob.glob(os.path.join(target_dir, '*')) logging.info(f'Files detected: { len(flist)}') station_dao, series_dao, mes_dao = setup_daos() for fname in flist: logging.info(f'Processing { fname}') with open(fname, 'rb') as f: for record in get_jsons_from_stream(stream=f, object_name=fname): station, measurement, _ = split_record(record) add_to_db(station_dao, series_dao, mes_dao, station=station, measurement=measurement) print_db_stats(station_dao, series_dao, mes_dao) return True
def test_split_record(self): rec = """{"date":{"utc":"2018-06-06T23:00:00.000Z","local":"2018-06-07T05:00:00+06:00"}, "parameter":"pm25", "location":"US Diplomatic Post: Dhaka", "value":27, "unit":"µg/m³", "city":"Dhaka", "attribution":[{"name":"EPA AirNow DOS","url":"http://airnow.gov/index.cfm?action=airnow.global_summary"}], "averagingPeriod":{"value":1,"unit":"hours"}, "coordinates":{"latitude":23.796373,"longitude":90.424614}, "country":"BD", "sourceName":"StateAir_Dhaka", "sourceType":"government", "mobile":false}""" jl = json.loads(rec) station, measurement, ext = split_record(jl) assert 'location' in station assert 'value' in measurement assert 'location' not in measurement assert 'city' in station assert 'city' not in ext assert 'location' not in ext assert 'attribution' in ext
def local_process_file(object_name): for record in get_jsons_from_object(bucket=FETCHES_BUCKET, object_name=object_name): station, measurement, _ = split_record(record) yield [station, measurement]