Beispiel #1
0
def add_to_database(**kwargs):
    objs = kwargs['ti'].xcom_pull(key='object_location',
                                  task_ids='generate_object_list')
    logging.info(f'Processing object list from {objs}')
    with open(objs, 'r') as f:
        wl = read_object_list(f)

    # execution_date = kwargs['execution_date'].strftime('%Y-%m-%dT%H-%M')
    # previous_run = kwargs['prev_execution_date'].strftime('%Y-%m-%dT%H-%M')
    # filtered = list(filter_objects(all_objects=wl, start_date=previous_run, end_date=execution_date))
    filtered = list(wl)

    station_dao, series_dao, mes_dao = setup_daos()
    records = 0

    for obj in filtered:
        for record in get_jsons_from_object(bucket=FETCHES_BUCKET,
                                            object_name=obj['Name']):
            station, measurement, _ = split_record(record)
            add_to_db(station_dao=station_dao,
                      series_dao=series_dao,
                      mes_dao=mes_dao,
                      station=station,
                      measurement=measurement)
            records += 1

    print_db_stats(station_dao, series_dao, mes_dao)
Beispiel #2
0
    def test_counter(self):
        with open('./tests/series.ndjson', 'rb') as f:
            ll = list(
                get_jsons_from_stream(stream=f, object_name='series.ndjson'))
            self.assertEqual(15, len(ll))

        for rec in ll:
            _, measurement, _ = split_record(rec)
            station_id = self.smdao.store_from_json(rec)
            series_id = self.dao.store(
                station_id=station_id,
                parameter=measurement['parameter'],
                unit=measurement['unit'],
                averagingPeriod=f"{measurement['averagingPeriod']}")

            mes_id = self.mdao.store(series_id=series_id,
                                     value=measurement['value'],
                                     date=measurement['date']['utc'])
        mes = self.mdao.get_all()
        self.assertEqual(15, len(mes))
        self.assertEqual(self.mdao.count(), 15)

        self.assertEqual(4, self.smdao.count())
        self.assertEqual(6, self.dao.count())

        self.assertEqual(4, self.mdao.count(series_id=1))
        self.assertEqual(0, self.mdao.count(series_id=838232))

        self.assertEqual(0, self.dao.count(station_id=1212))
Beispiel #3
0
def update_last(**kwargs):
    prefix = get_prefix(**kwargs)
    target_dir = os.path.join(Variable.get('target_dir'), prefix)
    logging.info(f'Will be processing [{ target_dir }]')

    flist = list_directory(target_dir)
    logging.info(f'Files detected: { len(flist)}')

    previous_run = kwargs['prev_execution_date']
    next_run = kwargs['next_execution_date']
    filtered_list = filter_file_list(
        flist=flist, previous_run=previous_run, next_run=next_run)
    logging.info(f'Previous run was @{previous_run}, next will be @{next_run}. File list reduced to: {len(filtered_list)}')

    station_dao, series_dao, mes_dao = setup_daos()
    m = 0

    for fname in filtered_list:
        logging.info(f'Analyzing { fname}')

        with open(fname, 'rb') as f:
            for record in get_jsons_from_stream(stream=f, object_name=fname):
                station, measurement, _ = split_record(record)
                m += 1
                add_to_db(station_dao, series_dao, mes_dao, station=station,
                          measurement=measurement)

    logging.info(f'Number of measurements added to DB: {m}')
    print_db_stats(station_dao, series_dao, mes_dao)
    return True
Beispiel #4
0
    def test_store_unique_series(self):

        asj = json.loads(jseg)
        id1 = self.smdao.store_from_json(asj)
        self.assertIsNotNone(id1)

        station, measurement, _ = split_record(json.loads(jseg))

        res1 = self.dao.store(station_id=id1,
                              parameter=measurement['parameter'],
                              unit=measurement['unit'],
                              averagingPeriod="")

        res2 = self.dao.store(station_id=id1,
                              parameter=measurement['parameter'],
                              unit=measurement['unit'],
                              averagingPeriod="")

        print(f'{res1} == {res2}')
        self.assertIsNotNone(res2)
        self.assertIsNotNone(res1)
        self.assertEqual(res1, res2)

        result = self.dao.get_all()
        self.assertEqual(1, len(result))
Beispiel #5
0
    def test_get_forstation(self):
        with open('./tests/series.ndjson', 'rb') as f:
            ll = list(
                get_jsons_from_stream(stream=f, object_name='series.ndjson'))
            self.assertEqual(15, len(ll))

        for rec in ll:
            _, measurement, _ = split_record(rec)
            station_id = self.smdao.store_from_json(rec)
            series_id = self.dao.store(
                station_id=station_id,
                parameter=measurement['parameter'],
                unit=measurement['unit'],
                averagingPeriod=f"{measurement['averagingPeriod']}")

            mes_id = self.mdao.store(series_id=series_id,
                                     value=measurement['value'],
                                     date=measurement['date']['utc'])
        mes = self.mdao.get_all()
        self.assertEqual(15, len(mes))

        only_one = self.dao.get_all_for_station(station_id="Nisekh")

        self.assertEqual(1, len(only_one))

        more_than_one = self.dao.get_all_for_station(
            station_id="Sankt Eriksgatan")

        self.assertEqual(3, len(more_than_one))
        my_series = self.dao.get_for_id(series_id=1)

        self.assertEqual(my_series[2], 'pm10')
Beispiel #6
0
    def test_multiple_inserts(self):

        with open('./tests/series.ndjson', 'rb') as f:
            ll = list(
                get_jsons_from_stream(stream=f, object_name='series.ndjson'))
            self.assertEqual(len(ll), 15)

        for rec in ll:
            station, measurement, _ = split_record(rec)
            station_id = self.smdao.store_from_json(rec)
            series_id = self.dao.store(
                station_id=station_id,
                parameter=measurement['parameter'],
                unit=measurement['unit'],
                averagingPeriod=f"{measurement['averagingPeriod']}")

            mes_id = self.mdao.store(series_id=series_id,
                                     value=measurement['value'],
                                     date=measurement['date']['utc'])

        stations = self.smdao.get_all()
        self.assertEqual(len(stations), 4)

        series = self.dao.get_all()
        self.assertEqual(len(series), 6)

        mes = self.mdao.get_all()
        self.assertEqual(len(mes), 15)
Beispiel #7
0
def local_process_file(object_name):
    ret = []
    for record in get_jsons_from_object(bucket=FETCHES_BUCKET,
                                        object_name=object_name):
        station, measurement, _ = split_record(record)

        ret.append([station, measurement])
    return ret
 def test_split_single(self):
     strr = '{"date":{"utc":"2015-12-10T01:00:00.000Z","local":"2015-12-10T08:00:00+07:00"},"parameter":"no2","location":"Pak Nam, Mueang","value":0.058,"unit":"ppm","city":"Samut Prakan","attribution":[{"name":"Pollution Control Department","url":"http://www.aqmthai.com/index.php"}],"averagingPeriod":{"value":1,"unit":"hours"},"coordinates":null,"country":"TH","sourceName":"Thailand"}'
     asj = json.loads(strr)
     print(f'got {asj}')
     self.assertIsNotNone(asj)
     station, measurement, ext = split_record(asj)
     self.assertIn('coordinates', station)
     self.assertEqual(station['coordinates']['latitude'], 0)
Beispiel #9
0
    def test_foreignkey_violation(self):

        station, measurement, _ = split_record(json.loads(jseg))

        res = self.dao.store(station_id='non-existing name',
                             parameter=measurement['parameter'],
                             unit=measurement['unit'],
                             averagingPeriod="")
        result = self.dao.get_all()
        assert len(result) == 1

        if self.engine.get_engine().name == 'sqlite':
            raise IntegrityError(statement='SQLLITE',
                                 params='does not support',
                                 orig='remote key violation',
                                 connection_invalidated=False,
                                 code=None)
Beispiel #10
0
    def test_insert_all(self):

        station, measurement, _ = split_record(json.loads(jseg))
        station_id = self.smdao.store_from_json(station)

        series_id = self.dao.store(station_id=station_id,
                                   parameter=measurement['parameter'],
                                   unit=measurement['unit'],
                                   averagingPeriod="")

        result = self.dao.get_all()
        self.assertEqual(len(result), 1)

        mes_id = self.mdao.store(series_id=series_id,
                                 value=measurement['value'],
                                 date=measurement['date']['utc'])
        self.assertTrue(mes_id)
        res = self.mdao.get_all()
        self.assertEqual(len(res), 1)
    def test_massive_split(self):
        client = Mock()
        f = open('./tests/exobj.ndjson', 'rb')
        client.get_object = MagicMock(return_value={'Body': f})

        for jl in get_jsons_from_object(bucket='bucket',
                                        object_name='obj',
                                        client=client):
            station, measurement, ext = split_record(jl)
            assert 'location' in station
            assert 'value' in measurement
            assert 'location' not in measurement
            assert 'city' in station
            assert 'city' not in ext
            assert 'location' not in ext
            assert 'attribution' in ext
            assert 'sourceName' in station
            assert 'sourceName' in measurement

        f.close()
def go_through(**kwargs):
    prefix = get_prefix(**kwargs)
    target_dir = os.path.join(Variable.get('target_dir'), prefix)
    logging.info(f'Will be processing [{ target_dir }]')

    flist = glob.glob(os.path.join(target_dir, '*'))
    logging.info(f'Files detected: { len(flist)}')

    station_dao, series_dao, mes_dao = setup_daos()

    for fname in flist:
        logging.info(f'Processing { fname}')
        with open(fname, 'rb') as f:
            for record in get_jsons_from_stream(stream=f, object_name=fname):
                station, measurement, _ = split_record(record)
                add_to_db(station_dao,
                          series_dao,
                          mes_dao,
                          station=station,
                          measurement=measurement)

    print_db_stats(station_dao, series_dao, mes_dao)
    return True
Beispiel #13
0
    def test_split_record(self):
        rec = """{"date":{"utc":"2018-06-06T23:00:00.000Z","local":"2018-06-07T05:00:00+06:00"},
            "parameter":"pm25",
            "location":"US Diplomatic Post: Dhaka",
            "value":27,
            "unit":"µg/m³",
            "city":"Dhaka",
            "attribution":[{"name":"EPA AirNow DOS","url":"http://airnow.gov/index.cfm?action=airnow.global_summary"}],
            "averagingPeriod":{"value":1,"unit":"hours"},
            "coordinates":{"latitude":23.796373,"longitude":90.424614},
            "country":"BD",
            "sourceName":"StateAir_Dhaka",
            "sourceType":"government",
            "mobile":false}"""
        jl = json.loads(rec)

        station, measurement, ext = split_record(jl)
        assert 'location' in station
        assert 'value' in measurement
        assert 'location' not in measurement
        assert 'city' in station
        assert 'city' not in ext
        assert 'location' not in ext
        assert 'attribution' in ext
def local_process_file(object_name):
    for record in get_jsons_from_object(bucket=FETCHES_BUCKET,
                                        object_name=object_name):
        station, measurement, _ = split_record(record)

        yield [station, measurement]