def test_general_exception_inside_yield(self): data = [ b'none', b'' ] * 10000 # supply an empty string otherwise urllib.read does not stop self.config_urlopen(data) # , sleep_time=1) # self.urls has a valid url (which should execute onsuccess) and an invalid one # which should execute onerror) with pytest.raises(KeyboardInterrupt): self.read_async_raise_exc_in_called_func(self.urls) assert self.progress == 0 # set the totalcounts of mock_urlread: 2 * len(url): totalcounts = 2 * len(self.urls) # assert we stopped before reading all url(s). Relax the condition by putting <=, as # if self.mock_urlread.call_count == totalcounts does not mean the test failed, it # can be due to the fact that we mock io-bound operations in urlread with non-io bound operations assert self.mock_urlread.call_count <= totalcounts # same regardless of urllib2 returned value: self.config_urlopen([URLError("")], sleep_time=None) # self.urls has a valid url (which should execute onsuccess) and an invalid one # which should execute onerror) with pytest.raises(KeyboardInterrupt): self.read_async_raise_exc_in_called_func(self.urls) assert self.progress == 0
def test_urlerrors(self): """Tests onerror. WE mock urllib2urlopen.read to raise an excpected Exception""" self.config_urlopen([URLError("")]) # self.urls has a valid url (which should execute onsuccess) and an invalid one # which should execute onerror) self.read_async(self.urls) assert len(self.errors) == 2 assert self.mock_urlread.call_count == len(self.urls) assert self.progress == 2
def test_download_save_segments(self, mock_updatedf, mock_insertdf, mseed_unpack, db, tt_ak135_tts): # prepare: # mseed unpack takes no starttime and endtime arguments, so that # we do not discard any correct chunk mseed_unpack.side_effect = lambda *a, **v: unpack(a[0]) mock_insertdf.side_effect = lambda *a, **v: insertdf(*a, **v) mock_updatedf.side_effect = lambda *a, **v: updatedf(*a, **v) urlread_sideeffect = None # use defaults from class events_df = self.get_events_df(urlread_sideeffect, db.session) net, sta, loc, cha = [], [], [], [] datacenters_df, eidavalidator = \ self.get_datacenters_df(urlread_sideeffect, db.session, self.service, self.routing_service, net, sta, loc, cha, db_bufsize=self.db_buf_size) channels_df = self.get_channels_df(urlread_sideeffect, db.session, datacenters_df, eidavalidator, net, sta, loc, cha, None, None, 10, False, None, None, -1, self.db_buf_size) assert len(channels_df) == 12 # just to be sure. If failing, we might have changed the class default # events_df # id magnitude latitude longitude depth_km time # 0 20160508_0000129 3.0 1.0 1.0 60.0 2016-05-08 05:17:11.500 # 1 20160508_0000004 4.0 2.0 2.0 2.0 2016-05-08 01:45:30.300 # channels_df (index not shown): # columns: # id station_id latitude longitude datacenter_id start_time end_time network station location channel # data (not aligned with columns): # 1 1 1.0 1.0 1 2003-01-01 NaT GE FLT1 HHE # 2 1 1.0 1.0 1 2003-01-01 NaT GE FLT1 HHN # 3 1 1.0 1.0 1 2003-01-01 NaT GE FLT1 HHZ # 4 2 90.0 90.0 1 2009-01-01 NaT n1 s c1 # 5 2 90.0 90.0 1 2009-01-01 NaT n1 s c2 # 6 2 90.0 90.0 1 2009-01-01 NaT n1 s c3 # 7 3 1.0 1.0 2 2003-01-01 NaT IA BAKI BHE # 8 3 1.0 1.0 2 2003-01-01 NaT IA BAKI BHN # 9 3 1.0 1.0 2 2003-01-01 NaT IA BAKI BHZ # 10 4 90.0 90.0 2 2009-01-01 NaT n2 s c1 # 11 4 90.0 90.0 2 2009-01-01 NaT n2 s c2 # 12 4 90.0 90.0 2 2009-01-01 NaT n2 s c3 assert all(_ in channels_df.columns for _ in [Station.network.key, Station.station.key, Channel.location.key, Channel.channel.key]) chaid2mseedid = chaid2mseedid_dict(channels_df) # check that we removed the columns: assert not any(_ in channels_df.columns for _ in [Station.network.key, Station.station.key, Channel.location.key, Channel.channel.key]) # take all segments: # use minmag and maxmag ttable = tt_ak135_tts segments_df = merge_events_stations(events_df, channels_df, dict(minmag=10, maxmag=10, minmag_radius=10, maxmag_radius=10), tttable=ttable) assert len(pd.unique(segments_df['arrival_time'])) == 2 h = 9 # segments_df (index not shown). Note that # cid sid did n s l c ed event_id depth_km time <- LAST TWO ARE Event related columns that will be removed after arrival_time calculations # 1 1 1 GE FLT1 HHE 0.0 20160508_0000129 60.0 2016-05-08 05:17:11.500 # 2 1 1 GE FLT1 HHN 0.0 20160508_0000129 60.0 2016-05-08 05:17:11.500 # 3 1 1 GE FLT1 HHZ 0.0 20160508_0000129 60.0 2016-05-08 05:17:11.500 # 7 3 2 IA BAKI BHE 0.0 20160508_0000129 60.0 2016-05-08 05:17:11.500 # 8 3 2 IA BAKI BHN 0.0 20160508_0000129 60.0 2016-05-08 05:17:11.500 # 9 3 2 IA BAKI BHZ 0.0 20160508_0000129 60.0 2016-05-08 05:17:11.500 # 4 2 1 n1 s c1 0.0 20160508_0000004 2.0 2016-05-08 01:45:30.300 # 5 2 1 n1 s c2 0.0 20160508_0000004 2.0 2016-05-08 01:45:30.300 # 6 2 1 n1 s c3 0.0 20160508_0000004 2.0 2016-05-08 01:45:30.300 # 10 4 2 n2 s c1 0.0 20160508_0000004 2.0 2016-05-08 01:45:30.300 # 11 4 2 n2 s c2 0.0 20160508_0000004 2.0 2016-05-08 01:45:30.300 # 12 4 2 n2 s c3 0.0 20160508_0000004 2.0 2016-05-08 01:45:30.300 # LEGEND: # cid = channel_id # sid = station_id # scid = datacenter_id # n, s, l, c = network, station, location, channel # ed = event_distance_deg # define a dc_dataselect_manager for open data only: dc_dataselect_manager = DcDataselectManager(datacenters_df, Authorizer(None), False) wtimespan = [1,2] expected = len(segments_df) # no segment on db, we should have all segments to download orig_segments_df = segments_df.copy() segments_df, request_timebounds_need_update = \ prepare_for_download(db.session, orig_segments_df, dc_dataselect_manager, wtimespan, retry_seg_not_found=True, retry_url_err=True, retry_mseed_err=True, retry_client_err=True, retry_server_err=True, retry_timespan_err=True, retry_timespan_warn=True) # segments_df # COLUMNS: # channel_id datacenter_id network station location channel event_distance_deg event_id arrival_time start_time end_time id download_status_code run_id # DATA (not aligned with columns): # channel_id datacenter_id network station location channel event_distance_deg event_id arrival_time start_time end_time id download_status_code run_id # GE.FLT1..HHE 1 1 GE FLT1 HHE 0.0 1 2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12 None None 1 # GE.FLT1..HHN 2 1 GE FLT1 HHN 0.0 1 2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12 None None 1 # GE.FLT1..HHZ 3 1 GE FLT1 HHZ 0.0 1 2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12 None None 1 # IA.BAKI..BHE 7 2 IA BAKI BHE 0.0 1 2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12 None None 1 # IA.BAKI..BHN 8 2 IA BAKI BHN 0.0 1 2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12 None None 1 # IA.BAKI..BHZ 9 2 IA BAKI BHZ 0.0 1 2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12 None None 1 # n1.s..c1 4 1 n1 s c1 0.0 2 2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31 None None 1 # n1.s..c2 5 1 n1 s c2 0.0 2 2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31 None None 1 # n1.s..c3 6 1 n1 s c3 0.0 2 2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31 None None 1 # n2.s..c1 10 2 n2 s c1 0.0 2 2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31 None None 1 # n2.s..c2 11 2 n2 s c2 0.0 2 2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31 None None 1 # n2.s..c3 12 2 n2 s c3 0.0 2 2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31 None None 1 # self._segdata is the folder file of a "valid" 3-channel miniseed # The channels are: # Thus, no match will be found and all segments will be written with a None # download status code # setup urlread: first three rows: ok # rows[3:6]: 413, retry them # rows[6:9]: malformed_data # rows[9:12] 413, retry them # then retry: # rows[3]: empty_data # rows[4]: data_with_gaps (but seed_id should notmatch) # rows[5]: data_with_gaps (seed_id should notmatch) # rows[9]: URLError # rows[10]: Http 500 error # rows[11]: 413 # NOTE THAT THIS RELIES ON THE FACT THAT THREADS ARE EXECUTED IN THE ORDER OF THE DATAFRAME # WHICH SEEMS TO BE THE CASE AS THERE IS ONE SINGLE PROCESS # self._seg_data[:2] is a way to mock data corrupted urlread_sideeffect = [self._seg_data, 413, self._seg_data[:2], 413, '', self._seg_data_gaps, self._seg_data_gaps, URLError("++urlerror++"), 500, 413] # Let's go: ztatz = self.download_save_segments(urlread_sideeffect, db.session, segments_df, dc_dataselect_manager, chaid2mseedid, self.run.id, False, request_timebounds_need_update, 1, 2, 3, db_bufsize=self.db_buf_size) # get columns from db which we are interested on to check cols = [Segment.id, Segment.channel_id, Segment.datacenter_id, Segment.download_code, Segment.maxgap_numsamples, \ Segment.sample_rate, Segment.data_seed_id, Segment.data, Segment.download_id, Segment.request_start, Segment.request_end, Segment.start_time, Segment.end_time ] db_segments_df = dbquery2df(db.session.query(*cols)) assert Segment.download_id.key in db_segments_df.columns # change data column otherwise we cannot display db_segments_df. # When there is data just print "data" db_segments_df.loc[(~pd.isnull(db_segments_df[Segment.data.key])) & (db_segments_df[Segment.data.key].str.len() > 0), Segment.data.key] = b'data' # assert we have 4 segments with "data" properly set: assert len(db_segments_df.loc[(~pd.isnull(db_segments_df[Segment.data.key])) & (db_segments_df[Segment.data.key].str.len() > 0), Segment.data.key]) == 4 # re-sort db_segments_df to match the segments_df: ret = [] for cha in segments_df[Segment.channel_id.key]: ret.append(db_segments_df[db_segments_df[Segment.channel_id.key] == cha]) db_segments_df = pd.concat(ret, axis=0) # db_segments_df: # id channel_id datacenter_id download_status_code max_gap_ovlap_ratio sample_rate data_seed_id data run_id start_time end_time # 0 1 1 1 200.0 0.0001 100.0 GE.FLT1..HHE data 1 2016-05-08 05:16:12 2016-05-08 05:19:12 # 1 2 2 1 200.0 0.0001 100.0 GE.FLT1..HHN data 1 2016-05-08 05:16:12 2016-05-08 05:19:12 # 2 3 3 1 200.0 0.0001 100.0 GE.FLT1..HHZ data 1 2016-05-08 05:16:12 2016-05-08 05:19:12 # 6 7 7 2 200.0 NaN NaN None 1 2016-05-08 05:16:12 2016-05-08 05:19:12 # 7 8 8 2 NaN NaN NaN None None 1 2016-05-08 05:16:12 2016-05-08 05:19:12 # 8 9 9 2 200.0 20.0 20.0 IA.BAKI..BHZ data 1 2016-05-08 05:16:12 2016-05-08 05:19:12 # 3 4 4 1 -2.0 NaN NaN None None 1 2016-05-08 01:44:31 2016-05-08 01:47:31 # 4 5 5 1 -2.0 NaN NaN None None 1 2016-05-08 01:44:31 2016-05-08 01:47:31 # 5 6 6 1 -2.0 NaN NaN None None 1 2016-05-08 01:44:31 2016-05-08 01:47:31 # 9 10 10 2 -1.0 NaN NaN None None 1 2016-05-08 01:44:31 2016-05-08 01:47:31 # 10 11 11 2 500.0 NaN NaN None None 1 2016-05-08 01:44:31 2016-05-08 01:47:31 # 11 12 12 2 413.0 NaN NaN None None 1 2016-05-08 01:44:31 2016-05-08 01:47:31 assert len(ztatz) == len(datacenters_df) assert len(db_segments_df) == len(segments_df) assert mock_updatedf.call_count == 0 dsc = db_segments_df[Segment.download_code.key] exp_dsc = np.array([200, 200, 200, 200, np.nan, 200, -2, -2, -2, -1, 500, 413]) assert ((dsc == exp_dsc) | (np.isnan(dsc) & np.isnan(exp_dsc))).all() # as we have 12 segments and a buf size of self.db_buf_size(=1, but it might change in the # future), this below is two # it might change if we changed the buf size in the future # test that we correctly called mock_insertdf. Note that we assume that the # latter is called ONLY inside DbManager. To test that, as the number of stuff # to be added (length of the dataframes) varies, we need to implement a counter here: mock_insertdf_call_count = 0 _bufzise = 0 for c in mock_insertdf.call_args_list: c_args = c[0] df_ = c_args[0] _bufzise += len(df_) if _bufzise >= self.db_buf_size: mock_insertdf_call_count += 1 _bufzise = 0 assert mock_insertdf.call_count == mock_insertdf_call_count # assert data is consistent COL = Segment.data.key assert (db_segments_df.iloc[:3][COL] == b'data').all() assert (db_segments_df.iloc[3:4][COL] == b'').all() assert pd.isnull(db_segments_df.iloc[4:5][COL]).all() assert (db_segments_df.iloc[5:6][COL] == b'data').all() assert pd.isnull(db_segments_df.iloc[6:][COL]).all() # assert downdload status code is consistent URLERR_CODE, MSEEDERR_CODE = s2scodes.url_err, s2scodes.mseed_err # also this asserts that we grouped for dc starttime endtime COL = Segment.download_code.key assert (db_segments_df.iloc[:4][COL] == 200).all() assert pd.isnull(db_segments_df.iloc[4:5][COL]).all() assert (db_segments_df.iloc[5:6][COL] == 200).all() assert (db_segments_df.iloc[6:9][COL] == MSEEDERR_CODE).all() assert (db_segments_df.iloc[9][COL] == URLERR_CODE).all() assert (db_segments_df.iloc[10][COL] == 500).all() assert (db_segments_df.iloc[11][COL] == 413).all() # assert gaps are only in the given position COL = Segment.maxgap_numsamples.key assert (db_segments_df.iloc[:3][COL] < 0.01).all() assert pd.isnull(db_segments_df.iloc[3:5][COL]).all() assert (db_segments_df.iloc[5][COL] == 20).all() assert pd.isnull(db_segments_df.iloc[6:][COL]).all() # now mock retry: segments_df, request_timebounds_need_update = \ prepare_for_download(db.session, orig_segments_df, dc_dataselect_manager, wtimespan, retry_seg_not_found=True, retry_url_err=True, retry_mseed_err=True, retry_client_err=True, retry_server_err=True, retry_timespan_err=True, retry_timespan_warn=True) assert request_timebounds_need_update is False COL = Segment.download_code.key mask = (db_segments_df[COL] >= 400) | pd.isnull(db_segments_df[COL]) \ | (db_segments_df[COL].isin([URLERR_CODE, MSEEDERR_CODE])) assert len(segments_df) == len(db_segments_df[mask]) urlread_sideeffect = [413] mock_updatedf.reset_mock() mock_insertdf.reset_mock() # define a dc_dataselect_manager for open data only: dc_dataselect_manager = DcDataselectManager(datacenters_df, Authorizer(None), False) # Let's go: ztatz = self.download_save_segments(urlread_sideeffect, db.session, segments_df, dc_dataselect_manager, chaid2mseedid, self.run.id, False, request_timebounds_need_update, 1, 2, 3, db_bufsize=self.db_buf_size) # get columns from db which we are interested on to check cols = [Segment.download_code, Segment.channel_id] db_segments_df = dbquery2df(db.session.query(*cols)) # change data column otherwise we cannot display db_segments_df. When there is data # just print "data" # db_segments_df.loc[(~pd.isnull(db_segments_df[Segment.data.key])) & # (db_segments_df[Segment.data.key].str.len() > 0), Segment.data.key] = b'data' # re-sort db_segments_df to match the segments_df: ret = [] for cha in segments_df[Segment.channel_id.key]: ret.append(db_segments_df[db_segments_df[Segment.channel_id.key] == cha]) db_segments_df = pd.concat(ret, axis=0) assert (db_segments_df[COL] == 413).all() assert len(ztatz) == len(datacenters_df) assert len(db_segments_df) == len(segments_df) # same as above: but with updatedf: test that we correctly called mock_insertdf_napkeys. # Note that we assume that the latter is called ONLY inside download.main.DbManager. # To test that, as the number of stuff to be added (length of the dataframes) varies, # we need to implement a counter here: mock_updatedf_call_count = 0 _bufzise = 0 for c in mock_updatedf.call_args_list: c_args = c[0] df_ = c_args[0] _bufzise += len(df_) if _bufzise >= self.db_buf_size: mock_updatedf_call_count += 1 _bufzise = 0 assert mock_updatedf.call_count == mock_updatedf_call_count assert mock_insertdf.call_count == 0
def test_utils_url_read(mock_urlopen): def side_effect(argss): return StringIO(argss) mockread = mock.Mock() class mybytesio(object): def __init__(self, url, **kwargs): mockread.reset_mock() if isinstance(url, Exception): self.a = url else: self.code = 200 self.msg = 'Ok' self.a = BytesIO(url) def read(self, *a, **kw): if isinstance(self.a, Exception): raise self.a # pylint: disable=raising-non-exception mockread(*a, **kw) return self.a.read(*a, **kw) def close(self, *a, **kw): if not isinstance(self.a, Exception): self.a.close(*a, **kw) mock_urlopen.side_effect = lambda url, **kw: mybytesio(url, **kw) with pytest.raises(TypeError): urlread('', "name") val = b'url' blockSize = 1024 * 1024 assert urlread(val, blockSize)[0] == val mock_urlopen.assert_called_with(val) # , timeout=DEFAULT_TIMEOUT) assert mockread.call_count == 2 mockread.assert_called_with(blockSize) mock_urlopen.side_effect = lambda url, **kw: mybytesio(url, **kw) assert urlread(val, arg_to_read=56)[0] == val mock_urlopen.assert_called_with(val, arg_to_read=56) assert mockread.call_count == 1 # because blocksize is -1 mock_urlopen.side_effect = lambda url, **kw: mybytesio(URLError('wat?')) with pytest.raises(URLError): urlread(val, wrap_exceptions=False) # note urlexc with pytest.raises(URLException): urlread(val, wrap_exceptions=True) # note urlexc mock_urlopen.side_effect = lambda url, **kw: mybytesio(URLError('wat?')) with pytest.raises(URLException): urlread(val) # note urlexc mock_urlopen.side_effect = lambda url, **kw: mybytesio(socket.timeout()) with pytest.raises(URLException): urlread(val) # note urlexc mock_urlopen.side_effect = lambda url, **kw: mybytesio(HTTPError('url', 500, '?', None, None)) with pytest.raises(URLException): urlread(val) # note urlexc mock_urlopen.side_effect = lambda url, **kw: mybytesio(HTTPError('url', 500, '?', None, None)) assert urlread(val, raise_http_err=False) == (None, 500, '?') # note urlexc
def test_get_channels_df(self, db): urlread_sideeffect = """1|2|3|4|5|6|7|8|9|10|11|12|13 20160508_0000129|2016-05-08 05:17:11.500000|40.57|52.23|60.0|AZER|EMSC-RTS|AZER|505483|ml|3.1|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN 20160508_0000004|2016-05-08 01:45:30.300000|44.96|15.35|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|3.6|EMSC|CROATIA 20160508_0000113|2016-05-08 22:37:20.100000|45.68|26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA 20160508_0000113|2016-05-08 22:37:20.100000|45.68|26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA --- ERRROR --- THIS IS MALFORMED 20160508_abc0113|2016-05-08 22:37:20.100000| --- ERROR --- |26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA """ events_df = self.get_events_df(urlread_sideeffect, db.session) urlread_sideeffect = """http://geofon.gfz-potsdam.de/fdsnws/dataselect/1/query ZZ * * * 2002-09-01T00:00:00 2005-10-20T00:00:00 UP ARJ * * 2013-08-01T00:00:00 2017-04-25 http://ws.resif.fr/fdsnws/dataselect/1/query ZU * * HHZ 2015-01-01T00:00:00 2016-12-31T23:59:59.999999 """ # we tried to add two events with the same id, check we printed out the msg: assert "Duplicated instances violate db constraint" in self.log_msg() net, sta, loc, cha = [], [], [], [] datacenters_df, eidavalidator = \ self.get_datacenters_df(urlread_sideeffect, db.session, None, self.routing_service, net, sta, loc, cha, db_bufsize=self.db_buf_size) # first we mock url errors in all queries. We still did not write anything in the db # so we should quit: with pytest.raises(FailedDownload) as qd: _ = self.get_channels_df(URLError('urlerror_wat'), db.session, datacenters_df, eidavalidator, net, sta, loc, cha, None, None, 100, False, None, None, -1, self.db_buf_size) assert 'urlerror_wat' in self.log_msg() assert "Unable to fetch stations" in self.log_msg() assert "Fetching stations from database for 2 (of 2) data-center(s)" in self.log_msg( ) # Test that the exception message is correct # note that this message is in the log if we run the method from the main # function (which is not the case here): assert ("Unable to fetch stations from all data-centers, " "no data to fetch from the database. " "Check config and log for details") in str(qd.value) # now get channels with a mocked custom urlread_sideeffect below: # IMPORTANT: url read for channels: Note: first response data raises, second has an # error and that error is skipped (the other channels are added) urlread_sideeffect = [ """#Network|Station|Location|Channel|Latitude|Longitude|Elevation|Depth|Azimuth|Dip|SensorDescription|Scale|ScaleFreq|ScaleUnits|SampleRate|StartTime|EndTime --- ERROR --- MALFORMED|12T00:00:00| HT|AGG||HHZ|39.0211|22.336|622.0|0.0|0.0|-90.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|50.0|2008-02-12T00:00:00| HT|LKD2||HHE|38.7889|20.6578|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00| """, # NOTE THAT THE CHANNELS ABOVE WILL BE OVERRIDDEN BY THE ONES BELOW (MULTIPLE NAMES< WE # SHOULD NOT HAVE THIS CASE WITH THE EDIAWS ROUTING SERVICE BUT WE TEST HERE THE CASE) # NOTE THE USE OF HTß as SensorDescription (to check non-asci characters do not raise) """#Network|Station|Location|Channel|Latitude|Longitude|Elevation|Depth|Azimuth|Dip|SensorDescription|Scale|ScaleFreq|ScaleUnits|SampleRate|StartTime|EndTime HT|AGG||HHE|--- ERROR --- NONNUMERIC |22.336|622.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|70.0|2008-02-12T00:00:00| HT|AGG||HLE|95.6|22.336|622.0|0.0|90.0|0.0|GFZ:HTß1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2008-02-12T00:00:00| HT|AGG||HLZ|39.0211|22.336|622.0|0.0|0.0|-90.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2008-02-12T00:00:00| HT|LKD2||HHE|38.7889|20.6578|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|90.0|2009-01-01T00:00:00| HT|LKD2||HHZ|38.7889|20.6578|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|90.0|2009-01-01T00:00:00| BLA|BLA||HHZ|38.7889|20.6578|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00|2019-01-01T00:00:00 BLA|BLA||HHZ|38.7889|20.6578|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2018-01-01T00:00:00| """ ] cha_df = self.get_channels_df(urlread_sideeffect, db.session, datacenters_df, eidavalidator, net, sta, loc, cha, None, None, 90, False, None, None, -1, self.db_buf_size) # assert we have a message for discarding the response data # (first arg of urlread): assert "Discarding response data" in self.log_msg() # we should have called mock_urlopen_in_async times the datacenters assert self.mock_urlopen.call_count == len(datacenters_df) assert len(db.session.query(Station.id).all()) == 4 # the last two channels of the second item of `urlread_sideeffect` are from two # stations (BLA|BLA|...) with only different start time. Thus they should both be added: assert len(db.session.query(Channel.id).all()) == 6 # as net, sta, loc, cha are all empty lists and start = end = None (all default=>no filter), # this is the post data passed to urlread for the 1st datacenter: assert self.mock_urlopen.call_args_list[0][0][ 0].data == b"""format=text level=channel * * * * * *""" # as net, sta, loc, cha are all empty lists and start = end = None (all default=>no filter), # this is the post data passed to urlread for the 2nd datacenter: assert self.mock_urlopen.call_args_list[1][0][ 0].data == b"""format=text level=channel * * * * * *""" assert self.mock_urlopen.call_args_list[0][0][0].get_full_url() == \ "http://geofon.gfz-potsdam.de/fdsnws/station/1/query" assert self.mock_urlopen.call_args_list[1][0][0].get_full_url() == \ "http://ws.resif.fr/fdsnws/station/1/query" # assert all downloaded stations have datacenter_id of the second datacenter: dcid = datacenters_df.iloc[1].id assert all(sid[0] == dcid for sid in db.session.query(Station.datacenter_id).all()) # assert all downloaded channels have station_id in the set of downloaded stations only: sta_ids = [x[0] for x in db.session.query(Station.id).all()] assert all(c_staid[0] in sta_ids for c_staid in db.session.query(Channel.station_id).all()) # now mock again url errors in all queries. As we wrote something in the db # so we should NOT quit cha_df2 = self.get_channels_df(URLError('urlerror_wat'), db.session, datacenters_df, eidavalidator, net, sta, loc, cha, datetime(2020, 1, 1), None, 100, False, None, None, -1, self.db_buf_size) # Note above that min sample rate = 100 and a starttime proivded should return 3 channels: assert len(cha_df2) == 3 assert "Fetching stations from database for 2 (of 2) data-center(s)" in self.log_msg( ) # now test again with a socket timeout cha_df2 = self.get_channels_df(socket.timeout(), db.session, datacenters_df, eidavalidator, net, sta, loc, cha, None, None, 100, False, None, None, -1, self.db_buf_size) assert 'timeout' in self.log_msg() assert "Fetching stations from database for 2 (of 2) data-center(s)" in self.log_msg( ) # now mixed case: # now change min sampling rate and see that we should get one channel less cha_df3 = self.get_channels_df(urlread_sideeffect, db.session, datacenters_df, eidavalidator, net, sta, loc, cha, None, None, 100, False, None, None, -1, self.db_buf_size) assert len(cha_df3) == len(cha_df) - 2 assert "2 channel(s) discarded according to current config. filters" in self.log_msg( ) # now change this: urlread_sideeffect = [ URLError('wat'), """#Network|Station|Location|Channel|Latitude|Longitude|Elevation|Depth|Azimuth|Dip|SensorDescription|Scale|ScaleFreq|ScaleUnits|SampleRate|StartTime|EndTime A|B|10|HBE|39.0211|22.336|622.0|0.0|0.0|-90.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2003-02-12T00:00:00|2010-02-12T00:00:00 E|F|11|HHZ|38.7889|20.6578|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2019-01-01T00:00:00| """, URLError('wat'), socket.timeout() ] # now change channels=['B??']. In the urlread_sideeffect above, for the 1st, 3rd and 4th # case we fallback to a db query, but we do not have such a channel, so nothing is returned # The dataframe currently saved on db is: # id channel start_time end_time sample_rate datacenter_id # 0 1 HLE 2008-02-12 NaT 100.0 2 # 1 2 HLZ 2008-02-12 NaT 100.0 2 # 2 3 HHE 2009-01-01 NaT 90.0 2 # 3 4 HHZ 2009-01-01 NaT 90.0 2 # 4 5 HHZ 2009-01-01 2019-01-01 100.0 2 # 5 6 HHZ 2018-01-01 NaT 100.0 2 # for the second case, the mocked response returns two channels and in this case # we might put whatever filter here below. Assert that the number of channels returned is 2 cha_df = self.get_channels_df(urlread_sideeffect, db.session, datacenters_df, eidavalidator, net, sta, loc, ['B??'], None, None, 10, False, None, None, -1, self.db_buf_size) assert len(cha_df) == 2 # test channels and startime + entimes provided when querying the db (postdata None) # by iussuing the command: # dbquery2df(db.session.query(Channel.id, Station.network, Station.station, # Channel.location, Channel.channel, Station.start_time,Station.end_time, # Channel.sample_rate, Station.datacenter_id).join(Channel.station)) # This is the actual state of the db: # ---------------------------------------------- # channel_id network station location channel start_time end_time sample_rate datacenter_id # 1 HT AGG HLE 2008-02-12 NaT 100.0 2 # 2 HT AGG HLZ 2008-02-12 NaT 100.0 2 # 3 HT LKD2 HHE 2009-01-01 NaT 90.0 2 # 4 HT LKD2 HHZ 2009-01-01 NaT 90.0 2 # 5 BLA BLA HHZ 2009-01-01 2019-01-01 100.0 2 # 6 BLA BLA HHZ 2018-01-01 NaT 100.0 2 # 7 A B 10 HBE 2003-02-12 2010-02-12 100.0 2 # 8 E F 11 HHZ 2019-01-01 NaT 100.0 2 # ---------------------------------------------- # Now according to the table above set a list of arguments: # Each key is: the argument, each value IS A LIST OF BOOLEAN MAPPED TO EACH ROW OF THE # DATAFRAME ABOVE, telling if the row matches according to the argument: nets = { ('*', ): [1, 1, 1, 1, 1, 1, 1, 1], # ('HT', 'BLA'): [1, 1, 1, 1, 1, 1, 0, 0], ( '*A*', ): [0, 0, 0, 0, 1, 1, 1, 0] } stas = { ('B*', ): [0, 0, 0, 0, 1, 1, 1, 0], ('B??', ): [0, 0, 0, 0, 1, 1, 0, 0] } # note that we do NOT assume '--' can be given, as this should be the parsed # output of `nslc_lists`: locs = { ('', ): [1, 1, 1, 1, 1, 1, 0, 0], ('1?', ): [0, 0, 0, 0, 0, 0, 1, 1] } chans = { ('?B?', ): [0, 0, 0, 0, 0, 0, 1, 0], ('HL?', '?B?'): [1, 1, 0, 0, 0, 0, 1, 0], ('HHZ', ): [0, 0, 0, 1, 1, 1, 0, 1] } stimes = { None: [1, 1, 1, 1, 1, 1, 1, 1], datetime(2002, 1, 1): [1, 1, 1, 1, 1, 1, 1, 1], datetime(2099, 1, 1): [1, 1, 1, 1, 0, 1, 0, 1] } etimes = { None: [1, 1, 1, 1, 1, 1, 1, 1], datetime(2002, 1, 1): [0, 0, 0, 0, 0, 0, 0, 0], datetime(2011, 1, 1): [1, 1, 1, 1, 1, 0, 1, 0], datetime(2099, 1, 1): [1, 1, 1, 1, 1, 1, 1, 1] } minsr = { 90: [1, 1, 1, 1, 1, 1, 1, 1], # 95: [1, 1, 0, 0, 1, 1, 1, 1], 100: [1, 1, 0, 0, 1, 1, 1, 1], 105: [0, 0, 0, 0, 0, 0, 0, 0] } # no url read: set socket.tiomeout as urlread side effect. This will force # querying the database to test that the filtering works as expected: for n, s, l, c, st, e, m in product(nets, stas, locs, chans, stimes, etimes, minsr): matches = np.array(nets[n]) * np.array(stas[s]) * np.array(locs[l]) * \ np.array(chans[c]) * np.array(stimes[st]) * np.array(etimes[e]) * \ np.array(minsr[m]) expected_length = matches.sum() # Now: if expected length is zero, it means we do not have data matches on the db # This raises a quitdownload (avoiding pytest.raises cause in this # case it's easier like done below): try: __dc_df = datacenters_df.loc[datacenters_df[DataCenter.id.key] == 2] cha_df = self.get_channels_df(socket.timeout(), db.session, __dc_df, eidavalidator, n, s, l, c, st, e, m, False, None, None, -1, self.db_buf_size) assert len(cha_df) == expected_length except FailedDownload as qd: assert expected_length == 0 assert "Unable to fetch stations from all data-centers" in str( qd) # Same test as above, but test negative assertions with "!". Reminder: data on db is: # ---------------------------------------------- # channel_id network station location channel start_time end_time sample_rate datacenter_id # 1 HT AGG HLE 2008-02-12 NaT 100.0 2 # 2 HT AGG HLZ 2008-02-12 NaT 100.0 2 # 3 HT LKD2 HHE 2009-01-01 NaT 90.0 2 # 4 HT LKD2 HHZ 2009-01-01 NaT 90.0 2 # 5 BLA BLA HHZ 2009-01-01 2019-01-01 100.0 2 # 6 BLA BLA HHZ 2018-01-01 NaT 100.0 2 # 7 A B 10 HBE 2003-02-12 2010-02-12 100.0 2 # 8 E F 11 HHZ 2019-01-01 NaT 100.0 2 # ---------------------------------------------- # Now according to the table above set a list of arguments: # Each key is: the argument, each value IS A LIST OF BOOLEAN MAPPED TO EACH ROW OF THE # DATAFRAME ABOVE, telling if the row matches according to the argument: nets = { ('!*A*', 'A'): [1, 1, 1, 1, 0, 0, 1, 1], ('E', 'A'): [0, 0, 0, 0, 0, 0, 1, 1] } stas = { ('!*B*', 'B'): [1, 1, 1, 1, 0, 0, 1, 1], ('!???2', ): [1, 1, 0, 0, 1, 1, 1, 1] } # note that we do NOT assume '--' can be given, as this should be the parsed # output of `nslc_lists`: locs = { ('', ): [1, 1, 1, 1, 1, 1, 0, 0], ('!', ): [0, 0, 0, 0, 0, 0, 1, 1] } chans = { ('HHZ', '!*E'): [0, 1, 0, 1, 1, 1, 0, 1], ('!?H?', ): [1, 1, 0, 0, 0, 0, 1, 0] } stimes = {None: [1, 1, 1, 1, 1, 1, 1, 1]} etimes = {None: [1, 1, 1, 1, 1, 1, 1, 1]} minsr = {-1: [1, 1, 1, 1, 1, 1, 1, 1]} # no url read: set socket.tiomeout as urlread side effect. This will force # querying the database to test that the filtering works as expected: for n, s, l, c, st, e, m in product(nets, stas, locs, chans, stimes, etimes, minsr): matches = np.array(nets[n]) * np.array(stas[s]) * np.array(locs[l]) * \ np.array(chans[c]) * np.array(stimes[st]) * np.array(etimes[e]) * np.array(minsr[m]) expected_length = matches.sum() # Now: if expected length is zero, it means we do not have data matches on the db # This raises a quitdownload (avoiding pytest.raises cause in this # case it's easier like done below): try: __dc_df = datacenters_df.loc[datacenters_df[DataCenter.id.key] == 2] cha_df = self.get_channels_df(socket.timeout(), db.session, __dc_df, eidavalidator, n, s, l, c, st, e, m, False, None, None, -1, self.db_buf_size) assert len(cha_df) == expected_length except FailedDownload as qd: assert expected_length == 0 assert "Unable to fetch stations from all data-centers" in str( qd) # now make the second url_side_effect raise => force query from db, and the first good # => fetch from the web # We want to test the mixed case: some fetched from db, some from the web # --------------------------------------------------- # first we query the db to check what we have: cha_df = dbquery2df( db.session.query(Channel.id, Station.datacenter_id, Station.network).join(Station)) # build a new network: newnetwork = 'U' while newnetwork in cha_df[Station.network.key]: newnetwork += 'U' urlread_sideeffect2 = [ """#Network|Station|Location|Channel|Latitude|Longitude|Elevation|Depth|Azimuth|Dip|SensorDescription|Scale|ScaleFreq|ScaleUnits|SampleRate|StartTime|EndTime %s|W||HBE|39.0211|22.336|622.0|0.0|0.0|-90.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|50.0|2008-02-12T00:00:00|2010-02-12T00:00:00 """ % newnetwork, socket.timeout() ] # now note: the first url read raised, now it does not: write the channel above with # network = newnetwork (surely non existing to the db) # The second url read did not raise, now it does (socket.timeout): fetch from the db # we issue a ['???'] as 'channel' argument in order to fetch everything from the db # (we would have got the same by passing None as 'channel' argument) # The three [] before ['???'] are net, sta, loc and mean: no filter on those params cha_df_ = self.get_channels_df(urlread_sideeffect2, db.session, datacenters_df, eidavalidator, [], [], [], ['???'], None, None, 10, False, None, None, -1, self.db_buf_size) # we should have the channel with network 'U' to the first datacenter dcid = datacenters_df.iloc[0][DataCenter.id.key] assert len(cha_df_[cha_df_[Station.datacenter_id.key] == dcid]) == 1 assert cha_df_[cha_df_[Station.datacenter_id.key] == dcid][Station.network.key][0] == \ newnetwork # but we did not query other channels for datacenter id = dcid, as the web response # was successful, we rely on that. Conversely, for the other datacenter we should have all # channels fetched from db dcid = datacenters_df.iloc[1][DataCenter.id.key] chaids_of_dcid = \ cha_df_[cha_df_[Station.datacenter_id.key] == dcid][Channel.id.key].tolist() db_chaids_of_dcid = \ cha_df[cha_df[Station.datacenter_id.key] == dcid][Channel.id.key].tolist() assert chaids_of_dcid == db_chaids_of_dcid
def test_merge_event_stations_mag_independent_circle(self, db, tt_ak135_tts): # get events with lat lon (1,1), (2,2,) ... (n, n) urlread_sideeffect = """#EventID | Time | Latitude | Longitude | Depth/km | Author | Catalog | Contributor | ContributorID | MagType | Magnitude | MagAuthor | EventLocationName 20160508_0000129|2016-05-08 05:17:11.500000|1|1|60.0|AZER|EMSC-RTS|AZER|505483|ml|3|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN 20160508_0000004|2016-05-08 01:45:30.300000|90|90|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|4|EMSC|CROATIA """ events_df = self.get_events_df(urlread_sideeffect, db.session) net, sta, loc, cha = [], [], [], [] datacenters_df, eidavalidator = \ self.get_datacenters_df(None, db.session, None, self.routing_service, net, sta, loc, cha, db_bufsize=self.db_buf_size) # url read for channels: Note: first response data raises, second has an error and # that error is skipped (other channels are added), and last two channels are from two # stations (BLA|BLA|...) with only different start time (thus stations should both be # added) urlread_sideeffect = ["""#Network|Station|Location|Channel|Latitude|Longitude|Elevation|Depth|Azimuth|Dip|SensorDescription|Scale|ScaleFreq|ScaleUnits|SampleRate|StartTime|EndTime A|a||HHZ|1|1|622.0|0.0|0.0|-90.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|50.0|2008-02-12T00:00:00| A|b||HHE|2|2|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00| """, """#Network|Station|Location|Channel|Latitude|Longitude|Elevation|Depth|Azimuth|Dip|SensorDescription|Scale|ScaleFreq|ScaleUnits|SampleRate|StartTime|EndTime A|c||HHZ|3|3|622.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2008-02-12T00:00:00| BLA|e||HHZ|7|7|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00|2019-01-01T00:00:00 BLA|e||HHZ|8|8|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2019-01-01T00:00:00| """, URLError('wat'), socket.timeout()] channels_df = self.get_channels_df(urlread_sideeffect, db.session, datacenters_df, eidavalidator, net, sta, loc, cha, None, None, 10, False, None, None, -1, self.db_buf_size) assert len(channels_df) == 5 # events_df # id magnitude latitude longitude depth_km time # 0 1 3.0 1.0 1.0 60.0 2016-05-08 05:17:11.500 # 1 2 4.0 90.0 90.0 2.0 2016-05-08 01:45:30.300 # channels_df: # id station_id latitude longitude datacenter_id start_time end_time # 0 1 1 1.0 1.0 1 2008-02-12 NaT # 1 2 2 2.0 2.0 1 2009-01-01 NaT # 2 3 3 3.0 3.0 2 2008-02-12 NaT # 3 4 4 7.0 7.0 2 2009-01-01 2019-01-01 # 4 5 5 8.0 8.0 2 2019-01-01 NaT tt_table = tt_ak135_tts # for magnitude <10, max_radius is 0. For magnitude >10, max_radius is 200 # we have only magnitudes <10, we have two events exactly on a station (=> dist=0) # which will be taken (the others dropped out) df = merge_events_stations(events_df, channels_df, dict(min=0, max=10), tttable=tt_table) # the first event results in 4 potential segments # (the last channel has been opened too late), # the second event results in 0 potential segments # (too far away): assert len(df) == 4 # now let's see: the channel with id = 4 is 8.48 degrees far away # from the first event. By issuing a max=8: df = merge_events_stations(events_df, channels_df, dict(min=0, max=8), tttable=tt_table) # we should get: assert len(df) == 3 # now let'se restrict again:search_radius min is increased to 2, meaning that # we skip the first two channels (distances =0 and 1.413, respectively) # and leaving us with 3-2 = 1 potential segment only: df = merge_events_stations(events_df, channels_df, dict(min=1.414, max=8), tttable=tt_table) # we should get: assert len(df) == 1 # now let's take all combinations (2 events x 4 channels = 8 potential segments). df = merge_events_stations(events_df, channels_df, dict(min=0, max=90), tttable=tt_table) # we should get: assert len(df) == 8
def test_merge_event_stations(self, db, tt_ak135_tts): # get events with lat lon (1,1), (2,2,) ... (n, n) urlread_sideeffect = """#EventID | Time | Latitude | Longitude | Depth/km | Author | Catalog | Contributor | ContributorID | MagType | Magnitude | MagAuthor | EventLocationName 20160508_0000129|2016-05-08 05:17:11.500000|1|1|60.0|AZER|EMSC-RTS|AZER|505483|ml|3|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN 20160508_0000004|2016-05-08 01:45:30.300000|2|2|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|4|EMSC|CROATIA """ events_df = self.get_events_df(urlread_sideeffect, db.session) net, sta, loc, cha = [], [], [], [] datacenters_df, eidavalidator = \ self.get_datacenters_df(None, db.session, None, self.routing_service, net, sta, loc, cha, db_bufsize=self.db_buf_size) # url read for channels: Note: first response data raises, second has an error and # that error is skipped (other channels are added), and last two channels are from two # stations (BLA|BLA|...) with only different start time (thus stations should both be # added) urlread_sideeffect = ["""#Network|Station|Location|Channel|Latitude|Longitude|Elevation|Depth|Azimuth|Dip|SensorDescription|Scale|ScaleFreq|ScaleUnits|SampleRate|StartTime|EndTime A|a||HHZ|1|1|622.0|0.0|0.0|-90.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|50.0|2008-02-12T00:00:00| A|b||HHE|2|2|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00| """, """#Network|Station|Location|Channel|Latitude|Longitude|Elevation|Depth|Azimuth|Dip|SensorDescription|Scale|ScaleFreq|ScaleUnits|SampleRate|StartTime|EndTime A|c||HHZ|3|3|622.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2008-02-12T00:00:00| BLA|e||HHZ|7|7|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00|2019-01-01T00:00:00 BLA|e||HHZ|8|8|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2019-01-01T00:00:00| """, URLError('wat'), socket.timeout()] channels_df = self.get_channels_df(urlread_sideeffect, db.session, datacenters_df, eidavalidator, net, sta, loc, cha, None, None, 10, False, None, None, -1, self.db_buf_size) assert len(channels_df) == 5 # events_df # id magnitude latitude longitude depth_km time # 0 1 3.0 1.0 1.0 60.0 2016-05-08 05:17:11.500 # 1 2 4.0 2.0 2.0 2.0 2016-05-08 01:45:30.300 # channels_df: # id station_id latitude longitude datacenter_id start_time end_time # 0 1 1 1.0 1.0 1 2008-02-12 NaT # 1 2 2 2.0 2.0 1 2009-01-01 NaT # 2 3 3 3.0 3.0 2 2008-02-12 NaT # 3 4 4 7.0 7.0 2 2009-01-01 2019-01-01 # 4 5 5 8.0 8.0 2 2019-01-01 NaT tt_table = tt_ak135_tts # for magnitude <10, max_radius is 0. For magnitude >10, max_radius is 200 # we have only magnitudes <10, we have two events exactly on a station (=> dist=0) # which will be taken (the others dropped out) df = merge_events_stations(events_df, channels_df, dict(minmag=10, maxmag=10, minmag_radius=0, maxmag_radius=200), tttable=tt_table) assert len(df) == 2 # for magnitude <1, max_radius is 100. For magnitude >1, max_radius is 200 # we have only magnitudes <10, we have all event-stations closer than 100 deg # So we might have ALL channels taken BUT: one station start time is in 2019, thus # it will not fall into the case above! df = merge_events_stations(events_df, channels_df, dict(minmag=1, maxmag=1, minmag_radius=100, maxmag_radius=2000), tttable=tt_table) assert len(df) == (len(channels_df)-1) * len(events_df) # assert channel outside time bounds was in: assert not channels_df[channels_df[Station.start_time.key] == datetime(2019, 1, 1)].empty # we need to get the channel id from channels_df cause in df we removed unnecessary # columns (including start end time) ch_id = channels_df[channels_df[Station.start_time.key] == datetime(2019, 1, 1)][Channel.id.key].iloc[0] # old Channel.id.key is Segment.channel_id.key in df: assert df[df[Segment.channel_id.key] == ch_id].empty # this is a more complex case, we want to drop the first event by setting a very low # threshold (sraidus_minradius=1) for magnitudes <=3 (the first event magnitude) # and maxradius very high for the other event (magnitude=4) df = merge_events_stations(events_df, channels_df, dict(minmag=3, maxmag=4, minmag_radius=1, maxmag_radius=40), tttable=tt_table) # assert we have only the second event except the first channel which is from the 1st event. # The first event is retrievable by its latitude (2) # FIXME: more fine grained tests based on distance? evid = events_df[events_df[Event.latitude.key] == 2][Event.id.key].iloc[0] assert np.array_equal((df[Segment.event_id.key] == evid), [False, True, True, True, True]) # test arrival times are properly set: Set all event locations to [0,0] as well # as stations locations. This should result in all arrival times equal to event time # _events_df = events_df _channels_df = channels_df events_df = events_df.copy() events_df.loc[:, Event.latitude.key] = 0 events_df.loc[:, Event.longitude.key] = 0 event_ids = pd.unique(events_df[Event.id.key]) # We have two events, set the depth of the first one to zero the other to 60 evtid1, evtid2 = event_ids[0], event_ids[1] evttime1 = events_df[events_df[Event.id.key] == evtid1][Event.time.key].iloc[0] evttime2 = events_df[events_df[Event.id.key] == evtid2][Event.time.key].iloc[0] events_df.loc[events_df[Event.id.key] == evtid1, Event.depth_km.key] = 0 events_df.loc[events_df[Event.id.key] == evtid2, Event.depth_km.key] = 60 channels_df = channels_df.copy() channels_df.loc[:, Station.latitude.key] = 0 channels_df.loc[:, Station.longitude.key] = 0 df = merge_events_stations(events_df, channels_df, dict(minmag=3, maxmag=4, minmag_radius=1, maxmag_radius=40), tttable=tt_table) # assert for events of depth 0 arrival times are queal to event times assert (df[df[Segment.event_id.key] == evtid1][Segment.arrival_time.key] == evttime1).all() # assert for events of depth > 0 arrival times are GREATER than event times assert (df[df[Segment.event_id.key] == evtid2][Segment.arrival_time.key] > evttime2).all() # now set the first event time out-of bounds: events_df.loc[events_df[Event.id.key] == evtid1, Event.depth_km.key] = 600000 df = merge_events_stations(events_df, channels_df, dict(minmag=3, maxmag=4, minmag_radius=1, maxmag_radius=40), tttable=tt_table) # assert for events of depth 0 arrival times are queal to event times # as nans are dropped from the returned dataframe, assert we do not have segments with # event_id == evtid1: assert df[df[Segment.event_id.key] == evtid1][Segment.arrival_time.key].empty # still assert for events of depth > 0 arrival times are GREATER than event times assert (df[df[Segment.event_id.key] == evtid2][Segment.arrival_time.key] > evttime2).all()
def init(self, request, db, data, pytestdir): # re-init a sqlite database (no-op if the db is not sqlite): db.create(to_file=False) self.logout = StringIO() self.handler = StreamHandler(stream=self.logout) # THIS IS A HACK: # s2s_download_logger.setLevel(logging.INFO) # necessary to forward to handlers # if we called closing (we are testing the whole chain) the level will be reset # (to level.INFO) otherwise it stays what we set two lines above. Problems might arise # if closing sets a different level, but for the moment who cares # s2s_download_logger.addHandler(self.handler) # setup a run_id: r = Download() db.session.add(r) db.session.commit() self.run = r # side effects: self._evt_urlread_sideeffect = """#EventID | Time | Latitude | Longitude | Depth/km | Author | Catalog | Contributor | ContributorID | MagType | Magnitude | MagAuthor | EventLocationName 20160508_0000129|2016-05-08 05:17:11.500000|1|1|60.0|AZER|EMSC-RTS|AZER|505483|ml|3|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN 20160508_0000004|2016-05-08 01:45:30.300000|90|90|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|4|EMSC|CROATIA """ self._dc_urlread_sideeffect = """http://geofon.gfz-potsdam.de/fdsnws/dataselect/1/query ZZ * * * 2002-09-01T00:00:00 2005-10-20T00:00:00 UP ARJ * * 2013-08-01T00:00:00 2017-04-25 http://ws.resif.fr/fdsnws/dataselect/1/query ZU * * HHZ 2015-01-01T00:00:00 2016-12-31T23:59:59.999999 """ # Note: by default we set sta_urlsideeffect to return such a channels which result in 12 # segments (see lat and lon of channels vs lat and lon of events above) self._sta_urlread_sideeffect = ["""#Network|Station|Location|Channel|Latitude|Longitude|Elevation|Depth|Azimuth|Dip|SensorDescription|Scale|ScaleFreq|ScaleUnits|SampleRate|StartTime|EndTime GE|FLT1||HHE|1|1|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2003-01-01T00:00:00| GE|FLT1||HHN|1|1|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2003-01-01T00:00:00| GE|FLT1||HHZ|1|1|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2003-01-01T00:00:00| n1|s||c1|90|90|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00| n1|s||c2|90|90|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00| n1|s||c3|90|90|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00| """, """#Network|Station|Location|Channel|Latitude|Longitude|Elevation|Depth|Azimuth|Dip|SensorDescription|Scale|ScaleFreq|ScaleUnits|SampleRate|StartTime|EndTime IA|BAKI||BHE|1|1|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2003-01-01T00:00:00| IA|BAKI||BHN|1|1|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2003-01-01T00:00:00| IA|BAKI||BHZ|1|1|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2003-01-01T00:00:00| n2|s||c1|90|90|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00| n2|s||c2|90|90|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00| n2|s||c3|90|90|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00| """] # self._sta_urlread_sideeffect = cycle([partial_valid, '', invalid, '', '', URLError('wat'), socket.timeout()]) self._mintraveltime_sideeffect = cycle([1]) self._seg_data = data.read("GE.FLT1..HH?.mseed") self._seg_data_gaps = data.read("IA.BAKI..BHZ.D.2016.004.head") self._seg_data_empty = b'' self._seg_urlread_sideeffect = [self._seg_data, self._seg_data_gaps, 413, 500, self._seg_data[:2], self._seg_data_empty, 413, URLError("++urlerror++"), socket.timeout()] self._inv_data = data.read("inventory_GE.APE.xml") self.service = '' # so get_datacenters_df accepts any row by default # store DcDataselectManager method here: self.dc_get_data_open = DcDataselectManager._get_data_open self.dc_get_data_from_userpass = DcDataselectManager._get_data_from_userpass # get data from token accepts a custom urlread side effect: _get_data_from_token = DcDataselectManager._get_data_from_token def dc_get_data_from_token_func(url_read_side_effect=None, *a, **kw): if url_read_side_effect is not None: self.setup_urlopen(url_read_side_effect) return _get_data_from_token(*a, **kw) self.dc_get_data_from_token = dc_get_data_from_token_func # class-level patchers: with patch('stream2segment.utils.url.urlopen') as mock_urlopen: self.mock_urlopen = mock_urlopen with patch('stream2segment.utils.inputargs.get_session', return_value=db.session): # this mocks yaml_load and sets inventory to False, as tests rely on that with patch('stream2segment.main.closesession'): # no-op (do not close session) # mock ThreadPool (tp) to run one instance at a time, so we # get deterministic results: class MockThreadPool(object): def __init__(self, *a, **kw): pass def imap(self, func, iterable, *args): # make imap deterministic: same as standard python map: # everything is executed in a single thread the right input order return map(func, iterable) def imap_unordered(self, func, iterable, *args): # make imap_unordered deterministic: same as standard python map: # everything is executed in a single thread in the right input order return map(func, iterable) def close(self, *a, **kw): pass # assign patches and mocks: with patch('stream2segment.utils.url.ThreadPool', side_effect=MockThreadPool) as mock_thread_pool: def c4d(logger, logfilebasepath, verbose): # config logger as usual, but redirects to a temp file # that will be deleted by pytest, instead of polluting the program # package: ret = configlog4download(logger, pytestdir.newfile('.log'), verbose) logger.addHandler(self.handler) return ret with patch('stream2segment.main.configlog4download', side_effect=c4d) as mock_config4download: self.mock_config4download = mock_config4download yield
def init(self, request, db, data): # re-init a sqlite database (no-op if the db is not sqlite): db.create(to_file=False) # setup a run_id: rdw = Download() db.session.add(rdw) db.session.commit() self.run = rdw # side effects: self._evt_urlread_sideeffect = """#EventID | Time | Latitude | Longitude | Depth/km | Author | Catalog | Contributor | ContributorID | MagType | Magnitude | MagAuthor | EventLocationName 20160508_0000129|2016-05-08 05:17:11.500000|1|1|60.0|AZER|EMSC-RTS|AZER|505483|ml|3|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN 20160508_0000004|2016-05-08 01:45:30.300000|90|90|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|4|EMSC|CROATIA """ self._mintraveltime_sideeffect = cycle([1]) self._seg_data = data.read("GE.FLT1..HH?.mseed") self._seg_data_gaps = data.read("IA.BAKI..BHZ.D.2016.004.head") self._seg_data_empty = b'' self._seg_urlread_sideeffect = [ self._seg_data, self._seg_data_gaps, 413, 500, self._seg_data[:2], self._seg_data_empty, 413, URLError("++urlerror++"), socket.timeout() ] self.service = '' # so get_datacenters_df accepts any row by default self.db_buf_size = 1 self.routing_service = yaml_load(get_templates_fpath("download.yaml"))\ ['advanced_settings']['routing_service_url'] # NON db stuff (logging, patchers, pandas...): self.loghandler = StreamHandler(stream=StringIO()) # THIS IS A HACK: query_logger.setLevel(logging.INFO) # necessary to forward to handlers # if we called closing (we are testing the whole chain) the level will be reset # (to level.INFO) otherwise it stays what we set two lines above. Problems might arise # if closing sets a different level, but for the moment who cares query_logger.addHandler(self.loghandler) # when debugging, I want the full dataframe with to_string(), not truncated # NOTE: this messes up right alignment of numbers in DownloadStats (see utils.py) # FIRST, remember current settings and restore them in cleanup: _pd_display_maxcolwidth = pd.get_option('display.max_colwidth') pd.set_option('display.max_colwidth', -1) # define class level patchers (we do not use a yiled as we need to do more stuff in the # finalizer, see below patchers = [] patchers.append(patch('stream2segment.utils.url.urlopen')) self.mock_urlopen = patchers[-1].start() # mock ThreadPool (tp) to run one instance at a time, so we get deterministic results: class MockThreadPool(object): def __init__(self, *a, **kw): pass def imap(self, func, iterable, *args): # make imap deterministic: same as standard python map: # everything is executed in a single thread the right input order return map(func, iterable) def imap_unordered(self, func_, iterable, *args): # make imap_unordered deterministic: same as standard python map: # everything is executed in a single thread in the right input order return map(func_, iterable) def close(self, *a, **kw): pass # assign patches and mocks: patchers.append(patch('stream2segment.utils.url.ThreadPool')) self.mock_tpool = patchers[-1].start() self.mock_tpool.side_effect = MockThreadPool # add finalizer: def delete(): pd.set_option('display.max_colwidth', _pd_display_maxcolwidth) for patcher in patchers: patcher.stop() hndls = query_logger.handlers[:] for h in hndls: if h is self.loghandler: self.loghandler.close() query_logger.removeHandler(h) request.addfinalizer(delete)
def test_retry(self, mock_get_opener, mock_get_data_from_token, mock_get_data_from_userpass, mock_get_data_open, mock_updatedf, mock_insertdf, mock_mseed_unpack, mock_download_save_segments, mock_save_inventories, mock_get_channels_df, mock_get_datacenters_df, mock_get_events_df, # fixtures: db, clirunner, pytestdir, yamlfile): mock_get_events_df.side_effect = lambda *a, **v: self.get_events_df(None, *a, **v) mock_get_datacenters_df.side_effect = \ lambda *a, **v: self.get_datacenters_df(None, *a, **v) mock_get_channels_df.side_effect = lambda *a, **v: self.get_channels_df(None, *a, **v) mock_save_inventories.side_effect = lambda *a, **v: self.save_inventories(None, *a, **v) mock_download_save_segments.side_effect = \ lambda *a, **v: self.download_save_segments([URLError('abc')], *a, **v) # mseed unpack is mocked by accepting only first arg (so that time bounds are # not considered) mock_mseed_unpack.side_effect = lambda *a, **v: unpack(a[0]) mock_insertdf.side_effect = lambda *a, **v: insertdf(*a, **v) mock_updatedf.side_effect = lambda *a, **v: updatedf(*a, **v) # prevlen = len(db.session.query(Segment).all()) # mock our opener m = Mock() mockopen = Mock() mockopen.read = lambda *a, **v: b'' mockopen.msg = 'abc' mockopen.code = 204 m.open = lambda *a, **v: mockopen # m.read = lambda *a, **v: '' mock_get_opener.side_effect = lambda *a, **v: m # patching class methods while preserving the original call requires storing once # the original methods (as class attributes). Sets the side effect of the mocked method # to those class attributes as to preserve the original functionality # and be able to assert mock_* functions are called and so on # For info see: # https://stackoverflow.com/a/29563665 mock_get_data_open.side_effect = self.dc_get_data_open mock_get_data_from_userpass.side_effect = self.dc_get_data_from_userpass mock_get_data_from_token.side_effect = \ lambda *a, **kw: self.dc_get_data_from_token([URLError('a'), 'abc'], *a, **kw) # TEST 1: provide a file with valid token: tokenfile = pytestdir.newfile(create=True) with open(tokenfile, 'w') as fh: fh.write('BEGIN PGP MESSAGE') # mock yaml_load to override restricted_data: # launch two download runs with different responses for token auth query: for tokenquery_mocked_return_values, dc_token_failed in \ ([[URLError('a'), 'uzer:pazzword'], "http://geofon.gfz-potsdam.de"], [['uzer:pazzword', URLError('a')], 'http://ws.resif.fr']): # set how many times self.mock_urlopen has been called: mock_urlopen_call_count = self.mock_urlopen.call_count # TEST 2: USERPASS good for just one datacenter: mock_get_data_open.reset_mock() mock_get_data_from_token.reset_mock() mock_get_data_from_userpass.reset_mock() mock_get_opener.reset_mock() mock_get_data_from_token.side_effect = \ lambda *a, **kw: self.dc_get_data_from_token(tokenquery_mocked_return_values, *a, **kw) yaml_file = yamlfile(restricted_data=os.path.abspath(tokenfile), retry_client_err=False) result = clirunner.invoke(cli, ['download', '-c', yaml_file, '--dburl', db.dburl, '--start', '2016-05-08T00:00:00', '--end', '2016-05-08T9:00:00']) assert clirunner.ok(result) assert 'restricted_data: %s' % os.path.abspath(tokenfile) in result.output assert 'STEP 5 of 8: Acquiring credentials from token' in result.output # assert we print that we are downloading open and restricted data: assert re.search(r'STEP 7 of 8\: Downloading \d+ segments and saving to db', result.output) assert not mock_get_data_open.called assert mock_get_data_from_token.called assert not mock_get_data_from_userpass.called assert "Downloading open data only from: %s" % dc_token_failed dc_token_ok = 'http://ws.resif.fr' \ if dc_token_failed == "http://geofon.gfz-potsdam.de" else \ "http://geofon.gfz-potsdam.de" assert mock_get_opener.call_count == 1 assert mock_get_opener.call_args_list[0][0][:] == (dc_token_ok, 'uzer', 'pazzword') dc_id = {Fdsnws(i[1]).site: i[0] for i in db.session.query(DataCenter.id, DataCenter.dataselect_url)} # assert urlopen has been called only once with query and not queryauth: # get the segments dataframe we (re)downloaded: segments_df_to_download = mock_download_save_segments.call_args_list[-1][0][1] dc2download = pd.unique(segments_df_to_download['datacenter_id']).tolist() # set the expected call count based on the datacenters of (re)downloaded segments: if dc_id[dc_token_failed] not in dc2download: assert self.mock_urlopen.call_count == 0 else: assert self.mock_urlopen.call_count >= 1 for i in range(self.mock_urlopen.call_count): i+=1 assert self.mock_urlopen.call_args_list[-i][0][0].get_full_url() == \ dc_token_failed + "/fdsnws/dataselect/1/query"
def test_restricted(self, mock_get_opener, mock_get_data_from_token, mock_get_data_from_userpass, mock_get_data_open, mock_updatedf, mock_insertdf, mock_mseed_unpack, mock_download_save_segments, mock_save_inventories, mock_get_channels_df, mock_get_datacenters_df, mock_get_events_df, # fixtures: db, clirunner, pytestdir, yamlfile): mock_get_events_df.side_effect = lambda *a, **v: self.get_events_df(None, *a, **v) mock_get_datacenters_df.side_effect = \ lambda *a, **v: self.get_datacenters_df(None, *a, **v) mock_get_channels_df.side_effect = lambda *a, **v: self.get_channels_df(None, *a, **v) mock_save_inventories.side_effect = lambda *a, **v: self.save_inventories(None, *a, **v) mock_download_save_segments.side_effect = \ lambda *a, **v: self.download_save_segments(None, *a, **v) # mseed unpack is mocked by accepting only first arg # (so that time bounds are not considered) mock_mseed_unpack.side_effect = lambda *a, **v: unpack(a[0]) mock_insertdf.side_effect = lambda *a, **v: insertdf(*a, **v) mock_updatedf.side_effect = lambda *a, **v: updatedf(*a, **v) # prevlen = len(db.session.query(Segment).all()) # patching class methods while preserving the original call requires storing once # the original methods (as class attributes). Sets the side effect of the mocked method # to those class attributes as to preserve the original functionality # and be able to assert mock_* functions are called and so on # For info see: # https://stackoverflow.com/a/29563665 mock_get_data_open.side_effect = self.dc_get_data_open mock_get_data_from_userpass.side_effect = self.dc_get_data_from_userpass mock_get_data_from_token.side_effect = \ lambda *a, **kw: self.dc_get_data_from_token([URLError('a'), 'abc'], *a, **kw) # TEST 1: provide a file with valid token: tokenfile = pytestdir.newfile(create=True) with open(tokenfile, 'w') as fh: fh.write('BEGIN PGP MESSAGE') # mock yaml_load to override restricted_data: yaml_file = yamlfile(restricted_data=os.path.abspath(tokenfile)) # The run table is populated with a run_id in the constructor of this class # for checking run_ids, store here the number of runs we have in the table: runs = len(db.session.query(Download.id).all()) result = clirunner.invoke(cli, ['download', '-c', yaml_file, '--dburl', db.dburl, '--start', '2016-05-08T00:00:00', '--end', '2016-05-08T9:00:00']) assert clirunner.ok(result) assert 'Downloading 12 segments (open data only)' in result.output assert 'STEP 5 of 8: Acquiring credentials from token' in result.output # note that due to (probably) dict order in py2-3 we need to test both of these: if not ('Downloading open data only from: http://geofon.gfz-potsdam.de, ' 'http://ws.resif.fr (Unable to acquire credentials for restricted data)') in \ result.output: assert ('Downloading open data only from: http://ws.resif.fr, ' 'http://geofon.gfz-potsdam.de (Unable to acquire credentials for restricted data)') in \ result.output # assert we print that we are downloading open data only (all errors): assert 'STEP 7 of 8: Downloading 12 segments (open data only)' in result.output assert not mock_get_data_open.called assert mock_get_data_from_token.called assert not mock_get_data_from_userpass.called assert not mock_get_opener.called # some assertions to check data properly written # These are important because they confirm that data has been downloaded anyway # (the test does not differentiate between restricted or open data) assert len(db.session.query(Download.id).all()) == runs + 1 runs += 1 segments = db.session.query(Segment).all() assert len(segments) == 12 segments = db.session.query(Segment).filter(Segment.has_data).all() assert len(segments) == 4 assert len(db.session.query(Station).filter(Station.has_inventory).all()) == 2 assert mock_updatedf.called # called while saving inventories assert mock_insertdf.called
def init(self, request, db, data): # re-init a sqlite database (no-op if the db is not sqlite): db.create(to_file=False) # setup a run_id: rdw = Download() db.session.add(rdw) db.session.commit() self.run = rdw # side effects: self._dc_urlread_sideeffect = """http://geofon.gfz-potsdam.de/fdsnws/dataselect/1/query ZZ * * * 2002-09-01T00:00:00 2005-10-20T00:00:00 UP ARJ * * 2013-08-01T00:00:00 2017-04-25 http://ws.resif.fr/fdsnws/dataselect/1/query ZU * * HHZ 2015-01-01T00:00:00 2016-12-31T23:59:59.999999 """ self._mintraveltime_sideeffect = cycle([1]) self._seg_data = data.read("GE.FLT1..HH?.mseed") self._seg_data_gaps = data.read("IA.BAKI..BHZ.D.2016.004.head") self._seg_data_empty = b'' self._seg_urlread_sideeffect = [ self._seg_data, self._seg_data_gaps, 413, 500, self._seg_data[:2], self._seg_data_empty, 413, URLError("++urlerror++"), socket.timeout() ] self.service = '' # so get_datacenters_df accepts any row by default self.db_buf_size = 1 self.routing_service = yaml_load(get_templates_fpath("download.yaml"))\ ['advanced_settings']['routing_service_url'] # NON db stuff (logging, patchers, pandas...): self.logout = StringIO() handler = StreamHandler(stream=self.logout) self._logout_cache = "" # THIS IS A HACK: query_logger.setLevel(logging.INFO) # necessary to forward to handlers # if we called closing (we are testing the whole chain) the level will be reset # (to level.INFO) otherwise it stays what we set two lines above. Problems might arise # if closing sets a different level, but for the moment who cares query_logger.addHandler(handler) # when debugging, I want the full dataframe with to_string(), not truncated # NOTE: this messes up right alignment of numbers in DownloadStats (see utils.py) # FIRST, remember current settings and restore them in cleanup: _pd_display_maxcolwidth = pd.get_option('display.max_colwidth') pd.set_option('display.max_colwidth', -1) # define class level patchers (we do not use a yiled as we need to do more stuff in the # finalizer, see below patchers = [] patchers.append(patch('stream2segment.utils.url.urlopen')) self.mock_urlopen = patchers[-1].start() # mock ThreadPool (tp) to run one instance at a time, so we get deterministic results: class MockThreadPool(object): def __init__(self, *a, **kw): pass def imap(self, func, iterable, *args): # make imap deterministic: same as standard python map: # everything is executed in a single thread the right input order return map(func, iterable) def imap_unordered(self, func_, iterable, *args): # make imap_unordered deterministic: same as standard python map: # everything is executed in a single thread in the right input order return map(func_, iterable) def close(self, *a, **kw): pass # assign patches and mocks: patchers.append(patch('stream2segment.utils.url.ThreadPool')) self.mock_tpool = patchers[-1].start() self.mock_tpool.side_effect = MockThreadPool # add finalizer: def delete(): pd.set_option('display.max_colwidth', _pd_display_maxcolwidth) for patcher in patchers: patcher.stop() hndls = query_logger.handlers[:] handler.close() for h in hndls: if h is handler: query_logger.removeHandler(h) request.addfinalizer(delete)
def test_get_dcs_routingerror( self, mock_fileopen, # fixtures: db): '''test fetching datacenters eida, iris, custom url''' # this is the output when using eida as service: urlread_sideeffect = [URLError('wat?')] # we might set the following params as defaults because not used, let's provide anyway # something meaningful: net, sta, loc, cha = ['*'], [], [], ['HH?', 'BH?'] starttime = datetime.utcnow() endtime = starttime + timedelta(minutes=1.1) # normal fdsn service ("https://mocked_domain/fdsnws/station/1/query") # we should not call self.mock_urlopen and not mock_fileopen (no eida) dcdf, eidavalidator = self.get_datacenters_df( urlread_sideeffect, db.session, "https://mock/fdsnws/station/1/query", self.routing_service, net, sta, loc, cha, starttime, endtime, db_bufsize=self.db_buf_size) assert not self.mock_urlopen.called assert not mock_fileopen.called assert eidavalidator is None assert len(dcdf) == 1 assert db.session.query(DataCenter).count() == 1 # iris: # we should not call self.mock_urlopen and not mock_fileopen (no eida) dcdf, eidavalidator = self.get_datacenters_df( urlread_sideeffect, db.session, "iris", self.routing_service, net, sta, loc, cha, starttime, endtime, db_bufsize=self.db_buf_size) assert not self.mock_urlopen.called assert not mock_fileopen.called assert eidavalidator is None assert len(dcdf) == 1 assert db.session.query(DataCenter).\ filter(DataCenter.organization_name == 'iris').count() == 1 # eida: # we should call self.mock_urlopen and mock_fileopen (eida error => read from file) dcdf, eidavalidator = self.get_datacenters_df( urlread_sideeffect, db.session, "eida", self.routing_service, net, sta, loc, cha, starttime, endtime, db_bufsize=self.db_buf_size) assert self.mock_urlopen.called assert mock_fileopen.called msg = self.log_msg() _, last_mod_time = _get_local_routing_service() expected_str = ("Eida routing service error, reading routes from file " "(last updated: %s") % last_mod_time assert expected_str in msg assert eidavalidator is not None assert db.session.query(DataCenter).\ filter(DataCenter.organization_name == 'eida').count() == 10 assert len(dcdf) == 10 # with pytest.raises(FailedDownload) as qdown: # data, _ = self.get_datacenters_df(urlread_sideeffect, db.session, "eida", # self.routing_service, # net, sta, loc, cha, starttime, endtime, # db_bufsize=self.db_buf_size) # assert self.mock_urlopen.called # assert "Eida routing service error, no eida data-center saved in database" \ # in str(qdown.value) # now let's mock a valid response from the eida routing service self.mock_urlopen.reset_mock() mock_fileopen.reset_mock() urlread_sideeffect = [ """http://ws.resif.fr/fdsnws/station/1/query http://geofon.gfz-potsdam.de/fdsnws/station/1/query http://geofon.gfz-potsdam.de/fdsnws/station/1/query ZZ * * * 2002-09-01T00:00:00 2005-10-20T00:00:00 UP ARJ * BHW 2013-08-01T00:00:00 2017-04-25""" ] dcdf, eidavalidator = self.get_datacenters_df( urlread_sideeffect, db.session, "eida", self.routing_service, net, sta, loc, cha, starttime, endtime, db_bufsize=self.db_buf_size) assert self.mock_urlopen.called assert not mock_fileopen.called assert db.session.query(DataCenter).\ filter(DataCenter.organization_name == 'eida').count() == 10 assert len(dcdf) == 2 assert "Eida routing service error, reading from file (last updated: " \ not in self.log_msg()[len(msg):] # write two new eida data centers self.mock_urlopen.reset_mock() mock_fileopen.reset_mock() urlread_sideeffect = [ """http://ws.NEWDC1.fr/fdsnws/station/1/query http://geofon.gfz-potsdam.de/fdsnws/station/1/query http://NEWDC2.gfz-potsdam.de/fdsnws/station/1/query ZZ * * * 2002-09-01T00:00:00 2005-10-20T00:00:00 UP ARJ * BHW 2013-08-01T00:00:00 2017-04-25""" ] dcdf, eidavalidator = self.get_datacenters_df( urlread_sideeffect, db.session, "eida", self.routing_service, net, sta, loc, cha, starttime, endtime, db_bufsize=self.db_buf_size) assert self.mock_urlopen.called assert not mock_fileopen.called assert db.session.query(DataCenter).\ filter(DataCenter.organization_name == 'eida').count() == 12 assert len(dcdf) == 2
def test_download_save_segments_timebounds(self, mock_updatedf, mock_insertdf, mseed_unpack, db, tt_ak135_tts): # prepare: # mseed unpack takes no starttime and endtime arguments, so that mseed_unpack.side_effect = lambda *a, **v: unpack(*a, **v) mock_insertdf.side_effect = lambda *a, **v: insertdf(*a, **v) mock_updatedf.side_effect = lambda *a, **v: updatedf(*a, **v) # mock event response: it's the same as self._evt_urlread_sideeffect but modify the dates # as NOW. This means, any segment downloaded later will # be out-of-bound utcnow = datetime.utcnow() utcnow_iso = utcnow.isoformat().replace("T", " ") urlread_sideeffect = """#EventID | Time | Latitude | Longitude | Depth/km | Author | Catalog | Contributor | ContributorID | MagType | Magnitude | MagAuthor | EventLocationName 20160508_0000129|%s|1|1|60.0|AZER|EMSC-RTS|AZER|505483|ml|3|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN 20160508_0000004|%s|90|90|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|4|EMSC|CROATIA """ % (utcnow_iso, utcnow_iso) events_df = self.get_events_df(urlread_sideeffect, db.session) # restore urlread_side_effect: urlread_sideeffect = None net, sta, loc, cha = [], [], [], [] datacenters_df, eidavalidator = \ self.get_datacenters_df(urlread_sideeffect, db.session, self.service, self.routing_service, net, sta, loc, cha, db_bufsize=self.db_buf_size) channels_df = self.get_channels_df(urlread_sideeffect, db.session, datacenters_df, eidavalidator, net, sta, loc, cha, None, None, 10, False, None, None, -1, self.db_buf_size) # just to be sure. If failing, we might have changed the class default: assert len(channels_df) == 12 # events_df # id magnitude latitude longitude depth_km time # 0 20160508_0000129 3.0 1.0 1.0 60.0 2016-05-08 05:17:11.500 # 1 20160508_0000004 4.0 2.0 2.0 2.0 2016-05-08 01:45:30.300 # channels_df (index not shown): # columns: # id station_id latitude longitude datacenter_id start_time end_time network station location channel # data (not aligned with columns): # 1 1 1.0 1.0 1 2003-01-01 NaT GE FLT1 HHE # 2 1 1.0 1.0 1 2003-01-01 NaT GE FLT1 HHN # 3 1 1.0 1.0 1 2003-01-01 NaT GE FLT1 HHZ # 4 2 90.0 90.0 1 2009-01-01 NaT n1 s c1 # 5 2 90.0 90.0 1 2009-01-01 NaT n1 s c2 # 6 2 90.0 90.0 1 2009-01-01 NaT n1 s c3 # 7 3 1.0 1.0 2 2003-01-01 NaT IA BAKI BHE # 8 3 1.0 1.0 2 2003-01-01 NaT IA BAKI BHN # 9 3 1.0 1.0 2 2003-01-01 NaT IA BAKI BHZ # 10 4 90.0 90.0 2 2009-01-01 NaT n2 s c1 # 11 4 90.0 90.0 2 2009-01-01 NaT n2 s c2 # 12 4 90.0 90.0 2 2009-01-01 NaT n2 s c3 assert all(_ in channels_df.columns for _ in [Station.network.key, Station.station.key, Channel.location.key, Channel.channel.key]) chaid2mseedid = chaid2mseedid_dict(channels_df) # check that we removed the columns: assert not any(_ in channels_df.columns for _ in [Station.network.key, Station.station.key, Channel.location.key, Channel.channel.key]) # take all segments: # use minmag and maxmag ttable = tt_ak135_tts segments_df = merge_events_stations(events_df, channels_df, dict(minmag=10, maxmag=10, minmag_radius=10, maxmag_radius=10), tttable=ttable) assert len(pd.unique(segments_df['arrival_time'])) == 2 h = 9 # segments_df (index not shown). Note that # cid sid did n s l c ed event_id depth_km time <- LAST TWO ARE Event related columns that will be removed after arrival_time calculations # 1 1 1 GE FLT1 HHE 0.0 20160508_0000129 60.0 2016-05-08 05:17:11.500 # 2 1 1 GE FLT1 HHN 0.0 20160508_0000129 60.0 2016-05-08 05:17:11.500 # 3 1 1 GE FLT1 HHZ 0.0 20160508_0000129 60.0 2016-05-08 05:17:11.500 # 7 3 2 IA BAKI BHE 0.0 20160508_0000129 60.0 2016-05-08 05:17:11.500 # 8 3 2 IA BAKI BHN 0.0 20160508_0000129 60.0 2016-05-08 05:17:11.500 # 9 3 2 IA BAKI BHZ 0.0 20160508_0000129 60.0 2016-05-08 05:17:11.500 # 4 2 1 n1 s c1 0.0 20160508_0000004 2.0 2016-05-08 01:45:30.300 # 5 2 1 n1 s c2 0.0 20160508_0000004 2.0 2016-05-08 01:45:30.300 # 6 2 1 n1 s c3 0.0 20160508_0000004 2.0 2016-05-08 01:45:30.300 # 10 4 2 n2 s c1 0.0 20160508_0000004 2.0 2016-05-08 01:45:30.300 # 11 4 2 n2 s c2 0.0 20160508_0000004 2.0 2016-05-08 01:45:30.300 # 12 4 2 n2 s c3 0.0 20160508_0000004 2.0 2016-05-08 01:45:30.300 # LEGEND: # cid = channel_id # sid = station_id # scid = datacenter_id # n, s, l, c = network, station, location, channel # ed = event_distance_deg # define a dc_dataselect_manager for open data only: dc_dataselect_manager = DcDataselectManager(datacenters_df, Authorizer(None), False) wtimespan = [1, 2] # in minutes expected = len(segments_df) # no segment on db, we should have all segments to download orig_segments_df = segments_df.copy() segments_df, request_timebounds_need_update = \ prepare_for_download(db.session, orig_segments_df, dc_dataselect_manager, wtimespan, retry_seg_not_found=True, retry_url_err=True, retry_mseed_err=True, retry_client_err=True, retry_server_err=True, retry_timespan_err=True, retry_timespan_warn=True) # segments_df # COLUMNS: # channel_id datacenter_id network station location channel event_distance_deg event_id arrival_time start_time end_time id download_status_code run_id # DATA (not aligned with columns): # channel_id datacenter_id network station location channel event_distance_deg event_id arrival_time start_time end_time id download_status_code run_id # GE.FLT1..HHE 1 1 GE FLT1 HHE 0.0 1 2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12 None None 1 # GE.FLT1..HHN 2 1 GE FLT1 HHN 0.0 1 2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12 None None 1 # GE.FLT1..HHZ 3 1 GE FLT1 HHZ 0.0 1 2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12 None None 1 # IA.BAKI..BHE 7 2 IA BAKI BHE 0.0 1 2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12 None None 1 # IA.BAKI..BHN 8 2 IA BAKI BHN 0.0 1 2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12 None None 1 # IA.BAKI..BHZ 9 2 IA BAKI BHZ 0.0 1 2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12 None None 1 # n1.s..c1 4 1 n1 s c1 0.0 2 2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31 None None 1 # n1.s..c2 5 1 n1 s c2 0.0 2 2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31 None None 1 # n1.s..c3 6 1 n1 s c3 0.0 2 2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31 None None 1 # n2.s..c1 10 2 n2 s c1 0.0 2 2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31 None None 1 # n2.s..c2 11 2 n2 s c2 0.0 2 2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31 None None 1 # n2.s..c3 12 2 n2 s c3 0.0 2 2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31 None None 1 # self._segdata is the folder file of a "valid" 3-channel miniseed # The channels are: # Thus, no match will be found and all segments will be written with a None # download status code # setup urlread: first three rows: ok # rows[3:6]: 413, retry them # rows[6:9]: malformed_data # rows[9:12] 413, retry them # then retry: # rows[3]: empty_data # rows[4]: data_with_gaps (but seed_id should notmatch) # rows[5]: data_with_gaps (seed_id should notmatch) # rows[9]: URLError # rows[10]: Http 500 error # rows[11]: 413 # NOTE THAT THIS RELIES ON THE FACT THAT THREADS ARE EXECUTED IN THE ORDER OF THE DATAFRAME # WHICH SEEMS TO BE THE CASE AS THERE IS ONE SINGLE PROCESS # self._seg_data[:2] is a way to mock data corrupted urlread_sideeffect = [self._seg_data, 413, self._seg_data[:2], 413, '', self._seg_data_gaps, self._seg_data_gaps, URLError("++urlerror++"), 500, 413] # Let's go: ztatz = self.download_save_segments(urlread_sideeffect, db.session, segments_df, dc_dataselect_manager, chaid2mseedid, self.run.id, False, request_timebounds_need_update, 1, 2, 3, db_bufsize=self.db_buf_size) # get columns from db which we are interested on to check cols = [Segment.id, Segment.channel_id, Segment.datacenter_id, Segment.download_code, Segment.maxgap_numsamples, Segment.sample_rate, Segment.data_seed_id, Segment.data, Segment.download_id, Segment.request_start, Segment.request_end, Segment.start_time, Segment.end_time ] db_segments_df = dbquery2df(db.session.query(*cols)) assert Segment.download_id.key in db_segments_df.columns OUTTIME_ERR, OUTTIME_WARN = s2scodes.timespan_err, s2scodes.timespan_warn # assert no segment has data (time out of bounds): assert len(db_segments_df.loc[(~pd.isnull(db_segments_df[Segment.data.key])) & (db_segments_df[Segment.data.key].str.len() > 0), Segment.data.key]) == 0 # assert the number of "correctly" downloaded segments, i.e. with data (4) has now # code = TIMEBOUND_ERR assert len(db_segments_df[db_segments_df[Segment.download_code.key] == OUTTIME_ERR]) == 4 # re-sort db_segments_df to match the segments_df: ret = [] for cha in segments_df[Segment.channel_id.key]: ret.append(db_segments_df[db_segments_df[Segment.channel_id.key] == cha]) db_segments_df = pd.concat(ret, axis=0) # db_segments_df: # id channel_id datacenter_id download_status_code max_gap_ovlap_ratio sample_rate data_seed_id data run_id start_time end_time # 0 1 1 1 -3 0.0001 100.0 GE.FLT1..HHE b'' 1 2016-05-08 05:16:12 2016-05-08 05:19:12 # 1 2 2 1 -3 0.0001 100.0 GE.FLT1..HHN b'' 1 2016-05-08 05:16:12 2016-05-08 05:19:12 # 2 3 3 1 -3 0.0001 100.0 GE.FLT1..HHZ b'' 1 2016-05-08 05:16:12 2016-05-08 05:19:12 # 6 7 7 2 200.0 NaN NaN None 1 2016-05-08 05:16:12 2016-05-08 05:19:12 # 7 8 8 2 NaN NaN NaN None None 1 2016-05-08 05:16:12 2016-05-08 05:19:12 # 8 9 9 2 -3 20.0 20.0 IA.BAKI..BHZ b'' 1 2016-05-08 05:16:12 2016-05-08 05:19:12 # 3 4 4 1 -2.0 NaN NaN None None 1 2016-05-08 01:44:31 2016-05-08 01:47:31 # 4 5 5 1 -2.0 NaN NaN None None 1 2016-05-08 01:44:31 2016-05-08 01:47:31 # 5 6 6 1 -2.0 NaN NaN None None 1 2016-05-08 01:44:31 2016-05-08 01:47:31 # 9 10 10 2 -1.0 NaN NaN None None 1 2016-05-08 01:44:31 2016-05-08 01:47:31 # 10 11 11 2 500.0 NaN NaN None None 1 2016-05-08 01:44:31 2016-05-08 01:47:31 # 11 12 12 2 413.0 NaN NaN None None 1 2016-05-08 01:44:31 2016-05-08 01:47:31 # now modify the first row time bounds: # first we need to assign the database id to our segments_df, to prevent # db contraint error when writing to db: # `download_save_segments` below needs toi UPDATE the segments and it does it by # checking if an id is present. # check that the channel_ids align: assert (segments_df[Segment.channel_id.key].values == db_segments_df[Segment.channel_id.key].values).all() # so that we can simply do this: segments_df[Segment.id.key] = db_segments_df[Segment.id.key] # first read the miniseed: stream = read(BytesIO(self._seg_data)) tstart = stream[0].stats.starttime.datetime tend = stream[0].stats.endtime.datetime segments_df.loc[segments_df[Segment.channel_id.key] == 1, Segment.request_start.key] = tstart segments_df.loc[segments_df[Segment.channel_id.key] == 1, Segment.request_end.key] = tstart + (tend-tstart)/2 segments_df.loc[segments_df[Segment.channel_id.key] == 2, Segment.request_start.key] = tstart segments_df.loc[segments_df[Segment.channel_id.key] == 2, Segment.request_end.key] = tend # build a segments_df of the three segments belonging to the same channel # copy at the end to avoid pandas settingwithcopy warning new_segments_df = \ segments_df.loc[segments_df[Segment.channel_id.key].isin([1, 2, 3]), :].copy() # change urlread_side_effect to provide, for the first three segments, the same # sequence of bytes. The sequence actually is OK, but in the first case it will be # PARTIALLY saved in the second case TOTALLY, and in the thrid case NOT AT ALL: urlread_sideeffect = [self._seg_data, self._seg_data, self._seg_data] # define a dc_dataselect_manager for open data only: dc_dataselect_manager = DcDataselectManager(datacenters_df, Authorizer(None), False) ztatz = self.download_save_segments(urlread_sideeffect, db.session, new_segments_df, dc_dataselect_manager, chaid2mseedid, self.run.id, False, request_timebounds_need_update, 1, 2, 3, db_bufsize=self.db_buf_size) db_segments_df = dbquery2df(db.session.query(*cols)) # re-sort db_segments_df to match the segments_df: ret = [db_segments_df[db_segments_df[Segment.channel_id.key] == cha] for cha in segments_df[Segment.channel_id.key]] db_segments_df = pd.concat(ret, axis=0) # assert the 1st segment whose time range has been modified has data, BUT # download_status_code still TIMEBOUNDS_ERROR df__ = db_segments_df.loc[db_segments_df[Segment.channel_id.key] == 1, :] assert len(df__) == 1 row__ = df__.iloc[0] assert row__[Segment.download_code.key] == OUTTIME_WARN assert len(row__[Segment.data.key]) > 0 # assert the 2nd segment whose time range has been modified has data, AND # download_status_code 200 (ok) df__ = db_segments_df.loc[db_segments_df[Segment.channel_id.key] == 2, :] assert len(df__) == 1 row__ = df__.iloc[0] assert row__[Segment.download_code.key] == 200 assert len(row__[Segment.data.key]) > 0 # assert the 3rd segment whose time range has NOT been modified has no data, # AND download_status_code is still TIMEBOUNDS_ERROR df__ = db_segments_df.loc[db_segments_df[Segment.channel_id.key] == 3, :] assert len(df__) == 1 row__ = df__.iloc[0] assert row__[Segment.download_code.key] == OUTTIME_ERR assert len(row__[Segment.data.key]) == 0
def test_retry2(self, mock_get_opener, mock_get_data_from_token, mock_get_data_from_userpass, mock_get_data_open, mock_updatedf, mock_insertdf, mock_mseed_unpack, mock_download_save_segments, mock_save_inventories, mock_get_channels_df, mock_get_datacenters_df, mock_get_events_df, # fixtures: db, clirunner, pytestdir, yamlfile): mock_get_events_df.side_effect = lambda *a, **v: self.get_events_df(None, *a, **v) mock_get_datacenters_df.side_effect = \ lambda *a, **v: self.get_datacenters_df(None, *a, **v) mock_get_channels_df.side_effect = lambda *a, **v: self.get_channels_df(None, *a, **v) mock_save_inventories.side_effect = lambda *a, **v: self.save_inventories(None, *a, **v) RESPONSES = [URLError('abc')] mock_download_save_segments.side_effect = \ lambda *a, **v: self.download_save_segments(RESPONSES, *a, **v) # mseed unpack is mocked by accepting only first arg (so that time bounds are not # considered) mock_mseed_unpack.side_effect = lambda *a, **v: unpack(a[0]) mock_insertdf.side_effect = lambda *a, **v: insertdf(*a, **v) mock_updatedf.side_effect = lambda *a, **v: updatedf(*a, **v) # prevlen = len(db.session.query(Segment).all()) # patching class methods while preserving the original call requires storing once # the original methods (as class attributes). Sets the side effect of the mocked method # to those class attributes as to preserve the original functionality # and be able to assert mock_* functions are called and so on # For info see: # https://stackoverflow.com/a/29563665 mock_get_data_open.side_effect = self.dc_get_data_open mock_get_data_from_userpass.side_effect = self.dc_get_data_from_userpass mock_get_data_from_token.side_effect = \ lambda *a, **kw: self.dc_get_data_from_token(['a:b', 'c:d'], *a, **kw) # TEST 1: provide a file with valid token: tokenfile = pytestdir.newfile(create=True) with open(tokenfile, 'w') as fh: fh.write('BEGIN PGP MESSAGE') # mock yaml_load to override restricted_data: # USERPASS good for both datacenter: mock_get_data_open.reset_mock() mock_get_data_from_token.reset_mock() mock_get_data_from_userpass.reset_mock() mock_get_opener.reset_mock() mock_get_data_from_token.side_effect = \ lambda *a, **kw: self.dc_get_data_from_token(['uzer:pazzword', 'uzer:pazzword'], *a, **kw) yaml_file = yamlfile(restricted_data=os.path.abspath(tokenfile), retry_client_err=False) result = clirunner.invoke(cli, ['download', '-c', yaml_file, '--dburl', db.dburl, '--start', '2016-05-08T00:00:00', '--end', '2016-05-08T9:00:00']) assert clirunner.ok(result) # get db data, sort by index and reset index to assure comparison across data frames: seg_df = dbquery2df(db.session.query(Segment.id, Segment.download_code, Segment.queryauth, Segment.download_id))\ .sort_values(by=[Segment.id.key]).reset_index(drop=True) # seg_df: # id download_code queryauth download_id # 1 -1 True 2 # 2 -1 True 2 # 3 -1 True 2 # 4 -1 True 2 # 5 -1 True 2 # 6 -1 True 2 # 7 -1 True 2 # 8 -1 True 2 # 9 -1 True 2 # 10 -1 True 2 # 11 -1 True 2 # 12 -1 True 2 urlerr, mseederr = s2scodes.url_err, s2scodes.mseed_err # according to our mock, we should have all urlerr codes: assert (seg_df[Segment.download_code.key] == urlerr).all() assert (seg_df[Segment.queryauth.key] == True).all() DOWNLOADID = 2 assert (seg_df[Segment.download_id.key] == DOWNLOADID).all() # other assertions: assert 'restricted_data: %s' % os.path.abspath(tokenfile) in result.output assert 'STEP 5 of 8: Acquiring credentials from token' in result.output # assert we print that we are downloading open and restricted data: assert re.search(r'STEP 7 of 8\: Downloading \d+ segments and saving to db', result.output) assert not mock_get_data_open.called assert mock_get_data_from_token.called assert not mock_get_data_from_userpass.called # no credentials failed: assert "Downloading open data only from: " not in result.output # Ok, test retry: new_seg_df = seg_df.copy() # first get run id # we have 12 segments, change the download codes. The second boolean # value denotes queryauth (True or False): code_queryauth = [(204, False), (204, True), (404, False), (404, True), (401, False), (401, True), (403, False), (403, True), (400, True), (400, False), (None, False), (None, True)] for id_, (dc_, qa_) in zip(seg_df[Segment.id.key].tolist(), code_queryauth): seg = db.session.query(Segment).filter(Segment.id == id_).first() seg.download_code = dc_ seg.queryauth = qa_ # set expected values (see also yamlfile below) # remember that any segment download will give urlerr as code expected_new_download_code = dc_ expected_download_id = DOWNLOADID if dc_ in (204, 404, 401, 403) and qa_ is False: # to retry becaue they failed due to authorization problems # (or most likely they did) expected_new_download_code = urlerr expected_download_id = DOWNLOADID + 1 elif dc_ is None or (dc_ < 400 and dc_ >= 500): # to retry because of the flags (see yamlfile below) expected_new_download_code = urlerr expected_download_id = DOWNLOADID + 1 expected_query_auth = qa_ if dc_ == 400 else True new_seg_df.loc[new_seg_df[Segment.id.key] == id_, :] = \ (id_, expected_new_download_code, expected_query_auth, expected_download_id) db.session.commit() # re-download and check what we have retried: yaml_file = yamlfile(restricted_data=os.path.abspath(tokenfile), retry_seg_not_found=True, retry_client_err=False) result = clirunner.invoke(cli, ['download', '-c', yaml_file, '--dburl', db.dburl, '--start', '2016-05-08T00:00:00', '--end', '2016-05-08T9:00:00']) DOWNLOADID += 1 assert clirunner.ok(result) # get db data, sort by index and reset index to assure comparison across data frames: seg_df2 = dbquery2df(db.session.query(Segment.id, Segment.download_code, Segment.queryauth, Segment.download_id))\ .sort_values(by=[Segment.id.key]).reset_index(drop=True) # seg_df2: # id download_code queryauth download_id # 1 -1 True 3 # 2 204 True 2 # 3 -1 True 3 # 4 404 True 2 # 5 -1 True 3 # 6 401 True 2 # 7 -1 True 3 # 8 403 True 2 # 9 400 True 2 # 10 400 False 2 # 11 -1 True 3 # 12 -1 True 3 pd.testing.assert_frame_equal(seg_df2, new_seg_df) # Another retry without modifyiung the segments but setting retry_client_err to True # re-download and check what we have retried: yaml_file = yamlfile(restricted_data=os.path.abspath(tokenfile), retry_seg_not_found=True, retry_client_err=True) result = clirunner.invoke(cli, ['download', '-c', yaml_file, '--dburl', db.dburl, '--start', '2016-05-08T00:00:00', '--end', '2016-05-08T9:00:00']) DOWNLOADID += 1 assert clirunner.ok(result) # get db data, sort by index and reset index to assure comparison across data frames: seg_df3 = dbquery2df(db.session.query(Segment.id, Segment.download_code, Segment.queryauth, Segment.download_id))\ .sort_values(by=[Segment.id.key]).reset_index(drop=True) expected_df = seg_df2.copy() # modify all 4xx codes as they are updated. Note that old urlerr codes have the old # download id (do not override) old_4xx = expected_df[Segment.download_code.key].between(400, 499.999) expected_df.loc[old_4xx, Segment.download_id.key] = DOWNLOADID expected_df.loc[old_4xx, Segment.queryauth.key] = True expected_df.loc[old_4xx, Segment.download_code.key] = urlerr # seg_df3: # id download_code queryauth download_id # 1 -1 True 3 # 2 204 True 2 # 3 -1 True 3 # 4 -1 True 4 # 5 -1 True 3 # 6 -1 True 4 # 7 -1 True 3 # 8 -1 True 4 # 9 -1 True 4 # 10 -1 True 4 # 11 -1 True 3 # 12 -1 True 3 pd.testing.assert_frame_equal(seg_df3, expected_df) old_urlerr_segids = seg_df2[seg_df2[Segment.download_code.key] == urlerr][Segment.id.key] new_urlerr_df = expected_df[expected_df[Segment.id.key].isin(old_urlerr_segids)] assert (new_urlerr_df[Segment.download_id.key] == 3).all()
def init(self, request, db, data): # re-init a sqlite database (no-op if the db is not sqlite): db.create(to_file=False) # setup a run_id: rdw = Download() db.session.add(rdw) db.session.commit() self.run = rdw # side effects: self._evt_urlread_sideeffect = """#EventID | Time | Latitude | Longitude | Depth/km | Author | Catalog | Contributor | ContributorID | MagType | Magnitude | MagAuthor | EventLocationName 20160508_0000129|2016-05-08 05:17:11.500000|1|1|60.0|AZER|EMSC-RTS|AZER|505483|ml|3|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN 20160508_0000004|2016-05-08 01:45:30.300000|90|90|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|4|EMSC|CROATIA """ self._dc_urlread_sideeffect = """http://geofon.gfz-potsdam.de/fdsnws/dataselect/1/query ZZ * * * 2002-09-01T00:00:00 2005-10-20T00:00:00 UP ARJ * * 2013-08-01T00:00:00 2017-04-25 http://ws.resif.fr/fdsnws/dataselect/1/query ZU * * HHZ 2015-01-01T00:00:00 2016-12-31T23:59:59.999999 """ # Note: by default we set sta_urlsideeffect to return such a channels which result in 12 # segments (see lat and lon of channels vs lat and lon of events above) self._sta_urlread_sideeffect = [ """#Network|Station|Location|Channel|Latitude|Longitude|Elevation|Depth|Azimuth|Dip|SensorDescription|Scale|ScaleFreq|ScaleUnits|SampleRate|StartTime|EndTime GE|FLT1||HHE|1|1|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2003-01-01T00:00:00| GE|FLT1||HHN|1|1|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2003-01-01T00:00:00| GE|FLT1||HHZ|1|1|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2003-01-01T00:00:00| n1|s||c1|90|90|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00| n1|s||c2|90|90|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00| n1|s||c3|90|90|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00| """, """#Network|Station|Location|Channel|Latitude|Longitude|Elevation|Depth|Azimuth|Dip|SensorDescription|Scale|ScaleFreq|ScaleUnits|SampleRate|StartTime|EndTime IA|BAKI||BHE|1|1|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2003-01-01T00:00:00| IA|BAKI||BHN|1|1|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2003-01-01T00:00:00| IA|BAKI||BHZ|1|1|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2003-01-01T00:00:00| n2|s||c1|90|90|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00| n2|s||c2|90|90|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00| n2|s||c3|90|90|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00| """ ] self._mintraveltime_sideeffect = cycle([1]) self._seg_data = data.read("GE.FLT1..HH?.mseed") self._seg_data_gaps = data.read("IA.BAKI..BHZ.D.2016.004.head") self._seg_data_empty = b'' self._seg_urlread_sideeffect = [ self._seg_data, self._seg_data_gaps, 413, 500, self._seg_data[:2], self._seg_data_empty, 413, URLError("++urlerror++"), socket.timeout() ] self.service = '' # so get_datacenters_df accepts any row by default self.db_buf_size = 1 self.routing_service = yaml_load(get_templates_fpath("download.yaml"))\ ['advanced_settings']['routing_service_url'] # NON db stuff (logging, patchers, pandas...): self.logout = StringIO() handler = StreamHandler(stream=self.logout) self._logout_cache = "" # THIS IS A HACK: query_logger.setLevel(logging.INFO) # necessary to forward to handlers # if we called closing (we are testing the whole chain) the level will be reset # (to level.INFO) otherwise it stays what we set two lines above. Problems might arise # if closing sets a different level, but for the moment who cares query_logger.addHandler(handler) # when debugging, I want the full dataframe with to_string(), not truncated # NOTE: this messes up right alignment of numbers in DownloadStats (see utils.py) # FIRST, remember current settings and restore them in cleanup: _pd_display_maxcolwidth = pd.get_option('display.max_colwidth') pd.set_option('display.max_colwidth', -1) # define class level patchers (we do not use a yiled as we need to do more stuff in the # finalizer, see below patchers = [] patchers.append(patch('stream2segment.utils.url.urlopen')) self.mock_urlopen = patchers[-1].start() # mock ThreadPool (tp) to run one instance at a time, so we get deterministic results: class MockThreadPool(object): def __init__(self, *a, **kw): pass def imap(self, func, iterable, *args): # make imap deterministic: same as standard python map: # everything is executed in a single thread the right input order return map(func, iterable) def imap_unordered(self, func_, iterable, *args): # make imap_unordered deterministic: same as standard python map: # everything is executed in a single thread in the right input order return map(func_, iterable) def close(self, *a, **kw): pass # assign patches and mocks: patchers.append(patch('stream2segment.utils.url.ThreadPool')) self.mock_tpool = patchers[-1].start() self.mock_tpool.side_effect = MockThreadPool # add finalizer: def delete(): pd.set_option('display.max_colwidth', _pd_display_maxcolwidth) for patcher in patchers: patcher.stop() hndls = query_logger.handlers[:] handler.close() for h in hndls: if h is handler: query_logger.removeHandler(h) request.addfinalizer(delete)
def test_get_events(self, mock_urljoin, db): urlread_sideeffect = [ """#1|2|3|4|5|6|7|8|9|10|11|12|13 20160508_0000129|2016-05-08 05:17:11.500000|40.57|52.23|60.0|AZER|EMSC-RTS|AZER|505483|ml|3.1|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN 20160508_0000004|2016-05-08 01:45:30.300000|44.96|15.35|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|3.6|EMSC|CROATIA 20160508_0000113|2016-05-08 22:37:20.100000|45.68|26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA 20160508_0000113|2016-05-08 22:37:20.100000|45.68|26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA --- ERRROR --- THIS IS MALFORMED 20160508_abc0113|2016-05-08 22:37:20.100000| --- ERROR --- |26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA """ ] data = self.get_events_df(urlread_sideeffect, db.session, "http://eventws", {}, datetime.utcnow() - timedelta(seconds=1), datetime.utcnow(), db_bufsize=self.db_buf_size) # assert only first two events events were successfully saved assert len(db.session.query(Event).all()) == len(pd.unique(data['id'])) == \ len(data) == 3 # check that log has notified: log1 = self.log_msg() assert "20160508_0000113" in log1 assert "1 database row(s) not inserted" in log1 assert mock_urljoin.call_count == 1 mock_urljoin.reset_mock() # now download again, with an url error: urlread_sideeffect = [ 504, """1|2|3|4|5|6|7|8|9|10|11|12|13 20160508_0000129|2016-05-08 05:17:11.500000|40.57|52.23|60.0|AZER|EMSC-RTS|AZER|505483|ml|3.1|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN 20160508_0000004|2016-05-08 01:45:30.300000|44.96|15.35|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|3.6|EMSC|CROATIA 20160508_0000113|2016-05-08 22:37:20.100000|45.68|26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA 20160508_0000113|2016-05-08 22:37:20.100000|45.68|26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA --- ERRROR --- THIS IS MALFORMED 20160508_abc0113|2016-05-08 22:37:20.100000| --- ERROR --- |26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA """, URLError('blabla23___') ] with pytest.raises(FailedDownload) as fld: data = self.get_events_df(urlread_sideeffect, db.session, "http://eventws", {}, datetime.utcnow() - timedelta(seconds=1), datetime.utcnow(), db_bufsize=self.db_buf_size) # assert we got the same result as above: assert len(db.session.query(Event).all()) == len(pd.unique(data['id'])) == \ len(data) == 3 log2 = self.log_msg() # log text has the message about the second (successful) dwnload, with the # two rows discarded: assert "2 row(s) discarded" in log2 # test that the exception has expected mesage: assert "Unable to fetch events" in str(fld) # check that we splitted once, thus we called 2 times mock_urljoin # (plus the first call): assert mock_urljoin.call_count == 3 mock_urljoin.reset_mock() # now download again, with a recursion error (max iterations reached): urlread_sideeffect = [413] with pytest.raises(FailedDownload) as fld: data = self.get_events_df(urlread_sideeffect, db.session, "http://eventws", {}, datetime.utcnow() - timedelta(seconds=1), datetime.utcnow(), db_bufsize=self.db_buf_size) # assert we got the same result as above: assert len(db.session.query(Event).all()) == len(pd.unique(data['id'])) == \ len(data) == 3 log2 = self.log_msg() # nothing written to log: assert "Request seems to be too large" in log2 # assertion on exception: assert "Unable to fetch events" in str(fld) assert "maximum recursion depth reached" in str(fld)