Esempio n. 1
0
    def test_general_exception_inside_yield(self):
        data = [
            b'none', b''
        ] * 10000  # supply an empty string otherwise urllib.read does not stop
        self.config_urlopen(data)  # , sleep_time=1)

        # self.urls has a valid url (which should execute onsuccess) and an invalid one
        # which should execute onerror)
        with pytest.raises(KeyboardInterrupt):
            self.read_async_raise_exc_in_called_func(self.urls)
        assert self.progress == 0
        # set the totalcounts of mock_urlread: 2 * len(url):
        totalcounts = 2 * len(self.urls)
        # assert we stopped before reading all url(s). Relax the condition by putting <=, as
        # if self.mock_urlread.call_count == totalcounts does not mean the test failed, it
        # can be due to the fact that we mock io-bound operations in urlread with non-io bound operations
        assert self.mock_urlread.call_count <= totalcounts

        # same regardless of urllib2 returned value:
        self.config_urlopen([URLError("")], sleep_time=None)
        # self.urls has a valid url (which should execute onsuccess) and an invalid one
        # which should execute onerror)
        with pytest.raises(KeyboardInterrupt):
            self.read_async_raise_exc_in_called_func(self.urls)
        assert self.progress == 0
Esempio n. 2
0
    def test_urlerrors(self):
        """Tests onerror. WE mock urllib2urlopen.read to raise an excpected Exception"""

        self.config_urlopen([URLError("")])

        # self.urls has a valid url (which should execute onsuccess) and an invalid one
        # which should execute onerror)
        self.read_async(self.urls)

        assert len(self.errors) == 2
        assert self.mock_urlread.call_count == len(self.urls)

        assert self.progress == 2
    def test_download_save_segments(self, mock_updatedf, mock_insertdf, mseed_unpack, db,
                                    tt_ak135_tts):
        # prepare:
        # mseed unpack takes no starttime and endtime arguments, so that
        # we do not discard any correct chunk
        mseed_unpack.side_effect = lambda *a, **v: unpack(a[0])
        mock_insertdf.side_effect = lambda *a, **v: insertdf(*a, **v)
        mock_updatedf.side_effect = lambda *a, **v: updatedf(*a, **v)

        urlread_sideeffect = None  # use defaults from class
        events_df = self.get_events_df(urlread_sideeffect, db.session)
        net, sta, loc, cha = [], [], [], []
        datacenters_df, eidavalidator = \
            self.get_datacenters_df(urlread_sideeffect, db.session, self.service,
                                    self.routing_service, net, sta, loc, cha,
                                    db_bufsize=self.db_buf_size)
        channels_df = self.get_channels_df(urlread_sideeffect, db.session,
                                           datacenters_df,
                                           eidavalidator,
                                           net, sta, loc, cha, None, None, 10,
                                           False, None, None, -1, self.db_buf_size)
        assert len(channels_df) == 12  # just to be sure. If failing, we might have changed the class default
    # events_df
#                  id  magnitude  latitude  longitude  depth_km  time
# 0  20160508_0000129        3.0       1.0        1.0      60.0  2016-05-08 05:17:11.500
# 1  20160508_0000004        4.0       2.0        2.0       2.0  2016-05-08 01:45:30.300

# channels_df (index not shown):
# columns:
# id  station_id  latitude  longitude  datacenter_id start_time end_time network station location channel
# data (not aligned with columns):
# 1   1  1.0   1.0   1 2003-01-01 NaT  GE  FLT1    HHE
# 2   1  1.0   1.0   1 2003-01-01 NaT  GE  FLT1    HHN
# 3   1  1.0   1.0   1 2003-01-01 NaT  GE  FLT1    HHZ
# 4   2  90.0  90.0  1 2009-01-01 NaT  n1  s       c1
# 5   2  90.0  90.0  1 2009-01-01 NaT  n1  s       c2
# 6   2  90.0  90.0  1 2009-01-01 NaT  n1  s       c3
# 7   3  1.0   1.0   2 2003-01-01 NaT  IA  BAKI    BHE
# 8   3  1.0   1.0   2 2003-01-01 NaT  IA  BAKI    BHN
# 9   3  1.0   1.0   2 2003-01-01 NaT  IA  BAKI    BHZ
# 10  4  90.0  90.0  2 2009-01-01 NaT  n2  s       c1
# 11  4  90.0  90.0  2 2009-01-01 NaT  n2  s       c2
# 12  4  90.0  90.0  2 2009-01-01 NaT  n2  s       c3

        assert all(_ in channels_df.columns for _ in [Station.network.key, Station.station.key,
                                                      Channel.location.key, Channel.channel.key])
        chaid2mseedid = chaid2mseedid_dict(channels_df)
        # check that we removed the columns:
        assert not any(_ in channels_df.columns for _ in
                       [Station.network.key, Station.station.key,
                        Channel.location.key, Channel.channel.key])

        # take all segments:
        # use minmag and maxmag
        ttable = tt_ak135_tts
        segments_df = merge_events_stations(events_df, channels_df, dict(minmag=10, maxmag=10,
                                            minmag_radius=10, maxmag_radius=10), tttable=ttable)

        assert len(pd.unique(segments_df['arrival_time'])) == 2

        h = 9

# segments_df (index not shown). Note that
# cid sid did n   s    l  c    ed   event_id          depth_km                time  <- LAST TWO ARE Event related columns that will be removed after arrival_time calculations
# 1   1   1   GE  FLT1    HHE  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 2   1   1   GE  FLT1    HHN  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 3   1   1   GE  FLT1    HHZ  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 7   3   2   IA  BAKI    BHE  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 8   3   2   IA  BAKI    BHN  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 9   3   2   IA  BAKI    BHZ  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 4   2   1   n1  s       c1   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300
# 5   2   1   n1  s       c2   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300
# 6   2   1   n1  s       c3   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300
# 10  4   2   n2  s       c1   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300
# 11  4   2   n2  s       c2   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300
# 12  4   2   n2  s       c3   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300

# LEGEND:
# cid = channel_id
# sid = station_id
# scid = datacenter_id
# n, s, l, c = network, station, location, channel
# ed = event_distance_deg

        # define a dc_dataselect_manager for open data only:
        dc_dataselect_manager = DcDataselectManager(datacenters_df, Authorizer(None), False)

        wtimespan = [1,2]
        expected = len(segments_df)  # no segment on db, we should have all segments to download
        orig_segments_df = segments_df.copy()
        segments_df, request_timebounds_need_update = \
            prepare_for_download(db.session, orig_segments_df, dc_dataselect_manager, wtimespan,
                                 retry_seg_not_found=True,
                                 retry_url_err=True,
                                 retry_mseed_err=True,
                                 retry_client_err=True,
                                 retry_server_err=True,
                                 retry_timespan_err=True,
                                 retry_timespan_warn=True)

# segments_df
# COLUMNS:
# channel_id  datacenter_id network station location channel event_distance_deg event_id arrival_time start_time end_time id download_status_code run_id
# DATA (not aligned with columns):
#               channel_id  datacenter_id network station location channel  event_distance_deg  event_id            arrival_time          start_time            end_time    id download_status_code  run_id
# GE.FLT1..HHE  1           1              GE      FLT1             HHE     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# GE.FLT1..HHN  2           1              GE      FLT1             HHN     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# GE.FLT1..HHZ  3           1              GE      FLT1             HHZ     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# IA.BAKI..BHE  7           2              IA      BAKI             BHE     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# IA.BAKI..BHN  8           2              IA      BAKI             BHN     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# IA.BAKI..BHZ  9           2              IA      BAKI             BHZ     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# n1.s..c1      4           1              n1      s                c1      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1
# n1.s..c2      5           1              n1      s                c2      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1
# n1.s..c3      6           1              n1      s                c3      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1
# n2.s..c1      10          2              n2      s                c1      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1
# n2.s..c2      11          2              n2      s                c2      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1
# n2.s..c3      12          2              n2      s                c3      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1

        # self._segdata is the folder file of a "valid" 3-channel miniseed
        # The channels are:
        # Thus, no match will be found and all segments will be written with a None
        # download status code

        # setup urlread: first three rows: ok
        # rows[3:6]: 413, retry them
        # rows[6:9]: malformed_data
        # rows[9:12] 413, retry them
        # then retry:
        # rows[3]: empty_data
        # rows[4]: data_with_gaps (but seed_id should notmatch)
        # rows[5]: data_with_gaps (seed_id should notmatch)
        # rows[9]: URLError
        # rows[10]: Http 500 error
        # rows[11]: 413

        # NOTE THAT THIS RELIES ON THE FACT THAT THREADS ARE EXECUTED IN THE ORDER OF THE DATAFRAME
        # WHICH SEEMS TO BE THE CASE AS THERE IS ONE SINGLE PROCESS
        # self._seg_data[:2] is a way to mock data corrupted
        urlread_sideeffect = [self._seg_data, 413, self._seg_data[:2], 413,
                              '', self._seg_data_gaps, self._seg_data_gaps,
                              URLError("++urlerror++"), 500, 413]
        # Let's go:
        ztatz = self.download_save_segments(urlread_sideeffect, db.session, segments_df,
                                            dc_dataselect_manager,
                                            chaid2mseedid,
                                            self.run.id, False,
                                            request_timebounds_need_update,
                                            1, 2, 3, db_bufsize=self.db_buf_size)
        # get columns from db which we are interested on to check
        cols = [Segment.id, Segment.channel_id, Segment.datacenter_id,
                Segment.download_code, Segment.maxgap_numsamples, \
                Segment.sample_rate, Segment.data_seed_id, Segment.data, Segment.download_id,
                Segment.request_start, Segment.request_end, Segment.start_time, Segment.end_time
                ]
        db_segments_df = dbquery2df(db.session.query(*cols))
        assert Segment.download_id.key in db_segments_df.columns

        # change data column otherwise we cannot display db_segments_df.
        # When there is data just print "data"
        db_segments_df.loc[(~pd.isnull(db_segments_df[Segment.data.key])) &
                           (db_segments_df[Segment.data.key].str.len() > 0),
                           Segment.data.key] = b'data'

        # assert we have 4 segments with "data" properly set:
        assert len(db_segments_df.loc[(~pd.isnull(db_segments_df[Segment.data.key])) &
                                      (db_segments_df[Segment.data.key].str.len() > 0),
                                      Segment.data.key]) == 4

        # re-sort db_segments_df to match the segments_df:
        ret = []
        for cha in segments_df[Segment.channel_id.key]:
            ret.append(db_segments_df[db_segments_df[Segment.channel_id.key] == cha])
        db_segments_df = pd.concat(ret, axis=0)

# db_segments_df:
#    id  channel_id  datacenter_id  download_status_code  max_gap_ovlap_ratio  sample_rate data_seed_id     data  run_id          start_time            end_time
# 0  1   1           1              200.0                 0.0001               100.0        GE.FLT1..HHE    data  1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 1  2   2           1              200.0                 0.0001               100.0        GE.FLT1..HHN    data  1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 2  3   3           1              200.0                 0.0001               100.0        GE.FLT1..HHZ    data  1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 6  7   7           2              200.0                 NaN                  NaN          None                  1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 7  8   8           2              NaN                   NaN                  NaN          None            None  1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 8  9   9           2              200.0                 20.0                 20.0         IA.BAKI..BHZ    data  1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 3  4   4           1             -2.0                   NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31
# 4  5   5           1             -2.0                   NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31
# 5  6   6           1             -2.0                   NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31
# 9  10  10          2              -1.0                  NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31
# 10 11  11          2              500.0                 NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31
# 11 12  12          2              413.0                 NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31

        assert len(ztatz) == len(datacenters_df)
        assert len(db_segments_df) == len(segments_df)
        assert mock_updatedf.call_count == 0

        dsc = db_segments_df[Segment.download_code.key]
        exp_dsc = np.array([200, 200, 200, 200, np.nan, 200, -2, -2, -2, -1, 500, 413])
        assert ((dsc == exp_dsc) | (np.isnan(dsc) & np.isnan(exp_dsc))).all()
        # as we have 12 segments and a buf size of self.db_buf_size(=1, but it might change in the
        # future), this below is two
        # it might change if we changed the buf size in the future

        # test that we correctly called mock_insertdf. Note that we assume that the
        # latter is called ONLY inside DbManager. To test that, as the number of stuff
        # to be added (length of the dataframes) varies, we need to implement a counter here:
        mock_insertdf_call_count = 0
        _bufzise = 0
        for c in mock_insertdf.call_args_list:
            c_args = c[0]
            df_ = c_args[0]
            _bufzise += len(df_)
            if _bufzise >= self.db_buf_size:
                mock_insertdf_call_count += 1
                _bufzise = 0

        assert mock_insertdf.call_count == mock_insertdf_call_count

        # assert data is consistent
        COL = Segment.data.key
        assert (db_segments_df.iloc[:3][COL] == b'data').all()
        assert (db_segments_df.iloc[3:4][COL] == b'').all()
        assert pd.isnull(db_segments_df.iloc[4:5][COL]).all()
        assert (db_segments_df.iloc[5:6][COL] == b'data').all()
        assert pd.isnull(db_segments_df.iloc[6:][COL]).all()

        # assert downdload status code is consistent
        URLERR_CODE, MSEEDERR_CODE = s2scodes.url_err, s2scodes.mseed_err

        # also this asserts that we grouped for dc starttime endtime
        COL = Segment.download_code.key
        assert (db_segments_df.iloc[:4][COL] == 200).all()
        assert pd.isnull(db_segments_df.iloc[4:5][COL]).all()
        assert (db_segments_df.iloc[5:6][COL] == 200).all()
        assert (db_segments_df.iloc[6:9][COL] == MSEEDERR_CODE).all()
        assert (db_segments_df.iloc[9][COL] == URLERR_CODE).all()
        assert (db_segments_df.iloc[10][COL] == 500).all()
        assert (db_segments_df.iloc[11][COL] == 413).all()

        # assert gaps are only in the given position
        COL = Segment.maxgap_numsamples.key
        assert (db_segments_df.iloc[:3][COL] < 0.01).all()
        assert pd.isnull(db_segments_df.iloc[3:5][COL]).all()
        assert (db_segments_df.iloc[5][COL] == 20).all()
        assert pd.isnull(db_segments_df.iloc[6:][COL]).all()

        # now mock retry:
        segments_df, request_timebounds_need_update = \
            prepare_for_download(db.session, orig_segments_df, dc_dataselect_manager, wtimespan,
                                 retry_seg_not_found=True,
                                 retry_url_err=True,
                                 retry_mseed_err=True,
                                 retry_client_err=True,
                                 retry_server_err=True,
                                 retry_timespan_err=True,
                                 retry_timespan_warn=True)

        assert request_timebounds_need_update is False

        COL = Segment.download_code.key
        mask = (db_segments_df[COL] >= 400) | pd.isnull(db_segments_df[COL]) \
            | (db_segments_df[COL].isin([URLERR_CODE, MSEEDERR_CODE]))
        assert len(segments_df) == len(db_segments_df[mask])

        urlread_sideeffect = [413]
        mock_updatedf.reset_mock()
        mock_insertdf.reset_mock()
        # define a dc_dataselect_manager for open data only:
        dc_dataselect_manager = DcDataselectManager(datacenters_df, Authorizer(None), False)
        # Let's go:
        ztatz = self.download_save_segments(urlread_sideeffect, db.session, segments_df,
                                            dc_dataselect_manager,
                                            chaid2mseedid,
                                            self.run.id, False,
                                            request_timebounds_need_update,
                                            1, 2, 3, db_bufsize=self.db_buf_size)
        # get columns from db which we are interested on to check
        cols = [Segment.download_code, Segment.channel_id]
        db_segments_df = dbquery2df(db.session.query(*cols))

        # change data column otherwise we cannot display db_segments_df. When there is data
        # just print "data"
        # db_segments_df.loc[(~pd.isnull(db_segments_df[Segment.data.key])) &
        # (db_segments_df[Segment.data.key].str.len() > 0), Segment.data.key] = b'data'

        # re-sort db_segments_df to match the segments_df:
        ret = []
        for cha in segments_df[Segment.channel_id.key]:
            ret.append(db_segments_df[db_segments_df[Segment.channel_id.key] == cha])
        db_segments_df = pd.concat(ret, axis=0)

        assert (db_segments_df[COL] == 413).all()
        assert len(ztatz) == len(datacenters_df)
        assert len(db_segments_df) == len(segments_df)

        # same as above: but with updatedf: test that we correctly called mock_insertdf_napkeys.
        # Note that we assume that the latter is called ONLY inside download.main.DbManager.
        # To test that, as the number of stuff to be added (length of the dataframes) varies,
        # we need to implement a counter here:
        mock_updatedf_call_count = 0
        _bufzise = 0
        for c in mock_updatedf.call_args_list:
            c_args = c[0]
            df_ = c_args[0]
            _bufzise += len(df_)
            if _bufzise >= self.db_buf_size:
                mock_updatedf_call_count += 1
                _bufzise = 0

        assert mock_updatedf.call_count == mock_updatedf_call_count

        assert mock_insertdf.call_count == 0
Esempio n. 4
0
def test_utils_url_read(mock_urlopen):

    def side_effect(argss):
        return StringIO(argss)

    mockread = mock.Mock()
    class mybytesio(object):

        def __init__(self, url, **kwargs):
            mockread.reset_mock()
            if isinstance(url, Exception):
                self.a = url
            else:
                self.code = 200
                self.msg = 'Ok'
                self.a = BytesIO(url)

        def read(self, *a, **kw):
            if isinstance(self.a, Exception):
                raise self.a  # pylint: disable=raising-non-exception
            mockread(*a, **kw)
            return self.a.read(*a, **kw)

        def close(self, *a, **kw):
            if not isinstance(self.a, Exception):
                self.a.close(*a, **kw)

    mock_urlopen.side_effect = lambda url, **kw: mybytesio(url, **kw)
    with pytest.raises(TypeError):
        urlread('', "name")

    val = b'url'
    blockSize = 1024 * 1024
    assert urlread(val, blockSize)[0] == val
    mock_urlopen.assert_called_with(val)  # , timeout=DEFAULT_TIMEOUT)
    assert mockread.call_count == 2
    mockread.assert_called_with(blockSize)

    mock_urlopen.side_effect = lambda url, **kw: mybytesio(url, **kw)

    assert urlread(val, arg_to_read=56)[0] == val
    mock_urlopen.assert_called_with(val, arg_to_read=56)
    assert mockread.call_count == 1  # because blocksize is -1

    mock_urlopen.side_effect = lambda url, **kw: mybytesio(URLError('wat?'))
    with pytest.raises(URLError):
        urlread(val, wrap_exceptions=False)  # note urlexc
    with pytest.raises(URLException):
        urlread(val, wrap_exceptions=True)  # note urlexc

    mock_urlopen.side_effect = lambda url, **kw: mybytesio(URLError('wat?'))
    with pytest.raises(URLException):
        urlread(val)  # note urlexc

    mock_urlopen.side_effect = lambda url, **kw: mybytesio(socket.timeout())
    with pytest.raises(URLException):
        urlread(val)  # note urlexc

    mock_urlopen.side_effect = lambda url, **kw: mybytesio(HTTPError('url', 500, '?', None, None))
    with pytest.raises(URLException):
        urlread(val)  # note urlexc

    mock_urlopen.side_effect = lambda url, **kw: mybytesio(HTTPError('url', 500, '?', None, None))
    assert urlread(val, raise_http_err=False) == (None, 500, '?')  # note urlexc
    def test_get_channels_df(self, db):
        urlread_sideeffect = """1|2|3|4|5|6|7|8|9|10|11|12|13
20160508_0000129|2016-05-08 05:17:11.500000|40.57|52.23|60.0|AZER|EMSC-RTS|AZER|505483|ml|3.1|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN
20160508_0000004|2016-05-08 01:45:30.300000|44.96|15.35|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|3.6|EMSC|CROATIA
20160508_0000113|2016-05-08 22:37:20.100000|45.68|26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
20160508_0000113|2016-05-08 22:37:20.100000|45.68|26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
--- ERRROR --- THIS IS MALFORMED 20160508_abc0113|2016-05-08 22:37:20.100000| --- ERROR --- |26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
"""
        events_df = self.get_events_df(urlread_sideeffect, db.session)

        urlread_sideeffect = """http://geofon.gfz-potsdam.de/fdsnws/dataselect/1/query
ZZ * * * 2002-09-01T00:00:00 2005-10-20T00:00:00
UP ARJ * * 2013-08-01T00:00:00 2017-04-25

http://ws.resif.fr/fdsnws/dataselect/1/query
ZU * * HHZ 2015-01-01T00:00:00 2016-12-31T23:59:59.999999
"""
        # we tried to add two events with the same id, check we printed out the msg:
        assert "Duplicated instances violate db constraint" in self.log_msg()

        net, sta, loc, cha = [], [], [], []
        datacenters_df, eidavalidator = \
            self.get_datacenters_df(urlread_sideeffect, db.session, None, self.routing_service,
                                    net, sta, loc, cha, db_bufsize=self.db_buf_size)
        # first we mock url errors in all queries. We still did not write anything in the db
        # so we should quit:
        with pytest.raises(FailedDownload) as qd:
            _ = self.get_channels_df(URLError('urlerror_wat'), db.session,
                                     datacenters_df, eidavalidator, net, sta,
                                     loc, cha, None, None, 100, False, None,
                                     None, -1, self.db_buf_size)
        assert 'urlerror_wat' in self.log_msg()
        assert "Unable to fetch stations" in self.log_msg()
        assert "Fetching stations from database for 2 (of 2) data-center(s)" in self.log_msg(
        )
        # Test that the exception message is correct
        # note that this message is in the log if we run the method from the main
        # function (which is not the case here):
        assert ("Unable to fetch stations from all data-centers, "
                "no data to fetch from the database. "
                "Check config and log for details") in str(qd.value)

        # now get channels with a mocked custom urlread_sideeffect below:
        # IMPORTANT: url read for channels: Note: first response data raises, second has an
        # error and that error is skipped (the other channels are added)
        urlread_sideeffect = [
            """#Network|Station|Location|Channel|Latitude|Longitude|Elevation|Depth|Azimuth|Dip|SensorDescription|Scale|ScaleFreq|ScaleUnits|SampleRate|StartTime|EndTime
--- ERROR --- MALFORMED|12T00:00:00|
HT|AGG||HHZ|39.0211|22.336|622.0|0.0|0.0|-90.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|50.0|2008-02-12T00:00:00|
HT|LKD2||HHE|38.7889|20.6578|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00|
""",
            # NOTE THAT THE CHANNELS ABOVE WILL BE OVERRIDDEN BY THE ONES BELOW (MULTIPLE NAMES< WE
            # SHOULD NOT HAVE THIS CASE WITH THE EDIAWS ROUTING SERVICE BUT WE TEST HERE THE CASE)
            # NOTE THE USE OF HTß as SensorDescription (to check non-asci characters do not raise)
            """#Network|Station|Location|Channel|Latitude|Longitude|Elevation|Depth|Azimuth|Dip|SensorDescription|Scale|ScaleFreq|ScaleUnits|SampleRate|StartTime|EndTime
HT|AGG||HHE|--- ERROR --- NONNUMERIC |22.336|622.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|70.0|2008-02-12T00:00:00|
HT|AGG||HLE|95.6|22.336|622.0|0.0|90.0|0.0|GFZ:HTß1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2008-02-12T00:00:00|
HT|AGG||HLZ|39.0211|22.336|622.0|0.0|0.0|-90.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2008-02-12T00:00:00|
HT|LKD2||HHE|38.7889|20.6578|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|90.0|2009-01-01T00:00:00|
HT|LKD2||HHZ|38.7889|20.6578|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|90.0|2009-01-01T00:00:00|
BLA|BLA||HHZ|38.7889|20.6578|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00|2019-01-01T00:00:00
BLA|BLA||HHZ|38.7889|20.6578|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2018-01-01T00:00:00|
"""
        ]

        cha_df = self.get_channels_df(urlread_sideeffect, db.session,
                                      datacenters_df, eidavalidator, net, sta,
                                      loc, cha, None, None, 90, False, None,
                                      None, -1, self.db_buf_size)
        # assert we have a message for discarding the response data
        # (first arg of urlread):
        assert "Discarding response data" in self.log_msg()
        # we should have called mock_urlopen_in_async times the datacenters
        assert self.mock_urlopen.call_count == len(datacenters_df)
        assert len(db.session.query(Station.id).all()) == 4
        # the last two channels of the second item of `urlread_sideeffect` are from two
        # stations (BLA|BLA|...) with only different start time. Thus they should both be added:
        assert len(db.session.query(Channel.id).all()) == 6
        # as net, sta, loc, cha are all empty lists and start = end = None (all default=>no filter),
        # this is the post data passed to urlread for the 1st datacenter:
        assert self.mock_urlopen.call_args_list[0][0][
            0].data == b"""format=text
level=channel
* * * * * *"""
        # as net, sta, loc, cha are all empty lists and start = end = None (all default=>no filter),
        # this is the post data passed to urlread for the 2nd datacenter:
        assert self.mock_urlopen.call_args_list[1][0][
            0].data == b"""format=text
level=channel
* * * * * *"""
        assert self.mock_urlopen.call_args_list[0][0][0].get_full_url() == \
            "http://geofon.gfz-potsdam.de/fdsnws/station/1/query"
        assert self.mock_urlopen.call_args_list[1][0][0].get_full_url() == \
            "http://ws.resif.fr/fdsnws/station/1/query"
        # assert all downloaded stations have datacenter_id of the second datacenter:
        dcid = datacenters_df.iloc[1].id
        assert all(sid[0] == dcid
                   for sid in db.session.query(Station.datacenter_id).all())
        # assert all downloaded channels have station_id in the set of downloaded stations only:
        sta_ids = [x[0] for x in db.session.query(Station.id).all()]
        assert all(c_staid[0] in sta_ids
                   for c_staid in db.session.query(Channel.station_id).all())

        # now mock again url errors in all queries. As we wrote something in the db
        # so we should NOT quit
        cha_df2 = self.get_channels_df(URLError('urlerror_wat'), db.session,
                                       datacenters_df,
                                       eidavalidator, net, sta, loc, cha,
                                       datetime(2020, 1, 1), None, 100, False,
                                       None, None, -1, self.db_buf_size)

        # Note above that min sample rate = 100 and a starttime proivded should return 3 channels:
        assert len(cha_df2) == 3
        assert "Fetching stations from database for 2 (of 2) data-center(s)" in self.log_msg(
        )

        # now test again with a socket timeout
        cha_df2 = self.get_channels_df(socket.timeout(), db.session,
                                       datacenters_df, eidavalidator, net, sta,
                                       loc, cha, None, None, 100, False, None,
                                       None, -1, self.db_buf_size)
        assert 'timeout' in self.log_msg()
        assert "Fetching stations from database for 2 (of 2) data-center(s)" in self.log_msg(
        )

        # now mixed case:

        # now change min sampling rate and see that we should get one channel less
        cha_df3 = self.get_channels_df(urlread_sideeffect, db.session,
                                       datacenters_df, eidavalidator, net, sta,
                                       loc, cha, None, None, 100, False, None,
                                       None, -1, self.db_buf_size)
        assert len(cha_df3) == len(cha_df) - 2
        assert "2 channel(s) discarded according to current config. filters" in self.log_msg(
        )

        # now change this:

        urlread_sideeffect = [
            URLError('wat'),
            """#Network|Station|Location|Channel|Latitude|Longitude|Elevation|Depth|Azimuth|Dip|SensorDescription|Scale|ScaleFreq|ScaleUnits|SampleRate|StartTime|EndTime
A|B|10|HBE|39.0211|22.336|622.0|0.0|0.0|-90.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2003-02-12T00:00:00|2010-02-12T00:00:00
E|F|11|HHZ|38.7889|20.6578|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2019-01-01T00:00:00|
""",
            URLError('wat'),
            socket.timeout()
        ]

        # now change channels=['B??']. In the urlread_sideeffect above, for the 1st, 3rd and 4th
        # case we fallback to a db query, but we do not have such a channel, so nothing is returned
        # The dataframe currently saved on db is:
        #    id channel start_time   end_time  sample_rate  datacenter_id
        # 0  1   HLE    2008-02-12 NaT         100.0        2
        # 1  2   HLZ    2008-02-12 NaT         100.0        2
        # 2  3   HHE    2009-01-01 NaT         90.0         2
        # 3  4   HHZ    2009-01-01 NaT         90.0         2
        # 4  5   HHZ    2009-01-01 2019-01-01  100.0        2
        # 5  6   HHZ    2018-01-01 NaT         100.0        2
        # for the second case, the mocked response returns two channels and in this case
        # we might put whatever filter here below. Assert that the number of channels returned is 2
        cha_df = self.get_channels_df(urlread_sideeffect, db.session,
                                      datacenters_df, eidavalidator, net, sta,
                                      loc, ['B??'], None, None, 10, False,
                                      None, None, -1, self.db_buf_size)
        assert len(cha_df) == 2

        # test channels and startime + entimes provided when querying the db (postdata None)
        # by iussuing the command:
        # dbquery2df(db.session.query(Channel.id, Station.network, Station.station,
        #  Channel.location, Channel.channel, Station.start_time,Station.end_time,
        #  Channel.sample_rate, Station.datacenter_id).join(Channel.station))
        # This is the actual state of the db:
        # ----------------------------------------------
        # channel_id network station location channel start_time    end_time   sample_rate  datacenter_id
        #          1      HT     AGG              HLE    2008-02-12 NaT              100.0              2
        #          2      HT     AGG              HLZ    2008-02-12 NaT              100.0              2
        #          3      HT     LKD2             HHE    2009-01-01 NaT              90.0               2
        #          4      HT     LKD2             HHZ    2009-01-01 NaT              90.0               2
        #          5      BLA    BLA              HHZ    2009-01-01 2019-01-01       100.0              2
        #          6      BLA    BLA              HHZ    2018-01-01 NaT              100.0              2
        #          7      A      B         10     HBE    2003-02-12 2010-02-12       100.0              2
        #          8      E      F         11     HHZ    2019-01-01 NaT              100.0              2
        # ----------------------------------------------
        # Now according to the table above set a list of arguments:
        # Each key is: the argument, each value IS A LIST OF BOOLEAN MAPPED TO EACH ROW OF THE
        # DATAFRAME ABOVE, telling if the row matches according to the argument:
        nets = {
            ('*', ): [1, 1, 1, 1, 1, 1, 1, 1],
            # ('HT', 'BLA'): [1, 1, 1, 1, 1, 1, 0, 0],
            (
                '*A*', ): [0, 0, 0, 0, 1, 1, 1, 0]
        }
        stas = {
            ('B*', ): [0, 0, 0, 0, 1, 1, 1, 0],
            ('B??', ): [0, 0, 0, 0, 1, 1, 0, 0]
        }
        # note that we do NOT assume '--' can be given, as this should be the parsed
        # output of `nslc_lists`:
        locs = {
            ('', ): [1, 1, 1, 1, 1, 1, 0, 0],
            ('1?', ): [0, 0, 0, 0, 0, 0, 1, 1]
        }
        chans = {
            ('?B?', ): [0, 0, 0, 0, 0, 0, 1, 0],
            ('HL?', '?B?'): [1, 1, 0, 0, 0, 0, 1, 0],
            ('HHZ', ): [0, 0, 0, 1, 1, 1, 0, 1]
        }
        stimes = {
            None: [1, 1, 1, 1, 1, 1, 1, 1],
            datetime(2002, 1, 1): [1, 1, 1, 1, 1, 1, 1, 1],
            datetime(2099, 1, 1): [1, 1, 1, 1, 0, 1, 0, 1]
        }
        etimes = {
            None: [1, 1, 1, 1, 1, 1, 1, 1],
            datetime(2002, 1, 1): [0, 0, 0, 0, 0, 0, 0, 0],
            datetime(2011, 1, 1): [1, 1, 1, 1, 1, 0, 1, 0],
            datetime(2099, 1, 1): [1, 1, 1, 1, 1, 1, 1, 1]
        }
        minsr = {
            90: [1, 1, 1, 1, 1, 1, 1, 1],
            # 95: [1, 1, 0, 0, 1, 1, 1, 1],
            100: [1, 1, 0, 0, 1, 1, 1, 1],
            105: [0, 0, 0, 0, 0, 0, 0, 0]
        }
        # no url read: set socket.tiomeout as urlread side effect. This will force
        # querying the database to test that the filtering works as expected:
        for n, s, l, c, st, e, m in product(nets, stas, locs, chans, stimes,
                                            etimes, minsr):
            matches = np.array(nets[n]) * np.array(stas[s]) * np.array(locs[l]) * \
                np.array(chans[c]) * np.array(stimes[st]) * np.array(etimes[e]) * \
                np.array(minsr[m])
            expected_length = matches.sum()
            # Now: if expected length is zero, it means we do not have data matches on the db
            # This raises a quitdownload (avoiding pytest.raises cause in this
            # case it's easier like done below):
            try:
                __dc_df = datacenters_df.loc[datacenters_df[DataCenter.id.key]
                                             == 2]
                cha_df = self.get_channels_df(socket.timeout(), db.session,
                                              __dc_df, eidavalidator, n, s, l,
                                              c, st, e, m, False, None, None,
                                              -1, self.db_buf_size)
                assert len(cha_df) == expected_length
            except FailedDownload as qd:
                assert expected_length == 0
                assert "Unable to fetch stations from all data-centers" in str(
                    qd)

        # Same test as above, but test negative assertions with "!". Reminder: data on db is:
        # ----------------------------------------------
        # channel_id network station location channel start_time    end_time   sample_rate  datacenter_id
        #          1      HT     AGG              HLE    2008-02-12 NaT              100.0              2
        #          2      HT     AGG              HLZ    2008-02-12 NaT              100.0              2
        #          3      HT     LKD2             HHE    2009-01-01 NaT              90.0               2
        #          4      HT     LKD2             HHZ    2009-01-01 NaT              90.0               2
        #          5      BLA    BLA              HHZ    2009-01-01 2019-01-01       100.0              2
        #          6      BLA    BLA              HHZ    2018-01-01 NaT              100.0              2
        #          7      A      B         10     HBE    2003-02-12 2010-02-12       100.0              2
        #          8      E      F         11     HHZ    2019-01-01 NaT              100.0              2
        # ----------------------------------------------
        # Now according to the table above set a list of arguments:
        # Each key is: the argument, each value IS A LIST OF BOOLEAN MAPPED TO EACH ROW OF THE
        # DATAFRAME ABOVE, telling if the row matches according to the argument:
        nets = {
            ('!*A*', 'A'): [1, 1, 1, 1, 0, 0, 1, 1],
            ('E', 'A'): [0, 0, 0, 0, 0, 0, 1, 1]
        }
        stas = {
            ('!*B*', 'B'): [1, 1, 1, 1, 0, 0, 1, 1],
            ('!???2', ): [1, 1, 0, 0, 1, 1, 1, 1]
        }
        # note that we do NOT assume '--' can be given, as this should be the parsed
        # output of `nslc_lists`:
        locs = {
            ('', ): [1, 1, 1, 1, 1, 1, 0, 0],
            ('!', ): [0, 0, 0, 0, 0, 0, 1, 1]
        }
        chans = {
            ('HHZ', '!*E'): [0, 1, 0, 1, 1, 1, 0, 1],
            ('!?H?', ): [1, 1, 0, 0, 0, 0, 1, 0]
        }
        stimes = {None: [1, 1, 1, 1, 1, 1, 1, 1]}
        etimes = {None: [1, 1, 1, 1, 1, 1, 1, 1]}
        minsr = {-1: [1, 1, 1, 1, 1, 1, 1, 1]}
        # no url read: set socket.tiomeout as urlread side effect. This will force
        # querying the database to test that the filtering works as expected:
        for n, s, l, c, st, e, m in product(nets, stas, locs, chans, stimes,
                                            etimes, minsr):
            matches = np.array(nets[n]) * np.array(stas[s]) * np.array(locs[l]) * \
                np.array(chans[c]) * np.array(stimes[st]) * np.array(etimes[e]) * np.array(minsr[m])
            expected_length = matches.sum()
            # Now: if expected length is zero, it means we do not have data matches on the db
            # This raises a quitdownload (avoiding pytest.raises cause in this
            # case it's easier like done below):
            try:
                __dc_df = datacenters_df.loc[datacenters_df[DataCenter.id.key]
                                             == 2]
                cha_df = self.get_channels_df(socket.timeout(), db.session,
                                              __dc_df, eidavalidator, n, s, l,
                                              c, st, e, m, False, None, None,
                                              -1, self.db_buf_size)
                assert len(cha_df) == expected_length
            except FailedDownload as qd:
                assert expected_length == 0
                assert "Unable to fetch stations from all data-centers" in str(
                    qd)

        # now make the second url_side_effect raise => force query from db, and the first good
        # => fetch from the web
        # We want to test the mixed case: some fetched from db, some from the web
        # ---------------------------------------------------
        # first we query the db to check what we have:
        cha_df = dbquery2df(
            db.session.query(Channel.id, Station.datacenter_id,
                             Station.network).join(Station))
        # build a new network:
        newnetwork = 'U'
        while newnetwork in cha_df[Station.network.key]:
            newnetwork += 'U'
        urlread_sideeffect2 = [
            """#Network|Station|Location|Channel|Latitude|Longitude|Elevation|Depth|Azimuth|Dip|SensorDescription|Scale|ScaleFreq|ScaleUnits|SampleRate|StartTime|EndTime
%s|W||HBE|39.0211|22.336|622.0|0.0|0.0|-90.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|50.0|2008-02-12T00:00:00|2010-02-12T00:00:00
""" % newnetwork,
            socket.timeout()
        ]
        # now note: the first url read raised, now it does not: write the channel above with
        # network = newnetwork (surely non existing to the db)
        #  The second url read did not raise, now it does (socket.timeout): fetch from the db
        # we issue a ['???'] as 'channel' argument in order to fetch everything from the db
        # (we would have got the same by passing None as 'channel' argument)
        # The three [] before ['???'] are net, sta, loc and mean: no filter on those params
        cha_df_ = self.get_channels_df(urlread_sideeffect2, db.session,
                                       datacenters_df, eidavalidator, [], [],
                                       [], ['???'], None, None, 10, False,
                                       None, None, -1, self.db_buf_size)

        # we should have the channel with network 'U' to the first datacenter
        dcid = datacenters_df.iloc[0][DataCenter.id.key]
        assert len(cha_df_[cha_df_[Station.datacenter_id.key] == dcid]) == 1
        assert cha_df_[cha_df_[Station.datacenter_id.key] == dcid][Station.network.key][0] == \
            newnetwork
        # but we did not query other channels for datacenter id = dcid, as the web response
        # was successful, we rely on that. Conversely, for the other datacenter we should have all
        # channels fetched from db
        dcid = datacenters_df.iloc[1][DataCenter.id.key]
        chaids_of_dcid = \
            cha_df_[cha_df_[Station.datacenter_id.key] == dcid][Channel.id.key].tolist()
        db_chaids_of_dcid = \
            cha_df[cha_df[Station.datacenter_id.key] == dcid][Channel.id.key].tolist()
        assert chaids_of_dcid == db_chaids_of_dcid
Esempio n. 6
0
    def test_merge_event_stations_mag_independent_circle(self, db, tt_ak135_tts):
        # get events with lat lon (1,1), (2,2,) ... (n, n)
        urlread_sideeffect = """#EventID | Time | Latitude | Longitude | Depth/km | Author | Catalog | Contributor | ContributorID | MagType | Magnitude | MagAuthor | EventLocationName
20160508_0000129|2016-05-08 05:17:11.500000|1|1|60.0|AZER|EMSC-RTS|AZER|505483|ml|3|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN
20160508_0000004|2016-05-08 01:45:30.300000|90|90|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|4|EMSC|CROATIA
"""
        events_df = self.get_events_df(urlread_sideeffect, db.session)

        net, sta, loc, cha = [], [], [], []
        datacenters_df, eidavalidator = \
            self.get_datacenters_df(None, db.session, None, self.routing_service,
                                    net, sta, loc, cha, db_bufsize=self.db_buf_size)

        # url read for channels: Note: first response data raises, second has an error and
        # that error is skipped (other channels are added), and last two channels are from two
        # stations (BLA|BLA|...) with only different start time (thus stations should both be
        # added)
        urlread_sideeffect  = ["""#Network|Station|Location|Channel|Latitude|Longitude|Elevation|Depth|Azimuth|Dip|SensorDescription|Scale|ScaleFreq|ScaleUnits|SampleRate|StartTime|EndTime
A|a||HHZ|1|1|622.0|0.0|0.0|-90.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|50.0|2008-02-12T00:00:00|
A|b||HHE|2|2|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00|
""",
"""#Network|Station|Location|Channel|Latitude|Longitude|Elevation|Depth|Azimuth|Dip|SensorDescription|Scale|ScaleFreq|ScaleUnits|SampleRate|StartTime|EndTime
A|c||HHZ|3|3|622.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2008-02-12T00:00:00|
BLA|e||HHZ|7|7|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00|2019-01-01T00:00:00
BLA|e||HHZ|8|8|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2019-01-01T00:00:00|
""",  URLError('wat'), socket.timeout()]

        channels_df = self.get_channels_df(urlread_sideeffect, db.session,
                                           datacenters_df,
                                           eidavalidator,
                                           net, sta, loc, cha, None, None, 10,
                                           False, None, None, -1, self.db_buf_size)
        assert len(channels_df) == 5
        # events_df
#    id  magnitude  latitude  longitude  depth_km                    time
# 0  1   3.0        1.0       1.0        60.0     2016-05-08 05:17:11.500
# 1  2   4.0        90.0      90.0       2.0      2016-05-08 01:45:30.300

    # channels_df:
#     id station_id  latitude  longitude  datacenter_id start_time   end_time
# 0   1           1       1.0        1.0              1 2008-02-12        NaT
# 1   2           2       2.0        2.0              1 2009-01-01        NaT
# 2   3           3       3.0        3.0              2 2008-02-12        NaT
# 3   4           4       7.0        7.0              2 2009-01-01 2019-01-01
# 4   5           5       8.0        8.0              2 2019-01-01        NaT


        tt_table = tt_ak135_tts
        # for magnitude <10, max_radius is 0. For magnitude >10, max_radius is 200
        # we have only magnitudes <10, we have two events exactly on a station (=> dist=0)
        # which will be taken (the others dropped out)
        df = merge_events_stations(events_df, channels_df, dict(min=0, max=10),
                                   tttable=tt_table)
        # the first event results in 4 potential segments
        # (the last channel has been opened too late),
        # the second event results in 0 potential segments
        # (too far away):
        assert len(df) == 4

        # now let's see: the channel with id = 4 is 8.48 degrees far away
        # from the first event. By issuing a max=8:
        df = merge_events_stations(events_df, channels_df, dict(min=0, max=8),
                                   tttable=tt_table)
        # we should get:
        assert len(df) == 3

        # now let'se restrict again:search_radius  min is increased to 2, meaning that
        # we skip the first two channels (distances =0 and 1.413, respectively)
        # and leaving us with 3-2 = 1 potential segment only:
        df = merge_events_stations(events_df, channels_df, dict(min=1.414, max=8),
                                   tttable=tt_table)
        # we should get:
        assert len(df) == 1

        # now let's take all combinations (2 events x 4 channels = 8 potential segments).
        df = merge_events_stations(events_df, channels_df, dict(min=0, max=90),
                                   tttable=tt_table)
        # we should get:
        assert len(df) == 8
Esempio n. 7
0
    def test_merge_event_stations(self, db, tt_ak135_tts):
        # get events with lat lon (1,1), (2,2,) ... (n, n)
        urlread_sideeffect = """#EventID | Time | Latitude | Longitude | Depth/km | Author | Catalog | Contributor | ContributorID | MagType | Magnitude | MagAuthor | EventLocationName
20160508_0000129|2016-05-08 05:17:11.500000|1|1|60.0|AZER|EMSC-RTS|AZER|505483|ml|3|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN
20160508_0000004|2016-05-08 01:45:30.300000|2|2|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|4|EMSC|CROATIA
"""
        events_df = self.get_events_df(urlread_sideeffect, db.session)

        net, sta, loc, cha = [], [], [], []
        datacenters_df, eidavalidator = \
            self.get_datacenters_df(None, db.session, None, self.routing_service,
                                    net, sta, loc, cha, db_bufsize=self.db_buf_size)

        # url read for channels: Note: first response data raises, second has an error and
        # that error is skipped (other channels are added), and last two channels are from two
        # stations (BLA|BLA|...) with only different start time (thus stations should both be
        # added)
        urlread_sideeffect  = ["""#Network|Station|Location|Channel|Latitude|Longitude|Elevation|Depth|Azimuth|Dip|SensorDescription|Scale|ScaleFreq|ScaleUnits|SampleRate|StartTime|EndTime
A|a||HHZ|1|1|622.0|0.0|0.0|-90.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|50.0|2008-02-12T00:00:00|
A|b||HHE|2|2|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00|
""",
"""#Network|Station|Location|Channel|Latitude|Longitude|Elevation|Depth|Azimuth|Dip|SensorDescription|Scale|ScaleFreq|ScaleUnits|SampleRate|StartTime|EndTime
A|c||HHZ|3|3|622.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2008-02-12T00:00:00|
BLA|e||HHZ|7|7|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00|2019-01-01T00:00:00
BLA|e||HHZ|8|8|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2019-01-01T00:00:00|
""",  URLError('wat'), socket.timeout()]

        channels_df = self.get_channels_df(urlread_sideeffect, db.session,
                                           datacenters_df,
                                           eidavalidator,
                                           net, sta, loc, cha, None, None, 10,
                                           False, None, None, -1, self.db_buf_size)
        assert len(channels_df) == 5

    # events_df
#    id  magnitude  latitude  longitude  depth_km                    time
# 0  1   3.0        1.0       1.0        60.0     2016-05-08 05:17:11.500
# 1  2   4.0        2.0       2.0       2.0      2016-05-08 01:45:30.300

    # channels_df:
#     id station_id  latitude  longitude  datacenter_id start_time   end_time
# 0   1           1       1.0        1.0              1 2008-02-12        NaT
# 1   2           2       2.0        2.0              1 2009-01-01        NaT
# 2   3           3       3.0        3.0              2 2008-02-12        NaT
# 3   4           4       7.0        7.0              2 2009-01-01 2019-01-01
# 4   5           5       8.0        8.0              2 2019-01-01        NaT

        tt_table = tt_ak135_tts
        # for magnitude <10, max_radius is 0. For magnitude >10, max_radius is 200
        # we have only magnitudes <10, we have two events exactly on a station (=> dist=0)
        # which will be taken (the others dropped out)
        df = merge_events_stations(events_df, channels_df, dict(minmag=10, maxmag=10,
                                   minmag_radius=0, maxmag_radius=200), tttable=tt_table)

        assert len(df) == 2

        # for magnitude <1, max_radius is 100. For magnitude >1, max_radius is 200
        # we have only magnitudes <10, we have all event-stations closer than 100 deg
        # So we might have ALL channels taken BUT: one station start time is in 2019, thus
        # it will not fall into the case above!
        df = merge_events_stations(events_df, channels_df, dict(minmag=1, maxmag=1,
                                   minmag_radius=100, maxmag_radius=2000), tttable=tt_table)

        assert len(df) == (len(channels_df)-1) * len(events_df)
        # assert channel outside time bounds was in:
        assert not channels_df[channels_df[Station.start_time.key] ==
                               datetime(2019, 1, 1)].empty
        # we need to get the channel id from channels_df cause in df we removed unnecessary
        # columns (including start end time)
        ch_id = channels_df[channels_df[Station.start_time.key] ==
                            datetime(2019, 1, 1)][Channel.id.key].iloc[0]
        # old Channel.id.key is Segment.channel_id.key in df:
        assert df[df[Segment.channel_id.key] == ch_id].empty

        # this is a more complex case, we want to drop the first event by setting a very low
        # threshold (sraidus_minradius=1) for magnitudes <=3 (the first event magnitude)
        # and maxradius very high for the other event (magnitude=4)
        df = merge_events_stations(events_df, channels_df, dict(minmag=3, maxmag=4,
                                   minmag_radius=1, maxmag_radius=40), tttable=tt_table)

        # assert we have only the second event except the first channel which is from the 1st event.
        # The first event is retrievable by its latitude (2)
        # FIXME: more fine grained tests based on distance?
        evid = events_df[events_df[Event.latitude.key] == 2][Event.id.key].iloc[0]
        assert np.array_equal((df[Segment.event_id.key] == evid),
                              [False, True, True, True, True])

        # test arrival times are properly set: Set all event locations to [0,0] as well
        # as stations locations. This should result in all arrival times equal to event time
        #
        _events_df = events_df
        _channels_df = channels_df
        events_df = events_df.copy()
        events_df.loc[:, Event.latitude.key] = 0
        events_df.loc[:, Event.longitude.key] = 0
        event_ids = pd.unique(events_df[Event.id.key])
        # We have two events, set the depth of the first one to zero the other to 60
        evtid1, evtid2 = event_ids[0], event_ids[1]
        evttime1 = events_df[events_df[Event.id.key] == evtid1][Event.time.key].iloc[0]
        evttime2 = events_df[events_df[Event.id.key] == evtid2][Event.time.key].iloc[0]
        events_df.loc[events_df[Event.id.key] == evtid1, Event.depth_km.key] = 0
        events_df.loc[events_df[Event.id.key] == evtid2, Event.depth_km.key] = 60

        channels_df = channels_df.copy()
        channels_df.loc[:, Station.latitude.key] = 0
        channels_df.loc[:, Station.longitude.key] = 0
        df = merge_events_stations(events_df, channels_df, dict(minmag=3, maxmag=4,
                                   minmag_radius=1, maxmag_radius=40), tttable=tt_table)
        # assert for events of depth 0 arrival times are queal to event times
        assert (df[df[Segment.event_id.key] == evtid1][Segment.arrival_time.key]
                == evttime1).all()
        # assert for events of depth > 0 arrival times are GREATER than event times
        assert (df[df[Segment.event_id.key] == evtid2][Segment.arrival_time.key]
                > evttime2).all()

        # now set the first event time out-of bounds:
        events_df.loc[events_df[Event.id.key] == evtid1, Event.depth_km.key] = 600000
        df = merge_events_stations(events_df, channels_df, dict(minmag=3, maxmag=4,
                                   minmag_radius=1, maxmag_radius=40), tttable=tt_table)
        # assert for events of depth 0 arrival times are queal to event times
        # as nans are dropped from the returned dataframe, assert we do not have segments with
        # event_id == evtid1:
        assert df[df[Segment.event_id.key] == evtid1][Segment.arrival_time.key].empty
        # still assert for events of depth > 0 arrival times are GREATER than event times
        assert (df[df[Segment.event_id.key] == evtid2][Segment.arrival_time.key] > evttime2).all()
    def init(self, request, db, data, pytestdir):
        # re-init a sqlite database (no-op if the db is not sqlite):
        db.create(to_file=False)

        self.logout = StringIO()
        self.handler = StreamHandler(stream=self.logout)
        # THIS IS A HACK:
        # s2s_download_logger.setLevel(logging.INFO)  # necessary to forward to handlers
        # if we called closing (we are testing the whole chain) the level will be reset
        # (to level.INFO) otherwise it stays what we set two lines above. Problems might arise
        # if closing sets a different level, but for the moment who cares
        # s2s_download_logger.addHandler(self.handler)

        # setup a run_id:
        r = Download()
        db.session.add(r)
        db.session.commit()
        self.run = r

        # side effects:

        self._evt_urlread_sideeffect =  """#EventID | Time | Latitude | Longitude | Depth/km | Author | Catalog | Contributor | ContributorID | MagType | Magnitude | MagAuthor | EventLocationName
20160508_0000129|2016-05-08 05:17:11.500000|1|1|60.0|AZER|EMSC-RTS|AZER|505483|ml|3|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN
20160508_0000004|2016-05-08 01:45:30.300000|90|90|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|4|EMSC|CROATIA
"""
        self._dc_urlread_sideeffect = """http://geofon.gfz-potsdam.de/fdsnws/dataselect/1/query
ZZ * * * 2002-09-01T00:00:00 2005-10-20T00:00:00
UP ARJ * * 2013-08-01T00:00:00 2017-04-25

http://ws.resif.fr/fdsnws/dataselect/1/query
ZU * * HHZ 2015-01-01T00:00:00 2016-12-31T23:59:59.999999

"""

# Note: by default we set sta_urlsideeffect to return such a channels which result in 12
# segments (see lat and lon of channels vs lat and lon of events above)
        self._sta_urlread_sideeffect  = ["""#Network|Station|Location|Channel|Latitude|Longitude|Elevation|Depth|Azimuth|Dip|SensorDescription|Scale|ScaleFreq|ScaleUnits|SampleRate|StartTime|EndTime
GE|FLT1||HHE|1|1|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2003-01-01T00:00:00|
GE|FLT1||HHN|1|1|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2003-01-01T00:00:00|
GE|FLT1||HHZ|1|1|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2003-01-01T00:00:00|
n1|s||c1|90|90|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00|
n1|s||c2|90|90|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00|
n1|s||c3|90|90|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00|
""", 
"""#Network|Station|Location|Channel|Latitude|Longitude|Elevation|Depth|Azimuth|Dip|SensorDescription|Scale|ScaleFreq|ScaleUnits|SampleRate|StartTime|EndTime
IA|BAKI||BHE|1|1|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2003-01-01T00:00:00|
IA|BAKI||BHN|1|1|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2003-01-01T00:00:00|
IA|BAKI||BHZ|1|1|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2003-01-01T00:00:00|
n2|s||c1|90|90|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00|
n2|s||c2|90|90|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00|
n2|s||c3|90|90|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00|
"""]
        # self._sta_urlread_sideeffect = cycle([partial_valid, '', invalid, '', '', URLError('wat'), socket.timeout()])

        self._mintraveltime_sideeffect = cycle([1])

        self._seg_data = data.read("GE.FLT1..HH?.mseed")
        self._seg_data_gaps = data.read("IA.BAKI..BHZ.D.2016.004.head")
        self._seg_data_empty = b''

        self._seg_urlread_sideeffect = [self._seg_data, self._seg_data_gaps, 413, 500,
                                        self._seg_data[:2],
                                        self._seg_data_empty,  413, URLError("++urlerror++"),
                                        socket.timeout()]

        self._inv_data = data.read("inventory_GE.APE.xml")

        self.service = ''  # so get_datacenters_df accepts any row by default

        # store DcDataselectManager method here:
        self.dc_get_data_open = DcDataselectManager._get_data_open
        self.dc_get_data_from_userpass = DcDataselectManager._get_data_from_userpass
        # get data from token accepts a custom urlread side effect:
        _get_data_from_token = DcDataselectManager._get_data_from_token

        def dc_get_data_from_token_func(url_read_side_effect=None, *a, **kw):
            if url_read_side_effect is not None:
                self.setup_urlopen(url_read_side_effect)
            return _get_data_from_token(*a, **kw)
        self.dc_get_data_from_token = dc_get_data_from_token_func

        # class-level patchers:
        with patch('stream2segment.utils.url.urlopen') as mock_urlopen:
            self.mock_urlopen = mock_urlopen
            with patch('stream2segment.utils.inputargs.get_session', return_value=db.session):
                # this mocks yaml_load and sets inventory to False, as tests rely on that
                with patch('stream2segment.main.closesession'):  # no-op (do not close session)

                    # mock ThreadPool (tp) to run one instance at a time, so we
                    # get deterministic results:
                    class MockThreadPool(object):

                        def __init__(self, *a, **kw):
                            pass

                        def imap(self, func, iterable, *args):
                            # make imap deterministic: same as standard python map:
                            # everything is executed in a single thread the right input order
                            return map(func, iterable)

                        def imap_unordered(self, func, iterable, *args):
                            # make imap_unordered deterministic: same as standard python map:
                            # everything is executed in a single thread in the right input order
                            return map(func, iterable)

                        def close(self, *a, **kw):
                            pass
                    # assign patches and mocks:
                    with patch('stream2segment.utils.url.ThreadPool',
                               side_effect=MockThreadPool) as mock_thread_pool:

                        def c4d(logger, logfilebasepath, verbose):
                            # config logger as usual, but redirects to a temp file
                            # that will be deleted by pytest, instead of polluting the program
                            # package:
                            ret = configlog4download(logger, pytestdir.newfile('.log'),
                                                     verbose)
                            logger.addHandler(self.handler)
                            return ret
                        with patch('stream2segment.main.configlog4download',
                                   side_effect=c4d) as mock_config4download:
                            self.mock_config4download = mock_config4download

                            yield
    def init(self, request, db, data):
        # re-init a sqlite database (no-op if the db is not sqlite):
        db.create(to_file=False)
        # setup a run_id:
        rdw = Download()
        db.session.add(rdw)
        db.session.commit()
        self.run = rdw

        # side effects:
        self._evt_urlread_sideeffect = """#EventID | Time | Latitude | Longitude | Depth/km | Author | Catalog | Contributor | ContributorID | MagType | Magnitude | MagAuthor | EventLocationName
20160508_0000129|2016-05-08 05:17:11.500000|1|1|60.0|AZER|EMSC-RTS|AZER|505483|ml|3|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN
20160508_0000004|2016-05-08 01:45:30.300000|90|90|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|4|EMSC|CROATIA
"""
        self._mintraveltime_sideeffect = cycle([1])
        self._seg_data = data.read("GE.FLT1..HH?.mseed")
        self._seg_data_gaps = data.read("IA.BAKI..BHZ.D.2016.004.head")
        self._seg_data_empty = b''
        self._seg_urlread_sideeffect = [
            self._seg_data, self._seg_data_gaps, 413, 500, self._seg_data[:2],
            self._seg_data_empty, 413,
            URLError("++urlerror++"),
            socket.timeout()
        ]
        self.service = ''  # so get_datacenters_df accepts any row by default
        self.db_buf_size = 1
        self.routing_service = yaml_load(get_templates_fpath("download.yaml"))\
            ['advanced_settings']['routing_service_url']

        # NON db stuff (logging, patchers, pandas...):
        self.loghandler = StreamHandler(stream=StringIO())

        # THIS IS A HACK:
        query_logger.setLevel(logging.INFO)  # necessary to forward to handlers
        # if we called closing (we are testing the whole chain) the level will be reset
        # (to level.INFO) otherwise it stays what we set two lines above. Problems might arise
        # if closing sets a different level, but for the moment who cares
        query_logger.addHandler(self.loghandler)

        # when debugging, I want the full dataframe with to_string(), not truncated
        # NOTE: this messes up right alignment of numbers in DownloadStats (see utils.py)
        # FIRST, remember current settings and restore them in cleanup:
        _pd_display_maxcolwidth = pd.get_option('display.max_colwidth')
        pd.set_option('display.max_colwidth', -1)

        # define class level patchers (we do not use a yiled as we need to do more stuff in the
        # finalizer, see below
        patchers = []

        patchers.append(patch('stream2segment.utils.url.urlopen'))
        self.mock_urlopen = patchers[-1].start()

        # mock ThreadPool (tp) to run one instance at a time, so we get deterministic results:
        class MockThreadPool(object):
            def __init__(self, *a, **kw):
                pass

            def imap(self, func, iterable, *args):
                # make imap deterministic: same as standard python map:
                # everything is executed in a single thread the right input order
                return map(func, iterable)

            def imap_unordered(self, func_, iterable, *args):
                # make imap_unordered deterministic: same as standard python map:
                # everything is executed in a single thread in the right input order
                return map(func_, iterable)

            def close(self, *a, **kw):
                pass

        # assign patches and mocks:
        patchers.append(patch('stream2segment.utils.url.ThreadPool'))
        self.mock_tpool = patchers[-1].start()
        self.mock_tpool.side_effect = MockThreadPool

        # add finalizer:
        def delete():
            pd.set_option('display.max_colwidth', _pd_display_maxcolwidth)

            for patcher in patchers:
                patcher.stop()

            hndls = query_logger.handlers[:]
            for h in hndls:
                if h is self.loghandler:
                    self.loghandler.close()
                    query_logger.removeHandler(h)

        request.addfinalizer(delete)
    def test_retry(self, mock_get_opener, mock_get_data_from_token,
                   mock_get_data_from_userpass,
                   mock_get_data_open, mock_updatedf, mock_insertdf, mock_mseed_unpack,
                   mock_download_save_segments, mock_save_inventories, mock_get_channels_df,
                   mock_get_datacenters_df, mock_get_events_df,
                   # fixtures:
                   db, clirunner, pytestdir, yamlfile):

        mock_get_events_df.side_effect = lambda *a, **v: self.get_events_df(None, *a, **v)
        mock_get_datacenters_df.side_effect = \
            lambda *a, **v: self.get_datacenters_df(None, *a, **v)
        mock_get_channels_df.side_effect = lambda *a, **v: self.get_channels_df(None, *a, **v)
        mock_save_inventories.side_effect = lambda *a, **v: self.save_inventories(None, *a, **v)
        mock_download_save_segments.side_effect = \
            lambda *a, **v: self.download_save_segments([URLError('abc')], *a, **v)
        # mseed unpack is mocked by accepting only first arg (so that time bounds are
        # not considered)
        mock_mseed_unpack.side_effect = lambda *a, **v: unpack(a[0])
        mock_insertdf.side_effect = lambda *a, **v: insertdf(*a, **v)
        mock_updatedf.side_effect = lambda *a, **v: updatedf(*a, **v)
        # prevlen = len(db.session.query(Segment).all())

        # mock our opener
        m = Mock()
        mockopen = Mock()
        mockopen.read = lambda *a, **v: b''
        mockopen.msg = 'abc'
        mockopen.code = 204
        m.open = lambda *a, **v: mockopen
        # m.read = lambda *a, **v: ''
        mock_get_opener.side_effect = lambda *a, **v: m

        # patching class methods while preserving the original call requires storing once
        # the original methods (as class attributes). Sets the side effect of the mocked method
        # to those class attributes as to preserve the original functionality
        # and be able to assert mock_* functions are called and so on
        # For info see:
        # https://stackoverflow.com/a/29563665
        mock_get_data_open.side_effect = self.dc_get_data_open
        mock_get_data_from_userpass.side_effect = self.dc_get_data_from_userpass
        mock_get_data_from_token.side_effect = \
            lambda *a, **kw: self.dc_get_data_from_token([URLError('a'), 'abc'], *a, **kw)

        # TEST 1: provide a file with valid token:
        tokenfile = pytestdir.newfile(create=True)
        with open(tokenfile, 'w') as fh:
            fh.write('BEGIN PGP MESSAGE')
        # mock yaml_load to override restricted_data:

        # launch two download runs with different responses for token auth query:
        for tokenquery_mocked_return_values, dc_token_failed in \
            ([[URLError('a'), 'uzer:pazzword'], "http://geofon.gfz-potsdam.de"],
             [['uzer:pazzword', URLError('a')], 'http://ws.resif.fr']):
            # set how many times self.mock_urlopen has been called:
            mock_urlopen_call_count = self.mock_urlopen.call_count
            # TEST 2: USERPASS good for just one datacenter:
            mock_get_data_open.reset_mock()
            mock_get_data_from_token.reset_mock()
            mock_get_data_from_userpass.reset_mock()
            mock_get_opener.reset_mock()
            mock_get_data_from_token.side_effect = \
                lambda *a, **kw: self.dc_get_data_from_token(tokenquery_mocked_return_values,
                                                             *a, **kw)
            yaml_file = yamlfile(restricted_data=os.path.abspath(tokenfile),
                                 retry_client_err=False)
            result = clirunner.invoke(cli, ['download',
                                            '-c', yaml_file,
                                            '--dburl', db.dburl,
                                            '--start', '2016-05-08T00:00:00',
                                            '--end', '2016-05-08T9:00:00'])
            assert clirunner.ok(result)
            assert 'restricted_data: %s' % os.path.abspath(tokenfile) in result.output
            assert 'STEP 5 of 8: Acquiring credentials from token' in result.output
            # assert we print that we are downloading open and restricted data:
            assert re.search(r'STEP 7 of 8\: Downloading \d+ segments and saving to db',
                             result.output)
            assert not mock_get_data_open.called
            assert mock_get_data_from_token.called
            assert not mock_get_data_from_userpass.called

            assert "Downloading open data only from: %s" % dc_token_failed
            dc_token_ok = 'http://ws.resif.fr' \
                if dc_token_failed == "http://geofon.gfz-potsdam.de" else \
                "http://geofon.gfz-potsdam.de"
            assert mock_get_opener.call_count == 1
            assert mock_get_opener.call_args_list[0][0][:] == (dc_token_ok, 'uzer', 'pazzword')

            dc_id = {Fdsnws(i[1]).site: i[0] for i in
                     db.session.query(DataCenter.id, DataCenter.dataselect_url)}
            # assert urlopen has been called only once with query and not queryauth:
            # get the segments dataframe we (re)downloaded:
            segments_df_to_download = mock_download_save_segments.call_args_list[-1][0][1]
            dc2download = pd.unique(segments_df_to_download['datacenter_id']).tolist()
            # set the expected call count based on the datacenters of (re)downloaded segments:
            if dc_id[dc_token_failed] not in dc2download:
                assert self.mock_urlopen.call_count == 0
            else:
                assert self.mock_urlopen.call_count >= 1
                for i in range(self.mock_urlopen.call_count):
                    i+=1
                    assert self.mock_urlopen.call_args_list[-i][0][0].get_full_url() == \
                        dc_token_failed + "/fdsnws/dataselect/1/query"
    def test_restricted(self, mock_get_opener, mock_get_data_from_token,
                        mock_get_data_from_userpass,
                        mock_get_data_open, mock_updatedf, mock_insertdf, mock_mseed_unpack,
                        mock_download_save_segments, mock_save_inventories, mock_get_channels_df,
                        mock_get_datacenters_df, mock_get_events_df,
                        # fixtures:
                        db, clirunner, pytestdir, yamlfile):

        mock_get_events_df.side_effect = lambda *a, **v: self.get_events_df(None, *a, **v)
        mock_get_datacenters_df.side_effect = \
            lambda *a, **v: self.get_datacenters_df(None, *a, **v) 
        mock_get_channels_df.side_effect = lambda *a, **v: self.get_channels_df(None, *a, **v)
        mock_save_inventories.side_effect = lambda *a, **v: self.save_inventories(None, *a, **v)
        mock_download_save_segments.side_effect = \
            lambda *a, **v: self.download_save_segments(None, *a, **v)
        # mseed unpack is mocked by accepting only first arg
        # (so that time bounds are not considered)
        mock_mseed_unpack.side_effect = lambda *a, **v: unpack(a[0])
        mock_insertdf.side_effect = lambda *a, **v: insertdf(*a, **v)
        mock_updatedf.side_effect = lambda *a, **v: updatedf(*a, **v)
        # prevlen = len(db.session.query(Segment).all())

        # patching class methods while preserving the original call requires storing once
        # the original methods (as class attributes). Sets the side effect of the mocked method
        # to those class attributes as to preserve the original functionality
        # and be able to assert mock_* functions are called and so on
        # For info see:
        # https://stackoverflow.com/a/29563665
        mock_get_data_open.side_effect = self.dc_get_data_open
        mock_get_data_from_userpass.side_effect = self.dc_get_data_from_userpass
        mock_get_data_from_token.side_effect = \
            lambda *a, **kw: self.dc_get_data_from_token([URLError('a'), 'abc'], *a, **kw)

        # TEST 1: provide a file with valid token:
        tokenfile = pytestdir.newfile(create=True)
        with open(tokenfile, 'w') as fh:
            fh.write('BEGIN PGP MESSAGE')
        # mock yaml_load to override restricted_data:
        yaml_file = yamlfile(restricted_data=os.path.abspath(tokenfile))
        # The run table is populated with a run_id in the constructor of this class
        # for checking run_ids, store here the number of runs we have in the table:
        runs = len(db.session.query(Download.id).all())
        result = clirunner.invoke(cli, ['download',
                                        '-c', yaml_file,
                                        '--dburl', db.dburl,
                                        '--start', '2016-05-08T00:00:00',
                                        '--end', '2016-05-08T9:00:00'])
        assert clirunner.ok(result)
        assert 'Downloading 12 segments (open data only)' in result.output
        assert 'STEP 5 of 8: Acquiring credentials from token' in result.output
        # note that due to (probably) dict order in py2-3 we need to test both of these:
        if not ('Downloading open data only from: http://geofon.gfz-potsdam.de, '
                'http://ws.resif.fr (Unable to acquire credentials for restricted data)') in \
                result.output:
            assert ('Downloading open data only from: http://ws.resif.fr, '
                    'http://geofon.gfz-potsdam.de (Unable to acquire credentials for restricted data)') in \
                    result.output
        # assert we print that we are downloading open data only (all errors):
        assert 'STEP 7 of 8: Downloading 12 segments (open data only)' in result.output
        assert not mock_get_data_open.called
        assert mock_get_data_from_token.called
        assert not mock_get_data_from_userpass.called
        assert not mock_get_opener.called
        # some assertions to check data properly written
        # These are important because they confirm that data has been downloaded anyway
        # (the test does not differentiate between restricted or open data)
        assert len(db.session.query(Download.id).all()) == runs + 1
        runs += 1
        segments = db.session.query(Segment).all()
        assert len(segments) == 12
        segments = db.session.query(Segment).filter(Segment.has_data).all()
        assert len(segments) == 4
        assert len(db.session.query(Station).filter(Station.has_inventory).all()) == 2
        assert mock_updatedf.called  # called while saving inventories
        assert mock_insertdf.called
    def init(self, request, db, data):
        # re-init a sqlite database (no-op if the db is not sqlite):
        db.create(to_file=False)
        # setup a run_id:
        rdw = Download()
        db.session.add(rdw)
        db.session.commit()
        self.run = rdw

        # side effects:
        self._dc_urlread_sideeffect = """http://geofon.gfz-potsdam.de/fdsnws/dataselect/1/query
ZZ * * * 2002-09-01T00:00:00 2005-10-20T00:00:00
UP ARJ * * 2013-08-01T00:00:00 2017-04-25

http://ws.resif.fr/fdsnws/dataselect/1/query
ZU * * HHZ 2015-01-01T00:00:00 2016-12-31T23:59:59.999999

"""
        self._mintraveltime_sideeffect = cycle([1])
        self._seg_data = data.read("GE.FLT1..HH?.mseed")
        self._seg_data_gaps = data.read("IA.BAKI..BHZ.D.2016.004.head")
        self._seg_data_empty = b''
        self._seg_urlread_sideeffect = [
            self._seg_data, self._seg_data_gaps, 413, 500, self._seg_data[:2],
            self._seg_data_empty, 413,
            URLError("++urlerror++"),
            socket.timeout()
        ]
        self.service = ''  # so get_datacenters_df accepts any row by default
        self.db_buf_size = 1
        self.routing_service = yaml_load(get_templates_fpath("download.yaml"))\
            ['advanced_settings']['routing_service_url']

        # NON db stuff (logging, patchers, pandas...):
        self.logout = StringIO()
        handler = StreamHandler(stream=self.logout)
        self._logout_cache = ""
        # THIS IS A HACK:
        query_logger.setLevel(logging.INFO)  # necessary to forward to handlers
        # if we called closing (we are testing the whole chain) the level will be reset
        # (to level.INFO) otherwise it stays what we set two lines above. Problems might arise
        # if closing sets a different level, but for the moment who cares
        query_logger.addHandler(handler)

        # when debugging, I want the full dataframe with to_string(), not truncated
        # NOTE: this messes up right alignment of numbers in DownloadStats (see utils.py)
        # FIRST, remember current settings and restore them in cleanup:
        _pd_display_maxcolwidth = pd.get_option('display.max_colwidth')
        pd.set_option('display.max_colwidth', -1)

        # define class level patchers (we do not use a yiled as we need to do more stuff in the
        # finalizer, see below
        patchers = []

        patchers.append(patch('stream2segment.utils.url.urlopen'))
        self.mock_urlopen = patchers[-1].start()

        # mock ThreadPool (tp) to run one instance at a time, so we get deterministic results:
        class MockThreadPool(object):
            def __init__(self, *a, **kw):
                pass

            def imap(self, func, iterable, *args):
                # make imap deterministic: same as standard python map:
                # everything is executed in a single thread the right input order
                return map(func, iterable)

            def imap_unordered(self, func_, iterable, *args):
                # make imap_unordered deterministic: same as standard python map:
                # everything is executed in a single thread in the right input order
                return map(func_, iterable)

            def close(self, *a, **kw):
                pass

        # assign patches and mocks:
        patchers.append(patch('stream2segment.utils.url.ThreadPool'))
        self.mock_tpool = patchers[-1].start()
        self.mock_tpool.side_effect = MockThreadPool

        # add finalizer:
        def delete():
            pd.set_option('display.max_colwidth', _pd_display_maxcolwidth)

            for patcher in patchers:
                patcher.stop()

            hndls = query_logger.handlers[:]
            handler.close()
            for h in hndls:
                if h is handler:
                    query_logger.removeHandler(h)

        request.addfinalizer(delete)
    def test_get_dcs_routingerror(
            self,
            mock_fileopen,
            # fixtures:
            db):
        '''test fetching datacenters eida, iris, custom url'''
        # this is the output when using eida as service:
        urlread_sideeffect = [URLError('wat?')]

        # we might set the following params as defaults because not used, let's provide anyway
        # something meaningful:
        net, sta, loc, cha = ['*'], [], [], ['HH?', 'BH?']
        starttime = datetime.utcnow()
        endtime = starttime + timedelta(minutes=1.1)

        # normal fdsn service ("https://mocked_domain/fdsnws/station/1/query")
        # we should not call self.mock_urlopen and not mock_fileopen (no eida)
        dcdf, eidavalidator = self.get_datacenters_df(
            urlread_sideeffect,
            db.session,
            "https://mock/fdsnws/station/1/query",
            self.routing_service,
            net,
            sta,
            loc,
            cha,
            starttime,
            endtime,
            db_bufsize=self.db_buf_size)
        assert not self.mock_urlopen.called
        assert not mock_fileopen.called
        assert eidavalidator is None
        assert len(dcdf) == 1
        assert db.session.query(DataCenter).count() == 1

        # iris:
        # we should not call self.mock_urlopen and not mock_fileopen (no eida)
        dcdf, eidavalidator = self.get_datacenters_df(
            urlread_sideeffect,
            db.session,
            "iris",
            self.routing_service,
            net,
            sta,
            loc,
            cha,
            starttime,
            endtime,
            db_bufsize=self.db_buf_size)
        assert not self.mock_urlopen.called
        assert not mock_fileopen.called
        assert eidavalidator is None
        assert len(dcdf) == 1
        assert db.session.query(DataCenter).\
            filter(DataCenter.organization_name == 'iris').count() == 1

        # eida:
        # we should call self.mock_urlopen and mock_fileopen (eida error => read from file)
        dcdf, eidavalidator = self.get_datacenters_df(
            urlread_sideeffect,
            db.session,
            "eida",
            self.routing_service,
            net,
            sta,
            loc,
            cha,
            starttime,
            endtime,
            db_bufsize=self.db_buf_size)
        assert self.mock_urlopen.called
        assert mock_fileopen.called
        msg = self.log_msg()
        _, last_mod_time = _get_local_routing_service()
        expected_str = ("Eida routing service error, reading routes from file "
                        "(last updated: %s") % last_mod_time
        assert expected_str in msg
        assert eidavalidator is not None
        assert db.session.query(DataCenter).\
            filter(DataCenter.organization_name == 'eida').count() == 10
        assert len(dcdf) == 10

        #         with pytest.raises(FailedDownload) as qdown:
        #             data, _ = self.get_datacenters_df(urlread_sideeffect, db.session, "eida",
        #                                               self.routing_service,
        #                                               net, sta, loc, cha, starttime, endtime,
        #                                               db_bufsize=self.db_buf_size)
        #         assert self.mock_urlopen.called
        #         assert "Eida routing service error, no eida data-center saved in database" \
        #             in str(qdown.value)

        # now let's mock a valid response from the eida routing service
        self.mock_urlopen.reset_mock()
        mock_fileopen.reset_mock()
        urlread_sideeffect = [
            """http://ws.resif.fr/fdsnws/station/1/query
http://geofon.gfz-potsdam.de/fdsnws/station/1/query

http://geofon.gfz-potsdam.de/fdsnws/station/1/query
ZZ * * * 2002-09-01T00:00:00 2005-10-20T00:00:00
UP ARJ * BHW 2013-08-01T00:00:00 2017-04-25"""
        ]
        dcdf, eidavalidator = self.get_datacenters_df(
            urlread_sideeffect,
            db.session,
            "eida",
            self.routing_service,
            net,
            sta,
            loc,
            cha,
            starttime,
            endtime,
            db_bufsize=self.db_buf_size)
        assert self.mock_urlopen.called
        assert not mock_fileopen.called
        assert db.session.query(DataCenter).\
            filter(DataCenter.organization_name == 'eida').count() == 10
        assert len(dcdf) == 2
        assert "Eida routing service error, reading from file (last updated: " \
            not in self.log_msg()[len(msg):]

        # write two new eida data centers
        self.mock_urlopen.reset_mock()
        mock_fileopen.reset_mock()
        urlread_sideeffect = [
            """http://ws.NEWDC1.fr/fdsnws/station/1/query
http://geofon.gfz-potsdam.de/fdsnws/station/1/query

http://NEWDC2.gfz-potsdam.de/fdsnws/station/1/query
ZZ * * * 2002-09-01T00:00:00 2005-10-20T00:00:00
UP ARJ * BHW 2013-08-01T00:00:00 2017-04-25"""
        ]
        dcdf, eidavalidator = self.get_datacenters_df(
            urlread_sideeffect,
            db.session,
            "eida",
            self.routing_service,
            net,
            sta,
            loc,
            cha,
            starttime,
            endtime,
            db_bufsize=self.db_buf_size)
        assert self.mock_urlopen.called
        assert not mock_fileopen.called
        assert db.session.query(DataCenter).\
            filter(DataCenter.organization_name == 'eida').count() == 12
        assert len(dcdf) == 2
    def test_download_save_segments_timebounds(self, mock_updatedf, mock_insertdf, mseed_unpack,
                                               db, tt_ak135_tts):
        # prepare:
        # mseed unpack takes no starttime and endtime arguments, so that
        mseed_unpack.side_effect = lambda *a, **v: unpack(*a, **v)
        mock_insertdf.side_effect = lambda *a, **v: insertdf(*a, **v)
        mock_updatedf.side_effect = lambda *a, **v: updatedf(*a, **v)

        # mock event response: it's the same as self._evt_urlread_sideeffect but modify the dates
        # as NOW. This means, any segment downloaded later will
        # be out-of-bound
        utcnow = datetime.utcnow()
        utcnow_iso = utcnow.isoformat().replace("T", " ")
        urlread_sideeffect = """#EventID | Time | Latitude | Longitude | Depth/km | Author | Catalog | Contributor | ContributorID | MagType | Magnitude | MagAuthor | EventLocationName
20160508_0000129|%s|1|1|60.0|AZER|EMSC-RTS|AZER|505483|ml|3|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN
20160508_0000004|%s|90|90|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|4|EMSC|CROATIA
""" % (utcnow_iso, utcnow_iso)
        events_df = self.get_events_df(urlread_sideeffect, db.session)
        # restore urlread_side_effect:
        urlread_sideeffect = None
        net, sta, loc, cha = [], [], [], []
        datacenters_df, eidavalidator = \
            self.get_datacenters_df(urlread_sideeffect, db.session, self.service,
                                    self.routing_service, net, sta, loc, cha,
                                    db_bufsize=self.db_buf_size)
        channels_df = self.get_channels_df(urlread_sideeffect, db.session,
                                           datacenters_df,
                                           eidavalidator,
                                           net, sta, loc, cha, None, None, 10,
                                           False, None, None, -1, self.db_buf_size)
        # just to be sure. If failing, we might have changed the class default:
        assert len(channels_df) == 12
    # events_df
#                  id  magnitude  latitude  longitude  depth_km  time
# 0  20160508_0000129        3.0       1.0        1.0      60.0  2016-05-08 05:17:11.500
# 1  20160508_0000004        4.0       2.0        2.0       2.0  2016-05-08 01:45:30.300

# channels_df (index not shown):
# columns:
# id  station_id  latitude  longitude  datacenter_id start_time end_time network station location channel
# data (not aligned with columns):
# 1   1  1.0   1.0   1 2003-01-01 NaT  GE  FLT1    HHE
# 2   1  1.0   1.0   1 2003-01-01 NaT  GE  FLT1    HHN
# 3   1  1.0   1.0   1 2003-01-01 NaT  GE  FLT1    HHZ
# 4   2  90.0  90.0  1 2009-01-01 NaT  n1  s       c1
# 5   2  90.0  90.0  1 2009-01-01 NaT  n1  s       c2
# 6   2  90.0  90.0  1 2009-01-01 NaT  n1  s       c3
# 7   3  1.0   1.0   2 2003-01-01 NaT  IA  BAKI    BHE
# 8   3  1.0   1.0   2 2003-01-01 NaT  IA  BAKI    BHN
# 9   3  1.0   1.0   2 2003-01-01 NaT  IA  BAKI    BHZ
# 10  4  90.0  90.0  2 2009-01-01 NaT  n2  s       c1
# 11  4  90.0  90.0  2 2009-01-01 NaT  n2  s       c2
# 12  4  90.0  90.0  2 2009-01-01 NaT  n2  s       c3

        assert all(_ in channels_df.columns for _ in [Station.network.key, Station.station.key,
                                                      Channel.location.key, Channel.channel.key])
        chaid2mseedid = chaid2mseedid_dict(channels_df)
        # check that we removed the columns:
        assert not any(_ in channels_df.columns for _ in
                       [Station.network.key, Station.station.key,
                        Channel.location.key, Channel.channel.key])

        # take all segments:
        # use minmag and maxmag
        ttable = tt_ak135_tts
        segments_df = merge_events_stations(events_df, channels_df, dict(minmag=10, maxmag=10,
                                            minmag_radius=10, maxmag_radius=10), tttable=ttable)

        assert len(pd.unique(segments_df['arrival_time'])) == 2

        h = 9

# segments_df (index not shown). Note that
# cid sid did n   s    l  c    ed   event_id          depth_km                time  <- LAST TWO ARE Event related columns that will be removed after arrival_time calculations
# 1   1   1   GE  FLT1    HHE  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 2   1   1   GE  FLT1    HHN  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 3   1   1   GE  FLT1    HHZ  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 7   3   2   IA  BAKI    BHE  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 8   3   2   IA  BAKI    BHN  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 9   3   2   IA  BAKI    BHZ  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 4   2   1   n1  s       c1   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300
# 5   2   1   n1  s       c2   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300
# 6   2   1   n1  s       c3   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300
# 10  4   2   n2  s       c1   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300
# 11  4   2   n2  s       c2   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300
# 12  4   2   n2  s       c3   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300

# LEGEND:
# cid = channel_id
# sid = station_id
# scid = datacenter_id
# n, s, l, c = network, station, location, channel
# ed = event_distance_deg

        # define a dc_dataselect_manager for open data only:
        dc_dataselect_manager = DcDataselectManager(datacenters_df, Authorizer(None), False)

        wtimespan = [1, 2]  # in minutes
        expected = len(segments_df)  # no segment on db, we should have all segments to download
        orig_segments_df = segments_df.copy()
        segments_df, request_timebounds_need_update = \
            prepare_for_download(db.session, orig_segments_df, dc_dataselect_manager, wtimespan,
                                 retry_seg_not_found=True,
                                 retry_url_err=True,
                                 retry_mseed_err=True,
                                 retry_client_err=True,
                                 retry_server_err=True,
                                 retry_timespan_err=True,
                                 retry_timespan_warn=True)

# segments_df
# COLUMNS:
# channel_id  datacenter_id network station location channel event_distance_deg event_id arrival_time start_time end_time id download_status_code run_id
# DATA (not aligned with columns):
#               channel_id  datacenter_id network station location channel  event_distance_deg  event_id            arrival_time          start_time            end_time    id download_status_code  run_id
# GE.FLT1..HHE  1           1              GE      FLT1             HHE     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# GE.FLT1..HHN  2           1              GE      FLT1             HHN     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# GE.FLT1..HHZ  3           1              GE      FLT1             HHZ     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# IA.BAKI..BHE  7           2              IA      BAKI             BHE     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# IA.BAKI..BHN  8           2              IA      BAKI             BHN     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# IA.BAKI..BHZ  9           2              IA      BAKI             BHZ     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# n1.s..c1      4           1              n1      s                c1      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1
# n1.s..c2      5           1              n1      s                c2      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1
# n1.s..c3      6           1              n1      s                c3      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1
# n2.s..c1      10          2              n2      s                c1      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1
# n2.s..c2      11          2              n2      s                c2      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1
# n2.s..c3      12          2              n2      s                c3      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1

        # self._segdata is the folder file of a "valid" 3-channel miniseed
        # The channels are:
        # Thus, no match will be found and all segments will be written with a None
        # download status code

        # setup urlread: first three rows: ok
        # rows[3:6]: 413, retry them
        # rows[6:9]: malformed_data
        # rows[9:12] 413, retry them
        # then retry:
        # rows[3]: empty_data
        # rows[4]: data_with_gaps (but seed_id should notmatch)
        # rows[5]: data_with_gaps (seed_id should notmatch)
        # rows[9]: URLError
        # rows[10]: Http 500 error
        # rows[11]: 413

        # NOTE THAT THIS RELIES ON THE FACT THAT THREADS ARE EXECUTED IN THE ORDER OF THE DATAFRAME
        # WHICH SEEMS TO BE THE CASE AS THERE IS ONE SINGLE PROCESS
        # self._seg_data[:2] is a way to mock data corrupted
        urlread_sideeffect = [self._seg_data, 413, self._seg_data[:2], 413,
                              '', self._seg_data_gaps, self._seg_data_gaps,
                              URLError("++urlerror++"), 500, 413]
        # Let's go:
        ztatz = self.download_save_segments(urlread_sideeffect, db.session, segments_df,
                                            dc_dataselect_manager,
                                            chaid2mseedid,
                                            self.run.id, False,
                                            request_timebounds_need_update,
                                            1, 2, 3, db_bufsize=self.db_buf_size)
        # get columns from db which we are interested on to check
        cols = [Segment.id, Segment.channel_id, Segment.datacenter_id,
                Segment.download_code, Segment.maxgap_numsamples,
                Segment.sample_rate, Segment.data_seed_id, Segment.data, Segment.download_id,
                Segment.request_start, Segment.request_end, Segment.start_time, Segment.end_time
                ]
        db_segments_df = dbquery2df(db.session.query(*cols))
        assert Segment.download_id.key in db_segments_df.columns

        OUTTIME_ERR, OUTTIME_WARN = s2scodes.timespan_err, s2scodes.timespan_warn
        # assert no segment has data (time out of bounds):
        assert len(db_segments_df.loc[(~pd.isnull(db_segments_df[Segment.data.key])) &
                                      (db_segments_df[Segment.data.key].str.len() > 0),
                                      Segment.data.key]) == 0
        # assert the number of "correctly" downloaded segments, i.e. with data (4) has now
        # code = TIMEBOUND_ERR
        assert len(db_segments_df[db_segments_df[Segment.download_code.key] == OUTTIME_ERR]) == 4

        # re-sort db_segments_df to match the segments_df:
        ret = []
        for cha in segments_df[Segment.channel_id.key]:
            ret.append(db_segments_df[db_segments_df[Segment.channel_id.key] == cha])
        db_segments_df = pd.concat(ret, axis=0)

# db_segments_df:
#    id  channel_id  datacenter_id  download_status_code  max_gap_ovlap_ratio  sample_rate data_seed_id     data  run_id          start_time            end_time
# 0  1   1           1              -3                    0.0001               100.0        GE.FLT1..HHE    b''   1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 1  2   2           1              -3                    0.0001               100.0        GE.FLT1..HHN    b''   1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 2  3   3           1              -3                    0.0001               100.0        GE.FLT1..HHZ    b''   1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 6  7   7           2              200.0                 NaN                  NaN          None                  1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 7  8   8           2              NaN                   NaN                  NaN          None            None  1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 8  9   9           2              -3                 20.0                 20.0         IA.BAKI..BHZ    b''   1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 3  4   4           1             -2.0                   NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31
# 4  5   5           1             -2.0                   NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31
# 5  6   6           1             -2.0                   NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31
# 9  10  10          2              -1.0                  NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31
# 10 11  11          2              500.0                 NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31
# 11 12  12          2              413.0                 NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31

        # now modify the first row time bounds:
        # first we need to assign the database id to our segments_df, to prevent
        # db contraint error when writing to db:
        # `download_save_segments` below needs toi UPDATE the segments and it does it by
        # checking if an id is present.
        # check that the channel_ids align:
        assert (segments_df[Segment.channel_id.key].values ==
                db_segments_df[Segment.channel_id.key].values).all()
        # so that we can simply do this:
        segments_df[Segment.id.key] = db_segments_df[Segment.id.key]

        # first read the miniseed:
        stream = read(BytesIO(self._seg_data))
        tstart = stream[0].stats.starttime.datetime
        tend = stream[0].stats.endtime.datetime
        segments_df.loc[segments_df[Segment.channel_id.key] == 1,
                        Segment.request_start.key] = tstart
        segments_df.loc[segments_df[Segment.channel_id.key] == 1,
                        Segment.request_end.key] = tstart + (tend-tstart)/2

        segments_df.loc[segments_df[Segment.channel_id.key] == 2,
                        Segment.request_start.key] = tstart
        segments_df.loc[segments_df[Segment.channel_id.key] == 2,
                        Segment.request_end.key] = tend

        # build a segments_df of the three segments belonging to the same channel
        # copy at the end to avoid pandas settingwithcopy warning
        new_segments_df = \
            segments_df.loc[segments_df[Segment.channel_id.key].isin([1, 2, 3]), :].copy()
        # change urlread_side_effect to provide, for the first three segments, the same
        # sequence of bytes. The sequence actually is OK, but in the first case it will be
        # PARTIALLY saved in the second case TOTALLY, and in the thrid case NOT AT ALL:
        urlread_sideeffect = [self._seg_data, self._seg_data, self._seg_data]
        # define a dc_dataselect_manager for open data only:
        dc_dataselect_manager = DcDataselectManager(datacenters_df, Authorizer(None), False)
        ztatz = self.download_save_segments(urlread_sideeffect, db.session, new_segments_df,
                                            dc_dataselect_manager,
                                            chaid2mseedid,
                                            self.run.id, False,
                                            request_timebounds_need_update,
                                            1, 2, 3, db_bufsize=self.db_buf_size)
        db_segments_df = dbquery2df(db.session.query(*cols))
        # re-sort db_segments_df to match the segments_df:
        ret = [db_segments_df[db_segments_df[Segment.channel_id.key] == cha]
               for cha in segments_df[Segment.channel_id.key]]
        db_segments_df = pd.concat(ret, axis=0)

        # assert the 1st segment whose time range has been modified has data, BUT
        # download_status_code still TIMEBOUNDS_ERROR
        df__ = db_segments_df.loc[db_segments_df[Segment.channel_id.key] == 1, :]
        assert len(df__) == 1
        row__ = df__.iloc[0]
        assert row__[Segment.download_code.key] == OUTTIME_WARN
        assert len(row__[Segment.data.key]) > 0

        # assert the 2nd segment whose time range has been modified has data, AND
        # download_status_code 200 (ok)
        df__ = db_segments_df.loc[db_segments_df[Segment.channel_id.key] == 2, :]
        assert len(df__) == 1
        row__ = df__.iloc[0]
        assert row__[Segment.download_code.key] == 200
        assert len(row__[Segment.data.key]) > 0

        # assert the 3rd segment whose time range has NOT been modified has no data,
        # AND download_status_code is still TIMEBOUNDS_ERROR
        df__ = db_segments_df.loc[db_segments_df[Segment.channel_id.key] == 3, :]
        assert len(df__) == 1
        row__ = df__.iloc[0]
        assert row__[Segment.download_code.key] == OUTTIME_ERR
        assert len(row__[Segment.data.key]) == 0
    def test_retry2(self, mock_get_opener, mock_get_data_from_token,
                    mock_get_data_from_userpass,
                    mock_get_data_open, mock_updatedf, mock_insertdf, mock_mseed_unpack,
                    mock_download_save_segments, mock_save_inventories, mock_get_channels_df,
                    mock_get_datacenters_df, mock_get_events_df,
                    # fixtures:
                    db, clirunner, pytestdir, yamlfile):

        mock_get_events_df.side_effect = lambda *a, **v: self.get_events_df(None, *a, **v)
        mock_get_datacenters_df.side_effect = \
            lambda *a, **v: self.get_datacenters_df(None, *a, **v)
        mock_get_channels_df.side_effect = lambda *a, **v: self.get_channels_df(None, *a, **v)
        mock_save_inventories.side_effect = lambda *a, **v: self.save_inventories(None, *a, **v)
        RESPONSES = [URLError('abc')]
        mock_download_save_segments.side_effect = \
            lambda *a, **v: self.download_save_segments(RESPONSES, *a, **v)
        # mseed unpack is mocked by accepting only first arg (so that time bounds are not
        # considered)
        mock_mseed_unpack.side_effect = lambda *a, **v: unpack(a[0])
        mock_insertdf.side_effect = lambda *a, **v: insertdf(*a, **v)
        mock_updatedf.side_effect = lambda *a, **v: updatedf(*a, **v)
        # prevlen = len(db.session.query(Segment).all())

        # patching class methods while preserving the original call requires storing once
        # the original methods (as class attributes). Sets the side effect of the mocked method
        # to those class attributes as to preserve the original functionality
        # and be able to assert mock_* functions are called and so on
        # For info see:
        # https://stackoverflow.com/a/29563665
        mock_get_data_open.side_effect = self.dc_get_data_open
        mock_get_data_from_userpass.side_effect = self.dc_get_data_from_userpass
        mock_get_data_from_token.side_effect = \
            lambda *a, **kw: self.dc_get_data_from_token(['a:b', 'c:d'], *a, **kw)

        # TEST 1: provide a file with valid token:
        tokenfile = pytestdir.newfile(create=True)
        with open(tokenfile, 'w') as fh:
            fh.write('BEGIN PGP MESSAGE')
        # mock yaml_load to override restricted_data:

        # USERPASS good for both  datacenter:
        mock_get_data_open.reset_mock()
        mock_get_data_from_token.reset_mock()
        mock_get_data_from_userpass.reset_mock()
        mock_get_opener.reset_mock()
        mock_get_data_from_token.side_effect = \
            lambda *a, **kw: self.dc_get_data_from_token(['uzer:pazzword', 'uzer:pazzword'],
                                                         *a, **kw)
        yaml_file = yamlfile(restricted_data=os.path.abspath(tokenfile),
                             retry_client_err=False)
        result = clirunner.invoke(cli, ['download',
                                        '-c', yaml_file,
                                        '--dburl', db.dburl,
                                        '--start', '2016-05-08T00:00:00',
                                        '--end', '2016-05-08T9:00:00'])
        assert clirunner.ok(result)
        # get db data, sort by index and reset index to assure comparison across data frames:
        seg_df = dbquery2df(db.session.query(Segment.id, Segment.download_code,
                                             Segment.queryauth, Segment.download_id))\
            .sort_values(by=[Segment.id.key]).reset_index(drop=True)
        # seg_df:
        # id  download_code  queryauth  download_id
        # 1  -1              True       2
        # 2  -1              True       2
        # 3  -1              True       2
        # 4  -1              True       2
        # 5  -1              True       2
        # 6  -1              True       2
        # 7  -1              True       2
        # 8  -1              True       2
        # 9  -1              True       2
        # 10 -1              True       2
        # 11 -1              True       2
        # 12 -1              True       2
        urlerr, mseederr = s2scodes.url_err, s2scodes.mseed_err
        # according to our mock, we should have all urlerr codes:
        assert (seg_df[Segment.download_code.key] == urlerr).all()
        assert (seg_df[Segment.queryauth.key] == True).all()
        DOWNLOADID = 2
        assert (seg_df[Segment.download_id.key] == DOWNLOADID).all()
        # other assertions:
        assert 'restricted_data: %s' % os.path.abspath(tokenfile) in result.output
        assert 'STEP 5 of 8: Acquiring credentials from token' in result.output
        # assert we print that we are downloading open and restricted data:
        assert re.search(r'STEP 7 of 8\: Downloading \d+ segments and saving to db',
                         result.output)
        assert not mock_get_data_open.called
        assert mock_get_data_from_token.called
        assert not mock_get_data_from_userpass.called
        # no credentials failed:
        assert "Downloading open data only from: " not in result.output

        # Ok, test retry:
        new_seg_df = seg_df.copy()
        # first get run id
        # we have 12 segments, change the download codes. The second boolean
        # value denotes queryauth (True or False):
        code_queryauth = [(204, False), (204, True), (404, False), (404, True),
                          (401, False), (401, True), (403, False), (403, True),
                          (400, True), (400, False), (None, False), (None, True)]
        for id_, (dc_, qa_) in zip(seg_df[Segment.id.key].tolist(), code_queryauth):
            seg = db.session.query(Segment).filter(Segment.id == id_).first()
            seg.download_code = dc_
            seg.queryauth = qa_
            # set expected values (see also yamlfile below)
            # remember that any segment download will give urlerr as code
            expected_new_download_code = dc_
            expected_download_id = DOWNLOADID
            if dc_ in (204, 404, 401, 403) and qa_ is False:
                # to retry becaue they failed due to authorization problems
                # (or most likely they did)
                expected_new_download_code = urlerr
                expected_download_id = DOWNLOADID + 1
            elif dc_ is None or (dc_ < 400 and dc_ >= 500):
                # to retry because of the flags (see yamlfile below)
                expected_new_download_code = urlerr
                expected_download_id = DOWNLOADID + 1
            expected_query_auth = qa_ if dc_ == 400 else True

            new_seg_df.loc[new_seg_df[Segment.id.key] == id_, :] = \
                (id_, expected_new_download_code, expected_query_auth, expected_download_id)
            db.session.commit()

        # re-download and check what we have retried:
        yaml_file = yamlfile(restricted_data=os.path.abspath(tokenfile),
                             retry_seg_not_found=True,
                             retry_client_err=False)
        result = clirunner.invoke(cli, ['download',
                                        '-c', yaml_file,
                                        '--dburl', db.dburl,
                                        '--start', '2016-05-08T00:00:00',
                                        '--end', '2016-05-08T9:00:00'])
        DOWNLOADID += 1
        assert clirunner.ok(result)
        # get db data, sort by index and reset index to assure comparison across data frames:
        seg_df2 = dbquery2df(db.session.query(Segment.id, Segment.download_code, Segment.queryauth,
                                              Segment.download_id))\
            .sort_values(by=[Segment.id.key]).reset_index(drop=True)
        # seg_df2:
        # id  download_code  queryauth  download_id
        # 1  -1              True       3
        # 2   204            True       2
        # 3  -1              True       3
        # 4   404            True       2
        # 5  -1              True       3
        # 6   401            True       2
        # 7  -1              True       3
        # 8   403            True       2
        # 9   400            True       2
        # 10  400            False      2
        # 11 -1              True       3
        # 12 -1              True       3
        pd.testing.assert_frame_equal(seg_df2, new_seg_df)

        # Another retry without modifyiung the segments but setting retry_client_err to True
        # re-download and check what we have retried:
        yaml_file = yamlfile(restricted_data=os.path.abspath(tokenfile),
                             retry_seg_not_found=True,
                             retry_client_err=True)
        result = clirunner.invoke(cli, ['download',
                                        '-c', yaml_file,
                                        '--dburl', db.dburl,
                                        '--start', '2016-05-08T00:00:00',
                                        '--end', '2016-05-08T9:00:00'])
        DOWNLOADID += 1
        assert clirunner.ok(result)
        # get db data, sort by index and reset index to assure comparison across data frames:
        seg_df3 = dbquery2df(db.session.query(Segment.id, Segment.download_code, Segment.queryauth,
                                              Segment.download_id))\
            .sort_values(by=[Segment.id.key]).reset_index(drop=True)
        expected_df = seg_df2.copy()
        # modify all 4xx codes as they are updated. Note that old urlerr codes have the old
        # download id (do not override)
        old_4xx = expected_df[Segment.download_code.key].between(400, 499.999)
        expected_df.loc[old_4xx, Segment.download_id.key] = DOWNLOADID
        expected_df.loc[old_4xx, Segment.queryauth.key] = True
        expected_df.loc[old_4xx, Segment.download_code.key] = urlerr
        # seg_df3:
        # id  download_code  queryauth  download_id
        # 1  -1              True       3
        # 2   204            True       2
        # 3  -1              True       3
        # 4  -1              True       4
        # 5  -1              True       3
        # 6  -1              True       4
        # 7  -1              True       3
        # 8  -1              True       4
        # 9  -1              True       4
        # 10 -1              True       4
        # 11 -1              True       3
        # 12 -1              True       3
        pd.testing.assert_frame_equal(seg_df3, expected_df)
        old_urlerr_segids = seg_df2[seg_df2[Segment.download_code.key] == urlerr][Segment.id.key]
        new_urlerr_df = expected_df[expected_df[Segment.id.key].isin(old_urlerr_segids)]
        assert (new_urlerr_df[Segment.download_id.key] == 3).all()
    def init(self, request, db, data):
        # re-init a sqlite database (no-op if the db is not sqlite):
        db.create(to_file=False)
        # setup a run_id:
        rdw = Download()
        db.session.add(rdw)
        db.session.commit()
        self.run = rdw

        # side effects:
        self._evt_urlread_sideeffect = """#EventID | Time | Latitude | Longitude | Depth/km | Author | Catalog | Contributor | ContributorID | MagType | Magnitude | MagAuthor | EventLocationName
20160508_0000129|2016-05-08 05:17:11.500000|1|1|60.0|AZER|EMSC-RTS|AZER|505483|ml|3|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN
20160508_0000004|2016-05-08 01:45:30.300000|90|90|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|4|EMSC|CROATIA
"""
        self._dc_urlread_sideeffect = """http://geofon.gfz-potsdam.de/fdsnws/dataselect/1/query
ZZ * * * 2002-09-01T00:00:00 2005-10-20T00:00:00
UP ARJ * * 2013-08-01T00:00:00 2017-04-25

http://ws.resif.fr/fdsnws/dataselect/1/query
ZU * * HHZ 2015-01-01T00:00:00 2016-12-31T23:59:59.999999

"""

        # Note: by default we set sta_urlsideeffect to return such a channels which result in 12
        # segments (see lat and lon of channels vs lat and lon of events above)
        self._sta_urlread_sideeffect = [
            """#Network|Station|Location|Channel|Latitude|Longitude|Elevation|Depth|Azimuth|Dip|SensorDescription|Scale|ScaleFreq|ScaleUnits|SampleRate|StartTime|EndTime
GE|FLT1||HHE|1|1|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2003-01-01T00:00:00|
GE|FLT1||HHN|1|1|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2003-01-01T00:00:00|
GE|FLT1||HHZ|1|1|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2003-01-01T00:00:00|
n1|s||c1|90|90|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00|
n1|s||c2|90|90|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00|
n1|s||c3|90|90|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00|
""", """#Network|Station|Location|Channel|Latitude|Longitude|Elevation|Depth|Azimuth|Dip|SensorDescription|Scale|ScaleFreq|ScaleUnits|SampleRate|StartTime|EndTime
IA|BAKI||BHE|1|1|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2003-01-01T00:00:00|
IA|BAKI||BHN|1|1|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2003-01-01T00:00:00|
IA|BAKI||BHZ|1|1|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2003-01-01T00:00:00|
n2|s||c1|90|90|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00|
n2|s||c2|90|90|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00|
n2|s||c3|90|90|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00|
"""
        ]

        self._mintraveltime_sideeffect = cycle([1])
        self._seg_data = data.read("GE.FLT1..HH?.mseed")
        self._seg_data_gaps = data.read("IA.BAKI..BHZ.D.2016.004.head")
        self._seg_data_empty = b''
        self._seg_urlread_sideeffect = [
            self._seg_data, self._seg_data_gaps, 413, 500, self._seg_data[:2],
            self._seg_data_empty, 413,
            URLError("++urlerror++"),
            socket.timeout()
        ]
        self.service = ''  # so get_datacenters_df accepts any row by default
        self.db_buf_size = 1
        self.routing_service = yaml_load(get_templates_fpath("download.yaml"))\
            ['advanced_settings']['routing_service_url']

        # NON db stuff (logging, patchers, pandas...):
        self.logout = StringIO()
        handler = StreamHandler(stream=self.logout)
        self._logout_cache = ""
        # THIS IS A HACK:
        query_logger.setLevel(logging.INFO)  # necessary to forward to handlers
        # if we called closing (we are testing the whole chain) the level will be reset
        # (to level.INFO) otherwise it stays what we set two lines above. Problems might arise
        # if closing sets a different level, but for the moment who cares
        query_logger.addHandler(handler)

        # when debugging, I want the full dataframe with to_string(), not truncated
        # NOTE: this messes up right alignment of numbers in DownloadStats (see utils.py)
        # FIRST, remember current settings and restore them in cleanup:
        _pd_display_maxcolwidth = pd.get_option('display.max_colwidth')
        pd.set_option('display.max_colwidth', -1)

        # define class level patchers (we do not use a yiled as we need to do more stuff in the
        # finalizer, see below
        patchers = []

        patchers.append(patch('stream2segment.utils.url.urlopen'))
        self.mock_urlopen = patchers[-1].start()

        # mock ThreadPool (tp) to run one instance at a time, so we get deterministic results:
        class MockThreadPool(object):
            def __init__(self, *a, **kw):
                pass

            def imap(self, func, iterable, *args):
                # make imap deterministic: same as standard python map:
                # everything is executed in a single thread the right input order
                return map(func, iterable)

            def imap_unordered(self, func_, iterable, *args):
                # make imap_unordered deterministic: same as standard python map:
                # everything is executed in a single thread in the right input order
                return map(func_, iterable)

            def close(self, *a, **kw):
                pass

        # assign patches and mocks:
        patchers.append(patch('stream2segment.utils.url.ThreadPool'))
        self.mock_tpool = patchers[-1].start()
        self.mock_tpool.side_effect = MockThreadPool

        # add finalizer:
        def delete():
            pd.set_option('display.max_colwidth', _pd_display_maxcolwidth)

            for patcher in patchers:
                patcher.stop()

            hndls = query_logger.handlers[:]
            handler.close()
            for h in hndls:
                if h is handler:
                    query_logger.removeHandler(h)

        request.addfinalizer(delete)
    def test_get_events(self, mock_urljoin, db):
        urlread_sideeffect = [
            """#1|2|3|4|5|6|7|8|9|10|11|12|13
20160508_0000129|2016-05-08 05:17:11.500000|40.57|52.23|60.0|AZER|EMSC-RTS|AZER|505483|ml|3.1|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN
20160508_0000004|2016-05-08 01:45:30.300000|44.96|15.35|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|3.6|EMSC|CROATIA
20160508_0000113|2016-05-08 22:37:20.100000|45.68|26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
20160508_0000113|2016-05-08 22:37:20.100000|45.68|26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
--- ERRROR --- THIS IS MALFORMED 20160508_abc0113|2016-05-08 22:37:20.100000| --- ERROR --- |26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
"""
        ]
        data = self.get_events_df(urlread_sideeffect,
                                  db.session,
                                  "http://eventws", {},
                                  datetime.utcnow() - timedelta(seconds=1),
                                  datetime.utcnow(),
                                  db_bufsize=self.db_buf_size)
        # assert only first two events events were successfully saved
        assert len(db.session.query(Event).all()) == len(pd.unique(data['id'])) == \
            len(data) == 3
        # check that log has notified:
        log1 = self.log_msg()
        assert "20160508_0000113" in log1
        assert "1 database row(s) not inserted" in log1
        assert mock_urljoin.call_count == 1
        mock_urljoin.reset_mock()

        # now download again, with an url error:
        urlread_sideeffect = [
            504, """1|2|3|4|5|6|7|8|9|10|11|12|13
20160508_0000129|2016-05-08 05:17:11.500000|40.57|52.23|60.0|AZER|EMSC-RTS|AZER|505483|ml|3.1|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN
20160508_0000004|2016-05-08 01:45:30.300000|44.96|15.35|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|3.6|EMSC|CROATIA
20160508_0000113|2016-05-08 22:37:20.100000|45.68|26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
20160508_0000113|2016-05-08 22:37:20.100000|45.68|26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
--- ERRROR --- THIS IS MALFORMED 20160508_abc0113|2016-05-08 22:37:20.100000| --- ERROR --- |26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
""",
            URLError('blabla23___')
        ]
        with pytest.raises(FailedDownload) as fld:
            data = self.get_events_df(urlread_sideeffect,
                                      db.session,
                                      "http://eventws", {},
                                      datetime.utcnow() - timedelta(seconds=1),
                                      datetime.utcnow(),
                                      db_bufsize=self.db_buf_size)
        # assert we got the same result as above:
        assert len(db.session.query(Event).all()) == len(pd.unique(data['id'])) == \
            len(data) == 3
        log2 = self.log_msg()

        # log text has the message about the second (successful) dwnload, with the
        # two rows discarded:
        assert "2 row(s) discarded" in log2
        # test that the exception has expected mesage:
        assert "Unable to fetch events" in str(fld)
        # check that we splitted once, thus we called 2 times mock_urljoin
        # (plus the first call):
        assert mock_urljoin.call_count == 3
        mock_urljoin.reset_mock()

        # now download again, with a recursion error (max iterations reached):
        urlread_sideeffect = [413]
        with pytest.raises(FailedDownload) as fld:
            data = self.get_events_df(urlread_sideeffect,
                                      db.session,
                                      "http://eventws", {},
                                      datetime.utcnow() - timedelta(seconds=1),
                                      datetime.utcnow(),
                                      db_bufsize=self.db_buf_size)
        # assert we got the same result as above:
        assert len(db.session.query(Event).all()) == len(pd.unique(data['id'])) == \
            len(data) == 3
        log2 = self.log_msg()

        # nothing written to log:
        assert "Request seems to be too large" in log2
        # assertion on exception:
        assert "Unable to fetch events" in str(fld)
        assert "maximum recursion depth reached" in str(fld)