def test_standard_timebounds(mock_response_inbytes):
    bytez = mock_response_inbytes()
    # g= get_stream(bytez)  # _read_mseed(BytesIO(bytez))
    # get our dicts of trace_id: trace_bytes
    dic = unpack(bytez, None, None)
    assert not haserr(dic)
    # assert we have data and the error field is empty:
    assert all(v[1] and v[0] is None for v in dic.values())
    s2s_stream = get_s2s_stream(dic)

    start = s2s_stream[0].stats.starttime.datetime
    end = s2s_stream[0].stats.endtime.datetime

    # asssert that the last flag (discarded chunks) is False (we did not provide time bounds)
    assert all(v[6] is False for v in dic.values())

    # assert START times are different: this depends on our miniseedlite3
    # AND obspy routine. Form THE CURENT TRY, the difference is 0.000001 seconds (1 microsecond?)
    # which might be due to floating point rounding errors
    timediffs = list(
        max(abs(v[4] - s2s_stream[i].stats.starttime.datetime),
            abs(v[5] - s2s_stream[i].stats.endtime.datetime))
        for i, v in enumerate(dic.values()))
    assert all(t <= timedelta(seconds=0.000001) for t in timediffs)

    tdelta = (end - start) / 2

    for times in [(start, start + tdelta), (start + tdelta, end)]:
        dic = unpack(bytez, *times)
        # check that everything is read (v[1] is truthy)
        # AND no errors are found (v[0] is None)
        # AND that the last element v[6] (chunks out of bounds) is True
        assert all(v[1] and v[0] is None and v[6] is True
                   for v in dic.values())
        # check the time differences between these unpacked data and the original miniseed without
        # time bounds: the new time diffs should be greater than the old time diffs:
        timediffs2 = list(
            max(abs(v[4] - s2s_stream[i].stats.starttime.datetime),
                abs(v[5] - s2s_stream[i].stats.endtime.datetime))
            for i, v in enumerate(dic.values()))
        assert all(t2 > t1 for t2, t1 in zip(timediffs2, timediffs))

    # now test for complete out of bounds:
    d1 = datetime.utcnow()
    start = d1 + timedelta(days=365)
    end = d1 + timedelta(days=366)
    dic = unpack(bytez, start, end)
    # check that nothing is read (b'')
    # AND no errors are found (v[0] is None)
    # AND that the last element v[6] (chunks out of bounds) is True
    assert all(v[1] == b'' and v[0] is None and v[6] is True
               for v in dic.values())
def test_standard(mock_response_inbytes):
    bytez = mock_response_inbytes()
    # g= get_stream(bytez)  # _read_mseed(BytesIO(bytez))
    # get our dicts of trace_id: trace_bytes
    dic = unpack(bytez)
    assert not haserr(dic)
    # assert all max gap ratios are below a certain threshold
    # (we should get 0, some rounding errors might occur):
    assert all(abs(v[3]) < 0.00011 for v in dic.values())
    # get the same dict by calling obspy.read:
    obspy_stream = get_stream(bytez)
    s2s_stream = get_s2s_stream(dic)
    assert streamequal(obspy_stream, s2s_stream, deep=True)

    # assert time read and time from obspy routine coincide. Probably due to rounding errors
    # end times are not strictly equal. However, they are really close (within 1 microsecond):
    tdelta = timedelta(microseconds=1)
    # compare traces, but note that we must match them by id for comparison:
    for t1 in obspy_stream:
        dic_values = dic[t1.get_id()]
        mseedlite_starttime, mseedlite_endtime = dic_values[4], dic_values[5]
        assert abs(t1.stats.starttime.datetime - mseedlite_starttime) <= tdelta
        assert abs(t1.stats.endtime.datetime - mseedlite_endtime) <= tdelta
    # assert also same number of traces:
    assert len(obspy_stream) == len(dic)
def test_unexpected_end_of_header(mock_response_inbytes):
    '''test unexpected end of header, i.e. when unpack raises'''
    bytez = mock_response_inbytes()
    # this raises 'unexpected end of header':
    bytez2 = bytez[:100] + b'abc' + bytez[101:]
    with pytest.raises(MSeedError):
        _ = unpack(bytez2)
def test_fsamp_mismatchs(mock_response_inbytes):
    bytez = mock_response_inbytes()
    dic = unpack(bytez, None, None)
    # assert all sample rates are 100:
    assert (v[2] == 100 for v in dic.values())
    # build a fake bytez string
    ret_dic = dict()
    key = None
    records = []
    for rec in Input(bytez):
        is_exc = rec.error
        if is_exc:
            continue
        if key is None:
            key = rec.record_id
        elif rec.record_id != key:
            continue
        records.append(rec)
        if len(records) > 1:
            # change fsamp
            rec.sr_factor *= 2
            rec.sr_mult = -rec.sr_mult
            break
    bytesio = BytesIO()
    for record in records:
        record.write(bytesio, int(log(record.size) / log(2)))
    bytez = bytesio.getvalue()
    bytesio.close()
    # g= get_stream(bytez)  # _read_mseed(BytesIO(bytez))
    # get our dicts of trace_id: trace_bytes
    dic = unpack(bytez, None, None)
    assert haserr(dic)
    assert len(dic) == 1
    # assert we have only one item, whose first element is not none (the exception)
    # and whose second element (the data) is None
    values = list(dic.values())[0]
    assert str(
        values[0]) == "records sample rate mismatch" and values[1] is None
def test_struct_unpack_error(mock_struct_unpack, struct_unpack_arg, raises,
                             mock_response_inbytes):
    '''test invalid pointers error'''
    def sunpack(what, bytez):
        if what == struct_unpack_arg:
            bytez = bytez[:-1]
        return original_unpack(what, bytez)

    mock_struct_unpack.side_effect = sunpack
    bytez = mock_response_inbytes()

    if raises:
        with pytest.raises(MSeedError):
            unpack(bytez)
        return

    dic = unpack(bytez)

    # erros is not empty but has the trace id 'aa.aaaaa.aa.aaa'. What is that?
    # is the id we created by modyfing the bytes above
    assert haserr(dic)
    assert len(dic) == 3
    assert mseed_with_error(dic) == len(dic)
def test_with_gaps_overlaps(mock_response_inbytes):
    bytez = mock_response_inbytes(True)

    # get our dicts of trace_id: trace_bytes
    dic = unpack(bytez)
    assert not haserr(dic)
    assert len(dic) == 1
    values = list(dic.values())[0]

    # get the same dict by calling obspy.read:
    obspy_stream = get_stream(bytez)
    obspygaps = obspy_stream.get_gaps()
    max_obspy_gap_ratio = max((_[-1] for _ in obspygaps))
    assert values[2] == max_obspy_gap_ratio  # gaps

    s2s_stream = get_s2s_stream(dic)
    assert streamequal(obspy_stream, s2s_stream, deep=True)
def test_change_last_byte(mock_response_inbytes):
    '''test when the data is corrupted, i.e. as headers are ok, unpack returns normally'''
    bytez = mock_response_inbytes()
    # get our dicts of trace_id: trace_bytes
    dic = unpack(bytez[:-1] + b'a')
    # this should not have errors as we changed the data, which is not read
    assert not haserr(dic)
    assert len(dic) == 3
    # assert all max gap ratios are below a certain threshold
    # (we should get 0, some rounding errors might occur):
    assert all(abs(v[3]) < 0.00011 for v in dic.values())

    obspy_stream = get_stream(bytez)
    s2s_stream = get_s2s_stream(dic)
    # assert same num of channels and traces and time ranges:
    assert streamequal(obspy_stream, s2s_stream, deep=False)
    # BUT NOT same data:
    assert not streamequal(obspy_stream, s2s_stream, deep=True)
def test_change_header_change_id(mock_response_inbytes):
    bytez = mock_response_inbytes()
    # get our dicts of trace_id: trace_bytes
    dic = unpack(b'a' * _FIXHEAD_LEN + bytez[_FIXHEAD_LEN:])
    # erros is not empty but has the trace id 'aa.aaaaa.aa.aaa'. What is that?
    # is the id we created by modyfing the bytes above
    assert haserr(dic)
    # curiously, the returned "traces" are 4 and not 3. The first one being the "error" trace
    assert len(dic) == 4
    # assert first one is erroneous (actually, different python versions might not store it in
    # the first item, so use 'any'):
    assert any(
        str(list(dic.values())[i][0]) == 'non-data record'
        for i in range(len(dic)))
    # assert all max gap ratios are below a certain threshold
    # (we should get 0, some rounding errors might occur)
    assert all(abs(v[3]) < 0.00011 for v in dic.values() if v[3] is not None)
    obspy_stream = get_stream(bytez)
    s2s_stream = get_s2s_stream(dic)
    # assert not same num of channels and traces and time ranges:
    assert streamequal(obspy_stream, s2s_stream, deep=False)
def test_invalid_pointers(mock_response_inbytes):
    '''test invalid pointers error'''
    bytez = mock_response_inbytes()
    # get our dicts of trace_id: trace_bytes
    dic = unpack(bytez[:_FIXHEAD_LEN - 8] + (b'a' * 8) + bytez[_FIXHEAD_LEN:])
    # erros is not empty but has the trace id 'aa.aaaaa.aa.aaa'. What is that?
    # is the id we created by modyfing the bytes above
    assert haserr(dic)
    assert len(dic) == 3
    # assert first one is erroneous (actually, different python versions might not store it in
    # the first item, so use 'any'):
    assert any(
        str(list(dic.values())[i][0]) == 'invalid pointers'
        for i in range(len(dic)))
    # assert all max gap ratios are below a certain threshold
    # (we should get 0, some rounding errors might occur)
    assert all(abs(v[3]) < 0.00011 for v in dic.values() if v[3] is not None)

    obspy_stream = get_stream(bytez)
    s2s_stream = get_s2s_stream(dic)
    # assert not same num of channels and traces and time ranges:
    assert not streamequal(obspy_stream, s2s_stream, deep=False)
    def test_restricted(self, mock_get_opener, mock_get_data_from_token,
                        mock_get_data_from_userpass,
                        mock_get_data_open, mock_updatedf, mock_insertdf, mock_mseed_unpack,
                        mock_download_save_segments, mock_save_inventories, mock_get_channels_df,
                        mock_get_datacenters_df, mock_get_events_df,
                        # fixtures:
                        db, clirunner, pytestdir, yamlfile):

        mock_get_events_df.side_effect = lambda *a, **v: self.get_events_df(None, *a, **v)
        mock_get_datacenters_df.side_effect = \
            lambda *a, **v: self.get_datacenters_df(None, *a, **v) 
        mock_get_channels_df.side_effect = lambda *a, **v: self.get_channels_df(None, *a, **v)
        mock_save_inventories.side_effect = lambda *a, **v: self.save_inventories(None, *a, **v)
        mock_download_save_segments.side_effect = \
            lambda *a, **v: self.download_save_segments(None, *a, **v)
        # mseed unpack is mocked by accepting only first arg
        # (so that time bounds are not considered)
        mock_mseed_unpack.side_effect = lambda *a, **v: unpack(a[0])
        mock_insertdf.side_effect = lambda *a, **v: insertdf(*a, **v)
        mock_updatedf.side_effect = lambda *a, **v: updatedf(*a, **v)
        # prevlen = len(db.session.query(Segment).all())

        # patching class methods while preserving the original call requires storing once
        # the original methods (as class attributes). Sets the side effect of the mocked method
        # to those class attributes as to preserve the original functionality
        # and be able to assert mock_* functions are called and so on
        # For info see:
        # https://stackoverflow.com/a/29563665
        mock_get_data_open.side_effect = self.dc_get_data_open
        mock_get_data_from_userpass.side_effect = self.dc_get_data_from_userpass
        mock_get_data_from_token.side_effect = \
            lambda *a, **kw: self.dc_get_data_from_token([URLError('a'), 'abc'], *a, **kw)

        # TEST 1: provide a file with valid token:
        tokenfile = pytestdir.newfile(create=True)
        with open(tokenfile, 'w') as fh:
            fh.write('BEGIN PGP MESSAGE')
        # mock yaml_load to override restricted_data:
        yaml_file = yamlfile(restricted_data=os.path.abspath(tokenfile))
        # The run table is populated with a run_id in the constructor of this class
        # for checking run_ids, store here the number of runs we have in the table:
        runs = len(db.session.query(Download.id).all())
        result = clirunner.invoke(cli, ['download',
                                        '-c', yaml_file,
                                        '--dburl', db.dburl,
                                        '--start', '2016-05-08T00:00:00',
                                        '--end', '2016-05-08T9:00:00'])
        assert clirunner.ok(result)
        assert 'Downloading 12 segments (open data only)' in result.output
        assert 'STEP 5 of 8: Acquiring credentials from token' in result.output
        # note that due to (probably) dict order in py2-3 we need to test both of these:
        if not ('Downloading open data only from: http://geofon.gfz-potsdam.de, '
                'http://ws.resif.fr (Unable to acquire credentials for restricted data)') in \
                result.output:
            assert ('Downloading open data only from: http://ws.resif.fr, '
                    'http://geofon.gfz-potsdam.de (Unable to acquire credentials for restricted data)') in \
                    result.output
        # assert we print that we are downloading open data only (all errors):
        assert 'STEP 7 of 8: Downloading 12 segments (open data only)' in result.output
        assert not mock_get_data_open.called
        assert mock_get_data_from_token.called
        assert not mock_get_data_from_userpass.called
        assert not mock_get_opener.called
        # some assertions to check data properly written
        # These are important because they confirm that data has been downloaded anyway
        # (the test does not differentiate between restricted or open data)
        assert len(db.session.query(Download.id).all()) == runs + 1
        runs += 1
        segments = db.session.query(Segment).all()
        assert len(segments) == 12
        segments = db.session.query(Segment).filter(Segment.has_data).all()
        assert len(segments) == 4
        assert len(db.session.query(Station).filter(Station.has_inventory).all()) == 2
        assert mock_updatedf.called  # called while saving inventories
        assert mock_insertdf.called
def test_empty_data(mock_response_inbytes):
    '''test empty data'''
    bytez = b''
    assert not unpack(bytez)
    def test_retry2(self, mock_get_opener, mock_get_data_from_token,
                    mock_get_data_from_userpass,
                    mock_get_data_open, mock_updatedf, mock_insertdf, mock_mseed_unpack,
                    mock_download_save_segments, mock_save_inventories, mock_get_channels_df,
                    mock_get_datacenters_df, mock_get_events_df,
                    # fixtures:
                    db, clirunner, pytestdir, yamlfile):

        mock_get_events_df.side_effect = lambda *a, **v: self.get_events_df(None, *a, **v)
        mock_get_datacenters_df.side_effect = \
            lambda *a, **v: self.get_datacenters_df(None, *a, **v)
        mock_get_channels_df.side_effect = lambda *a, **v: self.get_channels_df(None, *a, **v)
        mock_save_inventories.side_effect = lambda *a, **v: self.save_inventories(None, *a, **v)
        RESPONSES = [URLError('abc')]
        mock_download_save_segments.side_effect = \
            lambda *a, **v: self.download_save_segments(RESPONSES, *a, **v)
        # mseed unpack is mocked by accepting only first arg (so that time bounds are not
        # considered)
        mock_mseed_unpack.side_effect = lambda *a, **v: unpack(a[0])
        mock_insertdf.side_effect = lambda *a, **v: insertdf(*a, **v)
        mock_updatedf.side_effect = lambda *a, **v: updatedf(*a, **v)
        # prevlen = len(db.session.query(Segment).all())

        # patching class methods while preserving the original call requires storing once
        # the original methods (as class attributes). Sets the side effect of the mocked method
        # to those class attributes as to preserve the original functionality
        # and be able to assert mock_* functions are called and so on
        # For info see:
        # https://stackoverflow.com/a/29563665
        mock_get_data_open.side_effect = self.dc_get_data_open
        mock_get_data_from_userpass.side_effect = self.dc_get_data_from_userpass
        mock_get_data_from_token.side_effect = \
            lambda *a, **kw: self.dc_get_data_from_token(['a:b', 'c:d'], *a, **kw)

        # TEST 1: provide a file with valid token:
        tokenfile = pytestdir.newfile(create=True)
        with open(tokenfile, 'w') as fh:
            fh.write('BEGIN PGP MESSAGE')
        # mock yaml_load to override restricted_data:

        # USERPASS good for both  datacenter:
        mock_get_data_open.reset_mock()
        mock_get_data_from_token.reset_mock()
        mock_get_data_from_userpass.reset_mock()
        mock_get_opener.reset_mock()
        mock_get_data_from_token.side_effect = \
            lambda *a, **kw: self.dc_get_data_from_token(['uzer:pazzword', 'uzer:pazzword'],
                                                         *a, **kw)
        yaml_file = yamlfile(restricted_data=os.path.abspath(tokenfile),
                             retry_client_err=False)
        result = clirunner.invoke(cli, ['download',
                                        '-c', yaml_file,
                                        '--dburl', db.dburl,
                                        '--start', '2016-05-08T00:00:00',
                                        '--end', '2016-05-08T9:00:00'])
        assert clirunner.ok(result)
        # get db data, sort by index and reset index to assure comparison across data frames:
        seg_df = dbquery2df(db.session.query(Segment.id, Segment.download_code,
                                             Segment.queryauth, Segment.download_id))\
            .sort_values(by=[Segment.id.key]).reset_index(drop=True)
        # seg_df:
        # id  download_code  queryauth  download_id
        # 1  -1              True       2
        # 2  -1              True       2
        # 3  -1              True       2
        # 4  -1              True       2
        # 5  -1              True       2
        # 6  -1              True       2
        # 7  -1              True       2
        # 8  -1              True       2
        # 9  -1              True       2
        # 10 -1              True       2
        # 11 -1              True       2
        # 12 -1              True       2
        urlerr, mseederr = s2scodes.url_err, s2scodes.mseed_err
        # according to our mock, we should have all urlerr codes:
        assert (seg_df[Segment.download_code.key] == urlerr).all()
        assert (seg_df[Segment.queryauth.key] == True).all()
        DOWNLOADID = 2
        assert (seg_df[Segment.download_id.key] == DOWNLOADID).all()
        # other assertions:
        assert 'restricted_data: %s' % os.path.abspath(tokenfile) in result.output
        assert 'STEP 5 of 8: Acquiring credentials from token' in result.output
        # assert we print that we are downloading open and restricted data:
        assert re.search(r'STEP 7 of 8\: Downloading \d+ segments and saving to db',
                         result.output)
        assert not mock_get_data_open.called
        assert mock_get_data_from_token.called
        assert not mock_get_data_from_userpass.called
        # no credentials failed:
        assert "Downloading open data only from: " not in result.output

        # Ok, test retry:
        new_seg_df = seg_df.copy()
        # first get run id
        # we have 12 segments, change the download codes. The second boolean
        # value denotes queryauth (True or False):
        code_queryauth = [(204, False), (204, True), (404, False), (404, True),
                          (401, False), (401, True), (403, False), (403, True),
                          (400, True), (400, False), (None, False), (None, True)]
        for id_, (dc_, qa_) in zip(seg_df[Segment.id.key].tolist(), code_queryauth):
            seg = db.session.query(Segment).filter(Segment.id == id_).first()
            seg.download_code = dc_
            seg.queryauth = qa_
            # set expected values (see also yamlfile below)
            # remember that any segment download will give urlerr as code
            expected_new_download_code = dc_
            expected_download_id = DOWNLOADID
            if dc_ in (204, 404, 401, 403) and qa_ is False:
                # to retry becaue they failed due to authorization problems
                # (or most likely they did)
                expected_new_download_code = urlerr
                expected_download_id = DOWNLOADID + 1
            elif dc_ is None or (dc_ < 400 and dc_ >= 500):
                # to retry because of the flags (see yamlfile below)
                expected_new_download_code = urlerr
                expected_download_id = DOWNLOADID + 1
            expected_query_auth = qa_ if dc_ == 400 else True

            new_seg_df.loc[new_seg_df[Segment.id.key] == id_, :] = \
                (id_, expected_new_download_code, expected_query_auth, expected_download_id)
            db.session.commit()

        # re-download and check what we have retried:
        yaml_file = yamlfile(restricted_data=os.path.abspath(tokenfile),
                             retry_seg_not_found=True,
                             retry_client_err=False)
        result = clirunner.invoke(cli, ['download',
                                        '-c', yaml_file,
                                        '--dburl', db.dburl,
                                        '--start', '2016-05-08T00:00:00',
                                        '--end', '2016-05-08T9:00:00'])
        DOWNLOADID += 1
        assert clirunner.ok(result)
        # get db data, sort by index and reset index to assure comparison across data frames:
        seg_df2 = dbquery2df(db.session.query(Segment.id, Segment.download_code, Segment.queryauth,
                                              Segment.download_id))\
            .sort_values(by=[Segment.id.key]).reset_index(drop=True)
        # seg_df2:
        # id  download_code  queryauth  download_id
        # 1  -1              True       3
        # 2   204            True       2
        # 3  -1              True       3
        # 4   404            True       2
        # 5  -1              True       3
        # 6   401            True       2
        # 7  -1              True       3
        # 8   403            True       2
        # 9   400            True       2
        # 10  400            False      2
        # 11 -1              True       3
        # 12 -1              True       3
        pd.testing.assert_frame_equal(seg_df2, new_seg_df)

        # Another retry without modifyiung the segments but setting retry_client_err to True
        # re-download and check what we have retried:
        yaml_file = yamlfile(restricted_data=os.path.abspath(tokenfile),
                             retry_seg_not_found=True,
                             retry_client_err=True)
        result = clirunner.invoke(cli, ['download',
                                        '-c', yaml_file,
                                        '--dburl', db.dburl,
                                        '--start', '2016-05-08T00:00:00',
                                        '--end', '2016-05-08T9:00:00'])
        DOWNLOADID += 1
        assert clirunner.ok(result)
        # get db data, sort by index and reset index to assure comparison across data frames:
        seg_df3 = dbquery2df(db.session.query(Segment.id, Segment.download_code, Segment.queryauth,
                                              Segment.download_id))\
            .sort_values(by=[Segment.id.key]).reset_index(drop=True)
        expected_df = seg_df2.copy()
        # modify all 4xx codes as they are updated. Note that old urlerr codes have the old
        # download id (do not override)
        old_4xx = expected_df[Segment.download_code.key].between(400, 499.999)
        expected_df.loc[old_4xx, Segment.download_id.key] = DOWNLOADID
        expected_df.loc[old_4xx, Segment.queryauth.key] = True
        expected_df.loc[old_4xx, Segment.download_code.key] = urlerr
        # seg_df3:
        # id  download_code  queryauth  download_id
        # 1  -1              True       3
        # 2   204            True       2
        # 3  -1              True       3
        # 4  -1              True       4
        # 5  -1              True       3
        # 6  -1              True       4
        # 7  -1              True       3
        # 8  -1              True       4
        # 9  -1              True       4
        # 10 -1              True       4
        # 11 -1              True       3
        # 12 -1              True       3
        pd.testing.assert_frame_equal(seg_df3, expected_df)
        old_urlerr_segids = seg_df2[seg_df2[Segment.download_code.key] == urlerr][Segment.id.key]
        new_urlerr_df = expected_df[expected_df[Segment.id.key].isin(old_urlerr_segids)]
        assert (new_urlerr_df[Segment.download_id.key] == 3).all()
    def test_retry(self, mock_get_opener, mock_get_data_from_token,
                   mock_get_data_from_userpass,
                   mock_get_data_open, mock_updatedf, mock_insertdf, mock_mseed_unpack,
                   mock_download_save_segments, mock_save_inventories, mock_get_channels_df,
                   mock_get_datacenters_df, mock_get_events_df,
                   # fixtures:
                   db, clirunner, pytestdir, yamlfile):

        mock_get_events_df.side_effect = lambda *a, **v: self.get_events_df(None, *a, **v)
        mock_get_datacenters_df.side_effect = \
            lambda *a, **v: self.get_datacenters_df(None, *a, **v)
        mock_get_channels_df.side_effect = lambda *a, **v: self.get_channels_df(None, *a, **v)
        mock_save_inventories.side_effect = lambda *a, **v: self.save_inventories(None, *a, **v)
        mock_download_save_segments.side_effect = \
            lambda *a, **v: self.download_save_segments([URLError('abc')], *a, **v)
        # mseed unpack is mocked by accepting only first arg (so that time bounds are
        # not considered)
        mock_mseed_unpack.side_effect = lambda *a, **v: unpack(a[0])
        mock_insertdf.side_effect = lambda *a, **v: insertdf(*a, **v)
        mock_updatedf.side_effect = lambda *a, **v: updatedf(*a, **v)
        # prevlen = len(db.session.query(Segment).all())

        # mock our opener
        m = Mock()
        mockopen = Mock()
        mockopen.read = lambda *a, **v: b''
        mockopen.msg = 'abc'
        mockopen.code = 204
        m.open = lambda *a, **v: mockopen
        # m.read = lambda *a, **v: ''
        mock_get_opener.side_effect = lambda *a, **v: m

        # patching class methods while preserving the original call requires storing once
        # the original methods (as class attributes). Sets the side effect of the mocked method
        # to those class attributes as to preserve the original functionality
        # and be able to assert mock_* functions are called and so on
        # For info see:
        # https://stackoverflow.com/a/29563665
        mock_get_data_open.side_effect = self.dc_get_data_open
        mock_get_data_from_userpass.side_effect = self.dc_get_data_from_userpass
        mock_get_data_from_token.side_effect = \
            lambda *a, **kw: self.dc_get_data_from_token([URLError('a'), 'abc'], *a, **kw)

        # TEST 1: provide a file with valid token:
        tokenfile = pytestdir.newfile(create=True)
        with open(tokenfile, 'w') as fh:
            fh.write('BEGIN PGP MESSAGE')
        # mock yaml_load to override restricted_data:

        # launch two download runs with different responses for token auth query:
        for tokenquery_mocked_return_values, dc_token_failed in \
            ([[URLError('a'), 'uzer:pazzword'], "http://geofon.gfz-potsdam.de"],
             [['uzer:pazzword', URLError('a')], 'http://ws.resif.fr']):
            # set how many times self.mock_urlopen has been called:
            mock_urlopen_call_count = self.mock_urlopen.call_count
            # TEST 2: USERPASS good for just one datacenter:
            mock_get_data_open.reset_mock()
            mock_get_data_from_token.reset_mock()
            mock_get_data_from_userpass.reset_mock()
            mock_get_opener.reset_mock()
            mock_get_data_from_token.side_effect = \
                lambda *a, **kw: self.dc_get_data_from_token(tokenquery_mocked_return_values,
                                                             *a, **kw)
            yaml_file = yamlfile(restricted_data=os.path.abspath(tokenfile),
                                 retry_client_err=False)
            result = clirunner.invoke(cli, ['download',
                                            '-c', yaml_file,
                                            '--dburl', db.dburl,
                                            '--start', '2016-05-08T00:00:00',
                                            '--end', '2016-05-08T9:00:00'])
            assert clirunner.ok(result)
            assert 'restricted_data: %s' % os.path.abspath(tokenfile) in result.output
            assert 'STEP 5 of 8: Acquiring credentials from token' in result.output
            # assert we print that we are downloading open and restricted data:
            assert re.search(r'STEP 7 of 8\: Downloading \d+ segments and saving to db',
                             result.output)
            assert not mock_get_data_open.called
            assert mock_get_data_from_token.called
            assert not mock_get_data_from_userpass.called

            assert "Downloading open data only from: %s" % dc_token_failed
            dc_token_ok = 'http://ws.resif.fr' \
                if dc_token_failed == "http://geofon.gfz-potsdam.de" else \
                "http://geofon.gfz-potsdam.de"
            assert mock_get_opener.call_count == 1
            assert mock_get_opener.call_args_list[0][0][:] == (dc_token_ok, 'uzer', 'pazzword')

            dc_id = {Fdsnws(i[1]).site: i[0] for i in
                     db.session.query(DataCenter.id, DataCenter.dataselect_url)}
            # assert urlopen has been called only once with query and not queryauth:
            # get the segments dataframe we (re)downloaded:
            segments_df_to_download = mock_download_save_segments.call_args_list[-1][0][1]
            dc2download = pd.unique(segments_df_to_download['datacenter_id']).tolist()
            # set the expected call count based on the datacenters of (re)downloaded segments:
            if dc_id[dc_token_failed] not in dc2download:
                assert self.mock_urlopen.call_count == 0
            else:
                assert self.mock_urlopen.call_count >= 1
                for i in range(self.mock_urlopen.call_count):
                    i+=1
                    assert self.mock_urlopen.call_args_list[-i][0][0].get_full_url() == \
                        dc_token_failed + "/fdsnws/dataselect/1/query"
    def test_download_save_segments(self, mock_updatedf, mock_insertdf, mseed_unpack, db,
                                    tt_ak135_tts):
        # prepare:
        # mseed unpack takes no starttime and endtime arguments, so that
        # we do not discard any correct chunk
        mseed_unpack.side_effect = lambda *a, **v: unpack(a[0])
        mock_insertdf.side_effect = lambda *a, **v: insertdf(*a, **v)
        mock_updatedf.side_effect = lambda *a, **v: updatedf(*a, **v)

        urlread_sideeffect = None  # use defaults from class
        events_df = self.get_events_df(urlread_sideeffect, db.session)
        net, sta, loc, cha = [], [], [], []
        datacenters_df, eidavalidator = \
            self.get_datacenters_df(urlread_sideeffect, db.session, self.service,
                                    self.routing_service, net, sta, loc, cha,
                                    db_bufsize=self.db_buf_size)
        channels_df = self.get_channels_df(urlread_sideeffect, db.session,
                                           datacenters_df,
                                           eidavalidator,
                                           net, sta, loc, cha, None, None, 10,
                                           False, None, None, -1, self.db_buf_size)
        assert len(channels_df) == 12  # just to be sure. If failing, we might have changed the class default
    # events_df
#                  id  magnitude  latitude  longitude  depth_km  time
# 0  20160508_0000129        3.0       1.0        1.0      60.0  2016-05-08 05:17:11.500
# 1  20160508_0000004        4.0       2.0        2.0       2.0  2016-05-08 01:45:30.300

# channels_df (index not shown):
# columns:
# id  station_id  latitude  longitude  datacenter_id start_time end_time network station location channel
# data (not aligned with columns):
# 1   1  1.0   1.0   1 2003-01-01 NaT  GE  FLT1    HHE
# 2   1  1.0   1.0   1 2003-01-01 NaT  GE  FLT1    HHN
# 3   1  1.0   1.0   1 2003-01-01 NaT  GE  FLT1    HHZ
# 4   2  90.0  90.0  1 2009-01-01 NaT  n1  s       c1
# 5   2  90.0  90.0  1 2009-01-01 NaT  n1  s       c2
# 6   2  90.0  90.0  1 2009-01-01 NaT  n1  s       c3
# 7   3  1.0   1.0   2 2003-01-01 NaT  IA  BAKI    BHE
# 8   3  1.0   1.0   2 2003-01-01 NaT  IA  BAKI    BHN
# 9   3  1.0   1.0   2 2003-01-01 NaT  IA  BAKI    BHZ
# 10  4  90.0  90.0  2 2009-01-01 NaT  n2  s       c1
# 11  4  90.0  90.0  2 2009-01-01 NaT  n2  s       c2
# 12  4  90.0  90.0  2 2009-01-01 NaT  n2  s       c3

        assert all(_ in channels_df.columns for _ in [Station.network.key, Station.station.key,
                                                      Channel.location.key, Channel.channel.key])
        chaid2mseedid = chaid2mseedid_dict(channels_df)
        # check that we removed the columns:
        assert not any(_ in channels_df.columns for _ in
                       [Station.network.key, Station.station.key,
                        Channel.location.key, Channel.channel.key])

        # take all segments:
        # use minmag and maxmag
        ttable = tt_ak135_tts
        segments_df = merge_events_stations(events_df, channels_df, dict(minmag=10, maxmag=10,
                                            minmag_radius=10, maxmag_radius=10), tttable=ttable)

        assert len(pd.unique(segments_df['arrival_time'])) == 2

        h = 9

# segments_df (index not shown). Note that
# cid sid did n   s    l  c    ed   event_id          depth_km                time  <- LAST TWO ARE Event related columns that will be removed after arrival_time calculations
# 1   1   1   GE  FLT1    HHE  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 2   1   1   GE  FLT1    HHN  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 3   1   1   GE  FLT1    HHZ  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 7   3   2   IA  BAKI    BHE  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 8   3   2   IA  BAKI    BHN  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 9   3   2   IA  BAKI    BHZ  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 4   2   1   n1  s       c1   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300
# 5   2   1   n1  s       c2   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300
# 6   2   1   n1  s       c3   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300
# 10  4   2   n2  s       c1   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300
# 11  4   2   n2  s       c2   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300
# 12  4   2   n2  s       c3   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300

# LEGEND:
# cid = channel_id
# sid = station_id
# scid = datacenter_id
# n, s, l, c = network, station, location, channel
# ed = event_distance_deg

        # define a dc_dataselect_manager for open data only:
        dc_dataselect_manager = DcDataselectManager(datacenters_df, Authorizer(None), False)

        wtimespan = [1,2]
        expected = len(segments_df)  # no segment on db, we should have all segments to download
        orig_segments_df = segments_df.copy()
        segments_df, request_timebounds_need_update = \
            prepare_for_download(db.session, orig_segments_df, dc_dataselect_manager, wtimespan,
                                 retry_seg_not_found=True,
                                 retry_url_err=True,
                                 retry_mseed_err=True,
                                 retry_client_err=True,
                                 retry_server_err=True,
                                 retry_timespan_err=True,
                                 retry_timespan_warn=True)

# segments_df
# COLUMNS:
# channel_id  datacenter_id network station location channel event_distance_deg event_id arrival_time start_time end_time id download_status_code run_id
# DATA (not aligned with columns):
#               channel_id  datacenter_id network station location channel  event_distance_deg  event_id            arrival_time          start_time            end_time    id download_status_code  run_id
# GE.FLT1..HHE  1           1              GE      FLT1             HHE     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# GE.FLT1..HHN  2           1              GE      FLT1             HHN     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# GE.FLT1..HHZ  3           1              GE      FLT1             HHZ     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# IA.BAKI..BHE  7           2              IA      BAKI             BHE     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# IA.BAKI..BHN  8           2              IA      BAKI             BHN     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# IA.BAKI..BHZ  9           2              IA      BAKI             BHZ     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# n1.s..c1      4           1              n1      s                c1      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1
# n1.s..c2      5           1              n1      s                c2      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1
# n1.s..c3      6           1              n1      s                c3      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1
# n2.s..c1      10          2              n2      s                c1      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1
# n2.s..c2      11          2              n2      s                c2      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1
# n2.s..c3      12          2              n2      s                c3      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1

        # self._segdata is the folder file of a "valid" 3-channel miniseed
        # The channels are:
        # Thus, no match will be found and all segments will be written with a None
        # download status code

        # setup urlread: first three rows: ok
        # rows[3:6]: 413, retry them
        # rows[6:9]: malformed_data
        # rows[9:12] 413, retry them
        # then retry:
        # rows[3]: empty_data
        # rows[4]: data_with_gaps (but seed_id should notmatch)
        # rows[5]: data_with_gaps (seed_id should notmatch)
        # rows[9]: URLError
        # rows[10]: Http 500 error
        # rows[11]: 413

        # NOTE THAT THIS RELIES ON THE FACT THAT THREADS ARE EXECUTED IN THE ORDER OF THE DATAFRAME
        # WHICH SEEMS TO BE THE CASE AS THERE IS ONE SINGLE PROCESS
        # self._seg_data[:2] is a way to mock data corrupted
        urlread_sideeffect = [self._seg_data, 413, self._seg_data[:2], 413,
                              '', self._seg_data_gaps, self._seg_data_gaps,
                              URLError("++urlerror++"), 500, 413]
        # Let's go:
        ztatz = self.download_save_segments(urlread_sideeffect, db.session, segments_df,
                                            dc_dataselect_manager,
                                            chaid2mseedid,
                                            self.run.id, False,
                                            request_timebounds_need_update,
                                            1, 2, 3, db_bufsize=self.db_buf_size)
        # get columns from db which we are interested on to check
        cols = [Segment.id, Segment.channel_id, Segment.datacenter_id,
                Segment.download_code, Segment.maxgap_numsamples, \
                Segment.sample_rate, Segment.data_seed_id, Segment.data, Segment.download_id,
                Segment.request_start, Segment.request_end, Segment.start_time, Segment.end_time
                ]
        db_segments_df = dbquery2df(db.session.query(*cols))
        assert Segment.download_id.key in db_segments_df.columns

        # change data column otherwise we cannot display db_segments_df.
        # When there is data just print "data"
        db_segments_df.loc[(~pd.isnull(db_segments_df[Segment.data.key])) &
                           (db_segments_df[Segment.data.key].str.len() > 0),
                           Segment.data.key] = b'data'

        # assert we have 4 segments with "data" properly set:
        assert len(db_segments_df.loc[(~pd.isnull(db_segments_df[Segment.data.key])) &
                                      (db_segments_df[Segment.data.key].str.len() > 0),
                                      Segment.data.key]) == 4

        # re-sort db_segments_df to match the segments_df:
        ret = []
        for cha in segments_df[Segment.channel_id.key]:
            ret.append(db_segments_df[db_segments_df[Segment.channel_id.key] == cha])
        db_segments_df = pd.concat(ret, axis=0)

# db_segments_df:
#    id  channel_id  datacenter_id  download_status_code  max_gap_ovlap_ratio  sample_rate data_seed_id     data  run_id          start_time            end_time
# 0  1   1           1              200.0                 0.0001               100.0        GE.FLT1..HHE    data  1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 1  2   2           1              200.0                 0.0001               100.0        GE.FLT1..HHN    data  1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 2  3   3           1              200.0                 0.0001               100.0        GE.FLT1..HHZ    data  1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 6  7   7           2              200.0                 NaN                  NaN          None                  1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 7  8   8           2              NaN                   NaN                  NaN          None            None  1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 8  9   9           2              200.0                 20.0                 20.0         IA.BAKI..BHZ    data  1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 3  4   4           1             -2.0                   NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31
# 4  5   5           1             -2.0                   NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31
# 5  6   6           1             -2.0                   NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31
# 9  10  10          2              -1.0                  NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31
# 10 11  11          2              500.0                 NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31
# 11 12  12          2              413.0                 NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31

        assert len(ztatz) == len(datacenters_df)
        assert len(db_segments_df) == len(segments_df)
        assert mock_updatedf.call_count == 0

        dsc = db_segments_df[Segment.download_code.key]
        exp_dsc = np.array([200, 200, 200, 200, np.nan, 200, -2, -2, -2, -1, 500, 413])
        assert ((dsc == exp_dsc) | (np.isnan(dsc) & np.isnan(exp_dsc))).all()
        # as we have 12 segments and a buf size of self.db_buf_size(=1, but it might change in the
        # future), this below is two
        # it might change if we changed the buf size in the future

        # test that we correctly called mock_insertdf. Note that we assume that the
        # latter is called ONLY inside DbManager. To test that, as the number of stuff
        # to be added (length of the dataframes) varies, we need to implement a counter here:
        mock_insertdf_call_count = 0
        _bufzise = 0
        for c in mock_insertdf.call_args_list:
            c_args = c[0]
            df_ = c_args[0]
            _bufzise += len(df_)
            if _bufzise >= self.db_buf_size:
                mock_insertdf_call_count += 1
                _bufzise = 0

        assert mock_insertdf.call_count == mock_insertdf_call_count

        # assert data is consistent
        COL = Segment.data.key
        assert (db_segments_df.iloc[:3][COL] == b'data').all()
        assert (db_segments_df.iloc[3:4][COL] == b'').all()
        assert pd.isnull(db_segments_df.iloc[4:5][COL]).all()
        assert (db_segments_df.iloc[5:6][COL] == b'data').all()
        assert pd.isnull(db_segments_df.iloc[6:][COL]).all()

        # assert downdload status code is consistent
        URLERR_CODE, MSEEDERR_CODE = s2scodes.url_err, s2scodes.mseed_err

        # also this asserts that we grouped for dc starttime endtime
        COL = Segment.download_code.key
        assert (db_segments_df.iloc[:4][COL] == 200).all()
        assert pd.isnull(db_segments_df.iloc[4:5][COL]).all()
        assert (db_segments_df.iloc[5:6][COL] == 200).all()
        assert (db_segments_df.iloc[6:9][COL] == MSEEDERR_CODE).all()
        assert (db_segments_df.iloc[9][COL] == URLERR_CODE).all()
        assert (db_segments_df.iloc[10][COL] == 500).all()
        assert (db_segments_df.iloc[11][COL] == 413).all()

        # assert gaps are only in the given position
        COL = Segment.maxgap_numsamples.key
        assert (db_segments_df.iloc[:3][COL] < 0.01).all()
        assert pd.isnull(db_segments_df.iloc[3:5][COL]).all()
        assert (db_segments_df.iloc[5][COL] == 20).all()
        assert pd.isnull(db_segments_df.iloc[6:][COL]).all()

        # now mock retry:
        segments_df, request_timebounds_need_update = \
            prepare_for_download(db.session, orig_segments_df, dc_dataselect_manager, wtimespan,
                                 retry_seg_not_found=True,
                                 retry_url_err=True,
                                 retry_mseed_err=True,
                                 retry_client_err=True,
                                 retry_server_err=True,
                                 retry_timespan_err=True,
                                 retry_timespan_warn=True)

        assert request_timebounds_need_update is False

        COL = Segment.download_code.key
        mask = (db_segments_df[COL] >= 400) | pd.isnull(db_segments_df[COL]) \
            | (db_segments_df[COL].isin([URLERR_CODE, MSEEDERR_CODE]))
        assert len(segments_df) == len(db_segments_df[mask])

        urlread_sideeffect = [413]
        mock_updatedf.reset_mock()
        mock_insertdf.reset_mock()
        # define a dc_dataselect_manager for open data only:
        dc_dataselect_manager = DcDataselectManager(datacenters_df, Authorizer(None), False)
        # Let's go:
        ztatz = self.download_save_segments(urlread_sideeffect, db.session, segments_df,
                                            dc_dataselect_manager,
                                            chaid2mseedid,
                                            self.run.id, False,
                                            request_timebounds_need_update,
                                            1, 2, 3, db_bufsize=self.db_buf_size)
        # get columns from db which we are interested on to check
        cols = [Segment.download_code, Segment.channel_id]
        db_segments_df = dbquery2df(db.session.query(*cols))

        # change data column otherwise we cannot display db_segments_df. When there is data
        # just print "data"
        # db_segments_df.loc[(~pd.isnull(db_segments_df[Segment.data.key])) &
        # (db_segments_df[Segment.data.key].str.len() > 0), Segment.data.key] = b'data'

        # re-sort db_segments_df to match the segments_df:
        ret = []
        for cha in segments_df[Segment.channel_id.key]:
            ret.append(db_segments_df[db_segments_df[Segment.channel_id.key] == cha])
        db_segments_df = pd.concat(ret, axis=0)

        assert (db_segments_df[COL] == 413).all()
        assert len(ztatz) == len(datacenters_df)
        assert len(db_segments_df) == len(segments_df)

        # same as above: but with updatedf: test that we correctly called mock_insertdf_napkeys.
        # Note that we assume that the latter is called ONLY inside download.main.DbManager.
        # To test that, as the number of stuff to be added (length of the dataframes) varies,
        # we need to implement a counter here:
        mock_updatedf_call_count = 0
        _bufzise = 0
        for c in mock_updatedf.call_args_list:
            c_args = c[0]
            df_ = c_args[0]
            _bufzise += len(df_)
            if _bufzise >= self.db_buf_size:
                mock_updatedf_call_count += 1
                _bufzise = 0

        assert mock_updatedf.call_count == mock_updatedf_call_count

        assert mock_insertdf.call_count == 0
def test_outofbounds_data(mock_response_inbytes):
    '''test empty data'''
    bytez = b''
    data = unpack(bytez, datetime.utcnow(), datetime.utcnow() + timedelta(5))
    assert all(_[1] == b'' for _ in data.values())
    assert all(_[-1] is True for _ in data.values())
    def test_download_save_segments_timebounds(self, mock_updatedf, mock_insertdf, mseed_unpack,
                                               db, tt_ak135_tts):
        # prepare:
        # mseed unpack takes no starttime and endtime arguments, so that
        mseed_unpack.side_effect = lambda *a, **v: unpack(*a, **v)
        mock_insertdf.side_effect = lambda *a, **v: insertdf(*a, **v)
        mock_updatedf.side_effect = lambda *a, **v: updatedf(*a, **v)

        # mock event response: it's the same as self._evt_urlread_sideeffect but modify the dates
        # as NOW. This means, any segment downloaded later will
        # be out-of-bound
        utcnow = datetime.utcnow()
        utcnow_iso = utcnow.isoformat().replace("T", " ")
        urlread_sideeffect = """#EventID | Time | Latitude | Longitude | Depth/km | Author | Catalog | Contributor | ContributorID | MagType | Magnitude | MagAuthor | EventLocationName
20160508_0000129|%s|1|1|60.0|AZER|EMSC-RTS|AZER|505483|ml|3|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN
20160508_0000004|%s|90|90|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|4|EMSC|CROATIA
""" % (utcnow_iso, utcnow_iso)
        events_df = self.get_events_df(urlread_sideeffect, db.session)
        # restore urlread_side_effect:
        urlread_sideeffect = None
        net, sta, loc, cha = [], [], [], []
        datacenters_df, eidavalidator = \
            self.get_datacenters_df(urlread_sideeffect, db.session, self.service,
                                    self.routing_service, net, sta, loc, cha,
                                    db_bufsize=self.db_buf_size)
        channels_df = self.get_channels_df(urlread_sideeffect, db.session,
                                           datacenters_df,
                                           eidavalidator,
                                           net, sta, loc, cha, None, None, 10,
                                           False, None, None, -1, self.db_buf_size)
        # just to be sure. If failing, we might have changed the class default:
        assert len(channels_df) == 12
    # events_df
#                  id  magnitude  latitude  longitude  depth_km  time
# 0  20160508_0000129        3.0       1.0        1.0      60.0  2016-05-08 05:17:11.500
# 1  20160508_0000004        4.0       2.0        2.0       2.0  2016-05-08 01:45:30.300

# channels_df (index not shown):
# columns:
# id  station_id  latitude  longitude  datacenter_id start_time end_time network station location channel
# data (not aligned with columns):
# 1   1  1.0   1.0   1 2003-01-01 NaT  GE  FLT1    HHE
# 2   1  1.0   1.0   1 2003-01-01 NaT  GE  FLT1    HHN
# 3   1  1.0   1.0   1 2003-01-01 NaT  GE  FLT1    HHZ
# 4   2  90.0  90.0  1 2009-01-01 NaT  n1  s       c1
# 5   2  90.0  90.0  1 2009-01-01 NaT  n1  s       c2
# 6   2  90.0  90.0  1 2009-01-01 NaT  n1  s       c3
# 7   3  1.0   1.0   2 2003-01-01 NaT  IA  BAKI    BHE
# 8   3  1.0   1.0   2 2003-01-01 NaT  IA  BAKI    BHN
# 9   3  1.0   1.0   2 2003-01-01 NaT  IA  BAKI    BHZ
# 10  4  90.0  90.0  2 2009-01-01 NaT  n2  s       c1
# 11  4  90.0  90.0  2 2009-01-01 NaT  n2  s       c2
# 12  4  90.0  90.0  2 2009-01-01 NaT  n2  s       c3

        assert all(_ in channels_df.columns for _ in [Station.network.key, Station.station.key,
                                                      Channel.location.key, Channel.channel.key])
        chaid2mseedid = chaid2mseedid_dict(channels_df)
        # check that we removed the columns:
        assert not any(_ in channels_df.columns for _ in
                       [Station.network.key, Station.station.key,
                        Channel.location.key, Channel.channel.key])

        # take all segments:
        # use minmag and maxmag
        ttable = tt_ak135_tts
        segments_df = merge_events_stations(events_df, channels_df, dict(minmag=10, maxmag=10,
                                            minmag_radius=10, maxmag_radius=10), tttable=ttable)

        assert len(pd.unique(segments_df['arrival_time'])) == 2

        h = 9

# segments_df (index not shown). Note that
# cid sid did n   s    l  c    ed   event_id          depth_km                time  <- LAST TWO ARE Event related columns that will be removed after arrival_time calculations
# 1   1   1   GE  FLT1    HHE  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 2   1   1   GE  FLT1    HHN  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 3   1   1   GE  FLT1    HHZ  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 7   3   2   IA  BAKI    BHE  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 8   3   2   IA  BAKI    BHN  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 9   3   2   IA  BAKI    BHZ  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 4   2   1   n1  s       c1   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300
# 5   2   1   n1  s       c2   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300
# 6   2   1   n1  s       c3   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300
# 10  4   2   n2  s       c1   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300
# 11  4   2   n2  s       c2   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300
# 12  4   2   n2  s       c3   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300

# LEGEND:
# cid = channel_id
# sid = station_id
# scid = datacenter_id
# n, s, l, c = network, station, location, channel
# ed = event_distance_deg

        # define a dc_dataselect_manager for open data only:
        dc_dataselect_manager = DcDataselectManager(datacenters_df, Authorizer(None), False)

        wtimespan = [1, 2]  # in minutes
        expected = len(segments_df)  # no segment on db, we should have all segments to download
        orig_segments_df = segments_df.copy()
        segments_df, request_timebounds_need_update = \
            prepare_for_download(db.session, orig_segments_df, dc_dataselect_manager, wtimespan,
                                 retry_seg_not_found=True,
                                 retry_url_err=True,
                                 retry_mseed_err=True,
                                 retry_client_err=True,
                                 retry_server_err=True,
                                 retry_timespan_err=True,
                                 retry_timespan_warn=True)

# segments_df
# COLUMNS:
# channel_id  datacenter_id network station location channel event_distance_deg event_id arrival_time start_time end_time id download_status_code run_id
# DATA (not aligned with columns):
#               channel_id  datacenter_id network station location channel  event_distance_deg  event_id            arrival_time          start_time            end_time    id download_status_code  run_id
# GE.FLT1..HHE  1           1              GE      FLT1             HHE     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# GE.FLT1..HHN  2           1              GE      FLT1             HHN     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# GE.FLT1..HHZ  3           1              GE      FLT1             HHZ     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# IA.BAKI..BHE  7           2              IA      BAKI             BHE     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# IA.BAKI..BHN  8           2              IA      BAKI             BHN     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# IA.BAKI..BHZ  9           2              IA      BAKI             BHZ     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# n1.s..c1      4           1              n1      s                c1      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1
# n1.s..c2      5           1              n1      s                c2      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1
# n1.s..c3      6           1              n1      s                c3      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1
# n2.s..c1      10          2              n2      s                c1      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1
# n2.s..c2      11          2              n2      s                c2      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1
# n2.s..c3      12          2              n2      s                c3      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1

        # self._segdata is the folder file of a "valid" 3-channel miniseed
        # The channels are:
        # Thus, no match will be found and all segments will be written with a None
        # download status code

        # setup urlread: first three rows: ok
        # rows[3:6]: 413, retry them
        # rows[6:9]: malformed_data
        # rows[9:12] 413, retry them
        # then retry:
        # rows[3]: empty_data
        # rows[4]: data_with_gaps (but seed_id should notmatch)
        # rows[5]: data_with_gaps (seed_id should notmatch)
        # rows[9]: URLError
        # rows[10]: Http 500 error
        # rows[11]: 413

        # NOTE THAT THIS RELIES ON THE FACT THAT THREADS ARE EXECUTED IN THE ORDER OF THE DATAFRAME
        # WHICH SEEMS TO BE THE CASE AS THERE IS ONE SINGLE PROCESS
        # self._seg_data[:2] is a way to mock data corrupted
        urlread_sideeffect = [self._seg_data, 413, self._seg_data[:2], 413,
                              '', self._seg_data_gaps, self._seg_data_gaps,
                              URLError("++urlerror++"), 500, 413]
        # Let's go:
        ztatz = self.download_save_segments(urlread_sideeffect, db.session, segments_df,
                                            dc_dataselect_manager,
                                            chaid2mseedid,
                                            self.run.id, False,
                                            request_timebounds_need_update,
                                            1, 2, 3, db_bufsize=self.db_buf_size)
        # get columns from db which we are interested on to check
        cols = [Segment.id, Segment.channel_id, Segment.datacenter_id,
                Segment.download_code, Segment.maxgap_numsamples,
                Segment.sample_rate, Segment.data_seed_id, Segment.data, Segment.download_id,
                Segment.request_start, Segment.request_end, Segment.start_time, Segment.end_time
                ]
        db_segments_df = dbquery2df(db.session.query(*cols))
        assert Segment.download_id.key in db_segments_df.columns

        OUTTIME_ERR, OUTTIME_WARN = s2scodes.timespan_err, s2scodes.timespan_warn
        # assert no segment has data (time out of bounds):
        assert len(db_segments_df.loc[(~pd.isnull(db_segments_df[Segment.data.key])) &
                                      (db_segments_df[Segment.data.key].str.len() > 0),
                                      Segment.data.key]) == 0
        # assert the number of "correctly" downloaded segments, i.e. with data (4) has now
        # code = TIMEBOUND_ERR
        assert len(db_segments_df[db_segments_df[Segment.download_code.key] == OUTTIME_ERR]) == 4

        # re-sort db_segments_df to match the segments_df:
        ret = []
        for cha in segments_df[Segment.channel_id.key]:
            ret.append(db_segments_df[db_segments_df[Segment.channel_id.key] == cha])
        db_segments_df = pd.concat(ret, axis=0)

# db_segments_df:
#    id  channel_id  datacenter_id  download_status_code  max_gap_ovlap_ratio  sample_rate data_seed_id     data  run_id          start_time            end_time
# 0  1   1           1              -3                    0.0001               100.0        GE.FLT1..HHE    b''   1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 1  2   2           1              -3                    0.0001               100.0        GE.FLT1..HHN    b''   1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 2  3   3           1              -3                    0.0001               100.0        GE.FLT1..HHZ    b''   1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 6  7   7           2              200.0                 NaN                  NaN          None                  1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 7  8   8           2              NaN                   NaN                  NaN          None            None  1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 8  9   9           2              -3                 20.0                 20.0         IA.BAKI..BHZ    b''   1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 3  4   4           1             -2.0                   NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31
# 4  5   5           1             -2.0                   NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31
# 5  6   6           1             -2.0                   NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31
# 9  10  10          2              -1.0                  NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31
# 10 11  11          2              500.0                 NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31
# 11 12  12          2              413.0                 NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31

        # now modify the first row time bounds:
        # first we need to assign the database id to our segments_df, to prevent
        # db contraint error when writing to db:
        # `download_save_segments` below needs toi UPDATE the segments and it does it by
        # checking if an id is present.
        # check that the channel_ids align:
        assert (segments_df[Segment.channel_id.key].values ==
                db_segments_df[Segment.channel_id.key].values).all()
        # so that we can simply do this:
        segments_df[Segment.id.key] = db_segments_df[Segment.id.key]

        # first read the miniseed:
        stream = read(BytesIO(self._seg_data))
        tstart = stream[0].stats.starttime.datetime
        tend = stream[0].stats.endtime.datetime
        segments_df.loc[segments_df[Segment.channel_id.key] == 1,
                        Segment.request_start.key] = tstart
        segments_df.loc[segments_df[Segment.channel_id.key] == 1,
                        Segment.request_end.key] = tstart + (tend-tstart)/2

        segments_df.loc[segments_df[Segment.channel_id.key] == 2,
                        Segment.request_start.key] = tstart
        segments_df.loc[segments_df[Segment.channel_id.key] == 2,
                        Segment.request_end.key] = tend

        # build a segments_df of the three segments belonging to the same channel
        # copy at the end to avoid pandas settingwithcopy warning
        new_segments_df = \
            segments_df.loc[segments_df[Segment.channel_id.key].isin([1, 2, 3]), :].copy()
        # change urlread_side_effect to provide, for the first three segments, the same
        # sequence of bytes. The sequence actually is OK, but in the first case it will be
        # PARTIALLY saved in the second case TOTALLY, and in the thrid case NOT AT ALL:
        urlread_sideeffect = [self._seg_data, self._seg_data, self._seg_data]
        # define a dc_dataselect_manager for open data only:
        dc_dataselect_manager = DcDataselectManager(datacenters_df, Authorizer(None), False)
        ztatz = self.download_save_segments(urlread_sideeffect, db.session, new_segments_df,
                                            dc_dataselect_manager,
                                            chaid2mseedid,
                                            self.run.id, False,
                                            request_timebounds_need_update,
                                            1, 2, 3, db_bufsize=self.db_buf_size)
        db_segments_df = dbquery2df(db.session.query(*cols))
        # re-sort db_segments_df to match the segments_df:
        ret = [db_segments_df[db_segments_df[Segment.channel_id.key] == cha]
               for cha in segments_df[Segment.channel_id.key]]
        db_segments_df = pd.concat(ret, axis=0)

        # assert the 1st segment whose time range has been modified has data, BUT
        # download_status_code still TIMEBOUNDS_ERROR
        df__ = db_segments_df.loc[db_segments_df[Segment.channel_id.key] == 1, :]
        assert len(df__) == 1
        row__ = df__.iloc[0]
        assert row__[Segment.download_code.key] == OUTTIME_WARN
        assert len(row__[Segment.data.key]) > 0

        # assert the 2nd segment whose time range has been modified has data, AND
        # download_status_code 200 (ok)
        df__ = db_segments_df.loc[db_segments_df[Segment.channel_id.key] == 2, :]
        assert len(df__) == 1
        row__ = df__.iloc[0]
        assert row__[Segment.download_code.key] == 200
        assert len(row__[Segment.data.key]) > 0

        # assert the 3rd segment whose time range has NOT been modified has no data,
        # AND download_status_code is still TIMEBOUNDS_ERROR
        df__ = db_segments_df.loc[db_segments_df[Segment.channel_id.key] == 3, :]
        assert len(df__) == 1
        row__ = df__.iloc[0]
        assert row__[Segment.download_code.key] == OUTTIME_ERR
        assert len(row__[Segment.data.key]) == 0
    def test_opendata_and_errors(self, mock_get_data_from_token, mock_get_data_from_userpass,
                                 mock_get_data_open, mock_updatedf, mock_insertdf,
                                 mock_mseed_unpack, mock_download_save_segments,
                                 mock_save_inventories, mock_get_channels_df,
                                 mock_get_datacenters_df, mock_get_events_df,
                                 # fixtures:
                                 db, clirunner, pytestdir, yamlfile):

        mock_get_events_df.side_effect = lambda *a, **v: self.get_events_df(None, *a, **v) 
        mock_get_datacenters_df.side_effect = \
            lambda *a, **v: self.get_datacenters_df(None, *a, **v)
        mock_get_channels_df.side_effect = lambda *a, **v: self.get_channels_df(None, *a, **v)
        mock_save_inventories.side_effect = lambda *a, **v: self.save_inventories(None, *a, **v)
        mock_download_save_segments.side_effect = \
            lambda *a, **v: self.download_save_segments(None, *a, **v)
        # mseed unpack is mocked by accepting only first arg
        # (so that time bounds are not considered)
        mock_mseed_unpack.side_effect = lambda *a, **v: unpack(a[0])
        mock_insertdf.side_effect = lambda *a, **v: insertdf(*a, **v)
        mock_updatedf.side_effect = lambda *a, **v: updatedf(*a, **v)
        # prevlen = len(db.session.query(Segment).all())

        # patching class methods while preserving the original call requires storing once
        # the original methods (as class attributes). Sets the side effect of the mocked method
        # to those class attributes as to preserve the original functionality
        # and be able to assert mock_* functions are called and so on
        # For info see:
        # https://stackoverflow.com/a/29563665
        mock_get_data_open.side_effect = self.dc_get_data_open
        mock_get_data_from_userpass.side_effect = self.dc_get_data_from_userpass
        mock_get_data_from_token.side_effect = self.dc_get_data_from_token

        # TEST 1: NORMAL CASE (NO AUTH):
        # mock yaml_load to override restricted_data:
        yaml_file = yamlfile(restricted_data='')
        # The run table is populated with a run_id in the constructor of this class
        # for checking run_ids, store here the number of runs we have in the table:
        runs = len(db.session.query(Download.id).all())
        result = clirunner.invoke(cli, ['download',
                                        '-c', yaml_file,
                                        '--dburl', db.dburl,
                                        '--start', '2016-05-08T00:00:00',
                                        '--end', '2016-05-08T9:00:00'])
        assert clirunner.ok(result)
        assert 'Downloading 12 segments (open data only)' in result.output
        assert mock_get_data_open.called
        assert not mock_get_data_from_token.called
        assert not mock_get_data_from_userpass.called
        # some assertions to check data properly written
        assert len(db.session.query(Download.id).all()) == runs + 1
        runs += 1
        segments = db.session.query(Segment).all()
        assert len(segments) == 12
        segments = db.session.query(Segment).filter(Segment.has_data).all()
        assert len(segments) == 4
        assert len(db.session.query(Station).filter(Station.has_inventory).all()) == 2
        assert mock_updatedf.called  # called while saving inventories
        assert mock_insertdf.called

        # TEST 1: USERPASS AND EIDA (PROBLEM):
        # test that we provide userpass and eida: error:
        # mock yaml_load to override restricted_data:
        mock_get_data_open.reset_mock()
        mock_get_data_from_token.reset_mock()
        mock_get_data_from_userpass.reset_mock()
        yaml_file = yamlfile(restricted_data=['user', 'password'], dataws='eida')
        result = clirunner.invoke(cli, ['download',
                                        '-c', yaml_file,
                                        '--dburl', db.dburl,
                                        '--start', '2016-05-08T00:00:00',
                                        '--end', '2016-05-08T9:00:00'])
        assert not clirunner.ok(result)
        assert ('Error: Invalid value for "restricted_data": '
                'downloading from EIDA requires a token') in result.output

        # TEST 2: TOKEN FILE NOT EXISTING
        mock_get_data_open.reset_mock()
        mock_get_data_from_token.reset_mock()
        mock_get_data_from_userpass.reset_mock()
        yaml_file = yamlfile(restricted_data='abcdg465du97_Sdr4fvssgflero',
                             dataws='eida')
        result = clirunner.invoke(cli, ['download',
                                        '-c', yaml_file,
                                        '--dburl', db.dburl,
                                        '--start', '2016-05-08T00:00:00',
                                        '--end', '2016-05-08T9:00:00'])
        assert not clirunner.ok(result)
        assert ('Invalid token. If you passed a file path') in result.output

        # TEST 2: TOKEN FILE EXISTS, INVALID (e.g. empty)
        filepath = pytestdir.newfile(create=True)
        mock_get_data_open.reset_mock()
        mock_get_data_from_token.reset_mock()
        mock_get_data_from_userpass.reset_mock()
        yaml_file = yamlfile(restricted_data=os.path.abspath(filepath),
                             dataws='eida')
        result = clirunner.invoke(cli, ['download',
                                        '-c', yaml_file,
                                        '--dburl', db.dburl,
                                        '--start', '2016-05-08T00:00:00',
                                        '--end', '2016-05-08T9:00:00'])
        assert not clirunner.ok(result)
        assert ('Invalid token. If you passed a file path') in result.output