Esempio n. 1
0
def get_eida_datacenters_df(responsetext):
    """Returns the tuple (datacenters_df, eidavalidator) from eidars or from the db (in this
    latter case eidavalidator is None)
    """
    # For convenience and readability, define once the mapped column names representing the
    # dataframe columns that we need:
    DC_SURL = DataCenter.station_url.key  # pylint: disable=invalid-name
    DC_DURL = DataCenter.dataselect_url.key  # pylint: disable=invalid-name
    DC_ORG = DataCenter.organization_name.key  # pylint: disable=invalid-name

    dclist = []

    for url, postdata in eidarsiter(responsetext):  # @UnusedVariable
        try:
            fdsn = Fdsnws(url)
            dclist.append({
                DC_SURL: fdsn.url(Fdsnws.STATION),
                DC_DURL: fdsn.url(Fdsnws.DATASEL),
                DC_ORG: 'eida'
            })
        except ValueError as verr:
            logger.warning(
                "Discarding data center (non FDSN url: '%s' "
                "as returned from the routing service)", url)
    if not dclist:
        raise FailedDownload(
            Exception("No datacenters found in response text / file"))
    datacenters_df = pd.DataFrame(dclist)
    return datacenters_df
Esempio n. 2
0
def get_eidars_response_text(routing_service_url):
    """Returns the tuple (datacenters_df, eidavalidator) from eidars or from the db (in this
    latter case eidavalidator is None)
    """
    # IMPORTANT NOTE:
    # We issue a "basic" query to the EIDA rs, with no params other than 'service' and 'format'.
    # The reason is that as of Jan 2019 the
    # service is buggy if supplying some arguments
    # (e.g., with long list of channels)
    # Also, this way we can save a local file (independent from the custom query)
    # and read from that file in case of request failure.
    # The drawback is that we might ask later some data centers for data they do not have:
    # This is an information the the routing service would provide us
    # if queried with all parameters (net, sta, start, etcetera) ... too bad
    query_args = {'service': 'dataselect', 'format': 'post'}
    url = urljoin(routing_service_url, **query_args)

    try:
        responsetext, status, msg = urlread(url,
                                            decode='utf8',
                                            raise_http_err=True)
        if not responsetext:
            raise URLException(Exception("Empty data response"))  # fall below
    except URLException as urlexc:
        responsetext, last_mod_time_str = _get_local_routing_service()
        msg = ("Eida routing service error, reading routes from file "
               "(last updated: %s)" % last_mod_time_str)
        logger.info(formatmsg(msg, "eida routing service error"))
        logger.warning(formatmsg("Eida routing service error", urlexc.exc,
                                 url))

    return responsetext
Esempio n. 3
0
def logwarn_dataframe(dataframe, msg, cols_to_print_on_err, max_row_count=30):
    '''prints (using log.warning) the current dataframe. Does not check if dataframe is empty'''
    len_df = len(dataframe)
    if len_df > max_row_count:
        footer = "\n... (showing first %d rows only)" % max_row_count
        dataframe = dataframe.iloc[:max_row_count]
    else:
        footer = ""
    msg = "{}:\n{}{}".format(
        msg, dataframe.to_string(columns=cols_to_print_on_err, index=False),
        footer)
    logger.warning(msg)
Esempio n. 4
0
    def warn(self, request, exc):
        '''issues a logger.warn if the given error is not already reported

        :param request: the Request object
        :pram exc: the reported Exception or string message
        '''
        url = get_host(request)
        item = (url, err2str(exc))  # use err2str to uniquely identify exc
        if item not in self:
            if not self:
                logger.warning(
                    'Detailed inventory download errors '
                    '(showing only first of each type per data center):')
            self.add(item)
            request_str = url2str(request)
            logger.warning(
                formatmsg("Inventory download error", exc, request_str))
Esempio n. 5
0
    def __init__(self, datacenters_df, authorizer, show_progress=False):
        '''initializes a new DcDataselectManager'''
        DC_ID = DataCenter.id.key  # pylint: disable=invalid-name
        DC_DSURL = DataCenter.dataselect_url.key  # pylint: disable=invalid-name

        # there is a handy function datacenters_df.set_index(keys_col)[values_col].to_dict,
        # but we want iterrows cause we convert any dc url to its fdsnws object
        dcid2fdsn = {
            int(row[DC_ID]): Fdsnws(row[DC_DSURL])
            for _, row in datacenters_df.iterrows()
        }
        # Note: Fdsnws might raise, but at this point datacenters_df is assumed to be well
        # formed
        errors = {}  # urls mapped to their exception
        if authorizer.token:
            token = authorizer.token
            self._data, errors = self._get_data_from_token(
                dcid2fdsn, token, show_progress)
            self._restricted_id = [
                did for did in self._data if did not in errors
            ]
        elif authorizer.userpass:
            user, password = authorizer.userpass
            self._data, errors = self._get_data_from_userpass(
                dcid2fdsn, user, password)
            self._restricted_id = list(dcid2fdsn.keys())
        else:  # no authorization required
            self._data, errors = self._get_data_open(dcid2fdsn)
            self._restricted_id = []

        if errors:
            # map urls site to error, not dcids:
            errors = {
                dcid2fdsn[dcid].site: err
                for dcid, err in errors.items()
            }
            logger.info(
                formatmsg(
                    'Downloading open data only from: %s' % ", ".join(errors),
                    'Unable to acquire credentials for restricted data'))
            for url, exc in errors.items():
                logger.warning(
                    formatmsg(
                        "Downloading open data only, "
                        "Unable to acquire credentials for restricted data",
                        str(exc), url))
Esempio n. 6
0
def response2normalizeddf(url, raw_data, dbmodel_key):
    """Returns a normalized and harmonized dataframe from raw_data. dbmodel_key can be 'event'
    'station' or 'channel'. Raises ValueError if the resulting dataframe is empty or if
    a ValueError is raised from sub-functions

    :param url: url (string) or `Request` object. Used only to log the specified
        url in case of wranings
    :param raw_data: valid FDSN data in text format. For info see:
        https://www.fdsn.org/webservices/FDSN-WS-Specifications-1.1.pdf#page=12
    """

    dframe = response2df(raw_data)
    oldlen, dframe = len(dframe), normalize_fdsn_dframe(dframe, dbmodel_key)
    # stations_df surely not empty:
    if oldlen > len(dframe):
        logger.warning(
            formatmsg("%d row(s) discarded", "malformed text data", url),
            oldlen - len(dframe))
    return dframe
Esempio n. 7
0
    def warn(self, request, url, code, exc):
        '''issues a logger.warn if the given error is not already reported

        :param request: the Request object
        :param url: string, usually the request's url host, to identify same data centers
        :param code: the error code
        :pram exc: the reported Exception
        '''
        item = (url, code, str(exc.__class__.__name__))
        if item not in self:
            if not self:
                logger.warning(
                    'Detailed segment download errors '
                    '(showing only first of each type per data center):')
            self.add(item)
            request_str = url2str(request)
            logger.warning(
                formatmsg("Segment download error, code %s" % str(code), exc,
                          request_str))
Esempio n. 8
0
def get_channels_df(
        session,
        datacenters_df,
        eidavalidator,  # <- can be none
        net,
        sta,
        loc,
        cha,
        starttime,
        endtime,
        min_sample_rate,
        update,
        max_thread_workers,
        timeout,
        blocksize,
        db_bufsize,
        show_progress=False):
    """Returns a dataframe representing a query to the eida services (or the internal db
    if `post_data` is None) with the given argument.  The
    dataframe will have as columns the `key` attribute of any of the following db columns:
    ```
    [Channel.id, Station.latitude, Station.longitude, Station.datacenter_id]
    ```
    :param datacenters_df: the first item resulting from `get_datacenters_df` (pandas DataFrame)
    :param post_data: the second item resulting from `get_datacenters_df` (string)
    :param channels: a list of string denoting the channels, or None for no filtering
        (all channels). Each string follows FDSN specifications (e.g. 'BHZ', 'H??'). This argument
        is not used if `post_data` is given (not None)
    :param min_sample_rate: minimum sampling rate, set to negative value for no-filtering
        (all channels)
    """
    postdata = get_post_data(net, sta, loc, cha, starttime, endtime)

    ret = []
    url_failed_dc_ids = []
    iterable = ((id_,
                 Request(url,
                         data=('format=text\nlevel=channel\n' +
                               post_data_str).encode('utf8')))
                for url, id_, post_data_str in zip(
                    datacenters_df[DataCenter.station_url.key], datacenters_df[
                        DataCenter.id.key], cycle([postdata])))

    with get_progressbar(show_progress, length=len(datacenters_df)) as pbar:
        for obj, result, exc, url in read_async(iterable,
                                                urlkey=lambda obj: obj[-1],
                                                blocksize=blocksize,
                                                max_workers=max_thread_workers,
                                                decode='utf8',
                                                timeout=timeout):
            pbar.update(1)
            dcen_id = obj[0]
            if exc:
                url_failed_dc_ids.append(dcen_id)
                logger.warning(formatmsg("Unable to fetch stations", exc, url))
            else:
                try:
                    dframe = response2normalizeddf(url, result[0], "channel")
                    if not dframe.empty:
                        dframe[Station.datacenter_id.key] = dcen_id
                        ret.append(dframe)
                except ValueError as verr:
                    logger.warning(
                        formatmsg("Discarding response data", verr, url))

    db_cha_df = pd.DataFrame()
    if url_failed_dc_ids:  # if some datacenter does not return station, warn with INFO
        dc_df_fromdb = \
            datacenters_df.loc[datacenters_df[DataCenter.id.key].isin(url_failed_dc_ids)]
        logger.info(
            formatmsg(
                "Fetching stations from database for %d (of %d) data-center(s)",
                "download errors occurred"), len(dc_df_fromdb),
            len(datacenters_df))
        logger.info(
            dc_df_fromdb[DataCenter.dataselect_url.key].to_string(index=False))
        db_cha_df = get_channels_df_from_db(session, dc_df_fromdb, net, sta,
                                            loc, cha, starttime, endtime,
                                            min_sample_rate)

    # build two dataframes which we will concatenate afterwards
    web_cha_df = pd.DataFrame()
    if ret:  # pd.concat complains for empty list
        try:
            web_cha_df = filter_channels_df(
                pd.concat(ret, axis=0, ignore_index=True, copy=False), net,
                sta, loc, cha, min_sample_rate)

            # this raises FailedDownload if we cannot save any element:
            web_cha_df = save_stations_and_channels(session, web_cha_df,
                                                    eidavalidator, update,
                                                    db_bufsize)
        except FailedDownload as qexc:
            if db_cha_df.empty:
                raise
            else:
                logger.warning(qexc)

    if db_cha_df.empty and web_cha_df.empty:
        # ok, now let's see if we have remaining datacenters to be fetched from the db
        raise FailedDownload(
            formatmsg("No station found",
                      ("Unable to fetch stations from all data-centers, "
                       "no data to fetch from the database. "
                       "Check config and log for details")))
    ret = None
    if db_cha_df.empty:
        ret = web_cha_df
    elif web_cha_df.empty:
        ret = db_cha_df
    else:
        ret = pd.concat((web_cha_df, db_cha_df),
                        axis=0,
                        ignore_index=True,
                        sort=False)
    # the columns for the channels dataframe that will be returned
    return ret[[
        c.key for c in (Channel.id, Channel.station_id, Station.latitude,
                        Station.longitude, Station.datacenter_id,
                        Station.start_time, Station.end_time, Station.network,
                        Station.station, Channel.location, Channel.channel)
    ]].copy()
Esempio n. 9
0
def filter_channels_df(channels_df, net, sta, loc, cha, min_sample_rate):
    '''Filters out `channels_df` according to the given parameters. Raises
    `FailedDownload` if the returned filtered data frame woul be empty

    Note that `net, sta, loc, cha` filters will be considered only if negations (i.e.,
    with leading exclamation mark: "!A*") because the 'positive' filters are FDSN stantard and
    are supposed to be already used in producing channels_df

    Example:
        filter_channels_df(d, [], ['ABC'], [''], ['!A*', 'HH?', 'HN?'])

        basically takes the dataframe `d`, finds the column related to the `channels` key and
        removes all rowv whose channel starts with 'A', returning the new filtered data frame

    Arguments are usually the output of :func:`stream2segment.download.utils.nslc_lists`

    :param net: an iterable of strings denoting networks.
    :param sta: an iterable of strings denoting stations.
    :param loc: an iterable of strings denoting locations.
    :param cha: an iterable of strings denoting channels.
    :param min_sample_rate: numeric, minimum sample rate. If negative or zero, this parameter
        is ignored
    '''
    # create a dict of regexps for pandas dataframe. FDSNWS do not support NOT
    # operators . Thus concatenate expression with OR
    dffilter = None
    sa_cols = (Station.network, Station.station, Channel.location,
               Channel.channel)

    for lst, sa_col in zip((net, sta, loc, cha), sa_cols):
        if not lst:
            continue
        lst = [_ for _ in lst
               if _[0:1] == '!']  # take only negation expression
        if not lst:
            continue
        condition = ("^%s$" if len(lst) == 1 else "^(?:%s)$") % \
            "|".join(strconvert.wild2re(x[1:]) for x in lst)
        flt = channels_df[sa_col.key].str.match(re.compile(condition))
        if dffilter is None:
            dffilter = flt
        else:
            dffilter &= flt

    if min_sample_rate > 0:
        # account for Nones, thus negate the predicate below:
        flt = ~(channels_df[Channel.sample_rate.key] >= min_sample_rate)
        if dffilter is None:
            dffilter = flt
        else:
            dffilter &= flt

    ret = channels_df if dffilter is None else \
        channels_df[~dffilter].copy()  # pylint: disable=invalid-unary-operand-type

    if ret.empty:
        raise FailedDownload("No channel matches user defined filters "
                             "(network, channel, sample rate, ...)")

    discarded_sr = len(channels_df) - len(ret)
    if discarded_sr:
        logger.warning(
            ("%d channel(s) discarded according to current config. filters "
             "(network, channel, sample rate, ...)"), discarded_sr)

    return ret
Esempio n. 10
0
def download_save_segments(session,
                           segments_df,
                           dc_dataselect_manager,
                           chaid2mseedid,
                           download_id,
                           update_datacenters,
                           update_request_timebounds,
                           max_thread_workers,
                           timeout,
                           download_blocksize,
                           db_bufsize,
                           show_progress=False):
    """Downloads and saves the segments. segments_df MUST not be empty (this is not checked for)

    :param segments_df: the dataframe resulting from `prepare_for_download`. The Dataframe
        might or might not have the column 'download_code'. If it has, it will skip
        writing to db segments whose code did not change: in this case, nans stored under
        'download_code' in segments_df indicate new segments, or segments for which the update
        has to be forced, whatever code is obtained (e.g., queryauth when previously a simple
        query was used)
    :param chaid2mseedid: dict of channel ids (int) mapped to mseed ids
        (strings in "Network.station.location.channel" format)
    """
    # set queryauth column here, outside the loop:
    restricted_enable_dcids = dc_dataselect_manager.restricted_enabled_ids
    if restricted_enable_dcids:
        segments_df[SEG.QAUTH] = \
            segments_df[SEG.DCID].isin(dc_dataselect_manager.restricted_enabled_ids)
    else:
        segments_df[SEG.QAUTH] = False

    segmanager = get_dbmanager(session, update_datacenters,
                               update_request_timebounds, db_bufsize)
    stats = DownloadStats()

    # define the groupsby columns
    # remember that segments_df has columns:
    # we should group by (net, sta, loc, stime, etime), meaning that two rows with those values
    # equal will be given in the same sub-dataframe, and if 413 is found, take 413s erros creating a
    # new dataframe, and then group segment by segment, i.e.
    # (net, sta, loc, cha, stime, etime).
    # Unfortunately, for perf reasons we do not have
    # the first 4 columns, but we do have channel_id which basically comprises (net, sta, loc, cha)
    # NOTE: SEG_START and SEG_END MUST BE ALWAYS PRESENT IN THE SECOND AND THORD POSITION!!!!!
    groupsby = [[SEG.DCID, SEG.START, SEG.END],
                [SEG.DCID, SEG.START, SEG.END, SEG.CHAID]]

    # these are the column names to be set on a dataframe from a received response,
    # mapped to their default value
    # Set nan to let pandas understand it's numeric. None I don't know how it is converted
    # (should be checked) but it's for string types
    # for numpy types, see
    # https://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html#specifying-and-constructing-data-types
    defaultvalues = {
        SEG.DATA: None,
        SEG.SRATE: np.nan,
        SEG.MGAP: np.nan,
        SEG.DATAID: None,
        SEG.DSCODE: np.nan,
        SEG.STIME: pd.NaT,
        SEG.ETIME: pd.NaT
    }
    defaultvalues[SEG.DOWNLID] = download_id
    defaultvalues_nodata = dict(defaultvalues)  # copy
    col_dscode, col_data = SEG.DSCODE, SEG.DATA
    toupdate = SEG.DSCODE in segments_df.columns
    code_not_found = s2scodes.seg_not_found
    skipped_same_code = 0
    seg_logger = SegmentLogger(
    )  # report seg. errors only once per error type and data center
    with get_progressbar(show_progress, length=len(segments_df)) as pbar:

        skipped_dataframes = [
        ]  # store dataframes with a 413 error and retry later
        for group_ in groupsby:

            if segments_df.empty:  # for safety (if this is the second loop or greater)
                break

            is_last_iteration = group_ == groupsby[-1]
            seg_groups = segments_df.groupby(group_, sort=False)
            for data, exc, code, request, dframe in \
                    get_responses(seg_groups, dc_dataselect_manager, chaid2mseedid,
                                  max_thread_workers, timeout, download_blocksize):

                num_segments = len(dframe)
                if code == 413 and not is_last_iteration and num_segments > 1:
                    skipped_dataframes.append(dframe)
                    continue

                pbar.update(num_segments)
                url = get_host(request)
                url_stats = stats[url]

                if exc is None and data != b'':
                    # set default values on the dataframe:
                    dframe = dframe.assign(
                        **defaultvalues)  # assign returns a copy
                    populate_dataframe(data, code, dframe, chaid2mseedid)
                    # group by download code, count them, and add the counts to stats:
                    for kode, kount in get_counts(dframe, col_dscode,
                                                  code_not_found):
                        url_stats[kode] += kount
                else:
                    # here we are if: exc is not None OR data = b''
                    url_stats[code] += num_segments
                    if toupdate and code is not None and (dframe[col_dscode]
                                                          == code).sum():
                        # if there are rows to update, then discard those for which
                        # the code is the same in the database. If we requested a different
                        # time window, we should update the time windows but there is no point in
                        # this overhead. The condition `code is not None`
                        # should never happen but for safety we put it, because we have set the
                        # download code column of `dframe` to None/nan to mark segments to update
                        # neverthless, on the assumption that we never get response code = None
                        # (see comment L.94).
                        # Thus, if for some weird reason the response code is None, then update the
                        # segment anyway (as we wanted to)
                        dframe = dframe[dframe[col_dscode] != code]
                        skipped_same_code += num_segments - len(dframe)
                        if dframe.empty:  # nothing to update on the db
                            continue
                    # update dict of default values, and set it to the dataframe:
                    defaultvalues_nodata.update({
                        col_dscode: code,
                        col_data: data
                    })
                    dframe = dframe.assign(
                        **defaultvalues_nodata)  # assign returns a copy

                    if exc is not None:
                        # log segment errors only once per error type and data center,
                        # otherwise the log is hundreds of Mb and it's unreadable:
                        seg_logger.warn(request, url, code, exc)

                segmanager.add(dframe)

            segmanager.flush(
            )  # flush remaining stuff to insert / update, if any

            if skipped_dataframes:
                segments_df = pd.concat(skipped_dataframes,
                                        axis=0,
                                        ignore_index=True,
                                        copy=True,
                                        verify_integrity=False)
                skipped_dataframes = []
            else:
                # break the next loop, if any
                segments_df = pd.DataFrame()

    segmanager.close()  # flush remaining stuff to insert / update

    if skipped_same_code:
        logger.warning(
            formatmsg(
                ("%d already saved segment(s) with no waveform data skipped "
                 "with no messages, only their count is reported "
                 "in statistics") % skipped_same_code,
                "Still receiving the same download code"))
    return stats