def get_eidars_response_text(routing_service_url): """Returns the tuple (datacenters_df, eidavalidator) from eidars or from the db (in this latter case eidavalidator is None) """ # IMPORTANT NOTE: # We issue a "basic" query to the EIDA rs, with no params other than 'service' and 'format'. # The reason is that as of Jan 2019 the # service is buggy if supplying some arguments # (e.g., with long list of channels) # Also, this way we can save a local file (independent from the custom query) # and read from that file in case of request failure. # The drawback is that we might ask later some data centers for data they do not have: # This is an information the the routing service would provide us # if queried with all parameters (net, sta, start, etcetera) ... too bad query_args = {'service': 'dataselect', 'format': 'post'} url = urljoin(routing_service_url, **query_args) try: responsetext, status, msg = urlread(url, decode='utf8', raise_http_err=True) if not responsetext: raise URLException(Exception("Empty data response")) # fall below except URLException as urlexc: responsetext, last_mod_time_str = _get_local_routing_service() msg = ("Eida routing service error, reading routes from file " "(last updated: %s)" % last_mod_time_str) logger.info(formatmsg(msg, "eida routing service error")) logger.warning(formatmsg("Eida routing service error", urlexc.exc, url)) return responsetext
def save_stations_and_channels(session, channels_df, eidavalidator, update, db_bufsize): """Saves to db channels (and their stations) and returns a dataframe with only channels saved The returned data frame will have the column 'id' (`Station.id`) renamed to 'station_id' (`Channel.station_id`) and a new 'id' column referring to the Channel id (`Channel.id`) :param channels_df: pandas DataFrame resulting from `get_channels_df` """ # first drop channels of same station: sta_df = channels_df.drop_duplicates( subset=[ST.NET, ST.STA, ST.STIME, ST.DCID]).copy() sta_df = drop_station_duplicates(session, sta_df, eidavalidator) # remember: dbsyncdf raises a FailedDownload, so no need to check for empty(dataframe). Also, # if update is True, for stations only it must NOT update inventories HERE (handled later) _update_stations = update if _update_stations: _update_stations = [ _ for _ in shared_colnames(Station, sta_df, pkey=False) if _ != Station.inventory_xml.key ] sta_df = dbsyncdf(sta_df, session, [Station.network, Station.station, Station.start_time], Station.id, _update_stations, buf_size=db_bufsize, keep_duplicates=True, cols_to_print_on_err=ST.ERRCOLS) # sta_df will have the STA_ID columns, channels_df not: set it from the former to the latter: channels_df = mergeupdate(channels_df, sta_df, [ST.NET, ST.STA, ST.STIME, ST.DCID], [ST.ID]) # rename now 'id' to 'station_id' before writing the channels to db: channels_df.rename(columns={ST.ID: CH.STAID}, inplace=True) # check dupes and warn: channels_df_dupes = channels_df[channels_df[CH.STAID].isnull()] if not channels_df_dupes.empty: exc_msg = ( "Found %d duplicated channel(s) to be discarded " "as a result of duplicated stations") % len(channels_df_dupes) logger.info(exc_msg) # If you want to print the duplicated channels, see `drop_station_duplicates` # (don't do it as it's redundant info), and type e.g.: # DbExcLogger(columns_to_print).failed_insert(channels_df_dupes, Exception(exc_msg)) channels_df.dropna(axis=0, subset=[CH.STAID], inplace=True) # add channels to db: channels_df = dbsyncdf( channels_df, session, [Channel.station_id, Channel.location, Channel.channel], Channel.id, update, buf_size=db_bufsize, keep_duplicates=True, cols_to_print_on_err=CH.ERRCOLS) return channels_df
def dolog(ok, notok, okstr, nookstr): if not ok and not notok: return _errmsg = "sql errors" _noerrmsg = "no sql error" msg = okstr % (ok, "row" if ok == 1 else "rows") infomsg = _noerrmsg if notok: msg += nookstr % notok infomsg = _errmsg logger.info(formatmsg("%s: %s" % (_header, msg), infomsg))
def drop_station_duplicates(session, sta_df, eidavalidator): '''Drops station duplicates from the Station Data frame `sta_df` using eidavalidator or the database accessible via the session object, if eidavalidator is None. If no duplicates are found, returns `sta_df` ''' # then check dupes. Same network, station, starttime but different datacenter: duplicated = sta_df.duplicated( subset=[ST.NET, ST.STA, ST.STIME], keep=False) # keep=False => Mark all duplicates as True if duplicated.any(): sta_df_dupes = sta_df[duplicated].copy() sta_df_dupes.rename(columns={ST.DCID: ST.DCID2}, inplace=True) sta_df_dupes[ST.DCID] = np.nan if eidavalidator is not None: for i, net, sta, loc, cha, stime, etime in \ zip(sta_df_dupes.index, sta_df_dupes[ST.NET], sta_df_dupes[ST.STA], sta_df_dupes[CH.LOC], sta_df_dupes[CH.CHA], sta_df_dupes[ST.STIME], sta_df_dupes[ST.ETIME]): sta_df_dupes.at[i, ST.DCID] = \ eidavalidator.get_dc_id(net, sta, loc, cha, None if pd.isnull(stime) else stime, None if pd.isnull(etime) else etime) else: sta_db = dbquery2df( session.query(Station.network, Station.station, Station.start_time, Station.datacenter_id)) mergeupdate(sta_df_dupes, sta_db, [ST.NET, ST.STA, ST.STIME], [ST.DCID]) sta_df_dupes = sta_df_dupes[ sta_df_dupes[ST.DCID] != sta_df_dupes[ST.DCID2]] if not sta_df_dupes.empty: exc_msg = "Found %d duplicated station(s) to be discarded (checked against %s)" % \ (len(sta_df_dupes), ("already saved stations" if eidavalidator is None else "eida routing service")) logger.info(exc_msg) # print the removed dataframe to log.warning (showing # [STA_NET, STA_STA, STA_STIME, STA_DCID2] columns only): db_exc_logger = DbExcLogger([ST.NET, ST.STA, ST.STIME, ST.DCID2]) db_exc_logger.failed_insert( sta_df_dupes.sort_values(by=[ST.NET, ST.STA, ST.STIME]), '', ) # https://stackoverflow.com/questions/28901683/pandas-get-rows-which-are-not-in-other-dataframe: sta_df = sta_df.loc[~sta_df.index.isin(sta_df_dupes.index)] return sta_df
def __init__(self, datacenters_df, authorizer, show_progress=False): '''initializes a new DcDataselectManager''' DC_ID = DataCenter.id.key # pylint: disable=invalid-name DC_DSURL = DataCenter.dataselect_url.key # pylint: disable=invalid-name # there is a handy function datacenters_df.set_index(keys_col)[values_col].to_dict, # but we want iterrows cause we convert any dc url to its fdsnws object dcid2fdsn = { int(row[DC_ID]): Fdsnws(row[DC_DSURL]) for _, row in datacenters_df.iterrows() } # Note: Fdsnws might raise, but at this point datacenters_df is assumed to be well # formed errors = {} # urls mapped to their exception if authorizer.token: token = authorizer.token self._data, errors = self._get_data_from_token( dcid2fdsn, token, show_progress) self._restricted_id = [ did for did in self._data if did not in errors ] elif authorizer.userpass: user, password = authorizer.userpass self._data, errors = self._get_data_from_userpass( dcid2fdsn, user, password) self._restricted_id = list(dcid2fdsn.keys()) else: # no authorization required self._data, errors = self._get_data_open(dcid2fdsn) self._restricted_id = [] if errors: # map urls site to error, not dcids: errors = { dcid2fdsn[dcid].site: err for dcid, err in errors.items() } logger.info( formatmsg( 'Downloading open data only from: %s' % ", ".join(errors), 'Unable to acquire credentials for restricted data')) for url, exc in errors.items(): logger.warning( formatmsg( "Downloading open data only, " "Unable to acquire credentials for restricted data", str(exc), url))
def check_suspiciously_duplicated_segment(segments_df): '''Checks for suspiciously duplicated segments, i.e. different ids but same (channel_id, request_start, request_end). These segments stem from distinct events with very close spatio-temporal coordinates. This function simply logs a message if any such duplicated segment is found, it does NOT modify segments_df ''' seg_dupes_mask = segments_df.duplicated( subset=[SEG.CHAID, SEG.REQSTIME, SEG.REQETIME], keep=False) if seg_dupes_mask.any(): seg_dupes = segments_df[seg_dupes_mask] logger.info( formatmsg( "%d suspiciously duplicated segments found: this is most likely\n" "due to events with different ids\n" "but same (or very close) latitude, longitude, depth and time." ), len(seg_dupes)) logwarn_dataframe( seg_dupes.sort_values(by=[SEG.CHAID, SEG.REQSTIME, SEG.REQETIME]), "Suspicious duplicated segments", [SEG.CHAID, SEG.REQSTIME, SEG.REQETIME, SEG.EVID], max_row_count=100)
def dblog(table, inserted, not_inserted, updated=0, not_updated=0): """Prints to log the result of a database wrtie operation. Use this function to harmonize the message format and make it more readable in log or terminal """ _header = "Db table '%s'" % table.__tablename__ if not inserted and not not_inserted and not updated and not not_updated: logger.info("%s: no new row to insert, no row to update", _header) else: def dolog(ok, notok, okstr, nookstr): if not ok and not notok: return _errmsg = "sql errors" _noerrmsg = "no sql error" msg = okstr % (ok, "row" if ok == 1 else "rows") infomsg = _noerrmsg if notok: msg += nookstr % notok infomsg = _errmsg logger.info(formatmsg("%s: %s" % (_header, msg), infomsg)) dolog(inserted, not_inserted, "%d new %s inserted", ", %d discarded") dolog(updated, not_updated, "%d %s updated", ", %d discarded")
def merge_events_stations(events_df, channels_df, search_radius, tttable, show_progress=False): """Merges `events_df` and `channels_df` by returning a new dataframe representing all channels within a specific search radius. *Each row of the returned data frame is basically a segment to be potentially donwloaded*. The returned dataframe will be the same as `channels_df` with one or more rows repeated (some channels might be in the search radius of several events), plus a column "event_id" (`Segment.event_id`) representing the event associated to that channel and two columns 'event_distance_deg', 'time' (representing the *event* time) and 'depth_km' (representing the event depth in km) :param channels_df: pandas DataFrame resulting from `get_channels_df` :param events_df: pandas DataFrame resulting from `get_events_df` """ # For convenience and readability, define once the mapped column names representing the # dataframe columns that we need: EVT_ID = Event.id.key # pylint: disable=invalid-name EVT_MAG = Event.magnitude.key # pylint: disable=invalid-name EVT_LAT = Event.latitude.key # pylint: disable=invalid-name EVT_LON = Event.longitude.key # pylint: disable=invalid-name EVT_TIME = Event.time.key # pylint: disable=invalid-name EVT_DEPTH = Event.depth_km.key # pylint: disable=invalid-name STA_LAT = Station.latitude.key # pylint: disable=invalid-name STA_LON = Station.longitude.key # pylint: disable=invalid-name STA_STIME = Station.start_time.key # pylint: disable=invalid-name STA_ETIME = Station.end_time.key # pylint: disable=invalid-name CHA_ID = Channel.id.key # pylint: disable=invalid-name CHA_STAID = Channel.station_id.key # pylint: disable=invalid-name SEG_EVID = Segment.event_id.key # pylint: disable=invalid-name SEG_EVDIST = Segment.event_distance_deg.key # pylint: disable=invalid-name SEG_ATIME = Segment.arrival_time.key # pylint: disable=invalid-name SEG_DCID = Segment.datacenter_id.key # pylint: disable=invalid-name SEG_CHAID = Segment.channel_id.key # pylint: disable=invalid-name channels_df = channels_df.rename(columns={CHA_ID: SEG_CHAID}) # get unique stations, rename Channel.id into Segment.channel_id now so we do not bother later stations_df = channels_df.drop_duplicates(subset=[CHA_STAID]).copy() ret = [] sourcedepths, eventtimes = [], [] with get_progressbar(show_progress, length=len(events_df)) as pbar: min_radia, max_radia = get_serarch_radia(search_radius, events_df[EVT_MAG].values) for min_radius, max_radius, evt_dic in \ zip(min_radia, max_radia, dfrowiter(events_df, [EVT_ID, EVT_LAT, EVT_LON, EVT_TIME, EVT_DEPTH])): l2d = locations2degrees(stations_df[STA_LAT], stations_df[STA_LON], evt_dic[EVT_LAT], evt_dic[EVT_LON]) condition = (stations_df[STA_STIME] <= evt_dic[EVT_TIME]) & \ (pd.isnull(stations_df[STA_ETIME]) | (stations_df[STA_ETIME] >= evt_dic[EVT_TIME] + timedelta(days=1))) # l2d is a distance, thus non negative. We can add the min radius condition # only if it is >=0. Evaluate to false in case min_radius is None (legacy code): if min_radius: condition &= (l2d >= min_radius) # for max_radius, None means: skip if max_radius is not None: condition &= (l2d <= max_radius) pbar.update(1) if not np.any(condition): continue # Set (or re-set from second iteration on) as NaN SEG_EVDIST columns. This is important # cause from second loop on we might have some elements not-NaN which should be NaN now channels_df[SEG_EVDIST] = np.nan # set locations2 degrees stations_df[SEG_EVDIST] = l2d # Copy distances calculated on stations to their channels # (match along column CHA_STAID shared between the reletive dataframes). Set values # only for channels whose stations are within radius (stations_df[condition]): cha_df = mergeupdate(channels_df, stations_df[condition], [CHA_STAID], [SEG_EVDIST], drop_other_df_duplicates=False) # dupes already dropped # drop channels which are not related to station within radius: cha_df = cha_df.dropna(subset=[SEG_EVDIST], inplace=False).copy() cha_df[SEG_EVID] = evt_dic[EVT_ID] # ...and add "safely" SEG_EVID values # append to arrays (calculate arrival times in one shot a t the end, it's faster): sourcedepths += [evt_dic[EVT_DEPTH]] * len(cha_df) eventtimes += [np.datetime64(evt_dic[EVT_TIME])] * len(cha_df) # Append only relevant columns: ret.append(cha_df[[SEG_CHAID, SEG_EVID, SEG_DCID, SEG_EVDIST]]) # create total segments dataframe: # first check we have data: if not ret: raise FailedDownload(formatmsg("No segments to process", "No station within search radia")) # now concat: ret = pd.concat(ret, axis=0, ignore_index=True, copy=True) # compute travel times. Doing it on a single array is much faster sourcedepths = np.array(sourcedepths) distances = ret[SEG_EVDIST].values traveltimes = tttable(sourcedepths, 0, distances) # assign to column: eventtimes = np.array(eventtimes) # should be of type '<M8[us]' or whatever datetime dtype # now to compute arrival times: eventtimes + traveltimes does not work (we cannot # sum np.datetime64 and np.float). Convert traveltimes to np.timedelta: we first multiply by # 1000000 to preserve the millisecond resolution and then we write traveltimes.astype("m8[us]") # which means: 8bytes timedelta with microsecond resolution (10^-6) # Side note: that all numpy timedelta constructors (as well as "astype") round to int # argument, at least in numpy13. ret[SEG_ATIME] = eventtimes + (traveltimes*1000000).astype("m8[us]") # drop nat values oldlen = len(ret) ret.dropna(subset=[SEG_ATIME], inplace=True) if oldlen > len(ret): logger.info(formatmsg("%d of %d segments discarded", "Travel times NaN"), oldlen-len(ret), oldlen) if ret.empty: raise FailedDownload(formatmsg("No segments to process", "All travel times NaN")) return ret
def get_channels_df( session, datacenters_df, eidavalidator, # <- can be none net, sta, loc, cha, starttime, endtime, min_sample_rate, update, max_thread_workers, timeout, blocksize, db_bufsize, show_progress=False): """Returns a dataframe representing a query to the eida services (or the internal db if `post_data` is None) with the given argument. The dataframe will have as columns the `key` attribute of any of the following db columns: ``` [Channel.id, Station.latitude, Station.longitude, Station.datacenter_id] ``` :param datacenters_df: the first item resulting from `get_datacenters_df` (pandas DataFrame) :param post_data: the second item resulting from `get_datacenters_df` (string) :param channels: a list of string denoting the channels, or None for no filtering (all channels). Each string follows FDSN specifications (e.g. 'BHZ', 'H??'). This argument is not used if `post_data` is given (not None) :param min_sample_rate: minimum sampling rate, set to negative value for no-filtering (all channels) """ postdata = get_post_data(net, sta, loc, cha, starttime, endtime) ret = [] url_failed_dc_ids = [] iterable = ((id_, Request(url, data=('format=text\nlevel=channel\n' + post_data_str).encode('utf8'))) for url, id_, post_data_str in zip( datacenters_df[DataCenter.station_url.key], datacenters_df[ DataCenter.id.key], cycle([postdata]))) with get_progressbar(show_progress, length=len(datacenters_df)) as pbar: for obj, result, exc, url in read_async(iterable, urlkey=lambda obj: obj[-1], blocksize=blocksize, max_workers=max_thread_workers, decode='utf8', timeout=timeout): pbar.update(1) dcen_id = obj[0] if exc: url_failed_dc_ids.append(dcen_id) logger.warning(formatmsg("Unable to fetch stations", exc, url)) else: try: dframe = response2normalizeddf(url, result[0], "channel") if not dframe.empty: dframe[Station.datacenter_id.key] = dcen_id ret.append(dframe) except ValueError as verr: logger.warning( formatmsg("Discarding response data", verr, url)) db_cha_df = pd.DataFrame() if url_failed_dc_ids: # if some datacenter does not return station, warn with INFO dc_df_fromdb = \ datacenters_df.loc[datacenters_df[DataCenter.id.key].isin(url_failed_dc_ids)] logger.info( formatmsg( "Fetching stations from database for %d (of %d) data-center(s)", "download errors occurred"), len(dc_df_fromdb), len(datacenters_df)) logger.info( dc_df_fromdb[DataCenter.dataselect_url.key].to_string(index=False)) db_cha_df = get_channels_df_from_db(session, dc_df_fromdb, net, sta, loc, cha, starttime, endtime, min_sample_rate) # build two dataframes which we will concatenate afterwards web_cha_df = pd.DataFrame() if ret: # pd.concat complains for empty list try: web_cha_df = filter_channels_df( pd.concat(ret, axis=0, ignore_index=True, copy=False), net, sta, loc, cha, min_sample_rate) # this raises FailedDownload if we cannot save any element: web_cha_df = save_stations_and_channels(session, web_cha_df, eidavalidator, update, db_bufsize) except FailedDownload as qexc: if db_cha_df.empty: raise else: logger.warning(qexc) if db_cha_df.empty and web_cha_df.empty: # ok, now let's see if we have remaining datacenters to be fetched from the db raise FailedDownload( formatmsg("No station found", ("Unable to fetch stations from all data-centers, " "no data to fetch from the database. " "Check config and log for details"))) ret = None if db_cha_df.empty: ret = web_cha_df elif web_cha_df.empty: ret = db_cha_df else: ret = pd.concat((web_cha_df, db_cha_df), axis=0, ignore_index=True, sort=False) # the columns for the channels dataframe that will be returned return ret[[ c.key for c in (Channel.id, Channel.station_id, Station.latitude, Station.longitude, Station.datacenter_id, Station.start_time, Station.end_time, Station.network, Station.station, Channel.location, Channel.channel) ]].copy()
def prepare_for_download(session, segments_df, dc_dataselect_manager, timespan, retry_seg_not_found, retry_url_err, retry_mseed_err, retry_client_err, retry_server_err, retry_timespan_err, retry_timespan_warn=False): """Drops the segments which are already present on the database and updates the primary keys for those not present (adding them to the db). Adds new columns to the returned Data frame :param session: the sql-alchemy session bound to an existing database :param segments_df: pandas DataFrame resulting from `get_arrivaltimes` """ opendataonly = dc_dataselect_manager.opendataonly # fetch already downloaded segments and return the corresponding dataframe. # which will have also the boolean column SEG.RETRY, which is True for suspiciously # restricted (SR) segments, i.e. segments whose download code MIGHT denote that they # are restricted (see `s2scodes.restricted_data`): db_seg_df = fetch_already_downloaded_segments_df(session, segments_df, opendataonly) # store now the ids of the SR segments, we will use them later. If open data, `db_seg_df` # does not have the column SEG.RETRY so set the ids to a (empty) DataFrame for consistency: force_retry_ids = pd.DataFrame() if opendataonly else db_seg_df[SEG.ID][ db_seg_df[SEG.RETRY]] # Now update the SEG.RETRY column (or create it) according to the flags set: set_segments_to_retry(db_seg_df, opendataonly, retry_seg_not_found, retry_url_err, retry_mseed_err, retry_client_err, retry_server_err, retry_timespan_err, retry_timespan_warn) # Now merge/update existing dataframe (`segments_df`) with the db values (`db_seg_df`). # Do it in two steps, 1) and 2): # 1) set columns and defaults (for int types, sets np.nan). # Note that if we have something to retry (db_seg_df[SEG_RETRY].any()), we add also # a column SEG.DSCODE with None/nan as default: checking if that column exists # will be the way later to know if we need to update rows or only insert new rows. cols2set = OrderedDict([(SEG.ID, np.nan), (SEG.RETRY, True), (SEG.REQSTIME, pd.NaT), (SEG.REQETIME, pd.NaT)] + ([(SEG.DSCODE, np.nan)] if db_seg_df[SEG.RETRY].any() else [])) for colname, default_ in cols2set.items(): segments_df[colname] = default_ # 2) assign/override values of cols2set from db_seg_df to segments_df, # matching rows via the [SEG_CHID, SEG_EVID] cols: segments_df = mergeupdate(segments_df, db_seg_df, [SEG.CHAID, SEG.EVID], list(cols2set.keys())) request_timebounds_need_update = set_requested_timebounds( segments_df, timespan) oldlen = len(segments_df) # do a copy to avoid SettingWithCopyWarning. Moreover, copy should re-allocate contiguous # arrays which might be faster (and less memory consuming after unused memory is released) segments_df = segments_df[segments_df[SEG.RETRY]].copy() if oldlen != len(segments_df): reason = "already downloaded, no retry" logger.info(formatmsg("%d segments discarded", reason), oldlen - len(segments_df)) if segments_df.empty: raise NothingToDownload( "Nothing to download: all segments already downloaded " "according to the current configuration") check_suspiciously_duplicated_segment(segments_df) # Last step: the policy later will be to UPDATE (=overwrite existing segments on the database) # only segments whose download code changed (see comment on line 354) because yes, it might # save a lot of time. E.g., suppose retry_server_error=true and a segment # on the db with download code=500 => update it only if the server returns some code != 500. # However, if we are downloading with credentials, we need to force updating SR segments which # were downloaded with no credentials, by definition of SR (suspiciously restricted). # Thus, if we have those segments (`not force_retry_ids.empty`) and we are # performing a download on an already existing database (`SEG.DSCODE in segments_df.columns`), # for those SR segments we will set the value of the column `SEG.DSCODE` to None/nan: # as we will never get any response code = None from the server, those SR segments # will always be updated if not force_retry_ids.empty and SEG.DSCODE in segments_df.columns: segments_df.loc[segments_df[SEG.ID].isin(force_retry_ids), SEG.DSCODE] = np.nan segments_df.drop([SEG.RETRY], axis=1, inplace=True) return segments_df, request_timebounds_need_update