def get_eidars_response_text(routing_service_url): """Returns the tuple (datacenters_df, eidavalidator) from eidars or from the db (in this latter case eidavalidator is None) """ # IMPORTANT NOTE: # We issue a "basic" query to the EIDA rs, with no params other than 'service' and 'format'. # The reason is that as of Jan 2019 the # service is buggy if supplying some arguments # (e.g., with long list of channels) # Also, this way we can save a local file (independent from the custom query) # and read from that file in case of request failure. # The drawback is that we might ask later some data centers for data they do not have: # This is an information the the routing service would provide us # if queried with all parameters (net, sta, start, etcetera) ... too bad query_args = {'service': 'dataselect', 'format': 'post'} url = urljoin(routing_service_url, **query_args) try: responsetext, status, msg = urlread(url, decode='utf8', raise_http_err=True) if not responsetext: raise URLException(Exception("Empty data response")) # fall below except URLException as urlexc: responsetext, last_mod_time_str = _get_local_routing_service() msg = ("Eida routing service error, reading routes from file " "(last updated: %s)" % last_mod_time_str) logger.info(formatmsg(msg, "eida routing service error")) logger.warning(formatmsg("Eida routing service error", urlexc.exc, url)) return responsetext
def __init__(self, datacenters_df, authorizer, show_progress=False): '''initializes a new DcDataselectManager''' DC_ID = DataCenter.id.key # pylint: disable=invalid-name DC_DSURL = DataCenter.dataselect_url.key # pylint: disable=invalid-name # there is a handy function datacenters_df.set_index(keys_col)[values_col].to_dict, # but we want iterrows cause we convert any dc url to its fdsnws object dcid2fdsn = { int(row[DC_ID]): Fdsnws(row[DC_DSURL]) for _, row in datacenters_df.iterrows() } # Note: Fdsnws might raise, but at this point datacenters_df is assumed to be well # formed errors = {} # urls mapped to their exception if authorizer.token: token = authorizer.token self._data, errors = self._get_data_from_token( dcid2fdsn, token, show_progress) self._restricted_id = [ did for did in self._data if did not in errors ] elif authorizer.userpass: user, password = authorizer.userpass self._data, errors = self._get_data_from_userpass( dcid2fdsn, user, password) self._restricted_id = list(dcid2fdsn.keys()) else: # no authorization required self._data, errors = self._get_data_open(dcid2fdsn) self._restricted_id = [] if errors: # map urls site to error, not dcids: errors = { dcid2fdsn[dcid].site: err for dcid, err in errors.items() } logger.info( formatmsg( 'Downloading open data only from: %s' % ", ".join(errors), 'Unable to acquire credentials for restricted data')) for url, exc in errors.items(): logger.warning( formatmsg( "Downloading open data only, " "Unable to acquire credentials for restricted data", str(exc), url))
def test_formatmsg(): req = Request('http://mysite/query', data='a'*1000) msg = formatmsg("action", "errmsg", req) expected = ("action (errmsg). url: http://mysite/query, POST data:\n%s\n" "...(showing first 200 characters only)") % ('a' * 200) assert msg == expected req = Request('http://mysite/query', data='a\n'*5) msg = formatmsg("action", "errmsg", req) expected = ("action (errmsg). url: http://mysite/query, POST data:\n%s") % ('a\n' * 5) assert msg == expected.strip() req = Request('http://mysite/query', data=b'a\n'*5) msg = formatmsg("action", "errmsg", req) expected = ("action (errmsg). url: http://mysite/query, POST data:\n" "b'a\\na\\na\\na\\na\\n'") assert msg == expected.strip()
def warn(self, request, exc): '''issues a logger.warn if the given error is not already reported :param request: the Request object :pram exc: the reported Exception or string message ''' url = get_host(request) item = (url, err2str(exc)) # use err2str to uniquely identify exc if item not in self: if not self: logger.warning( 'Detailed inventory download errors ' '(showing only first of each type per data center):') self.add(item) request_str = url2str(request) logger.warning( formatmsg("Inventory download error", exc, request_str))
def warn(self, request, url, code, exc): '''issues a logger.warn if the given error is not already reported :param request: the Request object :param url: string, usually the request's url host, to identify same data centers :param code: the error code :pram exc: the reported Exception ''' item = (url, code, str(exc.__class__.__name__)) if item not in self: if not self: logger.warning( 'Detailed segment download errors ' '(showing only first of each type per data center):') self.add(item) request_str = url2str(request) logger.warning( formatmsg("Segment download error, code %s" % str(code), exc, request_str))
def check_suspiciously_duplicated_segment(segments_df): '''Checks for suspiciously duplicated segments, i.e. different ids but same (channel_id, request_start, request_end). These segments stem from distinct events with very close spatio-temporal coordinates. This function simply logs a message if any such duplicated segment is found, it does NOT modify segments_df ''' seg_dupes_mask = segments_df.duplicated( subset=[SEG.CHAID, SEG.REQSTIME, SEG.REQETIME], keep=False) if seg_dupes_mask.any(): seg_dupes = segments_df[seg_dupes_mask] logger.info( formatmsg( "%d suspiciously duplicated segments found: this is most likely\n" "due to events with different ids\n" "but same (or very close) latitude, longitude, depth and time." ), len(seg_dupes)) logwarn_dataframe( seg_dupes.sort_values(by=[SEG.CHAID, SEG.REQSTIME, SEG.REQETIME]), "Suspicious duplicated segments", [SEG.CHAID, SEG.REQSTIME, SEG.REQETIME, SEG.EVID], max_row_count=100)
def get_datacenters_df(session, service, routing_service_url, network, station, location, channel, starttime=None, endtime=None, db_bufsize=None): """Returns a 2 elements tuple: the dataframe of the datacenter(s) matching `service`, and an EidaValidator (built on the eida routing service response) for checking stations/channels duplicates after querying the datacenter(s) for stations / channels. If service != 'eida', this argument is None WARNING: Due to bugs in the eida rs the parameter network, station, location, channel, starttime, endtime are NOT used and are here for legacy code and potential future development once the eida rs will be fixed. In cany case, they would be used only if service = 'eida' :param service: the string denoting the dataselect *or* station url in fdsn format, or 'eida', or 'iris'. In case of 'eida', `routing_service_url` must denote an url for the edia routing service. If falsy (e.g., empty string or None), `service` defaults to 'eida' """ # For convenience and readability, define once the mapped column names representing the # dataframe columns that we need: DC_SURL = DataCenter.station_url.key # pylint: disable=invalid-name DC_DURL = DataCenter.dataselect_url.key # pylint: disable=invalid-name DC_ORG = DataCenter.organization_name.key # pylint: disable=invalid-name eidars_response_text = None if not service: service = 'eida' if service.lower() == 'iris': iris_netloc = 'https://service.iris.edu' dc_df = pd.DataFrame(data={ DC_DURL: '%s/fdsnws/dataselect/1/query' % iris_netloc, DC_SURL: '%s/fdsnws/station/1/query' % iris_netloc, DC_ORG: 'iris' }, index=[0]) elif service.lower() != 'eida': try: fdsn = Fdsnws(service) dc_df = pd.DataFrame(data={ DC_DURL: fdsn.url(Fdsnws.DATASEL), DC_SURL: fdsn.url(Fdsnws.STATION), DC_ORG: None }, index=[0]) except ValueError: raise FailedDownload( formatmsg("Unable to use datacenter", "Url does not seem to be a valid fdsn url", service)) else: eidars_response_text = get_eidars_response_text(routing_service_url) dc_df = get_eida_datacenters_df(eidars_response_text) # attempt saving to db only if we might have something to save: dc_df = dbsyncdf(dc_df, session, [DataCenter.station_url], DataCenter.id, buf_size=len(dc_df) if db_bufsize is None else db_bufsize, keep_duplicates='first') return dc_df, \ EidaValidator(dc_df, eidars_response_text) if eidars_response_text is not None else None
def merge_events_stations(events_df, channels_df, search_radius, tttable, show_progress=False): """Merges `events_df` and `channels_df` by returning a new dataframe representing all channels within a specific search radius. *Each row of the returned data frame is basically a segment to be potentially donwloaded*. The returned dataframe will be the same as `channels_df` with one or more rows repeated (some channels might be in the search radius of several events), plus a column "event_id" (`Segment.event_id`) representing the event associated to that channel and two columns 'event_distance_deg', 'time' (representing the *event* time) and 'depth_km' (representing the event depth in km) :param channels_df: pandas DataFrame resulting from `get_channels_df` :param events_df: pandas DataFrame resulting from `get_events_df` """ # For convenience and readability, define once the mapped column names representing the # dataframe columns that we need: EVT_ID = Event.id.key # pylint: disable=invalid-name EVT_MAG = Event.magnitude.key # pylint: disable=invalid-name EVT_LAT = Event.latitude.key # pylint: disable=invalid-name EVT_LON = Event.longitude.key # pylint: disable=invalid-name EVT_TIME = Event.time.key # pylint: disable=invalid-name EVT_DEPTH = Event.depth_km.key # pylint: disable=invalid-name STA_LAT = Station.latitude.key # pylint: disable=invalid-name STA_LON = Station.longitude.key # pylint: disable=invalid-name STA_STIME = Station.start_time.key # pylint: disable=invalid-name STA_ETIME = Station.end_time.key # pylint: disable=invalid-name CHA_ID = Channel.id.key # pylint: disable=invalid-name CHA_STAID = Channel.station_id.key # pylint: disable=invalid-name SEG_EVID = Segment.event_id.key # pylint: disable=invalid-name SEG_EVDIST = Segment.event_distance_deg.key # pylint: disable=invalid-name SEG_ATIME = Segment.arrival_time.key # pylint: disable=invalid-name SEG_DCID = Segment.datacenter_id.key # pylint: disable=invalid-name SEG_CHAID = Segment.channel_id.key # pylint: disable=invalid-name channels_df = channels_df.rename(columns={CHA_ID: SEG_CHAID}) # get unique stations, rename Channel.id into Segment.channel_id now so we do not bother later stations_df = channels_df.drop_duplicates(subset=[CHA_STAID]).copy() ret = [] sourcedepths, eventtimes = [], [] with get_progressbar(show_progress, length=len(events_df)) as pbar: min_radia, max_radia = get_serarch_radia(search_radius, events_df[EVT_MAG].values) for min_radius, max_radius, evt_dic in \ zip(min_radia, max_radia, dfrowiter(events_df, [EVT_ID, EVT_LAT, EVT_LON, EVT_TIME, EVT_DEPTH])): l2d = locations2degrees(stations_df[STA_LAT], stations_df[STA_LON], evt_dic[EVT_LAT], evt_dic[EVT_LON]) condition = (stations_df[STA_STIME] <= evt_dic[EVT_TIME]) & \ (pd.isnull(stations_df[STA_ETIME]) | (stations_df[STA_ETIME] >= evt_dic[EVT_TIME] + timedelta(days=1))) # l2d is a distance, thus non negative. We can add the min radius condition # only if it is >=0. Evaluate to false in case min_radius is None (legacy code): if min_radius: condition &= (l2d >= min_radius) # for max_radius, None means: skip if max_radius is not None: condition &= (l2d <= max_radius) pbar.update(1) if not np.any(condition): continue # Set (or re-set from second iteration on) as NaN SEG_EVDIST columns. This is important # cause from second loop on we might have some elements not-NaN which should be NaN now channels_df[SEG_EVDIST] = np.nan # set locations2 degrees stations_df[SEG_EVDIST] = l2d # Copy distances calculated on stations to their channels # (match along column CHA_STAID shared between the reletive dataframes). Set values # only for channels whose stations are within radius (stations_df[condition]): cha_df = mergeupdate(channels_df, stations_df[condition], [CHA_STAID], [SEG_EVDIST], drop_other_df_duplicates=False) # dupes already dropped # drop channels which are not related to station within radius: cha_df = cha_df.dropna(subset=[SEG_EVDIST], inplace=False).copy() cha_df[SEG_EVID] = evt_dic[EVT_ID] # ...and add "safely" SEG_EVID values # append to arrays (calculate arrival times in one shot a t the end, it's faster): sourcedepths += [evt_dic[EVT_DEPTH]] * len(cha_df) eventtimes += [np.datetime64(evt_dic[EVT_TIME])] * len(cha_df) # Append only relevant columns: ret.append(cha_df[[SEG_CHAID, SEG_EVID, SEG_DCID, SEG_EVDIST]]) # create total segments dataframe: # first check we have data: if not ret: raise FailedDownload(formatmsg("No segments to process", "No station within search radia")) # now concat: ret = pd.concat(ret, axis=0, ignore_index=True, copy=True) # compute travel times. Doing it on a single array is much faster sourcedepths = np.array(sourcedepths) distances = ret[SEG_EVDIST].values traveltimes = tttable(sourcedepths, 0, distances) # assign to column: eventtimes = np.array(eventtimes) # should be of type '<M8[us]' or whatever datetime dtype # now to compute arrival times: eventtimes + traveltimes does not work (we cannot # sum np.datetime64 and np.float). Convert traveltimes to np.timedelta: we first multiply by # 1000000 to preserve the millisecond resolution and then we write traveltimes.astype("m8[us]") # which means: 8bytes timedelta with microsecond resolution (10^-6) # Side note: that all numpy timedelta constructors (as well as "astype") round to int # argument, at least in numpy13. ret[SEG_ATIME] = eventtimes + (traveltimes*1000000).astype("m8[us]") # drop nat values oldlen = len(ret) ret.dropna(subset=[SEG_ATIME], inplace=True) if oldlen > len(ret): logger.info(formatmsg("%d of %d segments discarded", "Travel times NaN"), oldlen-len(ret), oldlen) if ret.empty: raise FailedDownload(formatmsg("No segments to process", "All travel times NaN")) return ret
def get_channels_df( session, datacenters_df, eidavalidator, # <- can be none net, sta, loc, cha, starttime, endtime, min_sample_rate, update, max_thread_workers, timeout, blocksize, db_bufsize, show_progress=False): """Returns a dataframe representing a query to the eida services (or the internal db if `post_data` is None) with the given argument. The dataframe will have as columns the `key` attribute of any of the following db columns: ``` [Channel.id, Station.latitude, Station.longitude, Station.datacenter_id] ``` :param datacenters_df: the first item resulting from `get_datacenters_df` (pandas DataFrame) :param post_data: the second item resulting from `get_datacenters_df` (string) :param channels: a list of string denoting the channels, or None for no filtering (all channels). Each string follows FDSN specifications (e.g. 'BHZ', 'H??'). This argument is not used if `post_data` is given (not None) :param min_sample_rate: minimum sampling rate, set to negative value for no-filtering (all channels) """ postdata = get_post_data(net, sta, loc, cha, starttime, endtime) ret = [] url_failed_dc_ids = [] iterable = ((id_, Request(url, data=('format=text\nlevel=channel\n' + post_data_str).encode('utf8'))) for url, id_, post_data_str in zip( datacenters_df[DataCenter.station_url.key], datacenters_df[ DataCenter.id.key], cycle([postdata]))) with get_progressbar(show_progress, length=len(datacenters_df)) as pbar: for obj, result, exc, url in read_async(iterable, urlkey=lambda obj: obj[-1], blocksize=blocksize, max_workers=max_thread_workers, decode='utf8', timeout=timeout): pbar.update(1) dcen_id = obj[0] if exc: url_failed_dc_ids.append(dcen_id) logger.warning(formatmsg("Unable to fetch stations", exc, url)) else: try: dframe = response2normalizeddf(url, result[0], "channel") if not dframe.empty: dframe[Station.datacenter_id.key] = dcen_id ret.append(dframe) except ValueError as verr: logger.warning( formatmsg("Discarding response data", verr, url)) db_cha_df = pd.DataFrame() if url_failed_dc_ids: # if some datacenter does not return station, warn with INFO dc_df_fromdb = \ datacenters_df.loc[datacenters_df[DataCenter.id.key].isin(url_failed_dc_ids)] logger.info( formatmsg( "Fetching stations from database for %d (of %d) data-center(s)", "download errors occurred"), len(dc_df_fromdb), len(datacenters_df)) logger.info( dc_df_fromdb[DataCenter.dataselect_url.key].to_string(index=False)) db_cha_df = get_channels_df_from_db(session, dc_df_fromdb, net, sta, loc, cha, starttime, endtime, min_sample_rate) # build two dataframes which we will concatenate afterwards web_cha_df = pd.DataFrame() if ret: # pd.concat complains for empty list try: web_cha_df = filter_channels_df( pd.concat(ret, axis=0, ignore_index=True, copy=False), net, sta, loc, cha, min_sample_rate) # this raises FailedDownload if we cannot save any element: web_cha_df = save_stations_and_channels(session, web_cha_df, eidavalidator, update, db_bufsize) except FailedDownload as qexc: if db_cha_df.empty: raise else: logger.warning(qexc) if db_cha_df.empty and web_cha_df.empty: # ok, now let's see if we have remaining datacenters to be fetched from the db raise FailedDownload( formatmsg("No station found", ("Unable to fetch stations from all data-centers, " "no data to fetch from the database. " "Check config and log for details"))) ret = None if db_cha_df.empty: ret = web_cha_df elif web_cha_df.empty: ret = db_cha_df else: ret = pd.concat((web_cha_df, db_cha_df), axis=0, ignore_index=True, sort=False) # the columns for the channels dataframe that will be returned return ret[[ c.key for c in (Channel.id, Channel.station_id, Station.latitude, Station.longitude, Station.datacenter_id, Station.start_time, Station.end_time, Station.network, Station.station, Channel.location, Channel.channel) ]].copy()
def prepare_for_download(session, segments_df, dc_dataselect_manager, timespan, retry_seg_not_found, retry_url_err, retry_mseed_err, retry_client_err, retry_server_err, retry_timespan_err, retry_timespan_warn=False): """Drops the segments which are already present on the database and updates the primary keys for those not present (adding them to the db). Adds new columns to the returned Data frame :param session: the sql-alchemy session bound to an existing database :param segments_df: pandas DataFrame resulting from `get_arrivaltimes` """ opendataonly = dc_dataselect_manager.opendataonly # fetch already downloaded segments and return the corresponding dataframe. # which will have also the boolean column SEG.RETRY, which is True for suspiciously # restricted (SR) segments, i.e. segments whose download code MIGHT denote that they # are restricted (see `s2scodes.restricted_data`): db_seg_df = fetch_already_downloaded_segments_df(session, segments_df, opendataonly) # store now the ids of the SR segments, we will use them later. If open data, `db_seg_df` # does not have the column SEG.RETRY so set the ids to a (empty) DataFrame for consistency: force_retry_ids = pd.DataFrame() if opendataonly else db_seg_df[SEG.ID][ db_seg_df[SEG.RETRY]] # Now update the SEG.RETRY column (or create it) according to the flags set: set_segments_to_retry(db_seg_df, opendataonly, retry_seg_not_found, retry_url_err, retry_mseed_err, retry_client_err, retry_server_err, retry_timespan_err, retry_timespan_warn) # Now merge/update existing dataframe (`segments_df`) with the db values (`db_seg_df`). # Do it in two steps, 1) and 2): # 1) set columns and defaults (for int types, sets np.nan). # Note that if we have something to retry (db_seg_df[SEG_RETRY].any()), we add also # a column SEG.DSCODE with None/nan as default: checking if that column exists # will be the way later to know if we need to update rows or only insert new rows. cols2set = OrderedDict([(SEG.ID, np.nan), (SEG.RETRY, True), (SEG.REQSTIME, pd.NaT), (SEG.REQETIME, pd.NaT)] + ([(SEG.DSCODE, np.nan)] if db_seg_df[SEG.RETRY].any() else [])) for colname, default_ in cols2set.items(): segments_df[colname] = default_ # 2) assign/override values of cols2set from db_seg_df to segments_df, # matching rows via the [SEG_CHID, SEG_EVID] cols: segments_df = mergeupdate(segments_df, db_seg_df, [SEG.CHAID, SEG.EVID], list(cols2set.keys())) request_timebounds_need_update = set_requested_timebounds( segments_df, timespan) oldlen = len(segments_df) # do a copy to avoid SettingWithCopyWarning. Moreover, copy should re-allocate contiguous # arrays which might be faster (and less memory consuming after unused memory is released) segments_df = segments_df[segments_df[SEG.RETRY]].copy() if oldlen != len(segments_df): reason = "already downloaded, no retry" logger.info(formatmsg("%d segments discarded", reason), oldlen - len(segments_df)) if segments_df.empty: raise NothingToDownload( "Nothing to download: all segments already downloaded " "according to the current configuration") check_suspiciously_duplicated_segment(segments_df) # Last step: the policy later will be to UPDATE (=overwrite existing segments on the database) # only segments whose download code changed (see comment on line 354) because yes, it might # save a lot of time. E.g., suppose retry_server_error=true and a segment # on the db with download code=500 => update it only if the server returns some code != 500. # However, if we are downloading with credentials, we need to force updating SR segments which # were downloaded with no credentials, by definition of SR (suspiciously restricted). # Thus, if we have those segments (`not force_retry_ids.empty`) and we are # performing a download on an already existing database (`SEG.DSCODE in segments_df.columns`), # for those SR segments we will set the value of the column `SEG.DSCODE` to None/nan: # as we will never get any response code = None from the server, those SR segments # will always be updated if not force_retry_ids.empty and SEG.DSCODE in segments_df.columns: segments_df.loc[segments_df[SEG.ID].isin(force_retry_ids), SEG.DSCODE] = np.nan segments_df.drop([SEG.RETRY], axis=1, inplace=True) return segments_df, request_timebounds_need_update
def download_save_segments(session, segments_df, dc_dataselect_manager, chaid2mseedid, download_id, update_datacenters, update_request_timebounds, max_thread_workers, timeout, download_blocksize, db_bufsize, show_progress=False): """Downloads and saves the segments. segments_df MUST not be empty (this is not checked for) :param segments_df: the dataframe resulting from `prepare_for_download`. The Dataframe might or might not have the column 'download_code'. If it has, it will skip writing to db segments whose code did not change: in this case, nans stored under 'download_code' in segments_df indicate new segments, or segments for which the update has to be forced, whatever code is obtained (e.g., queryauth when previously a simple query was used) :param chaid2mseedid: dict of channel ids (int) mapped to mseed ids (strings in "Network.station.location.channel" format) """ # set queryauth column here, outside the loop: restricted_enable_dcids = dc_dataselect_manager.restricted_enabled_ids if restricted_enable_dcids: segments_df[SEG.QAUTH] = \ segments_df[SEG.DCID].isin(dc_dataselect_manager.restricted_enabled_ids) else: segments_df[SEG.QAUTH] = False segmanager = get_dbmanager(session, update_datacenters, update_request_timebounds, db_bufsize) stats = DownloadStats() # define the groupsby columns # remember that segments_df has columns: # we should group by (net, sta, loc, stime, etime), meaning that two rows with those values # equal will be given in the same sub-dataframe, and if 413 is found, take 413s erros creating a # new dataframe, and then group segment by segment, i.e. # (net, sta, loc, cha, stime, etime). # Unfortunately, for perf reasons we do not have # the first 4 columns, but we do have channel_id which basically comprises (net, sta, loc, cha) # NOTE: SEG_START and SEG_END MUST BE ALWAYS PRESENT IN THE SECOND AND THORD POSITION!!!!! groupsby = [[SEG.DCID, SEG.START, SEG.END], [SEG.DCID, SEG.START, SEG.END, SEG.CHAID]] # these are the column names to be set on a dataframe from a received response, # mapped to their default value # Set nan to let pandas understand it's numeric. None I don't know how it is converted # (should be checked) but it's for string types # for numpy types, see # https://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html#specifying-and-constructing-data-types defaultvalues = { SEG.DATA: None, SEG.SRATE: np.nan, SEG.MGAP: np.nan, SEG.DATAID: None, SEG.DSCODE: np.nan, SEG.STIME: pd.NaT, SEG.ETIME: pd.NaT } defaultvalues[SEG.DOWNLID] = download_id defaultvalues_nodata = dict(defaultvalues) # copy col_dscode, col_data = SEG.DSCODE, SEG.DATA toupdate = SEG.DSCODE in segments_df.columns code_not_found = s2scodes.seg_not_found skipped_same_code = 0 seg_logger = SegmentLogger( ) # report seg. errors only once per error type and data center with get_progressbar(show_progress, length=len(segments_df)) as pbar: skipped_dataframes = [ ] # store dataframes with a 413 error and retry later for group_ in groupsby: if segments_df.empty: # for safety (if this is the second loop or greater) break is_last_iteration = group_ == groupsby[-1] seg_groups = segments_df.groupby(group_, sort=False) for data, exc, code, request, dframe in \ get_responses(seg_groups, dc_dataselect_manager, chaid2mseedid, max_thread_workers, timeout, download_blocksize): num_segments = len(dframe) if code == 413 and not is_last_iteration and num_segments > 1: skipped_dataframes.append(dframe) continue pbar.update(num_segments) url = get_host(request) url_stats = stats[url] if exc is None and data != b'': # set default values on the dataframe: dframe = dframe.assign( **defaultvalues) # assign returns a copy populate_dataframe(data, code, dframe, chaid2mseedid) # group by download code, count them, and add the counts to stats: for kode, kount in get_counts(dframe, col_dscode, code_not_found): url_stats[kode] += kount else: # here we are if: exc is not None OR data = b'' url_stats[code] += num_segments if toupdate and code is not None and (dframe[col_dscode] == code).sum(): # if there are rows to update, then discard those for which # the code is the same in the database. If we requested a different # time window, we should update the time windows but there is no point in # this overhead. The condition `code is not None` # should never happen but for safety we put it, because we have set the # download code column of `dframe` to None/nan to mark segments to update # neverthless, on the assumption that we never get response code = None # (see comment L.94). # Thus, if for some weird reason the response code is None, then update the # segment anyway (as we wanted to) dframe = dframe[dframe[col_dscode] != code] skipped_same_code += num_segments - len(dframe) if dframe.empty: # nothing to update on the db continue # update dict of default values, and set it to the dataframe: defaultvalues_nodata.update({ col_dscode: code, col_data: data }) dframe = dframe.assign( **defaultvalues_nodata) # assign returns a copy if exc is not None: # log segment errors only once per error type and data center, # otherwise the log is hundreds of Mb and it's unreadable: seg_logger.warn(request, url, code, exc) segmanager.add(dframe) segmanager.flush( ) # flush remaining stuff to insert / update, if any if skipped_dataframes: segments_df = pd.concat(skipped_dataframes, axis=0, ignore_index=True, copy=True, verify_integrity=False) skipped_dataframes = [] else: # break the next loop, if any segments_df = pd.DataFrame() segmanager.close() # flush remaining stuff to insert / update if skipped_same_code: logger.warning( formatmsg( ("%d already saved segment(s) with no waveform data skipped " "with no messages, only their count is reported " "in statistics") % skipped_same_code, "Still receiving the same download code")) return stats