def _check_missing_data(df): """There should be no missing data in the required columns.""" # first check if all required columns exist if not set(BULK_WAVEFORM_COLUMNS).issubset(set(df.columns)): missing_cols = set(BULK_WAVEFORM_COLUMNS) - set(df.columns) msg = f"Dataframe is missing the following columns: {missing_cols}" raise DataFrameContentError(msg) missing_date = df[list(BULK_WAVEFORM_COLUMNS)].isnull().any() no_data_cols = missing_date[missing_date].index if not no_data_cols.empty: msg = f"dataframe is missing values in columns: {list(no_data_cols)}" raise DataFrameContentError(msg) return df
def order_columns( df: pd.DataFrame, required_columns: Sequence, drop_columns=False, fill_missing=True ): """ Order a dataframe's columns and ensure it has required columns. Parameters ---------- df The input dataframe. required_columns A sequence that contains the column names. drop_columns If True drop columns not in required_columns. fill_missing If True, create missing required columns and fill with nullish values. Returns ------- pd.DataFrame """ # make sure required columns are there column_set = set(df.columns) missing_cols = set(required_columns) - set(df.columns) extra_cols = sorted(list(column_set - set(required_columns)), key=lambda x: str(x)) if drop_columns: # dont include extras if drop_columns extra_cols = [] # raise a DataFrameContentError if required columns are not there if missing_cols and not fill_missing: msg = f"dataframe is missing required columns: {missing_cols}" raise DataFrameContentError(msg) new_cols = list(required_columns) + extra_cols # add any extra (blank) columns if needed and sort df = df.reindex(columns=new_cols) return df
def _check_starttime_endtime(df): """Ensure all starttimes are less than endtimes.""" # starttimes must be <= endtime invalid_time_range = df["starttime"] >= df["endtime"] if invalid_time_range.any(): msg = "all values in starttime must be <= endtime" raise DataFrameContentError(msg) return df
def _check_nslc_codes(df): """Ensure there are no wildcards in NSLC columns.""" for code in NSLC: has_qmark = df[code].str.contains("?", regex=False).any() has_star = df[code].str.contains("*", regex=False).any() if has_qmark or has_star: msg = f"columns {NSLC} cannot contain * or ?, column {code} does" raise DataFrameContentError(msg) return df
def _validate_dataframe(self, df) -> pd.DataFrame: """Ensure all the parameters of the dataframe are reasonable.""" # first cull out columns that aren't needed and de-dup index if ("depth" in df.columns) and ("elevation" not in df.columns): # Make sure that the df has an elevation column out = df[list(ALT_DISTANCE_COLUMN_DTYPES)].astype( ALT_DISTANCE_COLUMN_DTYPES) out["elevation"] = -1 * out["depth"] out.drop("depth", inplace=True, axis=1) self._de_duplicate_df_index(out) else: out = (df[list(LOCATION_DTYPE)].astype(LOCATION_DTYPE).pipe( self._de_duplicate_df_index)) # sanity checks on lat/lon lat_valid = abs(df["latitude"]) <= 90.0 lons_valid = abs(df["longitude"]) <= 180.0 if not (lat_valid.all() & lons_valid.all()): msg = f"invalid lat/lon values found in {df}" raise DataFrameContentError(msg) return out
def _get_dataframe(self, obj) -> pd.DataFrame: """ Return a dataframe with latitude, longitude, elevation, and id. """ cols = list(LOCATION_DTYPE) cols1 = list(ALT_DISTANCE_COLUMN_DTYPES) # if a dataframe is used if isinstance(obj, pd.DataFrame): if not (set(cols).issubset(obj.columns) or set(cols1).issubset(obj.columns)): raise DataFrameContentError( "SpatialCalculator input dataframe must have the following " f"columns: {cols} or {cols1}") return self._validate_dataframe(obj) try: # first try events df = self._df_from_events(obj) except self.expected_exceptions: # then stations try: df = self._df_from_stations(obj) except self.expected_exceptions: # and lastly any sequence. df = self._df_from_sequences(obj) return self._validate_dataframe(df)