コード例 #1
0
ファイル: pd.py プロジェクト: niosh-mining/obsplus
 def _check_missing_data(df):
     """There should be no missing data in the required columns."""
     # first check if all required columns exist
     if not set(BULK_WAVEFORM_COLUMNS).issubset(set(df.columns)):
         missing_cols = set(BULK_WAVEFORM_COLUMNS) - set(df.columns)
         msg = f"Dataframe is missing the following columns: {missing_cols}"
         raise DataFrameContentError(msg)
     missing_date = df[list(BULK_WAVEFORM_COLUMNS)].isnull().any()
     no_data_cols = missing_date[missing_date].index
     if not no_data_cols.empty:
         msg = f"dataframe is missing values in columns: {list(no_data_cols)}"
         raise DataFrameContentError(msg)
     return df
コード例 #2
0
def order_columns(
    df: pd.DataFrame, required_columns: Sequence, drop_columns=False, fill_missing=True
):
    """
    Order a dataframe's columns and ensure it has required columns.

    Parameters
    ----------
    df
        The input dataframe.
    required_columns
        A sequence that contains the column names.
    drop_columns
        If True drop columns not in required_columns.
    fill_missing
        If True, create missing required columns and fill with nullish values.

    Returns
    -------
    pd.DataFrame
    """
    # make sure required columns are there
    column_set = set(df.columns)
    missing_cols = set(required_columns) - set(df.columns)
    extra_cols = sorted(list(column_set - set(required_columns)), key=lambda x: str(x))
    if drop_columns:  # dont include extras if drop_columns
        extra_cols = []
    # raise a DataFrameContentError if required columns are not there
    if missing_cols and not fill_missing:
        msg = f"dataframe is missing required columns: {missing_cols}"
        raise DataFrameContentError(msg)
    new_cols = list(required_columns) + extra_cols
    # add any extra (blank) columns if needed and sort
    df = df.reindex(columns=new_cols)
    return df
コード例 #3
0
ファイル: pd.py プロジェクト: niosh-mining/obsplus
 def _check_starttime_endtime(df):
     """Ensure all starttimes are less than endtimes."""
     # starttimes must be <= endtime
     invalid_time_range = df["starttime"] >= df["endtime"]
     if invalid_time_range.any():
         msg = "all values in starttime must be <= endtime"
         raise DataFrameContentError(msg)
     return df
コード例 #4
0
ファイル: pd.py プロジェクト: niosh-mining/obsplus
 def _check_nslc_codes(df):
     """Ensure there are no wildcards in NSLC columns."""
     for code in NSLC:
         has_qmark = df[code].str.contains("?", regex=False).any()
         has_star = df[code].str.contains("*", regex=False).any()
         if has_qmark or has_star:
             msg = f"columns {NSLC} cannot contain * or ?, column {code} does"
             raise DataFrameContentError(msg)
     return df
コード例 #5
0
 def _validate_dataframe(self, df) -> pd.DataFrame:
     """Ensure all the parameters of the dataframe are reasonable."""
     # first cull out columns that aren't needed and de-dup index
     if ("depth" in df.columns) and ("elevation" not in df.columns):
         # Make sure that the df has an elevation column
         out = df[list(ALT_DISTANCE_COLUMN_DTYPES)].astype(
             ALT_DISTANCE_COLUMN_DTYPES)
         out["elevation"] = -1 * out["depth"]
         out.drop("depth", inplace=True, axis=1)
         self._de_duplicate_df_index(out)
     else:
         out = (df[list(LOCATION_DTYPE)].astype(LOCATION_DTYPE).pipe(
             self._de_duplicate_df_index))
     # sanity checks on lat/lon
     lat_valid = abs(df["latitude"]) <= 90.0
     lons_valid = abs(df["longitude"]) <= 180.0
     if not (lat_valid.all() & lons_valid.all()):
         msg = f"invalid lat/lon values found in {df}"
         raise DataFrameContentError(msg)
     return out
コード例 #6
0
 def _get_dataframe(self, obj) -> pd.DataFrame:
     """
     Return a dataframe with latitude, longitude, elevation, and id.
     """
     cols = list(LOCATION_DTYPE)
     cols1 = list(ALT_DISTANCE_COLUMN_DTYPES)
     # if a dataframe is used
     if isinstance(obj, pd.DataFrame):
         if not (set(cols).issubset(obj.columns)
                 or set(cols1).issubset(obj.columns)):
             raise DataFrameContentError(
                 "SpatialCalculator input dataframe must have the following "
                 f"columns: {cols} or {cols1}")
         return self._validate_dataframe(obj)
     try:  # first try events
         df = self._df_from_events(obj)
     except self.expected_exceptions:  # then stations
         try:
             df = self._df_from_stations(obj)
         except self.expected_exceptions:
             # and lastly any sequence.
             df = self._df_from_sequences(obj)
     return self._validate_dataframe(df)