def _write_update(self, df: pd.DataFrame, update_time=None): """ convert updates to dataframe, then append to index table """ # read in dataframe and cast to correct types assert not df.duplicated().any(), "update index has duplicate entries" # set both dfs to use index of event_id df = df.set_index("event_id") # get current events, but dont allow it to update again current = self.read_index(event_id=set(df.index), _allow_update=False) indicies_to_update = set(current["event_id"]) & set(df.index) # populate index store and update metadata with sql_connection(self.index_path) as con: if indicies_to_update: # delete rows that will be re-entered _drop_rows(self._index_node, con, event_id=indicies_to_update) node = self._index_node df.to_sql(node, con, if_exists="append", index_label="event_id") tables = _get_tables(con) if self._meta_node not in tables: meta = self._make_meta_table() meta.to_sql(self._meta_node, con, if_exists="replace") # update timestamp with warnings.catch_warnings(): # ignore pandas collection warning timestamp = update_time or time.time() warnings.simplefilter("ignore") dft = pd.DataFrame(timestamp, index=[0], columns=["time"]) dft.to_sql(self._time_node, con, if_exists="replace", index=False) self._metadata = meta self._index = None
def last_updated(self): """ Return the last modified time stored in the index, else 0.0 """ with sql_connection(self.index_path) as con: try: return _read_table(self._time_node, con).loc[0, "time"] except (pd.io.sql.DatabaseError, KeyError): # table is empty return 0.0
def read_index(self, **kwargs) -> pd.DataFrame: """ Read the index and return a dataframe containing the event info. Parameters ---------- {get_events_params} """ self.ensure_bank_path_exists() if set(kwargs) & UNSUPPORTED_QUERY_OPTIONS: unsupported_options = set(kwargs) & UNSUPPORTED_QUERY_OPTIONS msg = f"Query parameters {unsupported_options} are not supported" raise ValueError(msg) with sql_connection(self.index_path) as con: try: df = _read_table(self._index_node, con, **kwargs).set_index("event_id") except pd.io.sql.DatabaseError: # empty or no db, return empty index df = pd.DataFrame(columns=list(COLUMN_TYPES)).set_index("event_id") # coerce datatypes dtype = {i: COLUMN_TYPES[i] for i in set(COLUMN_TYPES) & set(df.columns)} df = df.astype(dtype=dtype) # replace "None" with None on str columns str_cols = STR_COLUMNS & set(df.columns) df.loc[:, str_cols] = df.loc[:, str_cols].replace(["None"], [None]) return df
def read_index(self, **kwargs) -> pd.DataFrame: """ Read the index and return a dataframe containing the event info. Parameters ---------- {get_events_params} """ self.ensure_bank_path_exists() if set(kwargs) & UNSUPPORTED_QUERY_OPTIONS: unsupported_options = set(kwargs) & UNSUPPORTED_QUERY_OPTIONS msg = f"Query parameters {unsupported_options} are not supported" raise ValueError(msg) # Make sure all times are numpy datetime64 kwargs = dict_times_to_npdatetimes(kwargs) # a simple switch to prevent infinite recursion allow_update = kwargs.pop("_allow_update", True) # Circular search requires work to be done on the dataframe - we need # to get the whole dataframe then calculate the distances and search in # that circular_kwargs, kwargs = _sanitize_circular_search(**kwargs) with sql_connection(self.index_path) as con: try: df = _read_table(self._index_node, con, **kwargs) except pd.io.sql.DatabaseError: # if this database has never been updated, update now if allow_update and self.last_updated < 1: self.update_index() return self.read_index(_allow_update=False, **kwargs) # else return empty index df = pd.DataFrame(columns=list(EVENT_TYPES_OUTPUT)) df = self._prepare_dataframe(df, dtypes=EVENT_TYPES_OUTPUT) if len(circular_kwargs) >= 3: # Requires at least latitude, longitude and min or max radius circular_ids = _get_ids(df, circular_kwargs) df = df[df.event_id.isin(circular_ids)] return df
def _read_metadata(self): """ return the meta table """ self.ensure_bank_path_exists() with sql_connection(self.index_path) as con: sql = f'SELECT * FROM "{self._meta_node}";' return pd.read_sql(sql, con)