def get_metadata( self, keys: Union[Sequence[str], Mapping[str, str]]) -> Dict[str, Any]: keys = tuple(self._key_dict_to_sequence(keys)) if len(keys) != len(self.key_names): raise exceptions.InvalidKeyError('Got wrong number of keys') cursor = self._cursor where_string = ' AND '.join([f'{key}=%s' for key in self.key_names]) cursor.execute(f'SELECT * FROM metadata WHERE {where_string}', keys) row = cursor.fetchone() if not row: # support lazy loading filepath = self.get_datasets(dict(zip(self.key_names, keys))) if not filepath: raise exceptions.DatasetNotFoundError( f'No dataset found for given keys {keys}') assert len(filepath) == 1 # compute metadata and try again self.insert(keys, filepath[keys], skip_metadata=False) cursor.execute(f'SELECT * FROM metadata WHERE {where_string}', keys) row = cursor.fetchone() assert row data_columns, _ = zip(*self._METADATA_COLUMNS) encoded_data = {col: row[col] for col in self.key_names + data_columns} return self._decode_data(encoded_data)
def get_datasets(self, where: Mapping[str, str] = None, page: int = 0, limit: int = None) -> Dict[Tuple[str, ...], str]: cursor = self._cursor if limit is not None: # explicitly cast to int to prevent SQL injection page_fragment = f'LIMIT {int(limit)} OFFSET {int(page) * int(limit)}' else: page_fragment = '' # sort by keys to ensure deterministic results order_fragment = f'ORDER BY {", ".join(self.key_names)}' if where is None: cursor.execute( f'SELECT * FROM datasets {order_fragment} {page_fragment}') else: if not all(key in self.key_names for key in where.keys()): raise exceptions.InvalidKeyError( 'Encountered unrecognized keys in ' 'where clause') where_fragment = ' AND '.join( [f'{key}=%s' for key in where.keys()]) cursor.execute( f'SELECT * FROM datasets WHERE {where_fragment} {order_fragment} {page_fragment}', list(where.values())) def keytuple(row: Dict[str, Any]) -> Tuple[str, ...]: return tuple(row[key] for key in self.key_names) return {keytuple(row): row['filepath'] for row in cursor}
def insert(self, keys: Union[Sequence[str], Mapping[str, str]], filepath: str, *, metadata: Mapping[str, Any] = None, skip_metadata: bool = False, override_path: str = None) -> None: conn = self._connection if len(keys) != len(self.key_names): raise exceptions.InvalidKeyError( f'Got wrong number of keys (available keys: {self.key_names})') if override_path is None: override_path = filepath keys = self._key_dict_to_sequence(keys) template_string = ', '.join(['?'] * (len(keys) + 1)) conn.execute( f'INSERT OR REPLACE INTO datasets VALUES ({template_string})', [*keys, override_path]) if metadata is None and not skip_metadata: metadata = self.compute_metadata(filepath) if metadata is not None: encoded_data = self._encode_data(metadata) row_keys, row_values = zip(*encoded_data.items()) template_string = ', '.join(['?'] * (len(keys) + len(row_values))) conn.execute( f'INSERT OR REPLACE INTO metadata ({", ".join(self.key_names)}, ' f'{", ".join(row_keys)}) VALUES ({template_string})', [*keys, *row_values])
def get_metadata( self, keys: Union[Sequence[str], Mapping[str, str]]) -> Dict[str, Any]: keys = tuple(self._key_dict_to_sequence(keys)) if len(keys) != len(self.key_names): raise exceptions.InvalidKeyError( f'Got wrong number of keys (available keys: {self.key_names})') conn = self._connection where_string = ' AND '.join([f'{key}=?' for key in self.key_names]) row = conn.execute(f'SELECT * FROM metadata WHERE {where_string}', keys).fetchone() if not row: # support lazy loading filepath = self.get_datasets(dict(zip(self.key_names, keys)), page=0, limit=1) if not filepath: raise exceptions.DatasetNotFoundError( f'No dataset found for given keys {keys}') # compute metadata and try again metadata = self.compute_metadata( filepath[keys], max_shape=self.LAZY_LOADING_MAX_SHAPE) self.insert(keys, filepath[keys], metadata=metadata) row = conn.execute(f'SELECT * FROM metadata WHERE {where_string}', keys).fetchone() assert row data_columns, _ = zip(*self._METADATA_COLUMNS) encoded_data = {col: row[col] for col in self.key_names + data_columns} return self._decode_data(encoded_data)
def _key_dict_to_sequence(self, keys: Union[Mapping[str, Any], Sequence[Any]]) -> List[Any]: """Convert {key_name: key_value} to [key_value] with the correct key order.""" try: keys_as_mapping = cast(Mapping[str, Any], keys) return [keys_as_mapping[key] for key in self.key_names] except TypeError: # not a mapping return list(keys) except KeyError as exc: raise exceptions.InvalidKeyError('Encountered unknown key') from exc
def delete(self, keys: Union[Sequence[str], Mapping[str, str]]) -> None: cursor = self._cursor if len(keys) != len(self.key_names): raise exceptions.InvalidKeyError( f'Got wrong number of keys (available keys: {self.key_names})') keys = self._key_dict_to_sequence(keys) key_dict = dict(zip(self.key_names, keys)) if not self.get_datasets(key_dict): raise exceptions.DatasetNotFoundError( f'No dataset found with keys {keys}') where_string = ' AND '.join([f'{key}=%s' for key in self.key_names]) cursor.execute(f'DELETE FROM datasets WHERE {where_string}', keys) cursor.execute(f'DELETE FROM metadata WHERE {where_string}', keys)
def create(self, keys: Sequence[str], key_descriptions: Mapping[str, str] = None) -> None: """Create and initialize database with empty tables. This must be called before opening the first connection. The MySQL database must not exist already. Arguments: keys: Key names to use throughout the Terracotta database. key_descriptions: Optional (but recommended) full-text description for some keys, in the form of ``{key_name: description}``. """ if key_descriptions is None: key_descriptions = {} else: key_descriptions = dict(key_descriptions) if not all(k in keys for k in key_descriptions.keys()): raise exceptions.InvalidKeyError( 'key description dict contains unknown keys') if not all(re.match(r'^\w+$', key) for key in keys): raise exceptions.InvalidKeyError('key names must be alphanumeric') if any(key in self._RESERVED_KEYS for key in keys): raise exceptions.InvalidKeyError( f'key names cannot be one of {self._RESERVED_KEYS!s}') for key in keys: if key not in key_descriptions: key_descriptions[key] = '' # total primary key length has an upper limit in MySQL key_size = self._MAX_PRIMARY_KEY_LENGTH // len(keys) key_type = f'VARCHAR({key_size})' with pymysql.connect(host=self._db_args.host, user=self._db_args.user, password=self._db_args.password, port=self._db_args.port, read_timeout=self.DB_CONNECTION_TIMEOUT, write_timeout=self.DB_CONNECTION_TIMEOUT, binary_prefix=True, charset='utf8mb4') as con: con.execute(f'CREATE DATABASE {self._db_args.db}') with self._connect(check=False): cursor = self._cursor cursor.execute(f'CREATE TABLE terracotta (version VARCHAR(255)) ' f'CHARACTER SET {self._CHARSET}') cursor.execute('INSERT INTO terracotta VALUES (%s)', [str(__version__)]) cursor.execute( f'CREATE TABLE key_names (key_name {key_type}, ' f'description VARCHAR(8000)) CHARACTER SET {self._CHARSET}') key_rows = [(key, key_descriptions[key]) for key in keys] cursor.executemany('INSERT INTO key_names VALUES (%s, %s)', key_rows) key_string = ', '.join([f'{key} {key_type}' for key in keys]) cursor.execute( f'CREATE TABLE datasets ({key_string}, filepath VARCHAR(8000), ' f'PRIMARY KEY({", ".join(keys)})) CHARACTER SET {self._CHARSET}' ) column_string = ', '.join( f'{col} {col_type}' for col, col_type in self._METADATA_COLUMNS) cursor.execute( f'CREATE TABLE metadata ({key_string}, {column_string}, ' f'PRIMARY KEY ({", ".join(keys)})) CHARACTER SET {self._CHARSET}' ) # invalidate key cache self._db_keys = None