Beispiel #1
0
    def get_metadata(
            self, keys: Union[Sequence[str], Mapping[str,
                                                     str]]) -> Dict[str, Any]:
        keys = tuple(self._key_dict_to_sequence(keys))

        if len(keys) != len(self.key_names):
            raise exceptions.InvalidKeyError('Got wrong number of keys')

        cursor = self._cursor

        where_string = ' AND '.join([f'{key}=%s' for key in self.key_names])
        cursor.execute(f'SELECT * FROM metadata WHERE {where_string}', keys)
        row = cursor.fetchone()

        if not row:  # support lazy loading
            filepath = self.get_datasets(dict(zip(self.key_names, keys)))
            if not filepath:
                raise exceptions.DatasetNotFoundError(
                    f'No dataset found for given keys {keys}')
            assert len(filepath) == 1

            # compute metadata and try again
            self.insert(keys, filepath[keys], skip_metadata=False)
            cursor.execute(f'SELECT * FROM metadata WHERE {where_string}',
                           keys)
            row = cursor.fetchone()

        assert row

        data_columns, _ = zip(*self._METADATA_COLUMNS)
        encoded_data = {col: row[col] for col in self.key_names + data_columns}
        return self._decode_data(encoded_data)
Beispiel #2
0
    def get_datasets(self,
                     where: Mapping[str, str] = None,
                     page: int = 0,
                     limit: int = None) -> Dict[Tuple[str, ...], str]:
        cursor = self._cursor

        if limit is not None:
            # explicitly cast to int to prevent SQL injection
            page_fragment = f'LIMIT {int(limit)} OFFSET {int(page) * int(limit)}'
        else:
            page_fragment = ''

        # sort by keys to ensure deterministic results
        order_fragment = f'ORDER BY {", ".join(self.key_names)}'

        if where is None:
            cursor.execute(
                f'SELECT * FROM datasets {order_fragment} {page_fragment}')
        else:
            if not all(key in self.key_names for key in where.keys()):
                raise exceptions.InvalidKeyError(
                    'Encountered unrecognized keys in '
                    'where clause')
            where_fragment = ' AND '.join(
                [f'{key}=%s' for key in where.keys()])
            cursor.execute(
                f'SELECT * FROM datasets WHERE {where_fragment} {order_fragment} {page_fragment}',
                list(where.values()))

        def keytuple(row: Dict[str, Any]) -> Tuple[str, ...]:
            return tuple(row[key] for key in self.key_names)

        return {keytuple(row): row['filepath'] for row in cursor}
Beispiel #3
0
    def insert(self,
               keys: Union[Sequence[str], Mapping[str, str]],
               filepath: str,
               *,
               metadata: Mapping[str, Any] = None,
               skip_metadata: bool = False,
               override_path: str = None) -> None:
        conn = self._connection

        if len(keys) != len(self.key_names):
            raise exceptions.InvalidKeyError(
                f'Got wrong number of keys (available keys: {self.key_names})')

        if override_path is None:
            override_path = filepath

        keys = self._key_dict_to_sequence(keys)
        template_string = ', '.join(['?'] * (len(keys) + 1))
        conn.execute(
            f'INSERT OR REPLACE INTO datasets VALUES ({template_string})',
            [*keys, override_path])

        if metadata is None and not skip_metadata:
            metadata = self.compute_metadata(filepath)

        if metadata is not None:
            encoded_data = self._encode_data(metadata)
            row_keys, row_values = zip(*encoded_data.items())
            template_string = ', '.join(['?'] * (len(keys) + len(row_values)))
            conn.execute(
                f'INSERT OR REPLACE INTO metadata ({", ".join(self.key_names)}, '
                f'{", ".join(row_keys)}) VALUES ({template_string})',
                [*keys, *row_values])
Beispiel #4
0
    def get_metadata(
            self, keys: Union[Sequence[str], Mapping[str,
                                                     str]]) -> Dict[str, Any]:
        keys = tuple(self._key_dict_to_sequence(keys))

        if len(keys) != len(self.key_names):
            raise exceptions.InvalidKeyError(
                f'Got wrong number of keys (available keys: {self.key_names})')

        conn = self._connection

        where_string = ' AND '.join([f'{key}=?' for key in self.key_names])
        row = conn.execute(f'SELECT * FROM metadata WHERE {where_string}',
                           keys).fetchone()

        if not row:  # support lazy loading
            filepath = self.get_datasets(dict(zip(self.key_names, keys)),
                                         page=0,
                                         limit=1)
            if not filepath:
                raise exceptions.DatasetNotFoundError(
                    f'No dataset found for given keys {keys}')

            # compute metadata and try again
            metadata = self.compute_metadata(
                filepath[keys], max_shape=self.LAZY_LOADING_MAX_SHAPE)
            self.insert(keys, filepath[keys], metadata=metadata)
            row = conn.execute(f'SELECT * FROM metadata WHERE {where_string}',
                               keys).fetchone()

        assert row

        data_columns, _ = zip(*self._METADATA_COLUMNS)
        encoded_data = {col: row[col] for col in self.key_names + data_columns}
        return self._decode_data(encoded_data)
Beispiel #5
0
 def _key_dict_to_sequence(self, keys: Union[Mapping[str, Any], Sequence[Any]]) -> List[Any]:
     """Convert {key_name: key_value} to [key_value] with the correct key order."""
     try:
         keys_as_mapping = cast(Mapping[str, Any], keys)
         return [keys_as_mapping[key] for key in self.key_names]
     except TypeError:  # not a mapping
         return list(keys)
     except KeyError as exc:
         raise exceptions.InvalidKeyError('Encountered unknown key') from exc
Beispiel #6
0
    def delete(self, keys: Union[Sequence[str], Mapping[str, str]]) -> None:
        cursor = self._cursor

        if len(keys) != len(self.key_names):
            raise exceptions.InvalidKeyError(
                f'Got wrong number of keys (available keys: {self.key_names})')

        keys = self._key_dict_to_sequence(keys)
        key_dict = dict(zip(self.key_names, keys))

        if not self.get_datasets(key_dict):
            raise exceptions.DatasetNotFoundError(
                f'No dataset found with keys {keys}')

        where_string = ' AND '.join([f'{key}=%s' for key in self.key_names])
        cursor.execute(f'DELETE FROM datasets WHERE {where_string}', keys)
        cursor.execute(f'DELETE FROM metadata WHERE {where_string}', keys)
Beispiel #7
0
    def create(self,
               keys: Sequence[str],
               key_descriptions: Mapping[str, str] = None) -> None:
        """Create and initialize database with empty tables.

        This must be called before opening the first connection. The MySQL database must not
        exist already.

        Arguments:

            keys: Key names to use throughout the Terracotta database.
            key_descriptions: Optional (but recommended) full-text description for some keys,
                in the form of ``{key_name: description}``.

        """
        if key_descriptions is None:
            key_descriptions = {}
        else:
            key_descriptions = dict(key_descriptions)

        if not all(k in keys for k in key_descriptions.keys()):
            raise exceptions.InvalidKeyError(
                'key description dict contains unknown keys')

        if not all(re.match(r'^\w+$', key) for key in keys):
            raise exceptions.InvalidKeyError('key names must be alphanumeric')

        if any(key in self._RESERVED_KEYS for key in keys):
            raise exceptions.InvalidKeyError(
                f'key names cannot be one of {self._RESERVED_KEYS!s}')

        for key in keys:
            if key not in key_descriptions:
                key_descriptions[key] = ''

        # total primary key length has an upper limit in MySQL
        key_size = self._MAX_PRIMARY_KEY_LENGTH // len(keys)
        key_type = f'VARCHAR({key_size})'

        with pymysql.connect(host=self._db_args.host,
                             user=self._db_args.user,
                             password=self._db_args.password,
                             port=self._db_args.port,
                             read_timeout=self.DB_CONNECTION_TIMEOUT,
                             write_timeout=self.DB_CONNECTION_TIMEOUT,
                             binary_prefix=True,
                             charset='utf8mb4') as con:
            con.execute(f'CREATE DATABASE {self._db_args.db}')

        with self._connect(check=False):
            cursor = self._cursor
            cursor.execute(f'CREATE TABLE terracotta (version VARCHAR(255)) '
                           f'CHARACTER SET {self._CHARSET}')
            cursor.execute('INSERT INTO terracotta VALUES (%s)',
                           [str(__version__)])

            cursor.execute(
                f'CREATE TABLE key_names (key_name {key_type}, '
                f'description VARCHAR(8000)) CHARACTER SET {self._CHARSET}')
            key_rows = [(key, key_descriptions[key]) for key in keys]
            cursor.executemany('INSERT INTO key_names VALUES (%s, %s)',
                               key_rows)

            key_string = ', '.join([f'{key} {key_type}' for key in keys])
            cursor.execute(
                f'CREATE TABLE datasets ({key_string}, filepath VARCHAR(8000), '
                f'PRIMARY KEY({", ".join(keys)})) CHARACTER SET {self._CHARSET}'
            )

            column_string = ', '.join(
                f'{col} {col_type}'
                for col, col_type in self._METADATA_COLUMNS)
            cursor.execute(
                f'CREATE TABLE metadata ({key_string}, {column_string}, '
                f'PRIMARY KEY ({", ".join(keys)})) CHARACTER SET {self._CHARSET}'
            )

        # invalidate key cache
        self._db_keys = None