def process_nonpatient_table(table: Table, engine: Engine, progargs: Any) -> None: if progargs.rcep: return log.info("Preprocessing non-patient table {}".format(repr(table.name))) pk_col = get_effective_int_pk_col(table) other_pk_col = pk_col if pk_col != CRATE_COL_PK else None if other_pk_col: # table has a primary key already crate_pk_col = Column(CRATE_COL_PK, BigInteger, nullable=True) else: crate_pk_col = make_bigint_autoincrement_column( CRATE_COL_PK, engine.dialect) table.append_column(crate_pk_col) # must be Table-bound, as above add_columns(engine, table, [crate_pk_col]) if not progargs.print: ensure_columns_present(engine, tablename=table.name, column_names=[CRATE_COL_PK]) if other_pk_col: execute( engine, """ UPDATE {tablename} SET {crate_pk} = {rio_pk} WHERE {crate_pk} IS NULL """.format(tablename=table.name, crate_pk=CRATE_COL_PK, rio_pk=other_pk_col)) add_indexes(engine, table, [{ 'index_name': CRATE_IDX_PK, 'column': CRATE_COL_PK, 'unique': True }])
def process_master_patient_table(table: Table, engine: Engine, progargs: Any) -> None: crate_col_nhs_number = Column(CRATE_COL_NHS_NUMBER, BigInteger, nullable=True) table.append_column(crate_col_nhs_number) add_columns(engine, table, [crate_col_nhs_number]) if progargs.rcep: nhscol = RCEP_COL_NHS_NUMBER else: nhscol = RIO_COL_NHS_NUMBER log.info("Table {}: updating column {}".format(repr(table.name), repr(nhscol))) ensure_columns_present(engine, tablename=table.name, column_names=[nhscol]) if not progargs.print: ensure_columns_present(engine, tablename=table.name, column_names=[CRATE_COL_NHS_NUMBER]) execute( engine, """ UPDATE {tablename} SET {nhs_number_int} = CAST({nhscol} AS BIGINT) WHERE {nhs_number_int} IS NULL """.format( tablename=table.name, nhs_number_int=CRATE_COL_NHS_NUMBER, nhscol=nhscol, ))
def create_view(name: str, selectable: FromClause, metadata: MetaData, materialized: bool = False) -> Table: """ Args: name => name of materialized view to create selectable => query to create view as metadata => metadata to listen for events on materialized => whether to create standard or materialized view Returns: Table object bound to temporary MetaData object with columns returned from selectable (essentially creates table as view). NOTE: For non-postgresql backends, creating a materialized view will result in a standard view, which cannot be indexed. Preconditions: N/A Raises: N/A """ _tmp_mt = MetaData() tbl = Table(name, _tmp_mt) for column in selectable.c: tbl.append_column( Column(column.name, column.type, primary_key=column.primary_key)) listen(metadata, "after_create", (CreateMaterializedViewExpression(name, selectable) if materialized else CreateViewExpression(name, selectable))) listen( metadata, "before_drop", DropMaterializedViewExpression(name) if materialized else DropViewExpression(name)) return tbl
def _exclude_columns_table(table): new_table = Table(table.name, MetaData()) for c in table.columns: if c.name not in columns: new_table.append_column(c.copy()) return new_table
def get_sa_new_table(metadata, table_name, new_table_name, smallest_int_types=False): ' create new table with data from table ' assert metadata.is_bound(), 'Metadata is not bound' table = Table(table_name, metadata, autoload=True) # get smallest int types for all data in the table if smallest_int_types: min_max_types = get_sa_table_int_min_max(table) if min_max_types: smallest_int_types = {} for presto_col, (min_val, max_val) in min_max_types.items(): smallest_int_type = get_presto_smallest_int_type_min_max( min_val, max_val) smallest_int_types.update({presto_col: smallest_int_type}) new_table = Table(new_table_name, metadata) for column in table.columns: if smallest_int_types: smallest_int_type = smallest_int_types.get(column.name, None) if smallest_int_type: new_table.append_column( sa.Column(column.name, smallest_int_type())) continue new_table.append_column(sa.Column(column.name, column.type)) return new_table
def create_table(self, table_name, primary_id='id', primary_type='Integer'): """ Creates a new table. The new table will automatically have an `id` column unless specified via optional parameter primary_id, which will be used as the primary key of the table. Automatic id is set to be an auto-incrementing integer, while the type of custom primary_id can be a String or an Integer as specified with primary_type flag. The default length of String is 255. The caller can specify the length. The caller will be responsible for the uniqueness of manual primary_id. This custom id feature is only available via direct create_table call. Returns a :py:class:`Table <dataset.Table>` instance. :: table = db.create_table('population') # custom id and type table2 = db.create_table('population2', 'age') table3 = db.create_table('population3', primary_id='race', primary_type='String') # custom length of String table4 = db.create_table('population4', primary_id='race', primary_type='String(50)') """ table_name = self._valid_table_name(table_name) self._acquire() try: log.debug("Creating table: %s on %r" % (table_name, self.engine)) match = re.match(r'^(Integer)$|^(String)(\(\d+\))?$', primary_type) if match: if match.group(1) == 'Integer': auto_flag = False if primary_id == 'id': auto_flag = True col = Column(primary_id, Integer, primary_key=True, autoincrement=auto_flag) elif not match.group(3): col = Column(primary_id, String(255), primary_key=True) else: len_string = int(match.group(3)[1:-1]) len_string = min(len_string, 255) col = Column(primary_id, String(len_string), primary_key=True) else: raise DatasetException( "The primary_type has to be either 'Integer' or 'String'.") table = SQLATable(table_name, self.metadata, schema=self.schema) table.append_column(col) table.create(self.engine) self._tables[table_name] = table return Table(self, table) finally: self._release()
def wrap(fn): table_definition = TableDefinition() fn(table_definition) table = Table(name, self.meta) for attrname in table_definition.fields.keys(): args, kw = table_definition.fields[attrname] table.append_column(Column(attrname, *args, **kw)) table.create()
def create_table(engine, table_name): log.debug("Creating table: %s on %r" % (table_name, engine)) table = Table(table_name, engine._metadata) col = Column('id', Integer, primary_key=True) table.append_column(col) table.create(engine) TABLES[engine][table_name] = table return table
def _create_table(self, table_name): table_name = validate_name(table_name) log.debug("Creating table: %s on %r" % (table_name, self.engine)) table = Table(table_name, self.meta) col = Column(ID_COLUMN, Integer, primary_key=True) table.append_column(col) table.create(self.engine) return table
def _table(table): src_table = table new_table = Table(src_table.name, MetaData()) for c in src_table.columns: if c.name not in columns: new_table.append_column(c.copy()) return new_table
def copy_table(table): """ 渡されたテーブルをコピーします """ ret_table = Table(table.name, MetaData()) for c in table.columns: ret_table.append_column(copy_column(c)) return ret_table
def create_table(engine, table_name): with lock: log.debug("Creating table: %s on %r" % (table_name, engine)) table = Table(table_name, engine._metadata) col = Column('id', Integer, primary_key=True) table.append_column(col) table.create(engine) engine._tables[table_name] = table return table
def create_table(engine, table_name): with lock: log.debug("Creating table: %s on %r" % (table_name, engine)) table = Table(table_name, engine._metadata) col = Column("id", Integer, primary_key=True) table.append_column(col) table.create(engine) engine._tables[table_name] = table return table
def _rename_columns_table(table): new_table = Table(table.name, MetaData()) for c in table.columns: renamed_column = c.copy() if c.name in src_columns: renamed_column.name = maps[c.name] new_table.append_column(renamed_column) return new_table
def wrap(fn): table_definition = OrderedProperties() fn(table_definition) table = Table(table_name, g.db_meta) for attrname in table_definition.keys(): value = table_definition[attrname] if isinstance(value, Column): table.append_column(value) elif isinstance(value, Constraint): table.append_constraint(value) table.create(g.db_engine)
def create_table(self, table_name, primary_id='id', primary_type='Integer'): """ Create a new table. The new table will automatically have an `id` column unless specified via optional parameter primary_id, which will be used as the primary key of the table. Automatic id is set to be an auto-incrementing integer, while the type of custom primary_id can be a String or an Integer as specified with primary_type flag. The default length of String is 255. The caller can specify the length. The caller will be responsible for the uniqueness of manual primary_id. This custom id feature is only available via direct create_table call. Returns a :py:class:`Table <dataset.Table>` instance. :: table = db.create_table('population') # custom id and type table2 = db.create_table('population2', 'age') table3 = db.create_table('population3', primary_id='race', primary_type='String') # custom length of String table4 = db.create_table('population4', primary_id='race', primary_type='String(50)') """ table_name = self._valid_table_name(table_name) self._acquire() try: log.debug("Creating table: %s on %r" % (table_name, self.engine)) match = re.match(r'^(Integer)$|^(String)(\(\d+\))?$', primary_type) if match: if match.group(1) == 'Integer': auto_flag = False if primary_id == 'id': auto_flag = True col = Column(primary_id, Integer, primary_key=True, autoincrement=auto_flag) elif not match.group(3): col = Column(primary_id, String(255), primary_key=True) else: len_string = int(match.group(3)[1:-1]) len_string = min(len_string, 255) col = Column(primary_id, String(len_string), primary_key=True) else: raise DatasetException( "The primary_type has to be either 'Integer' or 'String'.") table = SQLATable(table_name, self.metadata, schema=self.schema) table.append_column(col) table.create(self.engine) self._tables[table_name] = table return Table(self, table) finally: self._release()
def _table(table): left_table = table right_table = tables[right_table_name] new_table = Table(left_table.name, MetaData()) for c in left_table.columns: new_table.append_column(_copy_column(c)) for c in right_table.columns: new_table.append_column(_copy_column(c)) return new_table
class TableHandler(object): """ Used by automatically generated objects such as datasets and dimensions to generate, write and clear the table under its management. """ def _init_table(self, meta, namespace, name, id_type=Integer): """ Create the given table if it does not exist, otherwise reflect the current table schema from the database. """ name = namespace + '__' + name self.table = Table(name, meta) if id_type is not None: col = Column('id', id_type, primary_key=True) self.table.append_column(col) def _generate_table(self): """ Create the given table if it does not exist. """ # TODO: make this support some kind of migration? if not db.engine.has_table(self.table.name): self.table.create(db.engine) def _upsert(self, bind, data, unique_columns): """ Upsert a set of values into the table. This will query for the set of unique columns and either update an existing row or create a new one. In both cases, the ID of the changed row will be returned. """ key = and_(*[self.table.c[c] == data.get(c) for c in unique_columns]) q = self.table.update(key, data) if bind.execute(q).rowcount == 0: q = self.table.insert(data) rs = bind.execute(q) return rs.inserted_primary_key[0] else: q = self.table.select(key) row = bind.execute(q).fetchone() return row['id'] def _flush(self, bind): """ Delete all rows in the table. """ q = self.table.delete() bind.execute(q) def _drop(self, bind): """ Drop the table and the local reference to it. """ if db.engine.has_table(self.table.name): self.table.drop() del self.table
def process_table(table: Table, engine: Engine, configoptions: PcmisConfigOptions) -> None: """ Processes a PCMIS table by checking it has appropriate columns, perhaps adding a CRATE integer PK, and indexing it. Args: table: an SQLAlchemy Table to process engine: an SQLAlchemy Engine configoptions: an instance of :class:`PcmisConfigOptions` """ tablename = table.name column_names = table.columns.keys() log.debug(f"TABLE: {tablename}; COLUMNS: {column_names}") existing_pk_cols = get_pk_colnames(table) assert len(existing_pk_cols) < 2, ( f"Table {tablename} has >1 PK column; don't know what to do") if existing_pk_cols and not get_effective_int_pk_col(table): raise ValueError(f"Table {table!r} has a non-integer PK") adding_crate_pk = not existing_pk_cols required_cols = [CRATE_COL_PK] if not configoptions.print_sql_only else [] if configoptions.drop_not_create: # --------------------------------------------------------------------- # DROP STUFF! Opposite order to creation (below) # --------------------------------------------------------------------- drop_indexes(engine, table, [CRATE_IDX_PK]) drop_columns(engine, table, [CRATE_COL_PK]) else: # --------------------------------------------------------------------- # CREATE STUFF! # --------------------------------------------------------------------- # SQL Server requires Table-bound columns in order to generate DDL: if adding_crate_pk: crate_pk_col = make_bigint_autoincrement_column( CRATE_COL_PK, engine.dialect) table.append_column(crate_pk_col) add_columns(engine, table, [crate_pk_col]) ensure_columns_present(engine, tablename=table.name, column_names=required_cols) add_indexes(engine, table, [{ 'index_name': CRATE_IDX_PK, 'column': CRATE_COL_PK, 'unique': True }])
def create_table(self, table_name, primary_id='id', primary_type='Integer'): """ Creates a new table. The new table will automatically have an `id` column unless specified via optional parameter primary_id, which will be used as the primary key of the table. Automatic id is set to be an auto-incrementing integer, while the type of custom primary_id can be a Text or an Integer as specified with primary_type flag. The caller will be responsible for the uniqueness of manual primary_id. This custom id feature is only available via direct create_table call. Returns a :py:class:`Table <dataset.Table>` instance. :: table = db.create_table('population') # custom id and type table2 = db.create_table('population2', 'age') table3 = db.create_table('population3', primary_id='race', primary_type='Text') """ self._acquire() try: log.debug("Creating table: %s on %r" % (table_name, self.engine)) table = SQLATable(table_name, self.metadata) if primary_type is 'Integer': auto_flag = False if primary_id is 'id': auto_flag = True col = Column(primary_id, Integer, primary_key=True, autoincrement=auto_flag) elif primary_type is 'Text': col = Column(primary_id, Text, primary_key=True) else: raise DatasetException( "The primary_type has to be either 'Integer' or 'Text'.") table.append_column(col) table.create(self.engine) self._tables[table_name] = table return Table(self, table) finally: self._release()
def create_table(self, table_name): """ Creates a new table. The new table will automatically have an `id` column, which is set to be an auto-incrementing integer as the primary key of the table. Returns a :py:class:`Table <dataset.Table>` instance. :: table = db.create_table('population') """ with self.lock: log.debug("Creating table: %s on %r" % (table_name, self.engine)) table = SQLATable(table_name, self.metadata) col = Column('id', Integer, primary_key=True) table.append_column(col) table.create(self.engine) self._tables[table_name] = table return Table(self, table)
def unit_tests() -> None: from sqlalchemy.dialects.mssql.base import MSDialect from sqlalchemy.dialects.mysql.base import MySQLDialect d_mssql = MSDialect() d_mysql = MySQLDialect() col1 = Column('hello', BigInteger, nullable=True) col2 = Column('world', BigInteger, autoincrement=True) # does NOT generate IDENTITY col3 = make_bigint_autoincrement_column('you', d_mssql) metadata = MetaData() t = Table('mytable', metadata) t.append_column(col1) t.append_column(col2) t.append_column(col3) print("Checking Column -> DDL: SQL Server (mssql)") test_assert(column_creation_ddl(col1, d_mssql), "hello BIGINT NULL") test_assert(column_creation_ddl(col2, d_mssql), "world BIGINT NULL") test_assert(column_creation_ddl(col3, d_mssql), "you BIGINT NOT NULL IDENTITY(1,1)") print("Checking Column -> DDL: MySQL (mysql)") test_assert(column_creation_ddl(col1, d_mysql), "hello BIGINT") test_assert(column_creation_ddl(col2, d_mysql), "world BIGINT") # not col3; unsupported print("Checking SQL type -> SQL Alchemy type") to_check = [ # mssql ("BIGINT", d_mssql), ("NVARCHAR(32)", d_mssql), ("NVARCHAR(MAX)", d_mssql), ('NVARCHAR(160) COLLATE "Latin1_General_CI_AS"', d_mssql), # mysql ("BIGINT", d_mssql), ("LONGTEXT", d_mysql), ] for coltype, dialect in to_check: print("... {} -> dialect {} -> {}".format( repr(coltype), repr(dialect.name), repr(get_sqla_coltype_from_dialect_str(coltype, dialect))))
def create_table(self, table_name): """ Creates a new table. The new table will automatically have an `id` column, which is set to be an auto-incrementing integer as the primary key of the table. Returns a :py:class:`Table <dataset.Table>` instance. :: table = db.create_table('population') """ self._acquire() try: log.debug("Creating table: %s on %r" % (table_name, self.engine)) table = SQLATable(table_name, self.metadata) col = Column('id', Integer, primary_key=True) table.append_column(col) table.create(self.engine) self._tables[table_name] = table return Table(self, table) finally: self._release()
def create_view(name, selectable, metadata, materialized=False): ''' Args: name: String => name of materialized view to create selectable: FromClause => query to create view as metadata: MetaData => metadata to listen for events on materialized: Boolean => whether to create standard or materialized view Returns: Table Table object bound to temporary MetaData object with columns as columns returned from selectable (essentially creates table as view) NOTE: For non-postgresql backends, creating a materialized view will result in a standard view, which cannot be indexed Preconditions: name is of type String selectable is of type FromClause metadata is of type Metadata materialized is of type Boolean ''' assert isinstance(name, str), 'Name is not of type String' assert isinstance(selectable, FromClause), 'Selectable is not of type FromClause' assert isinstance(metadata, MetaData), 'Metadata is not of type MetaData' assert isinstance(materialized, bool), 'Materialized is not of type Boolean' _tmp_mt = MetaData() tbl = Table(name, _tmp_mt) for c in selectable.c: tbl.append_column(Column(c.name, c.type, primary_key=c.primary_key)) listen(\ metadata,\ 'after_create',\ CreateMaterializedViewExpression(name, selectable) if materialized else CreateViewExpression(name, selectable)) listen(\ metadata,\ 'before_drop',\ DropMaterializedViewExpression(name) if materialized else DropViewExpression(name)) return tbl
def process_table(table: Table, engine: Engine, progargs: Any) -> None: tablename = table.name column_names = table.columns.keys() log.debug("TABLE: {}; COLUMNS: {}".format(tablename, column_names)) existing_pk_cols = get_pk_colnames(table) assert len(existing_pk_cols) < 2, ( "Table {} has >1 PK column; don't know what to do".format(tablename)) if existing_pk_cols and not get_effective_int_pk_col(table): raise ValueError("Table {} has a non-integer PK".format(repr(table))) adding_crate_pk = not existing_pk_cols required_cols = [CRATE_COL_PK] if not progargs.print else [] if progargs.drop_danger_drop: # --------------------------------------------------------------------- # DROP STUFF! Opposite order to creation (below) # --------------------------------------------------------------------- drop_indexes(engine, table, [CRATE_IDX_PK]) drop_columns(engine, table, [CRATE_COL_PK]) else: # --------------------------------------------------------------------- # CREATE STUFF! # --------------------------------------------------------------------- # SQL Server requires Table-bound columns in order to generate DDL: if adding_crate_pk: crate_pk_col = make_bigint_autoincrement_column( CRATE_COL_PK, engine.dialect) table.append_column(crate_pk_col) add_columns(engine, table, [crate_pk_col]) ensure_columns_present(engine, tablename=table.name, column_names=required_cols) add_indexes(engine, table, [{ 'index_name': CRATE_IDX_PK, 'column': CRATE_COL_PK, 'unique': True }])
def linktab(self): if not hasattr(self, '_linktab'): if self.engine.has_table(self.linktab_name): self._linktab = Table(self.linktab_name, self.meta, autoload=True) else: table = Table(self.linktab_name, self.meta) col = Column('view', Unicode, index=True) table.append_column(col) col = Column('serial', Unicode(40)) table.append_column(col) col = Column('key', Unicode, index=True) table.append_column(col) col = Column('fingerprint', Unicode(255), index=True) table.append_column(col) table.create(self.engine) self._linktab = table return self._linktab
def create_table(self, schema_name, table_name, columns_configuration, drop_first): metadata = MetaData() table = Table(table_name, metadata, schema=schema_name) for column_configuration in columns_configuration: table.append_column( self.create_column(column_configuration["destination"])) table.append_column( Column( Providers.AuditColumnsNames.TIMESTAMP, DateTime(timezone=True), server_default=func.now(), )) table.append_column( Column( Providers.AuditColumnsNames.IS_DELETED, Boolean, server_default="f", default=False, )) table.append_column( Column(Providers.AuditColumnsNames.CHANGE_VERSION, BigInteger)) if drop_first: self.logger.debug(f"Dropping table {schema_name}.{table_name}") table.drop(self.target_db, checkfirst=True) self.logger.debug(f"Dropped table {schema_name}.{table_name}") self.logger.debug(f"Creating table {schema_name}.{table_name}") table.create(self.target_db, checkfirst=False) self.logger.debug(f"Created table {schema_name}.{table_name}") return
class FactTable(object): """ The ``FactTable`` serves as a controller object for a given ``Model``, handling the creation, filling and migration of the table schema associated with the dataset. """ def __init__(self, dataset): self.dataset = dataset self.bind = db.engine self.table_name = '%s__facts' % dataset.name self.meta = MetaData() self.meta.bind = self.bind self._table = None @property def table(self): """ Generate an appropriate table representation to mirror the fields known for this table. """ if self._table is None: self._table = Table(self.table_name, self.meta) id_col = Column('_id', Unicode(42), primary_key=True) self._table.append_column(id_col) json_col = Column('_json', Unicode()) self._table.append_column(json_col) self._fields_columns(self._table) return self._table @property def alias(self): """ An alias used for queries. """ if not hasattr(self, '_alias'): self._alias = self.table.alias('entry') return self._alias @property def mapping(self): if not hasattr(self, '_mapping'): self._mapping = {} for attribute in self.dataset.model.attributes: if attribute.column in self.alias.columns: col = self.alias.c[attribute.column] self._mapping[attribute.path] = col return self._mapping @property def exists(self): return db.engine.has_table(self.table.name) def _fields_columns(self, table): """ Transform the (auto-detected) fields into a set of column specifications. """ for field in self.dataset.fields: data_type = TYPES.get(field.get('type'), Unicode) col = Column(field.get('name'), data_type, nullable=True) table.append_column(col) def load_iter(self, iterable, chunk_size=1000): """ Bulk load all the data in an artifact to a matching database table. """ chunk = [] conn = self.bind.connect() tx = conn.begin() try: for i, record in enumerate(iterable): record['_id'] = i record['_json'] = json.dumps(record, default=json_default) chunk.append(record) if len(chunk) >= chunk_size: stmt = self.table.insert() conn.execute(stmt, chunk) chunk = [] if len(chunk): stmt = self.table.insert() conn.execute(stmt, chunk) tx.commit() except: tx.rollback() raise def create(self): """ Create the fact table if it does not exist. """ if not self.exists: self.table.create(self.bind) def drop(self): """ Drop the fact table if it does exist. """ if self.exists: self.table.drop() self._table = None def __repr__(self): return "<FactTable(%r)>" % (self.dataset)
def gen_sqla_info(cls, cls_bases=()): """Return SQLAlchemy table object corresponding to the passed Spyne object. Also maps given class to the returned table. """ metadata = cls.Attributes.sqla_metadata table_name = cls.Attributes.table_name inc = [] # include_properties # check inheritance inheritance = None base_class = getattr(cls, '__extends__', None) if base_class is None: for b in cls_bases: if getattr(b, '_type_info', None) is not None and b.__mixin__: base_class = b if base_class is not None: base_table_name = base_class.Attributes.table_name if base_table_name is not None: if base_table_name == table_name: inheritance = _SINGLE else: inheritance = _JOINED raise NotImplementedError("Joined table inheritance is not yet " "implemented.") inc_prop = base_class.Attributes.sqla_mapper.include_properties if inc_prop is not None: inc.extend(inc_prop) exc_prop = base_class.Attributes.sqla_mapper.exclude_properties if exc_prop is not None: inc = [_p for _p in inc if not _p in exc_prop] # check whether the object already has a table table = None if table_name in metadata.tables: table = metadata.tables[table_name] else: # We need FakeTable because table_args can contain all sorts of stuff # that can require a fully-constructed table, and we don't have that # information here yet. table = _FakeTable() # check whether the base classes are already mapped base_mapper = None if base_class is not None: base_mapper = base_class.Attributes.sqla_mapper if base_mapper is None: for b in cls_bases: bm = _mapper_registry.get(b, None) if bm is not None: assert base_mapper is None, "There can be only one base mapper." base_mapper = bm inheritance = _SINGLE props = {} # For each Spyne field for k, v in cls._type_info.items(): if v.Attributes.exc_table: continue col_args, col_kwargs = sanitize_args(v.Attributes.sqla_column_args) _sp_attrs_to_sqla_constraints(cls, v, col_kwargs) t = get_sqlalchemy_type(v) if t is None: p = getattr(v.Attributes, 'store_as', None) if p is not None and issubclass(v, Array) and isinstance(p, c_table): child_cust, = v._type_info.values() if child_cust.__orig__ is not None: child = child_cust.__orig__ else: child = child_cust if p.multi != False: # many to many col_own, col_child = _get_cols_m2m(cls, k, v, p.left, p.right) p.left = col_own.key p.right = col_child.key if p.multi == True: rel_table_name = '_'.join([cls.Attributes.table_name, k]) else: rel_table_name = p.multi # FIXME: Handle the case where the table already exists. rel_t = Table(rel_table_name, metadata, *(col_own, col_child)) props[k] = relationship(child, secondary=rel_t, backref=p.backref) elif issubclass(child, SimpleModel): # one to many simple type # get left (fk) column info _gen_col = _get_col_o2m(cls, p.left) col_info = _gen_col.next() # gets the column name p.left, child_left_col_type = col_info[0] # FIXME: Add support for multi-column primary keys. child_left_col_name = p.left # get right(data) column info child_right_col_type = get_sqlalchemy_type(child_cust) child_right_col_name = p.right # this is the data column if child_right_col_name is None: child_right_col_name = k # get table name child_table_name = child_cust.Attributes.table_name if child_table_name is None: child_table_name = '_'.join([table_name, k]) if child_table_name in metadata.tables: # table exists, get releavant info child_t = metadata.tables[child_table_name] assert child_right_col_type is \ child_t.c[child_right_col_name].type.__class__ assert child_left_col_type is \ child_t.c[child_left_col_name].type.__class__ child_right_col = child_t.c[child_right_col_name] child_left_col = child_t.c[child_left_col_name] else: # table does not exist, generate table child_right_col = Column(child_right_col_name, child_right_col_type) _sp_attrs_to_sqla_constraints(cls, child_cust, col=child_right_col) child_left_col = _gen_col.next() _sp_attrs_to_sqla_constraints(cls, child_cust, col=child_left_col) child_t = Table(child_table_name , metadata, Column('id', sqlalchemy.Integer, primary_key=True), child_left_col, child_right_col) # generate temporary class for association proxy cls_name = ''.join(x.capitalize() or '_' for x in child_table_name.split('_')) # generates camelcase class name. def _i(self, *args): setattr(self, child_right_col_name, args[0]) cls_ = type("_" + cls_name, (object,), {'__init__': _i}) own_mapper(cls_)(cls_, child_t) props["_" + k] = relationship(cls_) # generate association proxy setattr(cls, k, association_proxy("_" + k, child_right_col_name)) else: # one to many complex type _gen_col = _get_col_o2m(cls, p.right) col_info = _gen_col.next() # gets the column name p.right, col_type = col_info[0] # FIXME: Add support for multi-column primary keys. assert p.left is None, \ "'left' is ignored in one-to-many relationships " \ "with complex types (because they already have a " \ "table). You probably meant to use 'right'." child_t = child.__table__ if p.right in child_t.c: # FIXME: This branch MUST be tested. assert col_type is child_t.c[p.right].type.__class__ # if the column is there, the decision about whether # it should be in child's mapper should also have been # made. # # so, not adding the child column to to child mapper # here. col = child_t.c[p.right] else: col = _gen_col.next() _sp_attrs_to_sqla_constraints(cls, child_cust, col=col) child_t.append_column(col) child.__mapper__.add_property(col.name, col) props[k] = relationship(child, foreign_keys=[col], backref=p.backref) elif p is not None and issubclass(v, ComplexModelBase): # v has the Attribute values we need whereas real_v is what the # user instantiates (thus what sqlalchemy needs) if v.__orig__ is None: # vanilla class real_v = v else: # customized class real_v = v.__orig__ if isinstance(p, c_table): assert not getattr(p, 'multi', False), ( 'Storing a single element-type using a ' 'relation table is pointless.') assert p.right is None, "'right' is ignored in a one-to-one " \ "relationship" col = _get_col_o2o(cls, k, v, p.left) rel = relationship(real_v, uselist=False, foreign_keys=[col], backref=p.backref) p.left = col.key props[k] = rel elif isinstance(p, c_xml): if k in table.c: col = table.c[k] else: col = Column(k, PGObjectXml(v, p.root_tag, p.no_ns), *col_args, **col_kwargs) elif isinstance(p, c_json): if k in table.c: col = table.c[k] else: col = Column(k, PGObjectJson(v, ignore_wrappers=p.ignore_wrappers, complex_as=p.complex_as ), *col_args, **col_kwargs ) elif isinstance(p, c_msgpack): raise NotImplementedError() else: raise ValueError(p) props[col.name] = col if not k in table.c: table.append_column(col) else: logger.debug("Skipping %s.%s.%s: %r, store_as: %r" % ( cls.get_namespace(), cls.get_type_name(), k, v, p)) else: unique = v.Attributes.unique index = v.Attributes.index if unique and not index: index = True try: index_name, index_method = v.Attributes.index except (TypeError, ValueError): index_name = "%s_%s%s" % (table_name, k, '_unique' if unique else '') index_method = v.Attributes.index if k in table.c: col = table.c[k] else: col = Column(k, t, *col_args, **col_kwargs) table.append_column(col) if index in (False, None): pass else: if index == True: index_args = (index_name, col), dict(unique=unique) else: index_args = (index_name, col), dict(unique=unique, postgresql_using=index_method) if isinstance(table, _FakeTable): table.indexes.append(index_args) else: Index(*index_args[0], **index_args[1]) if not v.Attributes.exc_mapper: props[k] = col if isinstance(table, _FakeTable): _table = table table_args, table_kwargs = sanitize_args(cls.Attributes.sqla_table_args) table = Table(table_name, metadata, *(tuple(table.columns) + table_args), **table_kwargs) for index_args, index_kwargs in _table.indexes: Index(*index_args, **index_kwargs) del _table # Map the table to the object mapper_args, mapper_kwargs = sanitize_args(cls.Attributes.sqla_mapper_args) _props = mapper_kwargs.get('properties', None) if _props is None: mapper_kwargs['properties'] = props else: props.update(_props) mapper_kwargs['properties'] = props _inc = mapper_kwargs.get('include_properties', None) if _inc is None: mapper_kwargs['include_properties'] = inc + props.keys() po = mapper_kwargs.get('polymorphic_on', None) if po is not None: if not isinstance(po, Column): mapper_kwargs['polymorphic_on'] = table.c[po] else: del mapper_kwargs['polymorphic_on'] if base_mapper is not None: mapper_kwargs['inherits'] = base_mapper if inheritance is not _SINGLE: mapper_args = (table,) + mapper_args cls_mapper = mapper(cls, *mapper_args, **mapper_kwargs) def my_load_listener(target, context): d = target.__dict__ for k, v in cls.get_flat_type_info(cls).items(): if not k in d: if isclass(v) and issubclass(v, ComplexModelBase): pass else: d[k] = None event.listen(cls, 'load', my_load_listener) cls.__tablename__ = cls.Attributes.table_name cls.Attributes.sqla_mapper = cls.__mapper__ = cls_mapper cls.Attributes.sqla_table = cls.__table__ = table return table
def gen_sqla_info(cls, cls_bases=()): """Return SQLAlchemy table object corresponding to the passed Spyne object. Also maps given class to the returned table. """ metadata = cls.Attributes.sqla_metadata table_name = cls.Attributes.table_name inc = [] # include_properties # check inheritance inheritance = None base_class = getattr(cls, '__extends__', None) if base_class is None: for b in cls_bases: if getattr(b, '_type_info', None) is not None and b.__mixin__: base_class = b if base_class is not None: base_table_name = base_class.Attributes.table_name if base_table_name is not None: if base_table_name == table_name: inheritance = _SINGLE else: inheritance = _JOINED raise NotImplementedError( "Joined table inheritance is not yet " "implemented.") inc_prop = base_class.Attributes.sqla_mapper.include_properties if inc_prop is not None: inc.extend(inc_prop) exc_prop = base_class.Attributes.sqla_mapper.exclude_properties if exc_prop is not None: inc = [_p for _p in inc if not _p in exc_prop] # check whether the object already has a table table = None if table_name in metadata.tables: table = metadata.tables[table_name] else: # We need FakeTable because table_args can contain all sorts of stuff # that can require a fully-constructed table, and we don't have that # information here yet. table = _FakeTable() # check whether the base classes are already mapped base_mapper = None if base_class is not None: base_mapper = base_class.Attributes.sqla_mapper if base_mapper is None: for b in cls_bases: bm = _mapper_registry.get(b, None) if bm is not None: assert base_mapper is None, "There can be only one base mapper." base_mapper = bm inheritance = _SINGLE props = {} # For each Spyne field for k, v in cls._type_info.items(): if v.Attributes.exc_table: continue col_args, col_kwargs = sanitize_args(v.Attributes.sqla_column_args) _sp_attrs_to_sqla_constraints(cls, v, col_kwargs) t = get_sqlalchemy_type(v) if t is None: p = getattr(v.Attributes, 'store_as', None) if p is not None and issubclass(v, Array) and isinstance( p, c_table): child_cust, = v._type_info.values() if child_cust.__orig__ is not None: child = child_cust.__orig__ else: child = child_cust if p.multi != False: # many to many col_own, col_child = _get_cols_m2m(cls, k, v, p.left, p.right) p.left = col_own.key p.right = col_child.key if p.multi == True: rel_table_name = '_'.join( [cls.Attributes.table_name, k]) else: rel_table_name = p.multi # FIXME: Handle the case where the table already exists. rel_t = Table(rel_table_name, metadata, *(col_own, col_child)) props[k] = relationship(child, secondary=rel_t, backref=p.backref) elif issubclass(child, SimpleModel): # one to many simple type # get left (fk) column info _gen_col = _get_col_o2m(cls, p.left) col_info = _gen_col.next() # gets the column name p.left, child_left_col_type = col_info[ 0] # FIXME: Add support for multi-column primary keys. child_left_col_name = p.left # get right(data) column info child_right_col_type = get_sqlalchemy_type(child_cust) child_right_col_name = p.right # this is the data column if child_right_col_name is None: child_right_col_name = k # get table name child_table_name = child_cust.Attributes.table_name if child_table_name is None: child_table_name = '_'.join([table_name, k]) if child_table_name in metadata.tables: # table exists, get releavant info child_t = metadata.tables[child_table_name] assert child_right_col_type is \ child_t.c[child_right_col_name].type.__class__ assert child_left_col_type is \ child_t.c[child_left_col_name].type.__class__ child_right_col = child_t.c[child_right_col_name] child_left_col = child_t.c[child_left_col_name] else: # table does not exist, generate table child_right_col = Column(child_right_col_name, child_right_col_type) _sp_attrs_to_sqla_constraints(cls, child_cust, col=child_right_col) child_left_col = _gen_col.next() _sp_attrs_to_sqla_constraints(cls, child_cust, col=child_left_col) child_t = Table( child_table_name, metadata, Column('id', sqlalchemy.Integer, primary_key=True), child_left_col, child_right_col) # generate temporary class for association proxy cls_name = ''.join(x.capitalize() or '_' for x in child_table_name.split('_')) # generates camelcase class name. def _i(self, *args): setattr(self, child_right_col_name, args[0]) cls_ = type("_" + cls_name, (object, ), {'__init__': _i}) own_mapper(cls_)(cls_, child_t) props["_" + k] = relationship(cls_) # generate association proxy setattr(cls, k, association_proxy("_" + k, child_right_col_name)) else: # one to many complex type _gen_col = _get_col_o2m(cls, p.right) col_info = _gen_col.next() # gets the column name p.right, col_type = col_info[ 0] # FIXME: Add support for multi-column primary keys. assert p.left is None, \ "'left' is ignored in one-to-many relationships " \ "with complex types (because they already have a " \ "table). You probably meant to use 'right'." child_t = child.__table__ if p.right in child_t.c: # FIXME: This branch MUST be tested. assert col_type is child_t.c[p.right].type.__class__ # if the column is there, the decision about whether # it should be in child's mapper should also have been # made. # # so, not adding the child column to to child mapper # here. col = child_t.c[p.right] else: col = _gen_col.next() _sp_attrs_to_sqla_constraints(cls, child_cust, col=col) child_t.append_column(col) child.__mapper__.add_property(col.name, col) props[k] = relationship(child, foreign_keys=[col], backref=p.backref) elif p is not None and issubclass(v, ComplexModelBase): # v has the Attribute values we need whereas real_v is what the # user instantiates (thus what sqlalchemy needs) if v.__orig__ is None: # vanilla class real_v = v else: # customized class real_v = v.__orig__ if isinstance(p, c_table): assert not getattr(p, 'multi', False), ( 'Storing a single element-type using a ' 'relation table is pointless.') assert p.right is None, "'right' is ignored in a one-to-one " \ "relationship" col = _get_col_o2o(cls, k, v, p.left) rel = relationship(real_v, uselist=False, foreign_keys=[col], backref=p.backref) p.left = col.key props[k] = rel _gen_index_info(table, table_name, col, k, v) elif isinstance(p, c_xml): if k in table.c: col = table.c[k] else: col = Column(k, PGObjectXml(v, p.root_tag, p.no_ns), *col_args, **col_kwargs) elif isinstance(p, c_json): if k in table.c: col = table.c[k] else: col = Column( k, PGObjectJson(v, ignore_wrappers=p.ignore_wrappers, complex_as=p.complex_as), *col_args, **col_kwargs) elif isinstance(p, c_msgpack): raise NotImplementedError() else: raise ValueError(p) props[col.name] = col if not k in table.c: table.append_column(col) else: logger.debug( "Skipping %s.%s.%s: %r, store_as: %r" % (cls.get_namespace(), cls.get_type_name(), k, v, p)) else: if k in table.c: col = table.c[k] else: col = Column(k, t, *col_args, **col_kwargs) table.append_column(col) _gen_index_info(table, table_name, col, k, v) if not v.Attributes.exc_mapper: props[k] = col if isinstance(table, _FakeTable): _table = table table_args, table_kwargs = sanitize_args( cls.Attributes.sqla_table_args) table = Table(table_name, metadata, *(tuple(table.columns) + table_args), **table_kwargs) for index_args, index_kwargs in _table.indexes: Index(*index_args, **index_kwargs) del _table # Map the table to the object mapper_args, mapper_kwargs = sanitize_args(cls.Attributes.sqla_mapper_args) _props = mapper_kwargs.get('properties', None) if _props is None: mapper_kwargs['properties'] = props else: props.update(_props) mapper_kwargs['properties'] = props _inc = mapper_kwargs.get('include_properties', None) if _inc is None: mapper_kwargs['include_properties'] = inc + props.keys() po = mapper_kwargs.get('polymorphic_on', None) if po is not None: if not isinstance(po, Column): mapper_kwargs['polymorphic_on'] = table.c[po] else: del mapper_kwargs['polymorphic_on'] if base_mapper is not None: mapper_kwargs['inherits'] = base_mapper if inheritance is not _SINGLE: mapper_args = (table, ) + mapper_args cls_mapper = mapper(cls, *mapper_args, **mapper_kwargs) def my_load_listener(target, context): d = target.__dict__ for k, v in cls.get_flat_type_info(cls).items(): if not k in d: if isclass(v) and issubclass(v, ComplexModelBase): pass else: d[k] = None event.listen(cls, 'load', my_load_listener) cls.__tablename__ = cls.Attributes.table_name cls.Attributes.sqla_mapper = cls.__mapper__ = cls_mapper cls.Attributes.sqla_table = cls.__table__ = table return table
def _copy_table(table): ret_table = Table(table.name, MetaData()) for c in table.columns: ret_table.append_column(_copy_column(c)) return ret_table
class SQLTable(Component): _selects = 0 _inserts = 0 _finalized = False def __init__(self): super(SQLTable, self).__init__() self._pk = False self.name = None self.connection = None self.columns = [ ] self.create = True self.sa_table = None self.sa_metadata = None self._selects = 0 self._inserts = 0 self._unicode_errors = 0 def _get_sa_type(self, column): if (column["type"] == "Integer"): return Integer elif (column["type"] == "String"): if (not "length" in column): column["length"] = 128 return Unicode(length = column["length"]) elif (column["type"] == "Float"): return Float elif (column["type"] == "Boolean"): return Boolean elif (column["type"] == "AutoIncrement"): return Integer else: raise Exception("Invalid data type: %s" % column["type"]) def finalize(self, ctx): if (not SQLTable._finalized): SQLTable._finalized = True if (SQLTable._inserts + SQLTable._selects > 0): logger.info("SQLTable Totals inserts/selects: %d/%d " % (SQLTable._inserts, SQLTable._selects)) if (self._inserts + self._selects > 0): logger.info("SQLTable %-18s inserts/selects: %6d/%-6d " % (self.name, self._inserts, self._selects)) if (self._unicode_errors > 0): logger.warn("SQLTable %s found %d warnings assigning non-unicode fields to unicode columns" % (self.name, self._unicode_errors)) ctx.comp.finalize(self.connection) super(SQLTable, self).finalize(ctx) def initialize(self, ctx): super(SQLTable, self).initialize(ctx) ctx.comp.initialize(self.connection) logger.debug("Loading table %s on %s" % (self.name, self)) self.sa_metadata = MetaData() self.sa_table = Table(self.name, self.sa_metadata) # Drop? columns_ex = [] for column in self.columns: # Check for duplicate names if (column["name"] in columns_ex): raise Exception("Duplicate column name %s in %s" % (column["name"], self)) columns_ex.append(column["name"]) # Configure column column["pk"] = False if (not "pk" in column) else parsebool(column["pk"]) if (not "type" in column): column["type"] = "String" #if (not "value" in column): column["value"] = None logger.debug("Adding column %s" % column) self.sa_table.append_column( Column(column["name"], self._get_sa_type(column), primary_key = column["pk"], autoincrement = (True if column["type"] == "AutoIncrement" else False) )) # Check schema # Create if doesn't exist if (not self.connection.engine().has_table(self.name)): logger.info("Creating table %s" % self.name) self.sa_table.create(self.connection.connection()) # Extend? # Delete columns? def pk(self, ctx): """ Returns the primary key column definitToClauion, or None if none defined. """ if (self._pk == False): pk_cols = [] for col in self.columns: if ("pk" in col): if parsebool(col["pk"]): pk_cols.append(col) if (len(pk_cols) > 1): raise Exception("Table %s has multiple primary keys: %s" % (self.name, pk_cols)) elif (len(pk_cols) == 1): self._pk = pk_cols[0] else: self._pk = None return self._pk def _attribsToClause(self, attribs): clauses = [] for k, v in attribs.items(): if isinstance(v, (list, tuple)): clauses.append(self.sa_table.c[k].in_(v)) else: clauses.append(self.sa_table.c[k] == v) return and_(*clauses) def _rowtodict(self, row): d = {} for column in self.columns: d[column["name"]] = getattr(row, column["name"]) return d def _find(self, ctx, attribs): self._selects = self._selects + 1 SQLTable._selects = SQLTable._selects + 1 query = self.sa_table.select(self._attribsToClause(attribs)) rows = self.connection.connection().execute(query) for r in rows: # Ensure we return dicts, not RowProxys from SqlAlchemy yield self._rowtodict(r) def lookup(self, ctx, attribs): logger.debug ("Lookup on '%s' attribs: %s" % (self, attribs)) if (len(attribs.keys()) == 0): raise Exception("Cannot lookup on table with no criteria (empty attribute set)") rows = self._find(ctx, attribs) rows = list(rows) if (len(rows) > 1): raise Exception("Found more than one row when searching for just one in table %s: %s" % (self.name, attribs)) elif (len(rows) == 1): row = rows[0] else: row = None logger.debug("Lookup result on %s: %s = %s" % (self.name, attribs, row)) return row def upsert(self, ctx, data, keys = []): # TODO: Check for AutoIncrement in keys, shall not be used # If keys qfilter = {} if (len(keys) > 0): for key in keys: try: qfilter[key] = data[key] except KeyError as e: raise Exception("Could not find attribute '%s' in data when storing row data: %s" % (key, data)) row = self.lookup(ctx, qfilter) if (row): return row row_with_id = self.insert(ctx, data) return row_with_id def _prepare_row(self, ctx, data): row = {} for column in self.columns: if (column["type"] != "AutoIncrement"): try: row[column["name"]] = data[column["name"]] except KeyError, e: raise Exception("Missing attribute for column %s in table '%s' while inserting row: %s" % (e, self.name, data)) # Checks if ((column["type"] == "String") and (not isinstance(row[column["name"]], unicode))): self._unicode_errors = self._unicode_errors + 1 if (ctx.debug): logger.warn("Unicode column %r received non-unicode string: %r " % (column["name"], row[column["name"]])) return row
class Tabular(object): def __init__(self, schema): self.schema = schema self.bind = db.engine self.meta = MetaData() self.meta.bind = self.bind self._table = None @property def table(self): """ Generate an appropriate table representation to mirror the fields known for this table. """ if self._table is None: self._table = Table(self.schema.table_name, self.meta) id_col = Column("_id", Unicode(42), primary_key=True) self._table.append_column(id_col) for column in self.schema.columns: column = Column(column.name, Unicode, nullable=True) self._table.append_column(column) return self._table @property def exists(self): return db.engine.has_table(self.table.name) def load_iter(self, iterable, chunk_size=5000): """ Bulk load all the data in an artifact to a matching database table. """ chunk = [] conn = self.bind.connect() tx = conn.begin() try: for i, record in enumerate(iterable): record["_id"] = i chunk.append(record) if len(chunk) >= chunk_size: stmt = self.table.insert() conn.execute(stmt, chunk) chunk = [] if len(chunk): stmt = self.table.insert() conn.execute(stmt, chunk) tx.commit() except: tx.rollback() raise def create(self): """ Create the fact table if it does not exist. """ if not self.exists: self.table.create(self.bind) def drop(self): """ Drop the fact table if it does exist. """ if self.exists: self.table.drop() self._table = None def to_dict(self): return self.schema.to_dict() def __len__(self): if not hasattr(self, "_count"): q = select(columns=func.count(self.table.c._id), from_obj=self.table) rp = db.engine.execute(q) self._count = rp.scalar() return self._count def __iter__(self): q = select(columns=self.table.c, from_obj=self.table) rp = db.engine.execute(q) while True: rows = rp.fetchmany(2000) if not rows: return for row in rows: yield OrderedDict(row.items()) def __repr__(self): return "<Tabular(%r)>" % self.document
class Table(object): """Represents a table in a database and exposes common operations.""" PRIMARY_DEFAULT = "id" def __init__( self, database, table_name, primary_id=None, primary_type=None, primary_increment=None, auto_create=False, ): """Initialise the table from database schema.""" self.db = database self.name = normalize_table_name(table_name) self._table = None self._columns = None self._indexes = [] self._primary_id = (primary_id if primary_id is not None else self.PRIMARY_DEFAULT) self._primary_type = primary_type if primary_type is not None else Types.integer if primary_increment is None: primary_increment = self._primary_type in (Types.integer, Types.bigint) self._primary_increment = primary_increment self._auto_create = auto_create @property def exists(self): """Check to see if the table currently exists in the database.""" if self._table is not None: return True return self.name in self.db @property def table(self): """Get a reference to the table, which may be reflected or created.""" if self._table is None: self._sync_table(()) return self._table @property def _column_keys(self): """Get a dictionary of all columns and their case mapping.""" if not self.exists: return {} with self.db.lock: if self._columns is None: # Initialise the table if it doesn't exist table = self.table self._columns = {} for column in table.columns: name = normalize_column_name(column.name) key = normalize_column_key(name) if key in self._columns: log.warning("Duplicate column: %s", name) self._columns[key] = name return self._columns @property def columns(self): """Get a listing of all columns that exist in the table.""" return list(self._column_keys.values()) def has_column(self, column): """Check if a column with the given name exists on this table.""" key = normalize_column_key(normalize_column_name(column)) return key in self._column_keys def _get_column_name(self, name): """Find the best column name with case-insensitive matching.""" name = normalize_column_name(name) key = normalize_column_key(name) return self._column_keys.get(key, name) def insert(self, row, ensure=None, types=None): """Add a ``row`` dict by inserting it into the table. If ``ensure`` is set, any of the keys of the row are not table columns, they will be created automatically. During column creation, ``types`` will be checked for a key matching the name of a column to be created, and the given SQLAlchemy column type will be used. Otherwise, the type is guessed from the row value, defaulting to a simple unicode field. :: data = dict(title='I am a banana!') table.insert(data) Returns the inserted row's primary key. """ row = self._sync_columns(row, ensure, types=types) res = self.db.executable.execute(self.table.insert(row)) if len(res.inserted_primary_key) > 0: return res.inserted_primary_key[0] return True def insert_ignore(self, row, keys, ensure=None, types=None): """Add a ``row`` dict into the table if the row does not exist. If rows with matching ``keys`` exist no change is made. Setting ``ensure`` results in automatically creating missing columns, i.e., keys of the row are not table columns. During column creation, ``types`` will be checked for a key matching the name of a column to be created, and the given SQLAlchemy column type will be used. Otherwise, the type is guessed from the row value, defaulting to a simple unicode field. :: data = dict(id=10, title='I am a banana!') table.insert_ignore(data, ['id']) """ row = self._sync_columns(row, ensure, types=types) if self._check_ensure(ensure): self.create_index(keys) args, _ = self._keys_to_args(row, keys) if self.count(**args) == 0: return self.insert(row, ensure=False) return False def insert_many(self, rows, chunk_size=1000, ensure=None, types=None): """Add many rows at a time. This is significantly faster than adding them one by one. Per default the rows are processed in chunks of 1000 per commit, unless you specify a different ``chunk_size``. See :py:meth:`insert() <dataset.Table.insert>` for details on the other parameters. :: rows = [dict(name='Dolly')] * 10000 table.insert_many(rows) """ # Sync table before inputting rows. sync_row = {} for row in rows: # Only get non-existing columns. sync_keys = list(sync_row.keys()) for key in [k for k in row.keys() if k not in sync_keys]: # Get a sample of the new column(s) from the row. sync_row[key] = row[key] self._sync_columns(sync_row, ensure, types=types) # Get columns name list to be used for padding later. columns = sync_row.keys() chunk = [] for index, row in enumerate(rows): chunk.append(row) # Insert when chunk_size is fulfilled or this is the last row if len(chunk) == chunk_size or index == len(rows) - 1: chunk = pad_chunk_columns(chunk, columns) self.table.insert().execute(chunk) chunk = [] def update(self, row, keys, ensure=None, types=None, return_count=False): """Update a row in the table. The update is managed via the set of column names stated in ``keys``: they will be used as filters for the data to be updated, using the values in ``row``. :: # update all entries with id matching 10, setting their title # columns data = dict(id=10, title='I am a banana!') table.update(data, ['id']) If keys in ``row`` update columns not present in the table, they will be created based on the settings of ``ensure`` and ``types``, matching the behavior of :py:meth:`insert() <dataset.Table.insert>`. """ row = self._sync_columns(row, ensure, types=types) args, row = self._keys_to_args(row, keys) clause = self._args_to_clause(args) if not len(row): return self.count(clause) stmt = self.table.update(whereclause=clause, values=row) rp = self.db.executable.execute(stmt) if rp.supports_sane_rowcount(): return rp.rowcount if return_count: return self.count(clause) def update_many(self, rows, keys, chunk_size=1000, ensure=None, types=None): """Update many rows in the table at a time. This is significantly faster than updating them one by one. Per default the rows are processed in chunks of 1000 per commit, unless you specify a different ``chunk_size``. See :py:meth:`update() <dataset.Table.update>` for details on the other parameters. """ keys = ensure_list(keys) chunk = [] columns = [] for index, row in enumerate(rows): chunk.append(row) for col in row.keys(): if col not in columns: columns.append(col) # bindparam requires names to not conflict (cannot be "id" for id) for key in keys: row["_%s" % key] = row[key] # Update when chunk_size is fulfilled or this is the last row if len(chunk) == chunk_size or index == len(rows) - 1: cl = [self.table.c[k] == bindparam("_%s" % k) for k in keys] stmt = self.table.update( whereclause=and_(*cl), values={ col: bindparam(col, required=False) for col in columns }, ) self.db.executable.execute(stmt, chunk) chunk = [] def upsert(self, row, keys, ensure=None, types=None): """An UPSERT is a smart combination of insert and update. If rows with matching ``keys`` exist they will be updated, otherwise a new row is inserted in the table. :: data = dict(id=10, title='I am a banana!') table.upsert(data, ['id']) """ row = self._sync_columns(row, ensure, types=types) if self._check_ensure(ensure): self.create_index(keys) row_count = self.update(row, keys, ensure=False, return_count=True) if row_count == 0: return self.insert(row, ensure=False) return True def upsert_many(self, rows, keys, chunk_size=1000, ensure=None, types=None): """ Sorts multiple input rows into upserts and inserts. Inserts are passed to insert and upserts are updated. See :py:meth:`upsert() <dataset.Table.upsert>` and :py:meth:`insert_many() <dataset.Table.insert_many>`. """ # Removing a bulk implementation in 5e09aba401. Doing this one by one # is incredibly slow, but doesn't run into issues with column creation. for row in rows: self.upsert(row, keys, ensure=ensure, types=types) def delete(self, *clauses, **filters): """Delete rows from the table. Keyword arguments can be used to add column-based filters. The filter criterion will always be equality: :: table.delete(place='Berlin') If no arguments are given, all records are deleted. """ if not self.exists: return False clause = self._args_to_clause(filters, clauses=clauses) stmt = self.table.delete(whereclause=clause) rp = self.db.executable.execute(stmt) return rp.rowcount > 0 def _reflect_table(self): """Load the tables definition from the database.""" with self.db.lock: self._columns = None try: self._table = SQLATable(self.name, self.db.metadata, schema=self.db.schema, autoload=True) except NoSuchTableError: self._table = None def _threading_warn(self): if self.db.in_transaction and threading.active_count() > 1: warnings.warn( "Changing the database schema inside a transaction " "in a multi-threaded environment is likely to lead " "to race conditions and synchronization issues.", RuntimeWarning, ) def _sync_table(self, columns): """Lazy load, create or adapt the table structure in the database.""" if self._table is None: # Load an existing table from the database. self._reflect_table() if self._table is None: # Create the table with an initial set of columns. if not self._auto_create: raise DatasetException("Table does not exist: %s" % self.name) # Keep the lock scope small because this is run very often. with self.db.lock: self._threading_warn() self._table = SQLATable(self.name, self.db.metadata, schema=self.db.schema) if self._primary_id is not False: # This can go wrong on DBMS like MySQL and SQLite where # tables cannot have no columns. column = Column( self._primary_id, self._primary_type, primary_key=True, autoincrement=self._primary_increment, ) self._table.append_column(column) for column in columns: if not column.name == self._primary_id: self._table.append_column(column) self._table.create(self.db.executable, checkfirst=True) self._columns = None elif len(columns): with self.db.lock: self._reflect_table() self._threading_warn() for column in columns: if not self.has_column(column.name): self.db.op.add_column(self.name, column, self.db.schema) self._reflect_table() def _sync_columns(self, row, ensure, types=None): """Create missing columns (or the table) prior to writes. If automatic schema generation is disabled (``ensure`` is ``False``), this will remove any keys from the ``row`` for which there is no matching column. """ ensure = self._check_ensure(ensure) types = types or {} types = {self._get_column_name(k): v for (k, v) in types.items()} out = {} sync_columns = {} for name, value in row.items(): name = self._get_column_name(name) if self.has_column(name): out[name] = value elif ensure: _type = types.get(name) if _type is None: _type = self.db.types.guess(value) sync_columns[name] = Column(name, _type) out[name] = value self._sync_table(sync_columns.values()) return out def _check_ensure(self, ensure): if ensure is None: return self.db.ensure_schema return ensure def _generate_clause(self, column, op, value): if op in ("like", ): return self.table.c[column].like(value) if op in ("ilike", ): return self.table.c[column].ilike(value) if op in ("notlike", ): return self.table.c[column].notlike(value) if op in ("notilike", ): return self.table.c[column].notilike(value) if op in (">", "gt"): return self.table.c[column] > value if op in ("<", "lt"): return self.table.c[column] < value if op in (">=", "gte"): return self.table.c[column] >= value if op in ("<=", "lte"): return self.table.c[column] <= value if op in ("=", "==", "is"): return self.table.c[column] == value if op in ("!=", "<>", "not"): return self.table.c[column] != value if op in ("in", ): return self.table.c[column].in_(value) if op in ("notin", ): return self.table.c[column].notin_(value) if op in ("between", ".."): start, end = value return self.table.c[column].between(start, end) if op in ("startswith", ): return self.table.c[column].like("%" + value) if op in ("endswith", ): return self.table.c[column].like(value + "%") return false() def _args_to_clause(self, args, clauses=()): clauses = list(clauses) for column, value in args.items(): column = self._get_column_name(column) if not self.has_column(column): clauses.append(false()) elif isinstance(value, (list, tuple, set)): clauses.append(self._generate_clause(column, "in", value)) elif isinstance(value, dict): for op, op_value in value.items(): clauses.append(self._generate_clause(column, op, op_value)) else: clauses.append(self._generate_clause(column, "=", value)) return and_(*clauses) def _args_to_order_by(self, order_by): orderings = [] for ordering in ensure_list(order_by): if ordering is None: continue column = ordering.lstrip("-") column = self._get_column_name(column) if not self.has_column(column): continue if ordering.startswith("-"): orderings.append(self.table.c[column].desc()) else: orderings.append(self.table.c[column].asc()) return orderings def _keys_to_args(self, row, keys): keys = [self._get_column_name(k) for k in ensure_list(keys)] row = row.copy() args = {k: row.pop(k, None) for k in keys} return args, row def create_column(self, name, type, **kwargs): """Create a new column ``name`` of a specified type. :: table.create_column('created_at', db.types.datetime) `type` corresponds to an SQLAlchemy type as described by `dataset.db.Types`. Additional keyword arguments are passed to the constructor of `Column`, so that default values, and options like `nullable` and `unique` can be set. :: table.create_column('key', unique=True, nullable=False) table.create_column('food', default='banana') """ name = self._get_column_name(name) if self.has_column(name): log.debug("Column exists: %s" % name) return self._sync_table((Column(name, type, **kwargs), )) def create_column_by_example(self, name, value): """ Explicitly create a new column ``name`` with a type that is appropriate to store the given example ``value``. The type is guessed in the same way as for the insert method with ``ensure=True``. :: table.create_column_by_example('length', 4.2) If a column of the same name already exists, no action is taken, even if it is not of the type we would have created. """ type_ = self.db.types.guess(value) self.create_column(name, type_) def drop_column(self, name): """ Drop the column ``name``. :: table.drop_column('created_at') """ if self.db.engine.dialect.name == "sqlite": raise RuntimeError("SQLite does not support dropping columns.") name = self._get_column_name(name) with self.db.lock: if not self.exists or not self.has_column(name): log.debug("Column does not exist: %s", name) return self._threading_warn() self.db.op.drop_column(self.table.name, name, self.table.schema) self._reflect_table() def drop(self): """Drop the table from the database. Deletes both the schema and all the contents within it. """ with self.db.lock: if self.exists: self._threading_warn() self.table.drop(self.db.executable, checkfirst=True) self._table = None self._columns = None self.db._tables.pop(self.name, None) def has_index(self, columns): """Check if an index exists to cover the given ``columns``.""" if not self.exists: return False columns = set([self._get_column_name(c) for c in columns]) if columns in self._indexes: return True for column in columns: if not self.has_column(column): return False indexes = self.db.inspect.get_indexes(self.name, schema=self.db.schema) for index in indexes: if columns == set(index.get("column_names", [])): self._indexes.append(columns) return True return False def create_index(self, columns, name=None, **kw): """Create an index to speed up queries on a table. If no ``name`` is given a random name is created. :: table.create_index(['name', 'country']) """ columns = [self._get_column_name(c) for c in ensure_list(columns)] with self.db.lock: if not self.exists: raise DatasetException("Table has not been created yet.") for column in columns: if not self.has_column(column): return if not self.has_index(columns): self._threading_warn() name = name or index_name(self.name, columns) columns = [self.table.c[c] for c in columns] # MySQL crashes out if you try to index very long text fields, # apparently. This defines (a somewhat random) prefix that # will be captured by the index, after which I assume the engine # conducts a more linear scan: mysql_length = {} for col in columns: if isinstance(col.type, MYSQL_LENGTH_TYPES): mysql_length[col.name] = 10 kw["mysql_length"] = mysql_length idx = Index(name, *columns, **kw) idx.create(self.db.executable) def find(self, *_clauses, **kwargs): """Perform a simple search on the table. Simply pass keyword arguments as ``filter``. :: results = table.find(country='France') results = table.find(country='France', year=1980) Using ``_limit``:: # just return the first 10 rows results = table.find(country='France', _limit=10) You can sort the results by single or multiple columns. Append a minus sign to the column name for descending order:: # sort results by a column 'year' results = table.find(country='France', order_by='year') # return all rows sorted by multiple columns (descending by year) results = table.find(order_by=['country', '-year']) You can also submit filters based on criteria other than equality, see :ref:`advanced_filters` for details. To run more complex queries with JOINs, or to perform GROUP BY-style aggregation, you can also use :py:meth:`db.query() <dataset.Database.query>` to run raw SQL queries instead. """ if not self.exists: return iter([]) _limit = kwargs.pop("_limit", None) _offset = kwargs.pop("_offset", 0) order_by = kwargs.pop("order_by", None) _streamed = kwargs.pop("_streamed", False) _step = kwargs.pop("_step", QUERY_STEP) if _step is False or _step == 0: _step = None order_by = self._args_to_order_by(order_by) args = self._args_to_clause(kwargs, clauses=_clauses) query = self.table.select(whereclause=args, limit=_limit, offset=_offset) if len(order_by): query = query.order_by(*order_by) conn = self.db.executable if _streamed: conn = self.db.engine.connect() conn = conn.execution_options(stream_results=True) return ResultIter(conn.execute(query), row_type=self.db.row_type, step=_step) def find_one(self, *args, **kwargs): """Get a single result from the table. Works just like :py:meth:`find() <dataset.Table.find>` but returns one result, or ``None``. :: row = table.find_one(country='United States') """ if not self.exists: return None kwargs["_limit"] = 1 kwargs["_step"] = None resiter = self.find(*args, **kwargs) try: for row in resiter: return row finally: resiter.close() def count(self, *_clauses, **kwargs): """Return the count of results for the given filter set.""" # NOTE: this does not have support for limit and offset since I can't # see how this is useful. Still, there might be compatibility issues # with people using these flags. Let's see how it goes. if not self.exists: return 0 args = self._args_to_clause(kwargs, clauses=_clauses) query = select([func.count()], whereclause=args) query = query.select_from(self.table) rp = self.db.executable.execute(query) return rp.fetchone()[0] def __len__(self): """Return the number of rows in the table.""" return self.count() def distinct(self, *args, **_filter): """Return all the unique (distinct) values for the given ``columns``. :: # returns only one row per year, ignoring the rest table.distinct('year') # works with multiple columns, too table.distinct('year', 'country') # you can also combine this with a filter table.distinct('year', country='China') """ if not self.exists: return iter([]) columns = [] clauses = [] for column in args: if isinstance(column, ClauseElement): clauses.append(column) else: if not self.has_column(column): raise DatasetException("No such column: %s" % column) columns.append(self.table.c[column]) clause = self._args_to_clause(_filter, clauses=clauses) if not len(columns): return iter([]) q = expression.select( columns, distinct=True, whereclause=clause, order_by=[c.asc() for c in columns], ) return self.db.query(q) # Legacy methods for running find queries. all = find def __iter__(self): """Return all rows of the table as simple dictionaries. Allows for iterating over all rows in the table without explicetly calling :py:meth:`find() <dataset.Table.find>`. :: for row in table: print(row) """ return self.find() def __repr__(self): """Get table representation.""" return "<Table(%s)>" % self.table.name
class FactTable(object): """ The ``FactTable`` serves as a controller object for a given ``Model``, handling the creation, filling and migration of the table schema associated with the dataset. """ def __init__(self, dataset): self.dataset = dataset self.bind = db.engine self.meta = MetaData() self.meta.bind = self.bind self._table = None @property def table(self): """ Generate an appropriate table representation to mirror the fields known for this table. """ if self._table is None: name = '%s__facts' % self.dataset.name self._table = Table(name, self.meta) id_col = Column('_id', Unicode(42), primary_key=True) self._table.append_column(id_col) json_col = Column('_json', Unicode()) self._table.append_column(json_col) self._fields_columns(self._table) return self._table @property def alias(self): """ An alias used for queries. """ if not hasattr(self, '_alias'): self._alias = self.table.alias('entry') return self._alias @property def mapping(self): if not hasattr(self, '_mapping'): self._mapping = {} for attribute in self.dataset.model.attributes: if attribute.column in self.alias.columns: col = self.alias.c[attribute.column] self._mapping[attribute.path] = col return self._mapping @property def exists(self): return db.engine.has_table(self.table.name) def _fields_columns(self, table): """ Transform the (auto-detected) fields into a set of column specifications. """ for field in self.dataset.fields: data_type = TYPES.get(field.get('type'), Unicode) col = Column(field.get('name'), data_type, nullable=True) table.append_column(col) def load_iter(self, iterable, chunk_size=1000): """ Bulk load all the data in an artifact to a matching database table. """ chunk = [] conn = self.bind.connect() tx = conn.begin() try: for i, record in enumerate(iterable): chunk.append(self._expand_record(i, record)) if len(chunk) >= chunk_size: stmt = self.table.insert() conn.execute(stmt, chunk) chunk = [] if len(chunk): stmt = self.table.insert() conn.execute(stmt, chunk) tx.commit() except: tx.rollback() raise def _expand_record(self, i, record): """ Transform an incoming record into a form that matches the fields schema. """ record['_id'] = i record['_json'] = json.dumps(record, default=json_default) return record def unpack_entry(self, row): """ Convert a database-returned row into a nested and mapped fact representation. """ row = dict(row.items()) result = {'id': row.get('_id')} for dimension in self.dataset.model.dimensions: value = {} for attr in dimension.attributes: value[attr.name] = row.get(attr.column) result[dimension.name] = value for measure in self.dataset.model.measures: result[measure.name] = row.get(measure.column) return result def create(self): """ Create the fact table if it does not exist. """ if not self.exists: self.table.create(self.bind) def drop(self): """ Drop the fact table if it does exist. """ if self.exists: self.table.drop() self._table = None def num_entries(self): """ Get the number of facts that are currently loaded. """ if not self.exists: return 0 rp = self.bind.execute(self.table.count()) return rp.fetchone()[0] def _dimension_columns(self, dimension): """ Filter the generated columns for those related to a particular dimension. """ prefix = dimension.name + '.' columns = [] for path, col in self.mapping.items(): if path.startswith(prefix): columns.append(col) return columns def num_members(self, dimension): """ Get the number of members for the given dimension. """ if not self.exists: return 0 q = select(self._dimension_columns(dimension), distinct=True) rp = self.bind.execute(q.alias('counted').count()) return rp.fetchone()[0] def dimension_members(self, dimension, conditions="1=1", offset=0, limit=None): selects = self._dimension_columns(dimension) order_by = [s.asc() for s in selects] for entry in self.entries(conditions=conditions, order_by=order_by, selects=selects, distinct=True, offset=offset, limit=limit): yield entry.get(dimension.name) def entries(self, conditions="1=1", order_by=None, limit=None, selects=[], distinct=False, offset=0, step=10000): """ Generate a fully denormalized view of the entries on this table. This view is nested so that each dimension will be a hash of its attributes. """ if not self.exists: return if not selects: selects = [self.alias.c._id] + self.mapping.values() # enforce stable sorting: if order_by is None: order_by = [self.alias.c._id.asc()] assert order_by is not None for i in count(): qoffset = offset + (step * i) qlimit = step if limit is not None: qlimit = min(limit - (step * i), step) if qlimit <= 0: break query = select(selects, conditions, [], order_by=order_by, distinct=distinct, limit=qlimit, offset=qoffset) rp = self.bind.execute(query) first_row = True while True: row = rp.fetchone() if row is None: if first_row: return break first_row = False yield self.unpack_entry(row) def __repr__(self): return "<FactTable(%r)>" % (self.dataset)
class SQLTable(Component): _selects = 0 _inserts = 0 _updates = 0 _finalized = False STORE_MODE_LOOKUP = "lookup" STORE_MODE_INSERT = "insert" STORE_MODE_UPSERT = "upsert" _pk = False columns = [] create = True _unicode_errors = 0 _lookup_changed_fields = None def __init__(self, name, connection, columns, label=None): super(SQLTable, self).__init__() self.sa_table = None self.sa_metadata = None self.name = name self.connection = connection self.label = label if label else name self.columns = columns or [] for col in columns: col.sqltable = self def _get_sa_type(self, column): if (column.type == "Integer"): return Integer elif (column.type == "String"): #if (column.length is None): column.length = 128 return Unicode(length=128) elif (column.type == "Float"): return Float elif (column.type == "Boolean"): return Boolean elif (column.type == "AutoIncrement"): return Integer elif (column.type == "Date"): return Date elif (column.type == "Time"): return Time elif (column.type == "DateTime"): return DateTime elif (column.type == "Binary"): return Binary else: raise Exception("Invalid data type (%s): %s" % (column, column.type)) def finalize(self, ctx): if (not SQLTable._finalized): SQLTable._finalized = True if (SQLTable._inserts + SQLTable._selects > 0): logger.info( "SQLTable Totals ins/upd/sel: %d/%d/%d " % (SQLTable._inserts, SQLTable._updates, SQLTable._selects)) if (self._inserts + self._selects > 0): logger.info( "SQLTable %-18s ins/upd/sel: %6d/%6d/%-6d " % (self.name, self._inserts, self._updates, self._selects)) if (self._unicode_errors > 0): logger.warning( "SQLTable %s found %d warnings assigning non-unicode fields to unicode columns" % (self.name, self._unicode_errors)) ctx.comp.finalize(self.connection) super(SQLTable, self).finalize(ctx) def initialize(self, ctx): super(SQLTable, self).initialize(ctx) if self._lookup_changed_fields == None: self._lookup_changed_fields = [] ctx.comp.initialize(self.connection) logger.debug("Loading table %s on %s" % (self.name, self)) self.sa_metadata = MetaData() self.sa_table = Table(self.name, self.sa_metadata) self._selects = 0 self._inserts = 0 self._updates = 0 self._unicode_errors = 0 # Drop? columns_ex = [] for column in self.columns: logger.debug("Adding column to %s: %s" % (self, column)) column.sqltable = self # Check for duplicate names if (column.name in columns_ex): raise ETLConfigurationException( "Duplicate column name '%s' in %s" % (column.name, self)) columns_ex.append(column.name) # Configure column if isinstance(column, SQLColumnFK): if column.fk_sqlcolumn.sqltable.sa_table is None: logger.warning( "Column %s foreign key %s table (%s) has not been defined in backend (ignoring).", column, column.fk_sqlcolumn, column.fk_sqlcolumn.sqltable) continue self.sa_table.append_column( Column(column.name, self._get_sa_type(column), ForeignKey( column.fk_sqlcolumn.sqltable.sa_table.columns[ column.fk_sqlcolumn.name]), primary_key=column.pk, nullable=column.nullable, autoincrement=(True if column.type == "AutoIncrement" else False))) else: self.sa_table.append_column( Column(column.name, self._get_sa_type(column), primary_key=column.pk, nullable=column.nullable, autoincrement=(True if column.type == "AutoIncrement" else False))) # Check schema: # Create if doesn't exist if (not self.connection.engine().has_table(self.name)): logger.info("Creating table %s" % self.name) self.sa_table.create(self.connection.connection()) # TODO:? Extend? (unsafe, allow read-only connections and make them default?) # TODO:? Delete columns (unsafe, allow read-only connections and make them default?) def pk(self, ctx): """ Returns the primary key column definitToClauion, or None if none defined. """ #if (self._pk == False): if True: pk_cols = [] for col in self.columns: if col.pk: pk_cols.append(col) if (len(pk_cols) > 1): raise Exception("Table %s has multiple primary keys: %s" % (self.name, pk_cols)) elif (len(pk_cols) == 1): self._pk = pk_cols[0] else: self._pk = None return self._pk def _attribsToClause(self, attribs): clauses = [] for k, v in attribs.items(): if isinstance(v, (list, tuple)): clauses.append(self.sa_table.c[k].in_(v)) else: clauses.append(self.sa_table.c[k] == v) return and_(*clauses) def _rowtodict(self, row): d = {} for column in self.columns: #print column d[column.name] = getattr(row, column.name) return d def _find(self, ctx, attribs): self._selects = self._selects + 1 SQLTable._selects = SQLTable._selects + 1 query = self.sa_table.select(self._attribsToClause(attribs)) rows = self.connection.connection().execute(query) for r in rows: # Ensure we return dicts, not RowProxys from SqlAlchemy yield self._rowtodict(r) def lookup(self, ctx, attribs, find_function=None): logger.debug("Lookup on '%s' attribs: %s" % (self, attribs)) if (len(attribs.keys()) == 0): raise Exception( "Cannot lookup on table '%s' with no criteria (empty attribute set)" % self.name) find_function = find_function or self._find rows = find_function(ctx, attribs) rows = list(rows) if (len(rows) > 1): raise Exception( "Found more than one row when searching for just one in table %s: %s" % (self.name, attribs)) elif (len(rows) == 1): row = rows[0] else: row = None logger.debug("Lookup result on %s: %s = %s" % (self.name, attribs, row)) return row def upsert(self, ctx, data, keys=[]): """ Upsert checks if the row exists and has changed. It does a lookup followed by an update or insert as appropriate. """ # TODO: Check for AutoIncrement in keys, shall not be used # If keys qfilter = {} if (len(keys) > 0): for key in keys: try: qfilter[key] = data[key] except KeyError as e: raise Exception( "Could not find attribute '%s' in data when storing row data: %s" % (key, data)) else: pk = self.pk(ctx) qfilter[pk.name] = data[pk.name] # Do lookup if len(qfilter) > 0: row = self.lookup(ctx, qfilter) if (row): # Check row is identical for c in self.columns: if c.type != "AutoIncrement": v1 = row[c.name] v2 = data[c.name] if c.type == "Date": v1 = row[c.name].strftime('%Y-%m-%d') v2 = data[c.name].strftime('%Y-%m-%d') if (isinstance(v1, str) or isinstance(v2, str)): if (not isinstance(v1, str)): v1 = str(v1) if (not isinstance(v2, str)): v2 = str(v2) if (v1 != v2): if (c.name not in self._lookup_changed_fields): logger.warning( "%s updating an entity that exists with different attributes, overwriting (field=%s, existing_value=%s, tried_value=%s)" % (self, c.name, v1, v2)) #self._lookup_changed_fields.append(c["name"]) # Update the row row = self.update(ctx, data, keys) return row row_with_id = self.insert(ctx, data) return row_with_id def _prepare_row(self, ctx, data): row = {} for column in self.columns: if column.type != "AutoIncrement": try: row[column.name] = data[column.name] except KeyError as e: raise Exception( "Missing attribute for column %s in table '%s' while inserting row: %s" % (e, self.name, data)) # Checks if (column.type == "String") and (not isinstance( row[column.name], str)): self._unicode_errors = self._unicode_errors + 1 if (ctx.debug): logger.warning( "Unicode column %r received non-unicode string: %r " % (column.name, row[column.name])) return row def insert(self, ctx, data): row = self._prepare_row(ctx, data) logger.debug("Inserting in table '%s' row: %s" % (self.name, row)) res = self.connection.connection().execute(self.sa_table.insert(row)) pk = self.pk(ctx) if pk: row[pk.name] = res.inserted_primary_key[0] self._inserts = self._inserts + 1 SQLTable._inserts = SQLTable._inserts + 1 if pk is not None: return row else: return row # None def update(self, ctx, data, keys=[]): row = self._prepare_row(ctx, data) # Automatically calculate lookup if necessary qfilter = {} if (len(keys) > 0): for key in keys: try: qfilter[key] = data[key] except KeyError as e: raise Exception( "Could not find attribute '%s' in data when storing row data: %s" % (key, data)) else: pk = self.pk(ctx) qfilter[pk.name] = data[pk.name] logger.debug("Updating in table '%s' row: %s" % (self.name, row)) res = self.connection.connection().execute( self.sa_table.update(self._attribsToClause(qfilter), row)) self._updates = self._updates + 1 SQLTable._updates = SQLTable._updates + 1 if pk is not None: return row else: return None
class SQLTable(Component): _selects = 0 _inserts = 0 _updates = 0 _finalized = False STORE_MODE_LOOKUP = "lookup" STORE_MODE_INSERT = "insert" STORE_MODE_UPSERT = "upsert" _pk = False columns = [] create = True _unicode_errors = 0 _lookup_changed_fields = None def __init__(self, name, connection, columns, label=None): super(SQLTable, self).__init__() self.sa_table = None self.sa_metadata = None self.name = name self.connection = connection self.label = label if label else name self.columns = columns or [] for col in columns: col.sqltable = self def _get_sa_type(self, column): if (column.type == "Integer"): return Integer elif (column.type == "String"): #if (column.length is None): column.length = 128 return Unicode(length = 128) elif (column.type == "Float"): return Float elif (column.type == "Boolean"): return Boolean elif (column.type == "AutoIncrement"): return Integer elif (column.type == "Date"): return Date elif (column.type == "Time"): return Time elif (column.type == "DateTime"): return DateTime elif (column.type == "Binary"): return Binary else: raise Exception("Invalid data type (%s): %s" % (column, column.type)) def finalize(self, ctx): if (not SQLTable._finalized): SQLTable._finalized = True if (SQLTable._inserts + SQLTable._selects > 0): logger.info("SQLTable Totals ins/upd/sel: %d/%d/%d " % (SQLTable._inserts, SQLTable._updates, SQLTable._selects)) if (self._inserts + self._selects > 0): logger.info("SQLTable %-18s ins/upd/sel: %6d/%6d/%-6d " % (self.name, self._inserts, self._updates, self._selects)) if (self._unicode_errors > 0): logger.warning("SQLTable %s found %d warnings assigning non-unicode fields to unicode columns" % (self.name, self._unicode_errors)) ctx.comp.finalize(self.connection) super(SQLTable, self).finalize(ctx) def initialize(self, ctx): super(SQLTable, self).initialize(ctx) if self._lookup_changed_fields == None: self._lookup_changed_fields = [] ctx.comp.initialize(self.connection) logger.debug("Loading table %s on %s" % (self.name, self)) self.sa_metadata = MetaData() self.sa_table = Table(self.name, self.sa_metadata) self._selects = 0 self._inserts = 0 self._updates = 0 self._unicode_errors = 0 # Drop? columns_ex = [] for column in self.columns: logger.debug("Adding column to %s: %s" % (self, column)) column.sqltable = self # Check for duplicate names if (column.name in columns_ex): raise ETLConfigurationException("Duplicate column name '%s' in %s" % (column.name, self)) columns_ex.append(column.name) # Configure column if isinstance(column, SQLColumnFK): if column.fk_sqlcolumn.sqltable.sa_table is None: logger.warning("Column %s foreign key %s table (%s) has not been defined in backend (ignoring).", column, column.fk_sqlcolumn, column.fk_sqlcolumn.sqltable) continue self.sa_table.append_column(Column(column.name, self._get_sa_type(column), ForeignKey(column.fk_sqlcolumn.sqltable.sa_table.columns[column.fk_sqlcolumn.name]), primary_key=column.pk, nullable=column.nullable, autoincrement=(True if column.type == "AutoIncrement" else False))) else: self.sa_table.append_column(Column(column.name, self._get_sa_type(column), primary_key=column.pk, nullable=column.nullable, autoincrement=(True if column.type == "AutoIncrement" else False))) # Check schema: # Create if doesn't exist if (not self.connection.engine().has_table(self.name)): logger.info("Creating table %s" % self.name) self.sa_table.create(self.connection.connection()) # TODO:? Extend? (unsafe, allow read-only connections and make them default?) # TODO:? Delete columns (unsafe, allow read-only connections and make them default?) def pk(self, ctx): """ Returns the primary key column definitToClauion, or None if none defined. """ #if (self._pk == False): if True: pk_cols = [] for col in self.columns: if col.pk: pk_cols.append(col) if (len(pk_cols) > 1): raise Exception("Table %s has multiple primary keys: %s" % (self.name, pk_cols)) elif (len(pk_cols) == 1): self._pk = pk_cols[0] else: self._pk = None return self._pk def _attribsToClause(self, attribs): clauses = [] for k, v in attribs.items(): if isinstance(v, (list, tuple)): clauses.append(self.sa_table.c[k].in_(v)) else: clauses.append(self.sa_table.c[k] == v) return and_(*clauses) def _rowtodict(self, row): d = {} for column in self.columns: #print column d[column.name] = getattr(row, column.name) return d def _find(self, ctx, attribs): self._selects = self._selects + 1 SQLTable._selects = SQLTable._selects + 1 query = self.sa_table.select(self._attribsToClause(attribs)) rows = self.connection.connection().execute(query) for r in rows: # Ensure we return dicts, not RowProxys from SqlAlchemy yield self._rowtodict(r) def lookup(self, ctx, attribs, find_function=None): logger.debug ("Lookup on '%s' attribs: %s" % (self, attribs)) if (len(attribs.keys()) == 0): raise Exception("Cannot lookup on table '%s' with no criteria (empty attribute set)" % self.name) find_function = find_function or self._find rows = find_function(ctx, attribs) rows = list(rows) if (len(rows) > 1): raise Exception("Found more than one row when searching for just one in table %s: %s" % (self.name, attribs)) elif (len(rows) == 1): row = rows[0] else: row = None logger.debug("Lookup result on %s: %s = %s" % (self.name, attribs, row)) return row def upsert(self, ctx, data, keys = []): """ Upsert checks if the row exists and has changed. It does a lookup followed by an update or insert as appropriate. """ # TODO: Check for AutoIncrement in keys, shall not be used # If keys qfilter = {} if (len(keys) > 0): for key in keys: try: qfilter[key] = data[key] except KeyError as e: raise Exception("Could not find attribute '%s' in data when storing row data: %s" % (key, data)) else: pk = self.pk(ctx) qfilter[pk.name] = data[pk.name] # Do lookup if len(qfilter) > 0: row = self.lookup(ctx, qfilter) if (row): # Check row is identical for c in self.columns: if c.type != "AutoIncrement": v1 = row[c.name] v2 = data[c.name] if c.type == "Date": v1 = row[c.name].strftime('%Y-%m-%d') v2 = data[c.name].strftime('%Y-%m-%d') if (isinstance(v1, str) or isinstance(v2, str)): if (not isinstance(v1, str)): v1 = str(v1) if (not isinstance(v2, str)): v2 = str(v2) if (v1 != v2): if (c.name not in self._lookup_changed_fields): logger.warn("%s updating an entity that exists with different attributes, overwriting (field=%s, existing_value=%s, tried_value=%s)" % (self, c.name, v1, v2)) #self._lookup_changed_fields.append(c["name"]) # Update the row row = self.update(ctx, data, keys) return row row_with_id = self.insert(ctx, data) return row_with_id def _prepare_row(self, ctx, data): row = {} for column in self.columns: if column.type != "AutoIncrement": try: row[column.name] = data[column.name] except KeyError as e: raise Exception("Missing attribute for column %s in table '%s' while inserting row: %s" % (e, self.name, data)) # Checks if (column.type == "String") and (not isinstance(row[column.name], str)): self._unicode_errors = self._unicode_errors + 1 if (ctx.debug): logger.warn("Unicode column %r received non-unicode string: %r " % (column.name, row[column.name])) return row def insert(self, ctx, data): row = self._prepare_row(ctx, data) logger.debug("Inserting in table '%s' row: %s" % (self.name, row)) res = self.connection.connection().execute(self.sa_table.insert(row)) pk = self.pk(ctx) if pk: row[pk.name] = res.inserted_primary_key[0] self._inserts = self._inserts + 1 SQLTable._inserts = SQLTable._inserts + 1 if pk is not None: return row else: return row # None def update(self, ctx, data, keys = []): row = self._prepare_row(ctx, data) # Automatically calculate lookup if necessary qfilter = {} if (len(keys) > 0): for key in keys: try: qfilter[key] = data[key] except KeyError as e: raise Exception("Could not find attribute '%s' in data when storing row data: %s" % (key, data)) else: pk = self.pk(ctx) qfilter[pk.name] = data[pk.name] logger.debug("Updating in table '%s' row: %s" % (self.name, row)) res = self.connection.connection().execute(self.sa_table.update(self._attribsToClause(qfilter), row)) self._updates = self._updates +1 SQLTable._updates = SQLTable._updates + 1 if pk is not None: return row else: return None
def create_translation_table(_table_name, foreign_class, relation_name, language_class, relation_lazy='select', **kwargs): """Creates a table that represents some kind of data attached to the given foreign class, but translated across several languages. Returns the new table's mapped class. It won't be declarative, but it will have a `__table__` attribute so you can retrieve the Table object. `foreign_class` must have a `__singlename__`, currently only used to create the name of the foreign key column. Also supports the notion of a default language, which is attached to the session. This is English by default, for historical and practical reasons. Usage looks like this: class Foo(Base): ... create_translation_table('foo_bars', Foo, 'bars', name = Column(...), ) # Now you can do the following: foo.name foo.name_map['en'] foo.foo_bars['en'] foo.name_map['en'] = "new name" del foo.name_map['en'] q.options(joinedload(Foo.bars_local)) q.options(joinedload(Foo.bars)) The following properties are added to the passed class: - `(relation_name)`, a relation to the new table. It uses a dict-based collection class, where the keys are language identifiers and the values are rows in the created tables. - `(relation_name)_local`, a relation to the row in the new table that matches the current default language. - `(relation_name)_table`, the class created by this function. Note that these are distinct relations. Even though the former necessarily includes the latter, SQLAlchemy doesn't treat them as linked; loading one will not load the other. Modifying both within the same transaction has undefined behavior. For each column provided, the following additional attributes are added to Foo: - `(column)_map`, an association proxy onto `foo_bars`. - `(column)`, an association proxy onto `foo_bars_local`. Pardon the naming disparity, but the grammar suffers otherwise. Modifying these directly is not likely to be a good idea. For Markdown-formatted columns, `(column)_map` and `(column)` will give Markdown objects. """ # n.b.: language_class only exists for the sake of tests, which sometimes # want to create tables entirely separate from the pokedex metadata foreign_key_name = foreign_class.__singlename__ + '_id' Translations = type(_table_name, (object,), { '_language_identifier': association_proxy('local_language', 'identifier'), 'relation_name': relation_name, '__tablename__': _table_name, }) # Create the table object table = Table(_table_name, foreign_class.__table__.metadata, Column(foreign_key_name, Integer, ForeignKey(foreign_class.id), primary_key=True, nullable=False, doc=u"ID of the %s these texts relate to" % foreign_class.__singlename__), Column('local_language_id', Integer, ForeignKey(language_class.id), primary_key=True, nullable=False, doc=u"Language these texts are in"), ) Translations.__table__ = table # Add ye columns # Column objects have a _creation_order attribute in ascending order; use # this to get the (unordered) kwargs sorted correctly kwitems = list(kwargs.items()) kwitems.sort(key=lambda kv: kv[1]._creation_order) for name, column in kwitems: column.name = name table.append_column(column) # Construct ye mapper mapper(Translations, table, properties={ 'foreign_id': synonym(foreign_key_name), 'local_language': relationship(language_class, primaryjoin=table.c.local_language_id == language_class.id, innerjoin=True), }) # Add full-table relations to the original class # Foo.bars_table setattr(foreign_class, relation_name + '_table', Translations) # Foo.bars setattr(foreign_class, relation_name, relationship(Translations, primaryjoin=foreign_class.id == Translations.foreign_id, collection_class=attribute_mapped_collection('local_language'), )) # Foo.bars_local # This is a bit clever; it uses bindparam() to make the join clause # modifiable on the fly. db sessions know the current language and # populate the bindparam. # The 'dummy' value is to trick SQLA; without it, SQLA thinks this # bindparam is just its own auto-generated clause and everything gets # f****d up. local_relation_name = relation_name + '_local' setattr(foreign_class, local_relation_name, relationship(Translations, primaryjoin=and_( Translations.foreign_id == foreign_class.id, Translations.local_language_id == bindparam('_default_language_id', value='dummy', type_=Integer, required=True), ), foreign_keys=[Translations.foreign_id, Translations.local_language_id], uselist=False, lazy=relation_lazy, )) # Add per-column proxies to the original class for name, column in kwitems: getset_factory = None string_getter = column.info.get('string_getter') if string_getter: getset_factory = _getset_factory_factory( column.name, string_getter) # Class.(column) -- accessor for the default language's value setattr(foreign_class, name, LocalAssociationProxy(local_relation_name, name, getset_factory=getset_factory)) # Class.(column)_map -- accessor for the language dict # Need a custom creator since Translations doesn't have an init, and # these are passed as *args anyway def creator(language, value): row = Translations() row.local_language = language setattr(row, name, value) return row setattr(foreign_class, name + '_map', association_proxy(relation_name, name, creator=creator, getset_factory=getset_factory)) # Add to the list of translation classes foreign_class.translation_classes.append(Translations) # Done return Translations
def process_progress_notes(table: Table, engine: Engine, progargs: Any) -> None: crate_col_max_subnum = Column(CRATE_COL_MAX_SUBNUM, Integer, nullable=True) crate_col_last_note = Column(CRATE_COL_LAST_NOTE, Integer, nullable=True) table.append_column(crate_col_max_subnum) table.append_column(crate_col_last_note) add_columns(engine, table, [crate_col_max_subnum, crate_col_last_note]) # We're always in "RiO land", not "RCEP land", for this one. add_indexes(engine, table, [ { # Joint index, for JOIN in UPDATE statement below 'index_name': CRATE_IDX_RIONUM_NOTENUM, 'column': '{rio_number}, NoteNum'.format( rio_number=CRATE_COL_RIO_NUMBER), }, { # Speeds up WHERE below. (Much, much faster for second run.) 'index_name': CRATE_IDX_MAX_SUBNUM, 'column': CRATE_COL_MAX_SUBNUM, }, { # Speeds up WHERE below. (Much, much faster for second run.) 'index_name': CRATE_IDX_LAST_NOTE, 'column': CRATE_COL_LAST_NOTE, }, ]) ensure_columns_present( engine, tablename=table.name, column_names=["NoteNum", "SubNum", "EnteredInError", "EnteredInError"]) if not progargs.print: ensure_columns_present(engine, tablename=table.name, column_names=[ CRATE_COL_MAX_SUBNUM, CRATE_COL_LAST_NOTE, CRATE_COL_RIO_NUMBER ]) # Find the maximum SubNum for each note, and store it. # Slow query, even with index. log.info("Progress notes table {}: updating {}".format( repr(table.name), repr(CRATE_COL_MAX_SUBNUM))) execute( engine, """ UPDATE p1 SET p1.{max_subnum_col} = subq.max_subnum FROM {tablename} p1 JOIN ( SELECT {rio_number}, NoteNum, MAX(SubNum) AS max_subnum FROM {tablename} p2 GROUP BY {rio_number}, NoteNum ) subq ON subq.{rio_number} = p1.{rio_number} AND subq.NoteNum = p1.NoteNum WHERE p1.{max_subnum_col} IS NULL """.format( max_subnum_col=CRATE_COL_MAX_SUBNUM, tablename=table.name, rio_number=CRATE_COL_RIO_NUMBER, )) # Set a single column accordingly log.info("Progress notes table {}: updating {}".format( repr(table.name), repr(CRATE_COL_LAST_NOTE))) execute( engine, """ UPDATE {tablename} SET {last_note_col} = CASE WHEN SubNum = {max_subnum_col} THEN 1 ELSE 0 END WHERE {last_note_col} IS NULL """.format( tablename=table.name, last_note_col=CRATE_COL_LAST_NOTE, max_subnum_col=CRATE_COL_MAX_SUBNUM, )) # Create a view, if we're on an RCEP database if progargs.rcep and progargs.cpft: select_sql = """ SELECT * FROM {tablename} WHERE (EnteredInError <> 1 OR EnteredInError IS NULL) AND {last_note_col} = 1 """.format( tablename=table.name, last_note_col=CRATE_COL_LAST_NOTE, ) create_view(engine, VIEW_RCEP_CPFT_PROGRESS_NOTES_CURRENT, select_sql)
def create_translation_table(_table_name, foreign_class, relation_name, language_class, relation_lazy='select', **kwargs): """Creates a table that represents some kind of data attached to the given foreign class, but translated across several languages. Returns the new table's mapped class. It won't be declarative, but it will have a `__table__` attribute so you can retrieve the Table object. `foreign_class` must have a `__singlename__`, currently only used to create the name of the foreign key column. Also supports the notion of a default language, which is attached to the session. This is English by default, for historical and practical reasons. Usage looks like this: class Foo(Base): ... create_translation_table('foo_bars', Foo, 'bars', name = Column(...), ) # Now you can do the following: foo.name foo.name_map['en'] foo.foo_bars['en'] foo.name_map['en'] = "new name" del foo.name_map['en'] q.options(joinedload(Foo.bars_local)) q.options(joinedload(Foo.bars)) The following properties are added to the passed class: - `(relation_name)`, a relation to the new table. It uses a dict-based collection class, where the keys are language identifiers and the values are rows in the created tables. - `(relation_name)_local`, a relation to the row in the new table that matches the current default language. - `(relation_name)_table`, the class created by this function. Note that these are distinct relations. Even though the former necessarily includes the latter, SQLAlchemy doesn't treat them as linked; loading one will not load the other. Modifying both within the same transaction has undefined behavior. For each column provided, the following additional attributes are added to Foo: - `(column)_map`, an association proxy onto `foo_bars`. - `(column)`, an association proxy onto `foo_bars_local`. Pardon the naming disparity, but the grammar suffers otherwise. Modifying these directly is not likely to be a good idea. For Markdown-formatted columns, `(column)_map` and `(column)` will give Markdown objects. """ # n.b.: language_class only exists for the sake of tests, which sometimes # want to create tables entirely separate from the pokedex metadata foreign_key_name = foreign_class.__singlename__ + '_id' Translations = type(_table_name, (object,), { '_language_identifier': association_proxy('local_language', 'identifier'), 'relation_name': relation_name, }) # Create the table object table = Table(_table_name, foreign_class.__table__.metadata, Column(foreign_key_name, Integer, ForeignKey(foreign_class.id), primary_key=True, nullable=False, info=dict(description="ID of the %s these texts relate to" % foreign_class.__singlename__)), Column('local_language_id', Integer, ForeignKey(language_class.id), primary_key=True, nullable=False, info=dict(description="Language these texts are in")), ) Translations.__table__ = table # Add ye columns # Column objects have a _creation_order attribute in ascending order; use # this to get the (unordered) kwargs sorted correctly kwitems = kwargs.items() kwitems.sort(key=lambda kv: kv[1]._creation_order) for name, column in kwitems: column.name = name table.append_column(column) # Construct ye mapper mapper(Translations, table, properties={ 'foreign_id': synonym(foreign_key_name), 'local_language': relationship(language_class, primaryjoin=table.c.local_language_id == language_class.id, innerjoin=True), }) # Add full-table relations to the original class # Foo.bars_table setattr(foreign_class, relation_name + '_table', Translations) # Foo.bars setattr(foreign_class, relation_name, relationship(Translations, primaryjoin=foreign_class.id == Translations.foreign_id, collection_class=attribute_mapped_collection('local_language'), )) # Foo.bars_local # This is a bit clever; it uses bindparam() to make the join clause # modifiable on the fly. db sessions know the current language and # populate the bindparam. # The 'dummy' value is to trick SQLA; without it, SQLA thinks this # bindparam is just its own auto-generated clause and everything gets # f****d up. local_relation_name = relation_name + '_local' setattr(foreign_class, local_relation_name, relationship(Translations, primaryjoin=and_( Translations.foreign_id == foreign_class.id, Translations.local_language_id == bindparam('_default_language_id', value='dummy', type_=Integer, required=True), ), foreign_keys=[Translations.foreign_id, Translations.local_language_id], uselist=False, #innerjoin=True, lazy=relation_lazy, )) # Add per-column proxies to the original class for name, column in kwitems: getset_factory = None string_getter = column.info.get('string_getter') if string_getter: getset_factory = _getset_factory_factory( column.name, string_getter) # Class.(column) -- accessor for the default language's value setattr(foreign_class, name, LocalAssociationProxy(local_relation_name, name, getset_factory=getset_factory)) # Class.(column)_map -- accessor for the language dict # Need a custom creator since Translations doesn't have an init, and # these are passed as *args anyway def creator(language, value): row = Translations() row.local_language = language setattr(row, name, value) return row setattr(foreign_class, name + '_map', association_proxy(relation_name, name, creator=creator, getset_factory=getset_factory)) # Add to the list of translation classes foreign_class.translation_classes.append(Translations) # Done return Translations
def process_patient_table(table: Table, engine: Engine, progargs: Any) -> None: log.info("Preprocessing patient table: {}".format(repr(table.name))) rio_type = table_is_rio_type(table.name, progargs) if rio_type: pk_col = get_effective_int_pk_col(table) rio_pk = pk_col if pk_col != CRATE_COL_PK else None string_pt_id = get_rio_patient_id_col(table) required_cols = [string_pt_id] else: # RCEP type rio_pk = None required_cols = [RCEP_COL_PATIENT_ID] string_pt_id = RCEP_COL_PATIENT_ID if not progargs.print: required_cols.extend([CRATE_COL_PK, CRATE_COL_RIO_NUMBER]) # ------------------------------------------------------------------------- # Add pk and rio_number columns, if not present # ------------------------------------------------------------------------- if rio_type and rio_pk is not None: crate_pk_col = Column(CRATE_COL_PK, BigInteger, nullable=True) # ... can't do NOT NULL; need to populate it required_cols.append(rio_pk) else: # RCEP type, or no PK in RiO crate_pk_col = make_bigint_autoincrement_column( CRATE_COL_PK, engine.dialect) # ... autopopulates crate_rio_number_col = Column(CRATE_COL_RIO_NUMBER, BigInteger, nullable=True) # ... even if RiO numbers are INT, they come from VARCHAR(15) here, and # that can (aod does) look numeric and overflow an INT. # SQL Server requires Table-bound columns in order to generate DDL: table.append_column(crate_pk_col) table.append_column(crate_rio_number_col) add_columns(engine, table, [crate_pk_col, crate_rio_number_col]) # ------------------------------------------------------------------------- # Update pk and rio_number values, if not NULL # ------------------------------------------------------------------------- ensure_columns_present(engine, tablename=table.name, column_names=required_cols) cast_id_to_int = sql_fragment_cast_to_int(string_pt_id, dialect=engine.dialect) if rio_type and rio_pk: log.info("Table {}: updating columns {} and {}".format( repr(table.name), repr(CRATE_COL_PK), repr(CRATE_COL_RIO_NUMBER))) execute( engine, """ UPDATE {tablename} SET {crate_pk} = {rio_pk}, {crate_rio_number} = {cast_id_to_int} WHERE {crate_pk} IS NULL OR {crate_rio_number} IS NULL """.format( tablename=table.name, crate_pk=CRATE_COL_PK, rio_pk=rio_pk, crate_rio_number=CRATE_COL_RIO_NUMBER, cast_id_to_int=cast_id_to_int, )) else: # RCEP format, or RiO with no PK # crate_pk is autogenerated as an INT IDENTITY field log.info("Table {}: updating column {}".format( repr(table.name), repr(CRATE_COL_RIO_NUMBER))) execute( engine, """ UPDATE {tablename} SET {crate_rio_number} = {cast_id_to_int} WHERE {crate_rio_number} IS NULL """.format( # noqa tablename=table.name, crate_rio_number=CRATE_COL_RIO_NUMBER, cast_id_to_int=cast_id_to_int, )) # ------------------------------------------------------------------------- # Add indexes, if absent # ------------------------------------------------------------------------- # Note that the indexes are unlikely to speed up the WHERE NOT NULL search # above, so it doesn't matter that we add these last. Their use is for # the subsequent CRATE anonymisation table scans. add_indexes(engine, table, [ { 'index_name': CRATE_IDX_PK, 'column': CRATE_COL_PK, 'unique': True, }, { 'index_name': CRATE_IDX_RIONUM, 'column': CRATE_COL_RIO_NUMBER, }, ])
def process_clindocs_table(table: Table, engine: Engine, progargs: Any) -> None: # For RiO only, not RCEP crate_col_max_docver = Column(CRATE_COL_MAX_DOCVER, Integer, nullable=True) crate_col_last_doc = Column(CRATE_COL_LAST_DOC, Integer, nullable=True) table.append_column(crate_col_max_docver) table.append_column(crate_col_last_doc) add_columns(engine, table, [crate_col_max_docver, crate_col_last_doc]) add_indexes(engine, table, [ { 'index_name': CRATE_IDX_RIONUM_SERIALNUM, 'column': '{rio_number}, SerialNumber'.format( rio_number=CRATE_COL_RIO_NUMBER), }, { 'index_name': CRATE_IDX_MAX_DOCVER, 'column': CRATE_COL_MAX_DOCVER, }, { 'index_name': CRATE_IDX_LAST_DOC, 'column': CRATE_COL_LAST_DOC, }, ]) required_cols = ["SerialNumber", "RevisionID"] if not progargs.print: required_cols.extend( [CRATE_COL_MAX_DOCVER, CRATE_COL_LAST_DOC, CRATE_COL_RIO_NUMBER]) ensure_columns_present(engine, tablename=table.name, column_names=required_cols) # Find the maximum SerialNumber for each note, and store it. # Slow query, even with index. log.info("Clinical documents table {}: updating {}".format( repr(table.name), repr(CRATE_COL_MAX_DOCVER))) execute( engine, """ UPDATE p1 SET p1.{max_docver_col} = subq.max_docver FROM {tablename} p1 JOIN ( SELECT {rio_number}, SerialNumber, MAX(RevisionID) AS max_docver FROM {tablename} p2 GROUP BY {rio_number}, SerialNumber ) subq ON subq.{rio_number} = p1.{rio_number} AND subq.SerialNumber = p1.SerialNumber WHERE p1.{max_docver_col} IS NULL """.format( max_docver_col=CRATE_COL_MAX_DOCVER, tablename=table.name, rio_number=CRATE_COL_RIO_NUMBER, )) # Set a single column accordingly log.info("Clinical documents table {}: updating {}".format( repr(table.name), repr(CRATE_COL_LAST_DOC))) execute( engine, """ UPDATE {tablename} SET {last_doc_col} = CASE WHEN RevisionID = {max_docver_col} THEN 1 ELSE 0 END WHERE {last_doc_col} IS NULL """.format( tablename=table.name, last_doc_col=CRATE_COL_LAST_DOC, max_docver_col=CRATE_COL_MAX_DOCVER, ))
class Table(object): """Represents a table in a database and exposes common operations.""" PRIMARY_DEFAULT = 'id' def __init__(self, database, table_name, primary_id=None, primary_type=None, auto_create=False): """Initialise the table from database schema.""" self.db = database self.name = normalize_table_name(table_name) self._table = None self._indexes = [] self._primary_id = primary_id self._primary_type = primary_type self._auto_create = auto_create @property def exists(self): """Check to see if the table currently exists in the database.""" if self._table is not None: return True return self.name in self.db @property def table(self): """Get a reference to the table, which may be reflected or created.""" if self._table is None: self._sync_table(()) return self._table @property def columns(self): """Get a listing of all columns that exist in the table.""" if not self.exists: return [] return self.table.columns.keys() def has_column(self, column): """Check if a column with the given name exists on this table.""" return normalize_column_name(column) in self.columns def insert(self, row, ensure=None, types=None): """Add a ``row`` dict by inserting it into the table. If ``ensure`` is set, any of the keys of the row are not table columns, they will be created automatically. During column creation, ``types`` will be checked for a key matching the name of a column to be created, and the given SQLAlchemy column type will be used. Otherwise, the type is guessed from the row value, defaulting to a simple unicode field. :: data = dict(title='I am a banana!') table.insert(data) Returns the inserted row's primary key. """ row = self._sync_columns(row, ensure, types=types) res = self.db.executable.execute(self.table.insert(row)) if len(res.inserted_primary_key) > 0: return res.inserted_primary_key[0] return True def insert_ignore(self, row, keys, ensure=None, types=None): """Add a ``row`` dict into the table if the row does not exist. If rows with matching ``keys`` exist they will be added to the table. Setting ``ensure`` results in automatically creating missing columns, i.e., keys of the row are not table columns. During column creation, ``types`` will be checked for a key matching the name of a column to be created, and the given SQLAlchemy column type will be used. Otherwise, the type is guessed from the row value, defaulting to a simple unicode field. :: data = dict(id=10, title='I am a banana!') table.insert_ignore(data, ['id']) """ row = self._sync_columns(row, ensure, types=types) if self._check_ensure(ensure): self.create_index(keys) args, _ = self._keys_to_args(row, keys) if self.count(**args) == 0: return self.insert(row, ensure=False) return False def insert_many(self, rows, chunk_size=1000, ensure=None, types=None): """Add many rows at a time. This is significantly faster than adding them one by one. Per default the rows are processed in chunks of 1000 per commit, unless you specify a different ``chunk_size``. See :py:meth:`insert() <dataset.Table.insert>` for details on the other parameters. :: rows = [dict(name='Dolly')] * 10000 table.insert_many(rows) """ chunk = [] for row in rows: row = self._sync_columns(row, ensure, types=types) chunk.append(row) if len(chunk) == chunk_size: self.table.insert().execute(chunk) chunk = [] if len(chunk): self.table.insert().execute(chunk) def update(self, row, keys, ensure=None, types=None, return_count=False): """Update a row in the table. The update is managed via the set of column names stated in ``keys``: they will be used as filters for the data to be updated, using the values in ``row``. :: # update all entries with id matching 10, setting their title columns data = dict(id=10, title='I am a banana!') table.update(data, ['id']) If keys in ``row`` update columns not present in the table, they will be created based on the settings of ``ensure`` and ``types``, matching the behavior of :py:meth:`insert() <dataset.Table.insert>`. """ row = self._sync_columns(row, ensure, types=types) args, row = self._keys_to_args(row, keys) clause = self._args_to_clause(args) if not len(row): return self.count(clause) stmt = self.table.update(whereclause=clause, values=row) rp = self.db.executable.execute(stmt) if rp.supports_sane_rowcount(): return rp.rowcount if return_count: return self.count(clause) def upsert(self, row, keys, ensure=None, types=None): """An UPSERT is a smart combination of insert and update. If rows with matching ``keys`` exist they will be updated, otherwise a new row is inserted in the table. :: data = dict(id=10, title='I am a banana!') table.upsert(data, ['id']) """ row = self._sync_columns(row, ensure, types=types) if self._check_ensure(ensure): self.create_index(keys) row_count = self.update(row, keys, ensure=False, return_count=True) if row_count == 0: return self.insert(row, ensure=False) return True def delete(self, *clauses, **filters): """Delete rows from the table. Keyword arguments can be used to add column-based filters. The filter criterion will always be equality: :: table.delete(place='Berlin') If no arguments are given, all records are deleted. """ if not self.exists: return False clause = self._args_to_clause(filters, clauses=clauses) stmt = self.table.delete(whereclause=clause) rp = self.db.executable.execute(stmt) return rp.rowcount > 0 def _reflect_table(self): """Load the tables definition from the database.""" with self.db.lock: try: self._table = SQLATable(self.name, self.db.metadata, schema=self.db.schema, autoload=True) except NoSuchTableError: pass def _threading_warn(self): if self.db.in_transaction and threading.active_count() > 1: warnings.warn( "Changing the database schema inside a transaction " "in a multi-threaded environment is likely to lead " "to race conditions and synchronization issues.", RuntimeWarning) def _sync_table(self, columns): """Lazy load, create or adapt the table structure in the database.""" if self._table is None: # Load an existing table from the database. self._reflect_table() if self._table is None: # Create the table with an initial set of columns. if not self._auto_create: raise DatasetException("Table does not exist: %s" % self.name) # Keep the lock scope small because this is run very often. with self.db.lock: self._threading_warn() self._table = SQLATable(self.name, self.db.metadata, schema=self.db.schema) if self._primary_id is not False: # This can go wrong on DBMS like MySQL and SQLite where # tables cannot have no columns. primary_id = self._primary_id or self.PRIMARY_DEFAULT primary_type = self._primary_type or Types.integer increment = primary_type in [Types.integer, Types.bigint] column = Column(primary_id, primary_type, primary_key=True, autoincrement=increment) self._table.append_column(column) for column in columns: if not column.name == self._primary_id: self._table.append_column(column) self._table.create(self.db.executable, checkfirst=True) elif len(columns): with self.db.lock: self._threading_warn() for column in columns: if not self.has_column(column.name): self.db.op.add_column(self.name, column, self.db.schema) self._reflect_table() def _sync_columns(self, row, ensure, types=None): """Create missing columns (or the table) prior to writes. If automatic schema generation is disabled (``ensure`` is ``False``), this will remove any keys from the ``row`` for which there is no matching column. """ columns = self.columns ensure = self._check_ensure(ensure) types = types or {} types = {normalize_column_name(k): v for (k, v) in types.items()} out = {} sync_columns = [] for name, value in row.items(): name = normalize_column_name(name) if ensure and name not in columns: _type = types.get(name) if _type is None: _type = self.db.types.guess(value) sync_columns.append(Column(name, _type)) columns.append(name) if name in columns: out[name] = value self._sync_table(sync_columns) return out def _check_ensure(self, ensure): if ensure is None: return self.db.ensure_schema return ensure def _args_to_clause(self, args, clauses=()): clauses = list(clauses) for column, value in args.items(): if not self.has_column(column): clauses.append(false()) elif isinstance(value, (list, tuple)): clauses.append(self.table.c[column].in_(value)) else: clauses.append(self.table.c[column] == value) return and_(*clauses) def _args_to_order_by(self, order_by): orderings = [] for ordering in ensure_tuple(order_by): if ordering is None: continue column = ordering.lstrip('-') if column not in self.table.columns: continue if ordering.startswith('-'): orderings.append(self.table.c[column].desc()) else: orderings.append(self.table.c[column].asc()) return orderings def _keys_to_args(self, row, keys): keys = ensure_tuple(keys) keys = [normalize_column_name(k) for k in keys] # keys = [self.has_column(k) for k in keys] row = row.copy() args = {k: row.pop(k) for k in keys if k in row} return args, row def create_column(self, name, type): """Create a new column ``name`` of a specified type. :: table.create_column('created_at', db.types.datetime) """ name = normalize_column_name(name) if self.has_column(name): log.debug("Column exists: %s" % name) return self._sync_table((Column(name, type), )) def create_column_by_example(self, name, value): """ Explicitly create a new column ``name`` with a type that is appropriate to store the given example ``value``. The type is guessed in the same way as for the insert method with ``ensure=True``. :: table.create_column_by_example('length', 4.2) If a column of the same name already exists, no action is taken, even if it is not of the type we would have created. """ type_ = self.db.types.guess(value) self.create_column(name, type_) def drop_column(self, name): """Drop the column ``name``. :: table.drop_column('created_at') """ if self.db.engine.dialect.name == 'sqlite': raise RuntimeError("SQLite does not support dropping columns.") name = normalize_column_name(name) with self.db.lock: if not self.exists or not self.has_column(name): log.debug("Column does not exist: %s", name) return self._threading_warn() self.db.op.drop_column(self.table.name, name, self.table.schema) self._reflect_table() def drop(self): """Drop the table from the database. Deletes both the schema and all the contents within it. """ with self.db.lock: if self.exists: self._threading_warn() self.table.drop(self.db.executable, checkfirst=True) self._table = None def has_index(self, columns): """Check if an index exists to cover the given ``columns``.""" if not self.exists: return False columns = set([normalize_column_name(c) for c in columns]) if columns in self._indexes: return True for column in columns: if not self.has_column(column): return False indexes = self.db.inspect.get_indexes(self.name, schema=self.db.schema) for index in indexes: if columns == set(index.get('column_names', [])): self._indexes.append(columns) return True return False def create_index(self, columns, name=None, **kw): """Create an index to speed up queries on a table. If no ``name`` is given a random name is created. :: table.create_index(['name', 'country']) """ columns = [normalize_column_name(c) for c in ensure_tuple(columns)] with self.db.lock: if not self.exists: raise DatasetException("Table has not been created yet.") for column in columns: if not self.has_column(column): return if not self.has_index(columns): self._threading_warn() name = name or index_name(self.name, columns) columns = [self.table.c[c] for c in columns] idx = Index(name, *columns, **kw) idx.create(self.db.executable) def find(self, *_clauses, **kwargs): """Perform a simple search on the table. Simply pass keyword arguments as ``filter``. :: results = table.find(country='France') results = table.find(country='France', year=1980) Using ``_limit``:: # just return the first 10 rows results = table.find(country='France', _limit=10) You can sort the results by single or multiple columns. Append a minus sign to the column name for descending order:: # sort results by a column 'year' results = table.find(country='France', order_by='year') # return all rows sorted by multiple columns (descending by year) results = table.find(order_by=['country', '-year']) To perform complex queries with advanced filters or to perform aggregation, use :py:meth:`db.query() <dataset.Database.query>` instead. """ if not self.exists: return [] _limit = kwargs.pop('_limit', None) _offset = kwargs.pop('_offset', 0) order_by = kwargs.pop('order_by', None) _streamed = kwargs.pop('_streamed', False) _step = kwargs.pop('_step', QUERY_STEP) if _step is False or _step == 0: _step = None order_by = self._args_to_order_by(order_by) args = self._args_to_clause(kwargs, clauses=_clauses) query = self.table.select(whereclause=args, limit=_limit, offset=_offset) if len(order_by): query = query.order_by(*order_by) conn = self.db.executable if _streamed: conn = self.db.engine.connect() conn = conn.execution_options(stream_results=True) return ResultIter(conn.execute(query), row_type=self.db.row_type, step=_step) def find_one(self, *args, **kwargs): """Get a single result from the table. Works just like :py:meth:`find() <dataset.Table.find>` but returns one result, or ``None``. :: row = table.find_one(country='United States') """ if not self.exists: return None kwargs['_limit'] = 1 kwargs['_step'] = None resiter = self.find(*args, **kwargs) try: for row in resiter: return row finally: resiter.close() def count(self, *_clauses, **kwargs): """Return the count of results for the given filter set.""" # NOTE: this does not have support for limit and offset since I can't # see how this is useful. Still, there might be compatibility issues # with people using these flags. Let's see how it goes. if not self.exists: return 0 args = self._args_to_clause(kwargs, clauses=_clauses) query = select([func.count()], whereclause=args) query = query.select_from(self.table) rp = self.db.executable.execute(query) return rp.fetchone()[0] def __len__(self): """Return the number of rows in the table.""" return self.count() def distinct(self, *args, **_filter): """Return all the unique (distinct) values for the given ``columns``. :: # returns only one row per year, ignoring the rest table.distinct('year') # works with multiple columns, too table.distinct('year', 'country') # you can also combine this with a filter table.distinct('year', country='China') """ if not self.exists: return [] filters = [] for column, value in _filter.items(): if not self.has_column(column): raise DatasetException("No such column: %s" % column) filters.append(self.table.c[column] == value) columns = [] for column in args: if isinstance(column, ClauseElement): filters.append(column) else: if not self.has_column(column): raise DatasetException("No such column: %s" % column) columns.append(self.table.c[column]) if not len(columns): return [] q = expression.select(columns, distinct=True, whereclause=and_(*filters), order_by=[c.asc() for c in columns]) return self.db.query(q) # Legacy methods for running find queries. all = find def __iter__(self): """Return all rows of the table as simple dictionaries. Allows for iterating over all rows in the table without explicetly calling :py:meth:`find() <dataset.Table.find>`. :: for row in table: print(row) """ return self.find() def __repr__(self): """Get table representation.""" return '<Table(%s)>' % self.table.name
class SQLTable(Component): _selects = 0 _inserts = 0 _updates = 0 _finalized = False STORE_MODE_LOOKUP = "lookup" STORE_MODE_INSERT = "insert" STORE_MODE_UPSERT = "upsert" _pk = False name = None connection = None columns = [] create = True sa_table = None sa_metadata = None _selects = 0 _inserts = 0 _unicode_errors = 0 _lookup_changed_fields = None def __init__(self): super(SQLTable, self).__init__() self.columns = [] def _get_sa_type(self, column): if (column["type"] == "Integer"): return Integer elif (column["type"] == "String"): if (not "length" in column): column["length"] = 128 return Unicode(length = column["length"]) elif (column["type"] == "Float"): return Float elif (column["type"] == "Boolean"): return Boolean elif (column["type"] == "AutoIncrement"): return Integer elif (column["type"] == "Date"): return Date elif (column["type"] == "Time"): return Time elif (column["type"] == "DateTime"): return DateTime else: raise Exception("Invalid data type: %s" % column["type"]) def finalize(self, ctx): if (not SQLTable._finalized): SQLTable._finalized = True if (SQLTable._inserts + SQLTable._selects > 0): logger.info("SQLTable Totals ins/upd/sel: %d/%d/%d " % (SQLTable._inserts, SQLTable._updates, SQLTable._selects)) if (self._inserts + self._selects > 0): logger.info("SQLTable %-18s ins/upd/sel: %6d/%6d/%-6d " % (self.name, self._inserts, self._updates, self._selects)) if (self._unicode_errors > 0): logger.warn("SQLTable %s found %d warnings assigning non-unicode fields to unicode columns" % (self.name, self._unicode_errors)) ctx.comp.finalize(self.connection) super(SQLTable, self).finalize(ctx) def initialize(self, ctx): super(SQLTable, self).initialize(ctx) if self._lookup_changed_fields == None: self._lookup_changed_fields = [] ctx.comp.initialize(self.connection) logger.debug("Loading table %s on %s" % (self.name, self)) self.sa_metadata = MetaData() self.sa_table = Table(self.name, self.sa_metadata) # Drop? columns_ex = [] for column in self.columns: logger.debug("Adding column to %s: %s" % (self, column)) # Check for duplicate names if (column["name"] in columns_ex): raise Exception("Duplicate column name '%s' in %s" % (column["name"], self)) columns_ex.append(column["name"]) # Configure column column["pk"] = False if (not "pk" in column) else parsebool(column["pk"]) if (not "type" in column): column["type"] = "String" #if (not "value" in column): column["value"] = None self.sa_table.append_column( Column(column["name"], self._get_sa_type(column), primary_key = column["pk"], autoincrement = (True if column["type"] == "AutoIncrement" else False) )) # Check schema # Create if doesn't exist if (not self.connection.engine().has_table(self.name)): logger.info("Creating table %s" % self.name) self.sa_table.create(self.connection.connection()) # Extend? # Delete columns? def pk(self, ctx): """ Returns the primary key column definitToClauion, or None if none defined. """ if (self._pk == False): pk_cols = [] for col in self.columns: if ("pk" in col): if parsebool(col["pk"]): pk_cols.append(col) if (len(pk_cols) > 1): raise Exception("Table %s has multiple primary keys: %s" % (self.name, pk_cols)) elif (len(pk_cols) == 1): self._pk = pk_cols[0] else: self._pk = None return self._pk def _attribsToClause(self, attribs): clauses = [] for k, v in attribs.items(): if isinstance(v, (list, tuple)): clauses.append(self.sa_table.c[k].in_(v)) else: clauses.append(self.sa_table.c[k] == v) return and_(*clauses) def _rowtodict(self, row): d = {} for column in self.columns: #print column d[column["name"]] = getattr(row, column["name"]) return d def _find(self, ctx, attribs): self._selects = self._selects + 1 SQLTable._selects = SQLTable._selects + 1 query = self.sa_table.select(self._attribsToClause(attribs)) rows = self.connection.connection().execute(query) for r in rows: # Ensure we return dicts, not RowProxys from SqlAlchemy yield self._rowtodict(r) def lookup(self, ctx, attribs): logger.debug ("Lookup on '%s' attribs: %s" % (self, attribs)) if (len(attribs.keys()) == 0): raise Exception("Cannot lookup on table '%s' with no criteria (empty attribute set)" % self.name) rows = self._find(ctx, attribs) rows = list(rows) if (len(rows) > 1): raise Exception("Found more than one row when searching for just one in table %s: %s" % (self.name, attribs)) elif (len(rows) == 1): row = rows[0] else: row = None logger.debug("Lookup result on %s: %s = %s" % (self.name, attribs, row)) return row def upsert(self, ctx, data, keys = []): """ Upsert checks if the row exists and has changed. It does a lookup followe by an update or insert as appropriate. """ # TODO: Check for AutoIncrement in keys, shall not be used # If keys qfilter = {} if (len(keys) > 0): for key in keys: try: qfilter[key] = data[key] except KeyError as e: raise Exception("Could not find attribute '%s' in data when storing row data: %s" % (key, data)) else: pk = self.pk(ctx) qfilter[pk["name"]] = data[pk["name"]] # Do lookup if len(qfilter) > 0: row = self.lookup(ctx, qfilter) if (row): # Check row is identical for c in self.columns: if c["type"] != "AutoIncrement": v1 = row[c['name']] v2 = data[c['name']] if c["type"] == "Date": v1 = row[c['name']].strftime('%Y-%m-%d') v2 = data[c['name']].strftime('%Y-%m-%d') if (isinstance(v1, basestring) or isinstance(v2, basestring)): if (not isinstance(v1, basestring)): v1 = str(v1) if (not isinstance(v2, basestring)): v2 = str(v2) if (v1 != v2): if (c["name"] not in self._lookup_changed_fields): logger.warn("%s updating an entity that exists with different attributes, overwriting (field=%s, existing_value=%s, tried_value=%s)" % (self, c["name"], v1, v2)) #self._lookup_changed_fields.append(c["name"]) # Update the row row = self.update(ctx, data, keys) return row row_with_id = self.insert(ctx, data) return row_with_id def _prepare_row(self, ctx, data): row = {} for column in self.columns: if (column["type"] != "AutoIncrement"): try: row[column["name"]] = data[column["name"]] except KeyError, e: raise Exception("Missing attribute for column %s in table '%s' while inserting row: %s" % (e, self.name, data)) # Checks if ((column["type"] == "String") and (not isinstance(row[column["name"]], unicode))): self._unicode_errors = self._unicode_errors + 1 if (ctx.debug): logger.warn("Unicode column %r received non-unicode string: %r " % (column["name"], row[column["name"]])) return row