def downgrade(migrate_engine): meta.bind = migrate_engine keys = Enum(name='key', metadata=meta, *ZONE_ATTRIBUTE_KEYS) types = Enum(name='types', metadata=meta, *ZONE_TYPES) domains_attributes_table = Table('domain_attributes', meta, autoload=True) domains_table = Table('domains', meta, autoload=True) domains = select(columns=[domains_table.c.id, domains_table.c.type])\ .where(domains_table.c.type == 'SECONDARY')\ .execute().fetchall() for dom in domains: delete = domains_table.delete()\ .where(domains_table.id == dom.id) delete.execute() domains_table.c.type.drop() domains_table.c.transferred_at.drop() domains_attributes_table.drop() keys.drop() types.drop() dialect = migrate_engine.url.get_dialect().name if dialect.startswith('sqlite'): constraint = UniqueConstraint( 'name', 'deleted', name='unique_domain_name', table=domains_table) # Add missing unique index constraint.create()
def downgrade(migrate_engine): meta.bind = migrate_engine # Load the pool_attributes and pool_ns_records table schema pool_attributes_table = Table('pool_attributes', meta, autoload=True) pool_ns_records_table = Table('pool_ns_records', meta, autoload=True) # Find the nameservers for the default_pool_id pool_ns_records = select( columns=[ pool_ns_records_table.c.id, pool_ns_records_table.c.created_at, pool_ns_records_table.c.updated_at, pool_ns_records_table.c.version, pool_ns_records_table.c.hostname, ] ).where(pool_attributes_table.c.pool_id == default_pool_id)\ .execute().fetchall() # Create matching entries in the new table. for pool_ns_record in pool_ns_records: pool_attributes_table.insert().execute( id=pool_ns_record.id, created_at=pool_ns_record.created_at, updated_at=pool_ns_record.updated_at, version=pool_ns_record.version, key='name_server', value=pool_ns_record.hostname, ) # Delete the pool_ns_records table from the DB pool_ns_records_table.drop()
def test_insert_table(engine_testaccount): metadata = MetaData() users = Table('users', metadata, Column('id', Integer, Sequence('user_id_seq'), primary_key=True), Column('name', String), Column('fullname', String), ) metadata.create_all(engine_testaccount) data = [{ 'id': 1, 'name': 'testname1', 'fullname': 'fulltestname1', }, { 'id': 2, 'name': 'testname2', 'fullname': 'fulltestname2', }] conn = engine_testaccount.connect() try: # using multivalue insert conn.execute(users.insert(data)) results = conn.execute(select([users]).order_by('id')) row = results.fetchone() assert row['name'] == 'testname1' finally: conn.close() users.drop(engine_testaccount)
def test_insert_table(engine_testaccount): metadata = MetaData() users = Table( 'users', metadata, Column('id', Integer, Sequence('user_id_seq'), primary_key=True), Column('name', String), Column('fullname', String), ) metadata.create_all(engine_testaccount) data = [{ 'id': 1, 'name': 'testname1', 'fullname': 'fulltestname1', }, { 'id': 2, 'name': 'testname2', 'fullname': 'fulltestname2', }] conn = engine_testaccount.connect() try: # using multivalue insert conn.execute(users.insert(data)) results = conn.execute(select([users]).order_by('id')) row = results.fetchone() assert row['name'] == 'testname1' finally: conn.close() users.drop(engine_testaccount)
def drop_table(self, schema_name, table_name): metadata = MetaData() self.logger.debug(f"Dropping table {schema_name}.{table_name}") table = Table(table_name, metadata, schema=schema_name) table.drop(self.target_db, checkfirst=True) self.logger.debug(f"Dropped table {schema_name}.{table_name}")
def downgrade(migrate_engine): meta = MetaData() meta.bind = migrate_engine share_snapshots = Table('share_snapshots', meta, autoload=True) try: share_snapshots.drop() except Exception: LOG.error(_("share_snapshots table not dropped")) raise
def test_insert_values(self, engine, connection): table = Table('insert_test', MetaData(bind=engine), Column('a', sqlalchemy.types.Integer)) table.drop(checkfirst=True) table.create() connection.execute(table.insert([{'a': 1}, {'a': 2}])) result = table.select().execute().fetchall() expected = [(1, ), (2, )] self.assertEqual(result, expected)
def test_insert_select(self, engine, connection): one_row = Table('one_row', MetaData(bind=engine), autoload=True) table = Table('insert_test', MetaData(bind=engine), Column('a', sqlalchemy.types.Integer)) table.drop(checkfirst=True) table.create() connection.execute(table.insert().from_select(['a'], one_row.select())) result = table.select().execute().fetchall() expected = [(1, )] self.assertEqual(result, expected)
def test_insert_values(self, engine, connection): table = Table('insert_test', MetaData(bind=engine), Column('a', sqlalchemy.types.Integer), schema='pyhive_test_database') table.drop(checkfirst=True) table.create() connection.execute(table.insert([{'a': 1}, {'a': 2}])) result = table.select().execute().fetchall() expected = [(1,), (2,)] self.assertEqual(result, expected)
def drop_table(engine, table_name): # Accept Connection objects here if hasattr(engine, 'engine'): engine = engine.engine if table_name in TABLES[engine]: table = TABLES[engine][table_name] elif engine.has_table(table_name): table = Table(table_name, engine._metadata) else: return table.drop(engine) TABLES[engine].pop(table_name, None)
def test_insert_select(self, engine, connection): one_row = Table('one_row', MetaData(bind=engine), autoload=True) table = Table('insert_test', MetaData(bind=engine), Column('a', sqlalchemy.types.Integer), schema='pyhive_test_database') table.drop(checkfirst=True) table.create() connection.execute('SET mapred.job.tracker=local') # NOTE(jing) I'm stuck on a version of Hive without INSERT ... VALUES connection.execute(table.insert().from_select(['a'], one_row.select())) result = table.select().execute().fetchall() expected = [(1,)] self.assertEqual(result, expected)
def drop_table(engine, table_name): # Accept Connection objects here if hasattr(engine, 'engine'): engine = engine.engine with lock: if table_name in engine._tables: table = engine._tables[table_name] elif engine.has_table(table_name): table = Table(table_name, engine._metadata) else: return table.drop(engine) engine._tables.pop(table_name, None)
def drop_table(engine, table_name): # Accept Connection objects here if hasattr(engine, "engine"): engine = engine.engine with lock: if table_name in engine._tables: table = engine._tables[table_name] elif engine.has_table(table_name): table = Table(table_name, engine._metadata) else: return table.drop(engine) engine._tables.pop(table_name, None)
class TableHandler(object): """ Used by automatically generated objects such as datasets and dimensions to generate, write and clear the table under its management. """ def _init_table(self, meta, namespace, name, id_type=Integer): """ Create the given table if it does not exist, otherwise reflect the current table schema from the database. """ name = namespace + '__' + name self.table = Table(name, meta) if id_type is not None: col = Column('id', id_type, primary_key=True) self.table.append_column(col) def _generate_table(self): """ Create the given table if it does not exist. """ # TODO: make this support some kind of migration? if not db.engine.has_table(self.table.name): self.table.create(db.engine) def _upsert(self, bind, data, unique_columns): """ Upsert a set of values into the table. This will query for the set of unique columns and either update an existing row or create a new one. In both cases, the ID of the changed row will be returned. """ key = and_(*[self.table.c[c] == data.get(c) for c in unique_columns]) q = self.table.update(key, data) if bind.execute(q).rowcount == 0: q = self.table.insert(data) rs = bind.execute(q) return rs.inserted_primary_key[0] else: q = self.table.select(key) row = bind.execute(q).fetchone() return row['id'] def _flush(self, bind): """ Delete all rows in the table. """ q = self.table.delete() bind.execute(q) def _drop(self, bind): """ Drop the table and the local reference to it. """ if db.engine.has_table(self.table.name): self.table.drop() del self.table
def test_lots_of_types(self, engine, connection): # take type list from sqlalchemy.types types = [ 'INT', 'CHAR', 'VARCHAR', 'NCHAR', 'TEXT', 'Text', 'FLOAT', 'NUMERIC', 'DECIMAL', 'TIMESTAMP', 'DATETIME', 'CLOB', 'BLOB', 'BOOLEAN', 'SMALLINT', 'DATE', 'TIME', 'String', 'Integer', 'SmallInteger', 'Numeric', 'Float', 'DateTime', 'Date', 'Time', 'LargeBinary', 'Boolean', 'Unicode', 'UnicodeText', ] cols = [] for i, t in enumerate(types): cols.append(Column(str(i), getattr(sqlalchemy.types, t))) table = Table('test_table', MetaData(bind=engine), *cols) table.drop(checkfirst=True) table.create() table.drop()
def test_lots_of_types(self, engine, connection): # Presto doesn't have raw CREATE TABLE support, so we ony test hive # take type list from sqlalchemy.types types = [ 'INT', 'CHAR', 'VARCHAR', 'NCHAR', 'TEXT', 'Text', 'FLOAT', 'NUMERIC', 'DECIMAL', 'TIMESTAMP', 'DATETIME', 'CLOB', 'BLOB', 'BOOLEAN', 'SMALLINT', 'DATE', 'TIME', 'String', 'Integer', 'SmallInteger', 'Numeric', 'Float', 'DateTime', 'Date', 'Time', 'Binary', 'Boolean', 'Unicode', 'UnicodeText', ] cols = [] for i, t in enumerate(types): cols.append(Column(str(i), getattr(sqlalchemy.types, t))) cols.append(Column('hive_date', HiveDate)) cols.append(Column('hive_decimal', HiveDecimal)) cols.append(Column('hive_timestamp', HiveTimestamp)) table = Table('test_table', MetaData(bind=engine), *cols, schema='pyhive_test_database') table.drop(checkfirst=True) table.create() connection.execute('SET mapred.job.tracker=local') connection.execute('USE pyhive_test_database') big_number = 10 ** 10 - 1 connection.execute(""" INSERT OVERWRITE TABLE test_table SELECT 1, "a", "a", "a", "a", "a", 0.1, 0.1, 0.1, 0, 0, "a", "a", false, 1, 0, 0, "a", 1, 1, 0.1, 0.1, 0, 0, 0, "a", false, "a", "a", 0, %d, 123 + 2000 FROM default.one_row """, big_number) row = connection.execute(table.select()).fetchone() self.assertEqual(row.hive_date, datetime.date(1970, 1, 1)) self.assertEqual(row.hive_decimal, decimal.Decimal(big_number)) self.assertEqual(row.hive_timestamp, datetime.datetime(1970, 1, 1, 0, 0, 2, 123)) table.drop()
def create_table(self, schema_name, table_name, columns_configuration, drop_first): metadata = MetaData() table = Table(table_name, metadata, schema=schema_name) for column_configuration in columns_configuration: table.append_column( self.create_column(column_configuration["destination"])) table.append_column( Column( Providers.AuditColumnsNames.TIMESTAMP, DateTime(timezone=True), server_default=func.now(), )) table.append_column( Column( Providers.AuditColumnsNames.IS_DELETED, Boolean, server_default="f", default=False, )) table.append_column( Column(Providers.AuditColumnsNames.CHANGE_VERSION, BigInteger)) if drop_first: self.logger.debug(f"Dropping table {schema_name}.{table_name}") table.drop(self.target_db, checkfirst=True) self.logger.debug(f"Dropped table {schema_name}.{table_name}") self.logger.debug(f"Creating table {schema_name}.{table_name}") table.create(self.target_db, checkfirst=False) self.logger.debug(f"Created table {schema_name}.{table_name}") return
def upgrade(migrate_engine): meta.bind = migrate_engine # Load the database tables servers_table = Table('servers', meta, autoload=True) servers_table.drop()
def downgrade(migrate_engine): meta.bind = migrate_engine # Find the table and drop it zone_tasks_table = Table("zone_tasks", meta, autoload=True) zone_tasks_table.drop()
def downgrade(migrate_engine): meta.bind = migrate_engine # Find the table and drop it zone_tasks_table = Table('zone_tasks', meta, autoload=True) zone_tasks_table.drop()
from sqlalchemy.schema import MetaData, Table, Column from sqlalchemy.types import Unicode engine = create_engine('postgresql://python@localhost/') loader = PostgresLoader(engine) testmodule = loader.load_module('testmodule') metadata = MetaData(bind=engine) table = Table('testtable', metadata, Column('test', Unicode), Column('test2', Unicode)) table.drop(checkfirst=True) table.create(checkfirst=True) for i in range(20): table.insert({'test': 'test%d' % i, 'test2': 'test%d' %i}).execute() print(engine.execute(testmodule.pyconcat(table.c.test, table.c.test2)).fetchall()) statement = """ CREATE TRIGGER mytrigger BEFORE INSERT ON %s FOR EACH ROW EXECUTE PROCEDURE %s(); """ engine.execute(statement % (table.name, testmodule.nullifying_trigger.__name__))
def wipe_opt_out_patients(report_every: int = 1000, chunksize: int = 10000) -> None: """ Delete any data from patients that have opted out (after their data was processed on a previous occasion). (Slightly complicated by the fact that the destination database can't necessarily 'see' the mapping database, so we need to cache the RID keys in the destination database temporarily.) """ start = "wipe_opt_out_patients" log.info(start) adminsession = config.admindb.session metadata = MetaData() # operate in isolation! destengine = config.destdb.engine destsession = config.destdb.session ridfield = config.research_id_fieldname # Drop/create temporary table pkfield = 'rid' temptable = Table( config.temporary_tablename, metadata, Column(pkfield, config.SqlTypeEncryptedPid, primary_key=True), **TABLE_KWARGS) log.debug(start + ": 1. dropping temporary table") temptable.drop(destengine, checkfirst=True) # use engine, not session log.debug(start + ": 2. making temporary table") temptable.create(destengine, checkfirst=True) # use engine, not session log.debug(start + ": 3. populating temporary table with RIDs") def insert(records_): # records_: a list of dictionaries # http://docs.sqlalchemy.org/en/latest/core/tutorial.html log.debug(start + "... inserting {} records".format(len(records_))) destsession.execute(temptable.insert(), records_) i = 0 records = [] # type: List[Dict[str: Any]] for rid in gen_optout_rids(): i += 1 if report_every and i % report_every == 0: log.debug(start + "... src row# {}".format(i)) records.append({pkfield: rid}) # a row is a dict of values if i % chunksize == 0: insert(records) records = [] # type: List[Dict[str: Any]] if records: # remainder insert(records) commit_destdb() log.debug(start + ": 4. creating index on temporary table") index = Index('_temptable_idx', temptable.columns[pkfield]) index.create(destengine) # use engine, not session # 5. For each patient destination table, # DELETE FROM desttable WHERE rid IN (SELECT rid FROM temptable) log.debug(start + ": 5. deleting from destination table by opt-out RID") for dest_table_name in config.dd.get_dest_tables_with_patient_info(): log.debug(start + ": ... {}".format(dest_table_name)) dest_table = config.dd.get_dest_sqla_table(dest_table_name) query = dest_table.delete().where( column(ridfield).in_(select([temptable.columns[pkfield]]))) destengine.execute(query) commit_destdb() log.debug(start + ": 6. dropping temporary table") temptable.drop(destengine, checkfirst=True) # use engine, not session commit_destdb() log.debug(start + ": 7. deleting opt-out patients from mapping table") adminsession.query(PatientInfo).filter( or_(PatientInfo.pid.in_(adminsession.query(OptOutPid.pid)), PatientInfo.mpid.in_(adminsession.query( OptOutMpid.mpid)))).delete(synchronize_session=False) commit_admindb()
def delete_dest_rows_with_no_src_row( srcdbname: str, src_table: str, report_every: int = DEFAULT_REPORT_EVERY, chunksize: int = DEFAULT_CHUNKSIZE) -> None: """ For a given source database/table, delete any rows in the corresponding destination table where there is no corresponding source row. - Can't do this in a single SQL command, since the engine can't necessarily see both databases. - Can't do this in a multiprocess way, because we're trying to do a DELETE WHERE NOT IN. - However, we can get stupidly long query lists if we try to SELECT all the values and use a DELETE FROM x WHERE y NOT IN (v1, v2, v3, ...) query. This crashes the MySQL connection, etc. - Therefore, we need a temporary table in the destination. """ if not config.dd.has_active_destination(srcdbname, src_table): return dest_table_name = config.dd.get_dest_table_for_src_db_table( srcdbname, src_table) start = "delete_dest_rows_with_no_src_row: {}.{} -> {}.{}: ".format( srcdbname, src_table, config.destdb.name, dest_table_name) log.info(start + "[WARNING: MAY BE SLOW]") metadata = MetaData() # operate in isolation! destengine = config.destdb.engine destsession = config.destdb.session dest_table = config.dd.get_dest_sqla_table(dest_table_name) pkddr = config.dd.get_pk_ddr(srcdbname, src_table) # If there's no source PK, we just delete everything if not pkddr: log.info("... No source PK; deleting everything") destsession.execute(dest_table.delete()) commit_destdb() return if pkddr.addition_only: log.info("... Table marked as addition-only; not deleting anything") return # Drop/create temporary table pkfield = 'srcpk' temptable = Table( config.temporary_tablename, metadata, Column(pkfield, pkddr.get_dest_sqla_coltype(), primary_key=True), **TABLE_KWARGS) # THIS (ABOVE) IS WHAT CONSTRAINS A USER-DEFINED PK TO BE UNIQUE WITHIN ITS # TABLE. log.debug("... dropping temporary table") temptable.drop(destengine, checkfirst=True) log.debug("... making temporary table") temptable.create(destengine, checkfirst=True) # Populate temporary table, +/- PK translation n = count_star(config.sources[srcdbname].session, src_table) log.debug("... populating temporary table: {} records to go".format(n)) def insert(records_): log.debug(start + "... inserting {} records".format(len(records_))) destsession.execute(temptable.insert(), records_) i = 0 records = [] # type: List[Dict[str: Any]] for pk in gen_pks(srcdbname, src_table, pkddr.src_field): i += 1 if report_every and i % report_every == 0: log.debug(start + "... src row# {} / {}".format(i, n)) if pkddr.primary_pid: pk = config.encrypt_primary_pid(pk) elif pkddr.master_pid: pk = config.encrypt_master_pid(pk) records.append({pkfield: pk}) if i % chunksize == 0: insert(records) records = [] # type: List[Dict[str: Any]] if records: # remainder insert(records) commit_destdb() # 4. Index -- no, hang on, it's a primary key already # # log.debug("... creating index on temporary table") # index = Index('_temptable_idx', temptable.columns[pkfield]) # index.create(destengine) # 5. DELETE FROM desttable # WHERE destpk NOT IN (SELECT srcpk FROM temptable) log.debug("... deleting from destination where appropriate") query = dest_table.delete().where( ~column(pkddr.dest_field).in_(select([temptable.columns[pkfield]]))) destengine.execute(query) commit_destdb() # 6. Drop temporary table log.debug("... dropping temporary table") temptable.drop(destengine, checkfirst=True) # 7. Commit commit_destdb()
class Table(object): def __init__(self, db, schema, table, columns=None): self.db = db self.schema = schema self.name = table self.engine = create_engine(db.url) self.metadata = MetaData(schema=schema) self.metadata.bind = self.engine # http://docs.sqlalchemy.org/en/rel_1_0/core/metadata.html # if provided columns (SQLAlchemy columns), create the table if table: if columns: self.table = SQLATable( table, self.metadata, schema=self.schema, *columns ) self.table.create() # otherwise just load from db else: self.table = SQLATable( table, self.metadata, schema=self.schema, autoload=True ) self.indexes = dict((i.name, i) for i in self.table.indexes) self._is_dropped = False else: self._is_dropped = True self.table = None @property def _normalized_columns(self): return list(map(normalize_column_name, self.columns)) @property def columns(self): """Return list of all columns in table """ return list(self.table.columns.keys()) @property def sqla_columns(self): """Return all columns in table as sqlalchemy column types """ return self.table.columns @property def column_types(self): """Return a dict mapping column name to type for all columns in table """ column_types = {} for c in self.sqla_columns: column_types[c.name] = c.type return column_types @property def primary_key(self): """Return a list of columns making up the primary key constraint """ return [c.name for c in self.table.primary_key] @property def op(self): ctx = MigrationContext.configure(self.engine.connect()) return Operations(ctx) def _valid_table_name(self, table_name): """Check if the table name is obviously invalid. """ if table_name is None or not len(table_name.strip()): raise ValueError("Invalid table name: %r" % table_name) return table_name.strip() def _update_table(self, table_name): self.metadata = MetaData(schema=self.schema) self.metadata.bind = self.engine return SQLATable(table_name, self.metadata, schema=self.schema) def add_primary_key(self, column="id"): """Add primary key constraint to specified column """ if not self.primary_key: sql = """ALTER TABLE {s}.{t} ADD PRIMARY KEY ({c}) """.format( s=self.schema, t=self.name, c=column ) self.db.execute(sql) def drop(self): """Drop the table from the database """ if self._is_dropped is False: self.table.drop(self.engine) self._is_dropped = True def _check_dropped(self): if self._is_dropped: raise DatasetException( "the table has been dropped. this object should not be used again." ) def _args_to_clause(self, args): clauses = [] for k, v in args.items(): if isinstance(v, (list, tuple)): clauses.append(self.table.c[k].in_(v)) else: clauses.append(self.table.c[k] == v) return and_(*clauses) def create_column(self, name, type): """ Explicitely create a new column ``name`` of a specified type. ``type`` must be a `SQLAlchemy column type <http://docs.sqlalchemy.org/en/rel_0_8/core/types.html>`_. :: table.create_column('created_at', sqlalchemy.DateTime) """ self._check_dropped() if normalize_column_name(name) not in self._normalized_columns: self.op.add_column(self.table.name, Column(name, type), self.table.schema) self.table = self._update_table(self.table.name) def drop_column(self, name): """ Drop the column ``name`` :: table.drop_column('created_at') """ self._check_dropped() if name in list(self.table.columns.keys()): self.op.drop_column(self.table.name, name, schema=self.schema) self.table = self._update_table(self.table.name) def create_index(self, columns, name=None, index_type="btree"): """ Create an index to speed up queries on a table. If no ``name`` is given a random name is created. :: table.create_index(['name', 'country']) """ self._check_dropped() if not name: sig = "||".join(columns + [index_type]) # This is a work-around for a bug in <=0.6.1 which would create # indexes based on hash() rather than a proper hash. key = abs(hash(sig)) name = "ix_%s_%s" % (self.table.name, key) if name in self.indexes: return self.indexes[name] key = sha1(sig.encode("utf-8")).hexdigest()[:16] name = "ix_%s_%s" % (self.table.name, key) if name in self.indexes: return self.indexes[name] # self.db._acquire() columns = [self.table.c[col] for col in columns] idx = Index(name, *columns, postgresql_using=index_type) idx.create(self.engine) # finally: # self.db._release() self.indexes[name] = idx return idx def create_index_geom(self, column="geom"): """Shortcut to create index on geometry """ self.create_index([column], index_type="gist") def distinct(self, *columns, **_filter): """ Returns all rows of a table, but removes rows in with duplicate values in ``columns``. Interally this creates a `DISTINCT statement <http://www.w3schools.com/sql/sql_distinct.asp>`_. :: # returns only one row per year, ignoring the rest table.distinct('year') # works with multiple columns, too table.distinct('year', 'country') # you can also combine this with a filter table.distinct('year', country='China') """ self._check_dropped() qargs = [] try: columns = [self.table.c[c] for c in columns] for col, val in _filter.items(): qargs.append(self.table.c[col] == val) except KeyError: return [] q = expression.select( columns, distinct=True, whereclause=and_(*qargs), order_by=[c.asc() for c in columns], ) # if just looking at one column, return a simple list if len(columns) == 1: return itertools.chain.from_iterable(self.engine.execute(q)) # otherwise return specified row_type else: return ResultIter(self.engine.execute(q), row_type=self.db.row_type) def insert(self, row): """ Add a row (type: dict) by inserting it into the table. Columns must exist. :: data = dict(title='I am a banana!') table.insert(data) Returns the inserted row's primary key. """ self._check_dropped() res = self.engine.execute(self.table.insert(row)) if len(res.inserted_primary_key) > 0: return res.inserted_primary_key[0] def insert_many(self, rows, chunk_size=1000): """ Add many rows at a time, which is significantly faster than adding them one by one. Per default the rows are processed in chunks of 1000 per commit, unless you specify a different ``chunk_size``. See :py:meth:`insert() <dataset.Table.insert>` for details on the other parameters. :: rows = [dict(name='Dolly')] * 10000 table.insert_many(rows) """ def _process_chunk(chunk): self.table.insert().execute(chunk) self._check_dropped() chunk = [] for i, row in enumerate(rows, start=1): chunk.append(row) if i % chunk_size == 0: _process_chunk(chunk) chunk = [] if chunk: _process_chunk(chunk) def rename(self, name): """Rename the table """ sql = """ALTER TABLE {s}.{t} RENAME TO {name} """.format( s=self.schema, t=self.name, name=name ) self.engine.execute(sql) self.table = SQLATable(name, self.metadata, schema=self.schema, autoload=True) def find_one(self, **kwargs): """ Works just like :py:meth:`find() <dataset.Table.find>` but returns one result, or None. :: row = table.find_one(country='United States') """ kwargs["_limit"] = 1 iterator = self.find(**kwargs) try: return next(iterator) except StopIteration: return None def _args_to_order_by(self, order_by): if order_by[0] == "-": return self.table.c[order_by[1:]].desc() else: return self.table.c[order_by].asc() def find( self, _limit=None, _offset=0, _step=5000, order_by="id", return_count=False, **_filter ): """ Performs a simple search on the table. Simply pass keyword arguments as ``filter``. :: results = table.find(country='France') results = table.find(country='France', year=1980) Using ``_limit``:: # just return the first 10 rows results = table.find(country='France', _limit=10) You can sort the results by single or multiple columns. Append a minus sign to the column name for descending order:: # sort results by a column 'year' results = table.find(country='France', order_by='year') # return all rows sorted by multiple columns (by year in descending order) results = table.find(order_by=['country', '-year']) By default :py:meth:`find() <dataset.Table.find>` will break the query into chunks of ``_step`` rows to prevent huge tables from being loaded into memory at once. For more complex queries, please use :py:meth:`db.query()` instead.""" self._check_dropped() if not isinstance(order_by, (list, tuple)): order_by = [order_by] order_by = [ o for o in order_by if (o.startswith("-") and o[1:] or o) in self.table.columns ] order_by = [self._args_to_order_by(o) for o in order_by] args = self._args_to_clause(_filter) # query total number of rows first count_query = alias( self.table.select(whereclause=args, limit=_limit, offset=_offset), name="count_query_alias", ).count() rp = self.engine.execute(count_query) total_row_count = rp.fetchone()[0] if return_count: return total_row_count if _limit is None: _limit = total_row_count if _step is None or _step is False or _step == 0: _step = total_row_count if total_row_count > _step and not order_by: _step = total_row_count log.warn( "query cannot be broken into smaller sections because it is unordered" ) queries = [] for i in count(): qoffset = _offset + (_step * i) qlimit = min(_limit - (_step * i), _step) if qlimit <= 0: break queries.append( self.table.select( whereclause=args, limit=qlimit, offset=qoffset, order_by=order_by ) ) return ResultIter( (self.engine.execute(q) for q in queries), row_type=self.db.row_type ) def count(self, **_filter): """ Return the count of results for the given filter set (same filter options as with ``find()``). """ return self.find(return_count=True, **_filter) def __getitem__(self, item): """ This is an alias for distinct which allows the table to be queried as using square bracket syntax. :: # Same as distinct: print list(table['year']) """ if not isinstance(item, tuple): item = (item,) return self.distinct(*item) def all(self): """ Returns all rows of the table as simple dictionaries. This is simply a shortcut to *find()* called with no arguments. :: rows = table.all()""" return self.find() def __iter__(self): """ Allows for iterating over all rows in the table without explicetly calling :py:meth:`all() <dataset.Table.all>`. :: for row in table: print(row) """ return self.all() def __repr__(self): return "<Table(%s)>" % self.table.name
def drop_table(eng, tbl_name): table = Table(tbl_name, MetaData(), autoload_with=eng) if table.exists(): table.drop() return True return False
def delete_table(self, table_name): table = Table(table_name, self.meta) table.drop()
class Table(object): def __init__(self, db, schema, table, columns=None): self.db = db self.schema = schema self.name = table self.engine = create_engine(db.url) self.metadata = MetaData(schema=schema) self.metadata.bind = self.engine # http://docs.sqlalchemy.org/en/rel_1_0/core/metadata.html # if provided columns (SQLAlchemy columns), create the table if table: if columns: self.table = SQLATable(table, self.metadata, schema=self.schema, *columns) self.table.create() # otherwise just load from db else: self.table = SQLATable(table, self.metadata, schema=self.schema, autoload=True) self.indexes = dict((i.name, i) for i in self.table.indexes) self._is_dropped = False else: self._is_dropped = True self.table = None @property def _normalized_columns(self): return list(map(normalize_column_name, self.columns)) @property def columns(self): """Return list of all columns in table """ return list(self.table.columns.keys()) @property def sqla_columns(self): """Return all columns in table as sqlalchemy column types """ return self.table.columns @property def column_types(self): """Return a dict mapping column name to type for all columns in table """ column_types = {} for c in self.sqla_columns: column_types[c.name] = c.type return column_types @property def primary_key(self): """Return a list of columns making up the primary key constraint """ return [c.name for c in self.table.primary_key] @property def op(self): ctx = MigrationContext.configure(self.engine.connect()) return Operations(ctx) def _valid_table_name(self, table_name): """Check if the table name is obviously invalid. """ if table_name is None or not len(table_name.strip()): raise ValueError("Invalid table name: %r" % table_name) return table_name.strip() def _update_table(self, table_name): self.metadata = MetaData(schema=self.schema) self.metadata.bind = self.engine return SQLATable(table_name, self.metadata, schema=self.schema) def add_primary_key(self, column="id"): """Add primary key constraint to specified column """ if not self.primary_key: sql = """ALTER TABLE {s}.{t} ADD PRIMARY KEY ({c}) """.format(s=self.schema, t=self.name, c=column) self.db.execute(sql) def drop(self): """Drop the table from the database """ if self._is_dropped is False: self.table.drop(self.engine) self._is_dropped = True def _check_dropped(self): if self._is_dropped: raise DatasetException( "the table has been dropped. this object should not be used again." ) def _args_to_clause(self, args): clauses = [] for k, v in args.items(): if isinstance(v, (list, tuple)): clauses.append(self.table.c[k].in_(v)) else: clauses.append(self.table.c[k] == v) return and_(*clauses) def create_column(self, name, type): """ Explicitely create a new column ``name`` of a specified type. ``type`` must be a `SQLAlchemy column type <http://docs.sqlalchemy.org/en/rel_0_8/core/types.html>`_. :: table.create_column('created_at', sqlalchemy.DateTime) """ self._check_dropped() if normalize_column_name(name) not in self._normalized_columns: self.op.add_column(self.table.name, Column(name, type), self.table.schema) self.table = self._update_table(self.table.name) def drop_column(self, name): """ Drop the column ``name`` :: table.drop_column('created_at') """ self._check_dropped() if name in list(self.table.columns.keys()): self.op.drop_column(self.table.name, name, schema=self.schema) self.table = self._update_table(self.table.name) def create_index(self, columns, name=None, index_type="btree"): """ Create an index to speed up queries on a table. If no ``name`` is given a random name is created. :: table.create_index(['name', 'country']) """ self._check_dropped() if not name: sig = "||".join(columns + [index_type]) # This is a work-around for a bug in <=0.6.1 which would create # indexes based on hash() rather than a proper hash. key = abs(hash(sig)) name = "ix_%s_%s" % (self.table.name, key) if name in self.indexes: return self.indexes[name] key = sha1(sig.encode("utf-8")).hexdigest()[:16] name = "ix_%s_%s" % (self.table.name, key) if name in self.indexes: return self.indexes[name] # self.db._acquire() columns = [self.table.c[col] for col in columns] idx = Index(name, *columns, postgresql_using=index_type) idx.create(self.engine) # finally: # self.db._release() self.indexes[name] = idx return idx def create_index_geom(self, column="geom"): """Shortcut to create index on geometry """ self.create_index([column], index_type="gist") def distinct(self, *columns, **_filter): """ Returns all rows of a table, but removes rows in with duplicate values in ``columns``. Interally this creates a `DISTINCT statement <http://www.w3schools.com/sql/sql_distinct.asp>`_. :: # returns only one row per year, ignoring the rest table.distinct('year') # works with multiple columns, too table.distinct('year', 'country') # you can also combine this with a filter table.distinct('year', country='China') """ self._check_dropped() qargs = [] try: columns = [self.table.c[c] for c in columns] for col, val in _filter.items(): qargs.append(self.table.c[col] == val) except KeyError: return [] q = expression.select( columns, distinct=True, whereclause=and_(*qargs), order_by=[c.asc() for c in columns], ) # if just looking at one column, return a simple list if len(columns) == 1: return itertools.chain.from_iterable(self.engine.execute(q)) # otherwise return specified row_type else: return ResultIter(self.engine.execute(q), row_type=self.db.row_type) def insert(self, row): """ Add a row (type: dict) by inserting it into the table. Columns must exist. :: data = dict(title='I am a banana!') table.insert(data) Returns the inserted row's primary key. """ self._check_dropped() res = self.engine.execute(self.table.insert(row)) if len(res.inserted_primary_key) > 0: return res.inserted_primary_key[0] def insert_many(self, rows, chunk_size=1000): """ Add many rows at a time, which is significantly faster than adding them one by one. Per default the rows are processed in chunks of 1000 per commit, unless you specify a different ``chunk_size``. See :py:meth:`insert() <dataset.Table.insert>` for details on the other parameters. :: rows = [dict(name='Dolly')] * 10000 table.insert_many(rows) """ def _process_chunk(chunk): self.table.insert().execute(chunk) self._check_dropped() chunk = [] for i, row in enumerate(rows, start=1): chunk.append(row) if i % chunk_size == 0: _process_chunk(chunk) chunk = [] if chunk: _process_chunk(chunk) def rename(self, name): """Rename the table """ sql = """ALTER TABLE {s}.{t} RENAME TO {name} """.format(s=self.schema, t=self.name, name=name) self.engine.execute(sql) self.table = SQLATable(name, self.metadata, schema=self.schema, autoload=True) def find_one(self, **kwargs): """ Works just like :py:meth:`find() <dataset.Table.find>` but returns one result, or None. :: row = table.find_one(country='United States') """ kwargs["_limit"] = 1 iterator = self.find(**kwargs) try: return next(iterator) except StopIteration: return None def _args_to_order_by(self, order_by): if order_by[0] == "-": return self.table.c[order_by[1:]].desc() else: return self.table.c[order_by].asc() def find(self, _limit=None, _offset=0, _step=5000, order_by="id", return_count=False, **_filter): """ Performs a simple search on the table. Simply pass keyword arguments as ``filter``. :: results = table.find(country='France') results = table.find(country='France', year=1980) Using ``_limit``:: # just return the first 10 rows results = table.find(country='France', _limit=10) You can sort the results by single or multiple columns. Append a minus sign to the column name for descending order:: # sort results by a column 'year' results = table.find(country='France', order_by='year') # return all rows sorted by multiple columns (by year in descending order) results = table.find(order_by=['country', '-year']) By default :py:meth:`find() <dataset.Table.find>` will break the query into chunks of ``_step`` rows to prevent huge tables from being loaded into memory at once. For more complex queries, please use :py:meth:`db.query()` instead.""" self._check_dropped() if not isinstance(order_by, (list, tuple)): order_by = [order_by] order_by = [ o for o in order_by if (o.startswith("-") and o[1:] or o) in self.table.columns ] order_by = [self._args_to_order_by(o) for o in order_by] args = self._args_to_clause(_filter) # query total number of rows first count_query = alias( self.table.select(whereclause=args, limit=_limit, offset=_offset), name="count_query_alias", ).count() rp = self.engine.execute(count_query) total_row_count = rp.fetchone()[0] if return_count: return total_row_count if _limit is None: _limit = total_row_count if _step is None or _step is False or _step == 0: _step = total_row_count if total_row_count > _step and not order_by: _step = total_row_count log.warn( "query cannot be broken into smaller sections because it is unordered" ) queries = [] for i in count(): qoffset = _offset + (_step * i) qlimit = min(_limit - (_step * i), _step) if qlimit <= 0: break queries.append( self.table.select(whereclause=args, limit=qlimit, offset=qoffset, order_by=order_by)) return ResultIter((self.engine.execute(q) for q in queries), row_type=self.db.row_type) def count(self, **_filter): """ Return the count of results for the given filter set (same filter options as with ``find()``). """ return self.find(return_count=True, **_filter) def __getitem__(self, item): """ This is an alias for distinct which allows the table to be queried as using square bracket syntax. :: # Same as distinct: print list(table['year']) """ if not isinstance(item, tuple): item = (item, ) return self.distinct(*item) def all(self): """ Returns all rows of the table as simple dictionaries. This is simply a shortcut to *find()* called with no arguments. :: rows = table.all()""" return self.find() def __iter__(self): """ Allows for iterating over all rows in the table without explicetly calling :py:meth:`all() <dataset.Table.all>`. :: for row in table: print(row) """ return self.all() def __repr__(self): return "<Table(%s)>" % self.table.name
def delete_where_no_source(nlpdef: NlpDefinition, ifconfig: InputFieldConfig, report_every: int = DEFAULT_REPORT_EVERY, chunksize: int = DEFAULT_CHUNKSIZE) -> None: """ Delete destination records where source records no longer exist. - Can't do this in a single SQL command, since the engine can't necessarily see both databases. - Can't use a single temporary table, since the progress database isn't necessarily the same as any of the destination database(s). - Can't do this in a multiprocess way, because we're trying to do a DELETE WHERE NOT IN. - So we fetch all source PKs (which, by definition, do exist), stash them keep them in memory, and do a DELETE WHERE NOT IN based on those specified values (or, if there are no PKs in the source, delete everything from the destination). Problems: - This is IMPERFECT if we have string source PKs and there are hash collisions (e.g. PKs for records X and Y both hash to the same thing; record X is deleted; then its processed version might not be). - With massive tables, we might run out of memory or (much more likely) SQL parameter slots. -- This is now happening; error looks like: pyodbc.ProgrammingError: ('The SQL contains 30807 parameter parkers, but 2717783 parameters were supplied', 'HY000') A better way might be: - for each table, make a temporary table in the same database - populate that table with (source PK integer/hash, source PK string) pairs - delete where pairs don't match -- is that portable SQL? http://stackoverflow.com/questions/7356108/sql-query-for-deleting-rows-with-not-in-using-2-columns # noqa - More efficient would be to make one table per destination database. On the "delete where multiple fields don't match": - Single field syntax is DELETE FROM a WHERE a1 NOT IN (SELECT b1 FROM b) - Multiple field syntax is DELETE FROM a WHERE NOT EXISTS ( SELECT 1 FROM b WHERE a.a1 = b.b1 AND a.a2 = b.b2 ) - In SQLAlchemy, exists(): http://stackoverflow.com/questions/14600619 http://docs.sqlalchemy.org/en/latest/core/selectable.html - Furthermore, in SQL NULL = NULL is false, and NULL <> NULL is also false, so we have to do an explicit null check. You do that with "field == None" (disable See http://stackoverflow.com/questions/21668606 We're aiming, therefore, for: DELETE FROM a WHERE NOT EXISTS ( SELECT 1 FROM b WHERE a.a1 = b.b1 AND ( a.a2 = b.b2 OR (a.a2 IS NULL AND b.b2 IS NULL) ) ) """ # ------------------------------------------------------------------------- # Sub-functions # ------------------------------------------------------------------------- def insert(records_): n_rows = len(records_) log.debug("... inserting {} records".format(n_rows)) for db in databases: session_ = db['session'] temptable_ = db['temptable'] # type: Table session_.execute(temptable_.insert(), records_) nlpdef.notify_transaction(session_, n_rows=n_rows, n_bytes=sys.getsizeof(records_)) def commit(): for db in databases: nlpdef.commit(db['session']) # ------------------------------------------------------------------------- # Main code # ------------------------------------------------------------------------- # Use info log level, otherwise it looks like our code hangs with very # large databases. log.info("delete_where_no_source: examining source table {}.{}; " "MAY BE SLOW".format(ifconfig.get_srcdb(), ifconfig.get_srctable())) # Start our list with the progress database databases = [{ 'session': nlpdef.get_progdb_session(), 'engine': nlpdef.get_progdb_engine(), 'metadata': nlpdef.get_progdb_metadata(), 'temptable': None, # type: Table }] # Add the processors' destination databases for processor in nlpdef.get_processors(): # of type BaseNlpParser session = processor.get_session() if any(x['session'] == session for x in databases): continue # already exists databases.append({ 'session': session, 'engine': processor.get_engine(), 'metadata': processor.get_metadata(), }) # Make a temporary table in each database (note: the Table objects become # affiliated to their engine, I think, so make separate ones for each). log.info("... using {n} destination database(s)".format(n=len(databases))) log.info("... dropping (if exists) and creating temporary table(s)") for database in databases: engine = database['engine'] temptable = Table( nlpdef.get_temporary_tablename(), database['metadata'], Column(FN_SRCPKVAL, BigInteger), # not PK, as may be a hash Column(FN_SRCPKSTR, String(MAX_STRING_PK_LENGTH)), **TABLE_KWARGS) temptable.drop(engine, checkfirst=True) temptable.create(engine, checkfirst=True) database['temptable'] = temptable # Insert PKs into temporary tables n = count_star(ifconfig.get_source_session(), ifconfig.get_srctable()) log.info("... populating temporary table(s): {} records to go; working in " "chunks of {}".format(n, chunksize)) i = 0 records = [] # type: List[Dict[str, Any]] for pkval, pkstr in ifconfig.gen_src_pks(): i += 1 if report_every and i % report_every == 0: log.info("... src row# {} / {}".format(i, n)) records.append({FN_SRCPKVAL: pkval, FN_SRCPKSTR: pkstr}) if i % chunksize == 0: insert(records) records = [] # type: List[Dict[str, Any]] if records: # remainder insert(records) # Commit commit() # Index, for speed log.info("... creating index(es) on temporary table(s)") for database in databases: temptable = database['temptable'] # type: Table index = Index('_temptable_idx', temptable.columns[FN_SRCPKVAL]) index.create(database['engine']) # DELETE FROM desttable WHERE destpk NOT IN (SELECT srcpk FROM temptable) log.info("... deleting from progress/destination DBs where appropriate") # Delete from progress database prog_db = databases[0] prog_temptable = prog_db['temptable'] ifconfig.delete_progress_records_where_srcpk_not(prog_temptable) # Delete from others for processor in nlpdef.get_processors(): database = [ x for x in databases if x['session'] == processor.get_session() ][0] temptable = database['temptable'] processor.delete_where_srcpk_not(ifconfig, temptable) # Drop temporary tables log.info("... dropping temporary table(s)") for database in databases: database['temptable'].drop(database['engine'], checkfirst=True) # Commit commit()