def clear_database( engine: Union[Engine, Connection], schemas: Iterable[str] = ()) -> None: """ Clear any tables from an existing database. For SQLite engines, the target database file will be deleted and a new one is created in its place. :param engine: the engine or connection to use :param schemas: full list of schema names to expect (ignored for SQLite) """ assert check_argument_types() if engine.dialect.name == 'sqlite': # SQLite does not support dropping constraints and it's faster to just delete the file if engine.url.database not in (None, ':memory:') and os.path.isfile( engine.url.database): os.remove(engine.url.database) else: metadatas = [] for schema in (None, ) + tuple(schemas): # Reflect the schema to get the list of the tables, views and constraints metadata = MetaData() metadata.reflect(engine, schema=schema, views=True) metadatas.append(metadata) for metadata in metadatas: metadata.drop_all(engine, checkfirst=False)
def reflect_model(self, table_name, bind_key=None): """ 反向生成 ORM 的 Model :param table_name: :param bind_key: :return: ORMClass """ with self._reflect_lock: if table_name in self._models: return self._models[table_name] engine = self.get_engine(bind_key) meta = MetaData(bind=engine) meta.reflect(only=[table_name]) table = meta.tables[table_name] self._tables[table_name] = table Base = automap_base(metadata=meta) Base.prepare() model = getattr(Base.classes, table_name) model.__table__.metadata = None self._models[table_name] = model return model
def test_clear_database(engine): clear_database(engine, ['altschema']) metadata = MetaData() metadata.reflect(engine) assert len(metadata.tables) == 0 if engine.dialect.name != 'sqlite': alt_metadata = MetaData(schema='altschema') alt_metadata.reflect(engine) assert len(alt_metadata.tables) == 0
def test_get_table_names(self, engine, conn): meta = MetaData() meta.reflect(bind=engine) print(meta.tables) self.assertIn("one_row", meta.tables) self.assertIn("one_row_complex", meta.tables) insp = sqlalchemy.inspect(engine) self.assertIn( "many_rows", insp.get_table_names(schema=SCHEMA), )
def test_clear_database(connection): clear_database( connection, ["altschema"] if connection.dialect.name != "sqlite" else [] ) metadata = MetaData() metadata.reflect(connection) assert len(metadata.tables) == 0 if connection.dialect.name != "sqlite": alt_metadata = MetaData(schema="altschema") alt_metadata.reflect(connection) assert len(alt_metadata.tables) == 0
def reflect_table(self, table_name, bind_key=None): with self._reflect_lock: if table_name in self._tables: return self._tables[table_name] engine = self.get_engine(bind_key) meta = MetaData(bind=engine) meta.reflect(only=[table_name]) table = meta.tables[table_name] table.metadata = None self._tables[table_name] = table return table
def clear_database( engine: Engine | Connection, schemas: Iterable[str] = ()) -> None: """ Clear any tables from an existing database using a synchronous connection/engine. :param engine: the engine or connection to use :param schemas: full list of schema names to expect (ignored for SQLite) """ metadatas = [] all_schemas: tuple[str | None, ...] = (None, ) all_schemas += tuple(schemas) for schema in all_schemas: # Reflect the schema to get the list of the tables, views and constraints metadata = MetaData() metadata.reflect(engine, schema=schema, views=True) metadatas.append(metadata) for metadata in metadatas: metadata.drop_all(engine, checkfirst=False)
def run(self): # Source reflection source_meta = MetaData() source_meta.reflect(bind=self.source_engine) source_tables = source_meta.tables source_table_names = [k for k, v in source_tables.items()] # Destination Binding destination_meta = MetaData(bind=self.destination_engine) for name, table in source_tables.items(): table.metadata = destination_meta if name in self.settings.exclude_data.keys(): table.__mapper_args__ = {"exclude_properties": self.settings.exclude_data[name]} # Drop table for testing purposes # destination_meta.drop_all(self.destination_engine) for table in source_table_names: self.sessions.destination.execute("DROP TABLE {table};".format(table=table)) self.sessions.destination.commit() print("DROPPED TABLE {table}".format(table=table)) # Begin migration source_meta.create_all(self.destination_engine) source_data = {table: self.sessions.source.query(source_tables[table]).all() for table in source_table_names} for table in source_table_names: print("Migrating:", table) # if table in self.settings.exclude_data.keys(): # pprint(source_tables[table].__mapper_args__) # exit(1) for row in source_data[table]: try: self.sessions.destination.execute(source_tables[table].insert(row)) except StatementError: print("Bad data in table: ", table, "row data:\n", row[0], "Error:", sys.exc_info()[0]) print("Data for:", table, "added to the queue..") self.sessions.destination.commit() print("Migration Complete!")
def run(self): # Source reflection source_meta = MetaData() source_meta.reflect(bind=self.source_engine) source_tables = source_meta.tables source_table_names = [k for k, v in source_tables.items()] # Destination Binding destination_meta = MetaData(bind=self.destination_engine) for name, table in source_tables.items(): table.metadata = destination_meta if name in self.settings.exclude_data.keys(): table.__mapper_args__ = {'exclude_properties': self.settings.exclude_data[name]} # Drop table for testing purposes # destination_meta.drop_all(self.destination_engine) for table in source_table_names: self.sessions.destination.execute('DROP TABLE {table};'.format(table=table)) self.sessions.destination.commit() print('DROPPED TABLE {table}'.format(table=table)) # Begin migration source_meta.create_all(self.destination_engine) source_data = {table: self.sessions.source.query(source_tables[table]).all() for table in source_table_names} for table in source_table_names: print("Migrating:", table) # if table in self.settings.exclude_data.keys(): # pprint(source_tables[table].__mapper_args__) # exit(1) for row in source_data[table]: try: self.sessions.destination.execute(source_tables[table].insert(row)) except StatementError: print('Bad data in table: ', table, 'row data:\n', row[0], 'Error:', sys.exc_info()[0]) print('Data for:', table, 'added to the queue..') self.sessions.destination.commit() print('Migration Complete!')
if (len(argv) > 6 and argv[6].upper() == 'TRUE'): limpar_tabela = True else: limpar_tabela = False #Mapeia a database do pontual __m = MetaData(schema='pontual') __m2 = MetaData(schema='seguranca') # __m = MetaData() url = "postgresql://%s:%s@%s/pontual" % (usuario, senha, ip_nome) __engine = create_engine(url) # __m.reflect(__engine, only=['linha', 'area_de_fiscalizacao', 'ponto_de_parada']) __m2.reflect(__engine, only=['user']) __Base = automap_base(bind=__engine, metadata=__m) __Base.prepare(__engine, reflect=True) __Base2 = automap_base(bind=__engine, metadata=__m2) __Base2.prepare(__engine, reflect=True) #Objeto ORMs do sqlalchmy do pontual User = __Base2.classes.user Linha = __Base.classes.linha AreaDeFiscalizacao = __Base.classes.area_de_fiscalizacao PontoDeParada = __Base.classes.ponto_de_parada Sessao = sessionmaker(bind=__engine)
class EnronDB: def __init__(self, table_name): self.engine = None self.metadata = MetaData() self.table_name = table_name @classmethod def holbox_db(cls): db = EnronDB("email_prediction") db.init('holbox.lti.cs.cmu.edu', 'inmind', 'yahoo', 'enron_experiment') return db def init(self, host, username, password, db_name): engine_desc = 'mysql://%s:%s@%s/%s' % (username, password, host, db_name) try: self.engine = create_engine(engine_desc) self.metadata.reflect(self.engine) except: print "Unexpected error:", sys.exc_info()[0] return False return True # sql: # create table TABLE_NAME (id INT NOT NULL AUTO_INCREMENT, date DATETIME, mime_type TEXT, from_addr TEXT, # to_addr TEXT, subject TEXT, raw_body TEXT, cleaned_body TEXT, one_line TEXT, path TEXT, prediction INT, PRIMARY KEY(id)); def create_table(self): email_table = Table(self.table_name, self.metadata, Column('id', Integer, primary_key=True), Column('date', Text), Column('mime_type', Text), Column('from_addr', Text), Column('to_addr', Text), Column('subject', Text), Column('raw_body', Text), Column('cleaned_body', Text), Column('one_line', Text), Column('path', Text), Column('prediction', Integer), Column('probability', Float) ) email_table.create(self.engine) def create_sample_table(self, sample_table_name): if sample_table_name == self.table_name: print('Cannon use the same table name') return email_table = Table(sample_table_name, self.metadata, Column('id', Integer, primary_key=True), Column('date', Text), Column('mime_type', Text), Column('from_addr', Text), Column('to_addr', Text), Column('subject', Text), Column('raw_body', Text), Column('cleaned_body', Text), Column('one_line', Text), Column('path', Text), Column('prediction', Integer), Column('probability', Float), Column('manual_label', Integer) ) email_table.create(self.engine) def get_all_brushed_emails(self): email_table = Table('brushed_email', self.metadata) sel_stmt = select([email_table.c.id, email_table.c.date, email_table.c.mime_type, \ email_table.c.from_addr, email_table.c.to_adddr, \ email_table.c.subject, email_table.c.body, email_table.c.one_line, \ email_table.c.path, email_table.c.label, email_table.c.is_scheduling]) rp = self.engine.execute(sel_stmt) emails = [] for record in rp: email = Email() if record is not None: email.id = record.id email.date = record.date email.mime_type = record.mime_type email.from_addr = record.from_addr email.to_addr = record.to_adddr email.subject = record.subject email.body = record.body email.one_line = record.one_line email.path = record.path email.label = record.label email.is_scheduling = record.is_scheduling or 0 emails.append(email) return emails def insert_email(self, email): if not isinstance(email, Email): print 'ERROR: input must be of type Email' return email_table = Table(self.table_name, self.metadata) ins_stmt = email_table.insert() conn = self.engine.connect() conn.execute(ins_stmt, date=email.date, mime_type=email.mime_type, from_addr=email.from_addr, to_addr=email.to_addr, subject=email.subject, raw_body=email.raw_body, cleaned_body=email.cleaned_body, one_line=email.one_line, path=email.path, label=email.label, prediction=email.prediction, probability=email.probability ) def get_all_email_predictions(self): email_table = Table(self.table_name, self.metadata) sel_stmt = select([email_table.c.id, email_table.c.date, email_table.c.mime_type, \ email_table.c.from_addr, email_table.c.to_addr, \ email_table.c.subject, email_table.c.raw_body, email_table.c.cleaned_body, email_table.c.one_line, \ email_table.c.path, email_table.c.prediction, email_table.c.probability]) rp = self.engine.execute(sel_stmt) emails = [] for record in rp: email = Email() if record is not None: email.id = record.id email.date = record.date email.mime_type = record.mime_type email.from_addr = record.from_addr email.to_addr = record.to_addr email.subject = record.subject email.raw_body = record.raw_body email.cleaned_body = record.cleaned_body email.one_line = record.one_line email.path = record.path email.prediction = record.prediction email.probability = record.probability emails.append(email) return emails def get_sample_emails(self, sample_table_name): email_table = Table(sample_table_name, self.metadata) sel_stmt = select([email_table.c.id, email_table.c.date, email_table.c.mime_type, \ email_table.c.from_addr, email_table.c.to_addr, \ email_table.c.subject, email_table.c.raw_body, email_table.c.cleaned_body, email_table.c.one_line, \ email_table.c.path, email_table.c.prediction, email_table.c.probability, email_table.c.manual_label]) rp = self.engine.execute(sel_stmt) emails = [] for record in rp: email = Email() if record is not None: email.id = record.id email.date = record.date email.mime_type = record.mime_type email.from_addr = record.from_addr email.to_addr = record.to_addr email.subject = record.subject email.raw_body = record.raw_body email.cleaned_body = record.cleaned_body email.one_line = record.one_line email.path = record.path email.prediction = record.prediction email.probability = record.probability email.manual_label = record.manual_label emails.append(email) return emails def get_all_email_predictions_greater_than(self, threshold = 0.7): s = text("select * from " + self.table_name + " where probability >= " + str(threshold)) rp = self.engine.execute(s).fetchall() # email_table = Table(self.table_name, self.metadata) # sel_stmt = select([email_table.c.id, email_table.c.date, email_table.c.mime_type, \ # email_table.c.from_addr, email_table.c.to_addr, \ # email_table.c.subject, email_table.c.raw_body, email_table.c.cleaned_body, email_table.c.one_line, \ # email_table.c.path, email_table.c.prediction, email_table.c.probability]).where(email_table.c.probability >= 0.7) # rp = self.engine.execute(sel_stmt) emails = [] for record in rp: email = Email() if record is not None: email.id = record.id email.date = record.date email.mime_type = record.mime_type email.from_addr = record.from_addr email.to_addr = record.to_addr email.subject = record.subject email.raw_body = record.raw_body email.cleaned_body = record.cleaned_body email.one_line = record.one_line email.path = record.path email.prediction = record.prediction email.probability = record.probability emails.append(email) return emails def insert_sample_email(self, sample_table_name, email): if not isinstance(email, Email): print 'ERROR: input must be of type Email' return if sample_table_name == self.table_name: print('Cannot use the same table name') return email_table = Table(sample_table_name, self.metadata) ins_stmt = email_table.insert() conn = self.engine.connect() conn.execute(ins_stmt, date=email.date, mime_type=email.mime_type, from_addr=email.from_addr, to_addr=email.to_addr, subject=email.subject, raw_body=email.raw_body, cleaned_body=email.cleaned_body, one_line=email.one_line, path=email.path, label=email.label, prediction=email.prediction, probability=email.probability, )
db_username = config['Username'] db_password = config['Password'] db_address = config['EndpointAddress'] db_port = config['EndpointPort'] db_name = config['Name'] # LEGGERE http://docs.sqlalchemy.org/en/latest/core/pooling.html engine = create_engine('mysql://' + db_username + ':' + db_password + '@' + db_address + ':' + db_port + '/' + db_name + '?charset=utf8&use_unicode=0', pool_recycle=3600) # useful for tables retrieval metadata = MetaData() metadata.reflect(engine) # tables retrieval # retrieve session table session_table = metadata.tables.get('session') # retrieve task table task_table = metadata.tables.get('task') # retrieve iteration table iter_table = metadata.tables.get('iteration') # retrieve preferences table pref_table = metadata.tables.get('preferences') ############ # GLOBAL # ############
def upgrade(): # ### commands auto generated by Alembic - please adjust! ### op.create_table('corp_types', sa.Column('corp_type_cd', sa.String(length=5), nullable=False), sa.Column('colin_ind', sa.String(length=1), nullable=False), sa.Column('corp_class', sa.String(length=10), nullable=False), sa.Column('short_desc', sa.String(length=25), nullable=False), sa.Column('full_desc', sa.String(length=100), nullable=False), sa.Column('legislation', sa.String(length=100), nullable=True), sa.PrimaryKeyConstraint('corp_type_cd') ) meta = MetaData(bind=op.get_bind()) meta.reflect(only=('corp_types',)) corp_types_table = Table('corp_types', meta) op.bulk_insert( corp_types_table, [ {'corp_type_cd': 'A', 'colin_ind': 'Y', 'corp_class': 'XPRO', 'short_desc': 'EXTRA PRO', 'full_desc': 'Extraprovincial Company', 'legislation': 'BC Business Corporations Act'}, {'corp_type_cd': 'B', 'colin_ind': 'Y', 'corp_class': 'XPRO', 'short_desc': 'EXTRA PRO', 'full_desc': 'Extraprovincial Company', 'legislation': 'BC Business Corporations Act'}, {'corp_type_cd': 'BC', 'colin_ind': 'Y', 'corp_class': 'BC', 'short_desc': 'BC COMPANY', 'full_desc': 'BC Limited Company', 'legislation': 'BC Business Corporations Act'}, {'corp_type_cd': 'C', 'colin_ind': 'Y', 'corp_class': 'BC', 'short_desc': 'CONTINUE IN', 'full_desc': 'BC Limited Company', 'legislation': 'BC Business Corporations Act'}, {'corp_type_cd': 'CEM', 'colin_ind': 'N', 'corp_class': 'OT', 'short_desc': 'CEMETARY', 'full_desc': 'Cemetary', 'legislation': ''}, {'corp_type_cd': 'CP', 'colin_ind': 'N', 'corp_class': 'OT', 'short_desc': 'COOP', 'full_desc': 'BC Cooperative Association', 'legislation': 'BC Cooperative Association Act'}, {'corp_type_cd': 'EPR', 'colin_ind': 'Y', 'corp_class': 'XPRO', 'short_desc': 'EXTRA PRO REG', 'full_desc': 'Extraprovincial Registration', 'legislation': ''}, {'corp_type_cd': 'FOR', 'colin_ind': 'Y', 'corp_class': 'XPRO', 'short_desc': 'FOREIGN', 'full_desc': 'Foreign Registration', 'legislation': ''}, {'corp_type_cd': 'LIC', 'colin_ind': 'Y', 'corp_class': 'XPRO', 'short_desc': 'LICENSED', 'full_desc': 'Licensed (Extra-Pro)', 'legislation': ''}, {'corp_type_cd': 'LIB', 'colin_ind': 'N', 'corp_class': 'OT', 'short_desc': 'LIBRARY', 'full_desc': 'Public Library Association', 'legislation': ''}, {'corp_type_cd': 'LLC', 'colin_ind': 'Y', 'corp_class': 'XPRO', 'short_desc': 'LIMITED CO', 'full_desc': 'Limited Liability Company', 'legislation': 'BC Business Corporations Act'}, {'corp_type_cd': 'PA', 'colin_ind': 'N', 'corp_class': 'OT', 'short_desc': 'PRIVATE ACT', 'full_desc': 'Private Act', 'legislation': 'Private Act'}, {'corp_type_cd': 'PAR', 'colin_ind': 'N', 'corp_class': 'OT', 'short_desc': 'PARISHES', 'full_desc': 'Parishes', 'legislation': 'BC Partnership Act'}, {'corp_type_cd': 'PFS', 'colin_ind': 'N', 'corp_class': 'OT', 'short_desc': 'PENS FUND SOC', 'full_desc': 'Pension Funded Society', 'legislation': ''}, {'corp_type_cd': 'QA', 'colin_ind': 'Y', 'corp_class': 'BC', 'short_desc': 'CO 1860', 'full_desc': 'CO 1860', 'legislation': ''}, {'corp_type_cd': 'QB', 'colin_ind': 'Y', 'corp_class': 'BC', 'short_desc': 'CO 1862', 'full_desc': 'CO 1862', 'legislation': ''}, {'corp_type_cd': 'QC', 'colin_ind': 'Y', 'corp_class': 'BC', 'short_desc': 'CO 1878', 'full_desc': 'CO 1878', 'legislation': ''}, {'corp_type_cd': 'QD', 'colin_ind': 'Y', 'corp_class': 'BC', 'short_desc': 'CO 1890', 'full_desc': 'CO 1890', 'legislation': ''}, {'corp_type_cd': 'QE', 'colin_ind': 'Y', 'corp_class': 'BC', 'short_desc': 'CO 1897', 'full_desc': 'CO 1897', 'legislation': ''}, {'corp_type_cd': 'REG', 'colin_ind': 'Y', 'corp_class': 'XPRO', 'short_desc': 'REGISTRATION', 'full_desc': 'Registraton (Extra-pro)', 'legislation': 'BC Business Corporations Act'}, {'corp_type_cd': 'RLY', 'colin_ind': 'N', 'corp_class': 'OT', 'short_desc': 'RAILWAYS', 'full_desc': 'Railways', 'legislation': ''}, {'corp_type_cd': 'SB', 'colin_ind': 'N', 'corp_class': 'OT', 'short_desc': 'SOCIETY BRANCH', 'full_desc': 'Society Branch', 'legislation': ''}, {'corp_type_cd': 'T', 'colin_ind': 'N', 'corp_class': 'OT', 'short_desc': 'TRUST', 'full_desc': 'Trust', 'legislation': 'BC Partnership Act'}, {'corp_type_cd': 'TMY', 'colin_ind': 'N', 'corp_class': 'OT', 'short_desc': 'TRAMWAYS', 'full_desc': 'Tramways', 'legislation': ''}, {'corp_type_cd': 'XCP', 'colin_ind': 'N', 'corp_class': 'OT', 'short_desc': 'XPRO COOP', 'full_desc': 'Extraprovincial Cooperative Association', 'legislation': 'BC Cooperative Association Act'}, {'corp_type_cd': 'ULC', 'colin_ind': 'Y', 'corp_class': 'BC', 'short_desc': 'BC ULC COMPANY', 'full_desc': 'BC Unlimited Liability Company', 'legislation': 'BC Business Corporations Act'}, {'corp_type_cd': 'CUL', 'colin_ind': 'Y', 'corp_class': 'BC', 'short_desc': 'ULC CONTINUE IN', 'full_desc': 'Continuation In as a BC ULC', 'legislation': 'BC Business Corporations Act'}, {'corp_type_cd': 'UQA', 'colin_ind': 'Y', 'corp_class': 'BC', 'short_desc': 'ULC CO 1860', 'full_desc': 'ULC CO 1860', 'legislation': ''}, {'corp_type_cd': 'UQB', 'colin_ind': 'Y', 'corp_class': 'BC', 'short_desc': 'ULC CO 1862', 'full_desc': 'ULC CO 1862', 'legislation': ''}, {'corp_type_cd': 'UQC', 'colin_ind': 'Y', 'corp_class': 'BC', 'short_desc': 'ULC CO 1878', 'full_desc': 'ULC CO 1878', 'legislation': ''}, {'corp_type_cd': 'UQD', 'colin_ind': 'Y', 'corp_class': 'BC', 'short_desc': 'ULC CO 1890', 'full_desc': 'ULC CO 1890', 'legislation': ''}, {'corp_type_cd': 'UQE', 'colin_ind': 'Y', 'corp_class': 'BC', 'short_desc': 'ULC CO 1897', 'full_desc': 'ULC CO 1897', 'legislation': ''}, {'corp_type_cd': 'CC', 'colin_ind': 'Y', 'corp_class': 'BC', 'short_desc': 'BC CCC', 'full_desc': 'BC Community Contribution Company', 'legislation': 'BC Business Corporations Act'}, {'corp_type_cd': 'CCC', 'colin_ind': 'Y', 'corp_class': 'BC', 'short_desc': 'CCC CONTINUE IN', 'full_desc': 'BC Community Contribution Company', 'legislation': 'BC Business Corporations Act'}, {'corp_type_cd': 'S', 'colin_ind': 'Y', 'corp_class': 'SOC', 'short_desc': 'SOCIETY', 'full_desc': 'Society', 'legislation': 'BC Societies Act'}, {'corp_type_cd': 'XS', 'colin_ind': 'Y', 'corp_class': 'SOC', 'short_desc': 'XPRO SOCIETY', 'full_desc': 'Extraprovincial Society', 'legislation': 'BC Societies Act'}, {'corp_type_cd': 'SP', 'colin_ind': 'Y', 'corp_class': 'FIRM', 'short_desc': 'SOLE PROP', 'full_desc': 'Sole Proprietorship', 'legislation': 'BC Partnership Act'}, {'corp_type_cd': 'GP', 'colin_ind': 'Y', 'corp_class': 'FIRM', 'short_desc': 'PARTNERSHIP', 'full_desc': 'General Partnership', 'legislation': 'BC Partnership Act'}, {'corp_type_cd': 'LP', 'colin_ind': 'Y', 'corp_class': 'FIRM', 'short_desc': 'LIM PARTNERSHIP', 'full_desc': 'Limited Partnership', 'legislation': 'BC Partnership Act'}, {'corp_type_cd': 'XP', 'colin_ind': 'Y', 'corp_class': 'FIRM', 'short_desc': 'XPRO LIM PARTNR', 'full_desc': 'Extraprovincial Limited Partnership', 'legislation': 'BC Partnership Act'}, {'corp_type_cd': 'LL', 'colin_ind': 'Y', 'corp_class': 'FIRM', 'short_desc': 'LL PARTNERSHIP', 'full_desc': 'Limited Liability Partnership', 'legislation': 'BC Partnership Act'}, {'corp_type_cd': 'XL', 'colin_ind': 'Y', 'corp_class': 'FIRM', 'short_desc': 'XPRO LL PARTNR', 'full_desc': 'Extrapro Limited Liability Partnership', 'legislation': 'BC Partnership Act'}, {'corp_type_cd': 'MF', 'colin_ind': 'Y', 'corp_class': 'FIRM', 'short_desc': 'MISC FIRM', 'full_desc': 'Miscellaneous Firm', 'legislation': 'BC Partnership Act'}, {'corp_type_cd': 'FI', 'colin_ind': 'N', 'corp_class': 'OT', 'short_desc': 'FINANCIAL', 'full_desc': 'Financial Institutions', 'legislation': 'Credit Union Incorporation Act'}, {'corp_type_cd': 'CS', 'colin_ind': 'Y', 'corp_class': 'SOC', 'short_desc': 'CONT IN SOCIETY', 'full_desc': 'Society', 'legislation': 'BC Societies Act'}, {'corp_type_cd': 'BEN', 'colin_ind': 'Y', 'corp_class': 'BC', 'short_desc': 'BENEFIT COMPANY', 'full_desc': 'BC Benefit Company', 'legislation': 'BC Business Corporations Act'} ] )
# grabs the schema from the db # run this any time the schema changes # side note: metadata.pickle is kept in git if __name__ == '__main__': from mimic_package.connect.connect import connection_string from sqlalchemy.sql.schema import MetaData from sqlalchemy.engine import create_engine from mimic_package.data_model.resources import metadata_filename import pickle engine = create_engine(connection_string, echo=False, convert_unicode=True) metadata = MetaData(bind=engine) metadata.reflect(schema='mimiciii') with open(metadata_filename, 'wb') as outfile: pickle.dump(metadata, outfile)
class EnronDB: def __init__(self): self.engine = None self.metadata = MetaData() def init(self, host, username, password, db_name): engine_desc = 'mysql://%s:%s@%s/%s' % (username, password, host, db_name) try: self.engine = create_engine(engine_desc) self.metadata.reflect(self.engine) except: print "Unexpected error:", sys.exc_info()[0] return False return True # RAW_EMAIL table def insert_email(self, email): self.insert_to_table(email, "raw_email") # RAW_EMAIL table def insert_cleaned_email(self, email): self.insert_to_table(email, "cleaned_email") def insert_to_table(self, email, table_name): if not isinstance(email, Email): print 'ERROR: input must be of type Email' return email_table = Table(table_name, self.metadata) ins_stmt = email_table.insert() conn = self.engine.connect() result = conn.execute(ins_stmt, date=email.date, mime_type=email.mime_type, from_addr=email.from_addr, to_addr=email.to_addr, subject=email.subject, body=email.body, path=email.path, label=email.label) def get_all_content(self): email_table = Table('raw_email', self.metadata) sel_stmt = select([email_table.c.subject, email_table.c.body]) rp = self.engine.execute(sel_stmt) all_content = "" for record in rp: all_content += record.subject + " " all_content += record.body + " " return all_content def add_username(self): email_table = Table('brushed_email', self.metadata) sel_stmt = select([email_table.c.id, email_table.c.path]) rp = self.engine.execute(sel_stmt) conn = self.engine.connect() for record in rp: # print(record) p = "\/[^\/]*\/([^\/]+)" # match the content between the second / and the third / match = re.match(p, record.path) if match: username = match.group(1) stmt = email_table.update().where(email_table.c.id == record.id).values(username=username) conn.execute(stmt) else: print("Error! " + record.path) exit(0) def update_brushed_email_is_scheduling(self, email_id, is_scheduling): email_table = Table('brushed_email', self.metadata) conn = self.engine.connect() stmt = email_table.update().where(email_table.c.id == email_id).values(is_scheduling=is_scheduling) conn.execute(stmt) def get_all_dates(self): email_table = Table('raw_email', self.metadata) sel_stmt = select([email_table.c.date]) rp = self.engine.execute(sel_stmt) dates = [] for record in rp: dates.append(record.date.strftime("%y%m%d")) return dates def get_all_subjects(self): email_table = Table('raw_email', self.metadata) sel_stmt = select([email_table.c.subject]) rp = self.engine.execute(sel_stmt) subjects = [] for record in rp: subjects.append(record.subject) return subjects def get_all_bodies_with_id(self): email_table = Table('raw_email', self.metadata) sel_stmt = select([email_table.c.id, email_table.c.body]) rp = self.engine.execute(sel_stmt) bodies = [] for record in rp: bodies.append((record.id, record.body)) return bodies def get_body(self, email_id): email_table = Table('raw_email', self.metadata) sel_stmt = select([email_table.c.body]).where(email_table.c.id == email_id) rp = self.engine.execute(sel_stmt) record = rp.first() return record.body def get_all_bodies(self): email_table = Table('raw_email', self.metadata) sel_stmt = select([email_table.c.body]) rp = self.engine.execute(sel_stmt) bodies = [] for record in rp: bodies.append(record.body) return bodies def get_all_brushed_emails(self): email_table = Table('brushed_email', self.metadata) sel_stmt = select([email_table.c.id, email_table.c.date, email_table.c.mime_type, \ email_table.c.from_addr, email_table.c.to_adddr, \ email_table.c.subject, email_table.c.body, email_table.c.one_line, \ email_table.c.path, email_table.c.label, email_table.c.is_scheduling]) rp = self.engine.execute(sel_stmt) emails = [] for record in rp: email = Email() if record is not None: email.id = record.id email.date = record.date email.mime_type = record.mime_type email.from_addr = record.from_addr email.to_addr = record.to_adddr email.subject = record.subject email.body = record.body email.one_line = record.one_line email.path = record.path email.label = record.label email.is_scheduling = record.is_scheduling or 0 emails.append(email) return emails def get_brushed_email(self, email_id): email_table = Table('brushed_email', self.metadata) sel_stmt = select([email_table.c.date, email_table.c.mime_type, \ email_table.c.from_addr, email_table.c.to_adddr, \ email_table.c.subject, email_table.c.body, \ email_table.c.path, email_table.c.label, email_table.c.is_scheduling]).where(email_table.c.id == email_id) rp = self.engine.execute(sel_stmt) record = rp.first() email = Email() if record is not None: email.date = record.date email.mime_type = record.mime_type email.from_addr = record.from_addr email.to_addr = record.to_adddr email.subject = record.subject email.body = record.body email.path = record.path email.label = record.label email.is_scheduling = record.is_scheduling return email def get_email(self, email_id): email_table = Table('raw_email', self.metadata) sel_stmt = select([email_table.c.date, email_table.c.mime_type, \ email_table.c.from_addr, email_table.c.to_addr, \ email_table.c.subject, email_table.c.body, \ email_table.c.path, email_table.c.label]).where(email_table.c.id == email_id) rp = self.engine.execute(sel_stmt) record = rp.first() email = Email() if record is not None: email.date = record.date email.mime_type = record.mime_type email.from_addr = record.from_addr email.to_addr = record.to_addr email.subject = record.subject email.body = record.body email.path = record.path email.label = record.label return email def get_emails_from(self, from_addr): email_table = Table('raw_email', self.metadata) sel_stmt = select([email_table.c.date, email_table.c.mime_type, \ email_table.c.from_addr, email_table.c.to_addr, \ email_table.c.subject, email_table.c.body, \ email_table.c.path, email_table.c.label]).where(email_table.c.from_addrr == from_addr) rp = self.engine.execute(sel_stmt) email_list = [] for record in rp: email = Email() email.date = record.date email.mime_type = record.mime_type email.from_addr = record.from_addr email.to_addr = record.to_addr email.subject = record.subject email.body = record.body email.path = record.path email.label = record.label email_list.append(email) return email_list def get_emails_before(self, query_date): email_table = Table('raw_email', self.metadata) sel_stmt = select([email_table.c.date, email_table.c.mime_type, \ email_table.c.from_addr, email_table.c.to_addr, \ email_table.c.subject, email_table.c.body, \ email_table.c.path, email_table.c.label]).where(email_table.c.date <= query_date) rp = self.engine.execute(sel_stmt) email_list = [] for record in rp: email = Email() email.date = record.date email.mime_type = record.mime_type email.from_addr = record.from_addr email.to_addr = record.to_addr email.subject = record.subject email.body = record.body email.path = record.path email.label = record.label email_list.append(email) return email_list # EMAIL_ADDRESS table def insert_address(self, email_address): if type(email) != EmailAddress: print 'ERROR: input must be of type EmailAddress' return email_address_table = Table('email_address', self.metadata) ins_stmt = email_address_table.insert() conn = self.engine.connect() result = conn.execute(ins_stmt, address=email_address.address, name=email_address.name) def get_address(self, address_id): email_address_table = Table('email_address', self.metadata) sel_stmt = select([email_address_table.c.name, email_address_table.c.address]).where(email_address_table.c.id == address_id) rp = self.engine.execute(sel_stmt) record = rp.first() email_address = EmailAddress() if record is not None: email_address.name = record.name email_address.address = record.address return email_address def get_address_name(self, address_id): email_address_table = Table('email_address', self.metadata) sel_stmt = select([email_address_table.c.name]).where(email_address_table.c.id == address_id) rp = self.engine.execute(sel_stmt) record = rp.first() email_address = EmailAddress() if record is not None: email_address.name = record.name email_address.address = record.address return email_address # BRUSHED_EMAIL table def count_per_label(self, label): rp = self.engine.execute('select count(label) from brushed_email where label=%d'%(label)) res = rp.first() return long(res[0]) def get_all_brushed_labels_with_id(self): email_table = Table('brushed_email', self.metadata) sel_stmt = select([email_table.c.id, email_table.c.label]) rp = self.engine.execute(sel_stmt) labels = [] for record in rp: labels.append((record.id, record.label)) return labels def get_all_brushed_bodies_with_id(self): email_table = Table('brushed_email', self.metadata) sel_stmt = select([email_table.c.id, email_table.c.body]) rp = self.engine.execute(sel_stmt) bodies = [] for record in rp: bodies.append((record.id, record.body)) return bodies def get_all_brushed_body_summary_with_id(self): email_table = Table('brushed_email', self.metadata) sel_stmt = select([email_table.c.id, email_table.c.subject, email_table.c.body, email_table.c.summary]) rp = self.engine.execute(sel_stmt) bodies = [] for record in rp: bodies.append((record.id, record.subject, record.body, record.summary)) return bodies def get_all_brushed_lines_with_id(self): email_table = Table('brushed_email', self.metadata) sel_stmt = select([email_table.c.id, email_table.c.lines]) rp = self.engine.execute(sel_stmt) lines = [] for record in rp: lines.append((record.id, record.lines)) return lines def get_all_brushed_verbs_with_id(self): email_table = Table('brushed_email', self.metadata) sel_stmt = select([email_table.c.id, email_table.c.verbs]) rp = self.engine.execute(sel_stmt) brushed_verbs = [] for record in rp: brushed_verbs.append((record.id, record.verbs)) return brushed_verbs def get_all_brushed_verbs_per_label(self, label): email_table = Table('brushed_email', self.metadata) sel_stmt = select([email_table.c.id, email_table.c.verbs]).where(email_table.c.label==label) rp = self.engine.execute(sel_stmt) brushed_verbs = [] for record in rp: brushed_verbs.append((record.id, record.verbs)) return brushed_verbs def get_all_one_liners_per_label(self, label): email_table = Table('brushed_email', self.metadata) sel_stmt = select([email_table.c.id, email_table.c.one_line]).where(email_table.c.label==label) rp = self.engine.execute(sel_stmt) one_line = [] for record in rp: one_line.append((record.id, record.one_line)) return one_line def update_brushed_body(self,email_id, body): brushed_table = Table('brushed_email', self.metadata) u = update(brushed_table) u = u.values(body=body) u = u.where(brushed_table.c.id==email_id) conn = self.engine.connect() result = conn.execute(u) def update_brushed_lines(self,email_id, msg_lines): brushed_table = Table('brushed_email', self.metadata) u = update(brushed_table) u = u.values(lines=msg_lines) u = u.where(brushed_table.c.id==email_id) conn = self.engine.connect() result = conn.execute(u) def update_brushed_one_line(self,email_id, one_line): brushed_table = Table('brushed_email', self.metadata) u = update(brushed_table) u = u.values(one_line=one_line) u = u.where(brushed_table.c.id==email_id) conn = self.engine.connect() result = conn.execute(u) def update_brushed_verbs(self, email_id, verbs): brushed_table = Table('brushed_email', self.metadata) u = update(brushed_table) u = u.values(verbs=verbs) u = u.where(brushed_table.c.id==email_id) conn = self.engine.connect() result = conn.execute(u) def update_brushed_summary(self, email_id, summary): brushed_table = Table('brushed_email', self.metadata) u = update(brushed_table) u = u.values(summary=summary) u = u.where(brushed_table.c.id==email_id) conn = self.engine.connect() result = conn.execute(u) # additional dataset, out of the labelled data def insert_brushed_email_more(self, email): if not isinstance(email, Email): print 'ERROR: input must be of type Email' return email_table = Table("brushed_email_more", self.metadata) ins_stmt = email_table.insert() conn = self.engine.connect() conn.execute(ins_stmt, date=email.date, mime_type=email.mime_type, from_addr=email.from_addr, to_addr=email.to_addr, subject=email.subject, raw_body=email.body, body=email.body, all_lines=email.all_lines, one_line=email.one_line, path=email.path, label=email.label, prediction=email.prediction) def insert_cleaned_email_full(self, email): if not isinstance(email, Email): print 'ERROR: input must be of type Email' return email_table = Table("email_full", self.metadata) ins_stmt = email_table.insert() conn = self.engine.connect() conn.execute(ins_stmt, date=email.date, mime_type=email.mime_type, from_addr=email.from_addr, to_addr=email.to_addr, subject=email.subject, raw_body=email.raw_body, body=email.body, all_lines=email.all_lines, one_line=email.one_line, path=email.path) def get_raw_bodies_with_id(self): email_table = Table('email_full', self.metadata) sel_stmt = select([email_table.c.id, email_table.c.raw_body]) rp = self.engine.execute(sel_stmt) bodies = [] for record in rp: bodies.append((record.id, record.raw_body)) return bodies def update_brushed_body_full(self,email_id, body): brushed_table = Table('email_full', self.metadata) u = update(brushed_table) u = u.values(body=body) u = u.where(brushed_table.c.id==email_id) conn = self.engine.connect() result = conn.execute(u) def update_brushed_lines_full(self,email_id, msg_lines): brushed_table = Table('email_full', self.metadata) u = update(brushed_table) u = u.values(all_lines=msg_lines) u = u.where(brushed_table.c.id==email_id) conn = self.engine.connect() result = conn.execute(u) def get_all_brushed_lines_with_id_full(self): email_table = Table('email_full', self.metadata) sel_stmt = select([email_table.c.id, email_table.c.all_lines]) rp = self.engine.execute(sel_stmt) lines = [] for record in rp: lines.append((record.id, record.all_lines)) return lines def update_brushed_one_line_full(self,email_id, one_line): brushed_table = Table('email_full', self.metadata) u = update(brushed_table) u = u.values(one_line=one_line) u = u.where(brushed_table.c.id==email_id) conn = self.engine.connect() result = conn.execute(u) def get_email_full(self, email_id): email_table = Table('email_full', self.metadata) sel_stmt = select([email_table.c.id, email_table.c.raw_body]).where(email_table.c.id==email_id) rp = self.engine.execute(sel_stmt) bodies = [] for record in rp: bodies.append((record.id, record.raw_body)) return bodies[0] def get_all_brushed_emails_full(self): email_table = Table('email_full', self.metadata) sel_stmt = select([email_table.c.id, email_table.c.date, email_table.c.mime_type, \ email_table.c.from_addr, email_table.c.to_addr, \ email_table.c.subject, email_table.c.body, email_table.c.one_line, \ email_table.c.path, email_table.c.label, email_table.c.is_scheduling]) rp = self.engine.execute(sel_stmt) emails = [] for record in rp: email = Email() if record is not None: email.id = record.id email.date = record.date email.mime_type = record.mime_type email.from_addr = record.from_addr email.to_addr = record.to_addr email.subject = record.subject email.body = record.body email.one_line = record.one_line email.path = record.path email.label = record.label email.is_scheduling = record.is_scheduling or 0 emails.append(email) return emails def get_all_brushed_email_more(self): email_table = Table('brushed_email_more', self.metadata) sel_stmt = select([email_table.c.id, email_table.c.date, email_table.c.mime_type, \ email_table.c.from_addr, email_table.c.to_addr, \ email_table.c.subject, email_table.c.body, email_table.c.one_line, \ email_table.c.path, email_table.c.label, email_table.c.is_scheduling]) rp = self.engine.execute(sel_stmt) emails = [] for record in rp: email = Email() if record is not None: email.id = record.id email.date = record.date email.mime_type = record.mime_type email.from_addr = record.from_addr email.to_addr = record.to_addr email.subject = record.subject email.body = record.body email.one_line = record.one_line email.path = record.path email.label = record.label email.is_scheduling = record.is_scheduling or 0 emails.append(email) return emails
class DatabaseHolder(object): """ Object to represent a connection to a database. """ def __init__(self, name: str, url: str, srccfg: DB_SAFE_CONFIG_FWD_REF = None, with_session: bool = False, with_conn: bool = True, reflect: bool = True, encoding: str = 'utf-8', echo: bool = False) -> None: """ Args: name: internal database name url: SQLAlchemy URL srccfg: :class:`crate_anon.anonymise.config.DatabaseSafeConfig` with_session: create an SQLAlchemy Session? with_conn: create an SQLAlchemy connection (via an Engine)? reflect: read the database structure (when required)? encoding: passed to SQLAlchemy's :func:`create_engine` echo: passed to SQLAlchemy's :func:`create_engine` """ self.name = name self.srccfg = srccfg self.engine = create_engine(url, encoding=encoding, echo=echo) self.conn = None # type: Optional[Connection] self.session = None # type: Optional[Session] self._reflect_on_request = reflect self._reflected = False self._table_names = [] # type: List[str] self._metadata = MetaData(bind=self.engine) log.debug(self.engine) # obscures password if with_conn: # for raw connections self.conn = self.engine.connect() if with_session: # for ORM self.session = sessionmaker(bind=self.engine)() # for ORM def _reflect(self) -> None: """ Perform the database reflection. Reflection is expensive, so we defer unless required """ if not self._reflect_on_request: return log.info(f"Reflecting database: {self.name}") # self.table_names = get_table_names(self.engine) self._metadata.reflect(views=True) # include views self._table_names = [t.name for t in self._metadata.sorted_tables] self._reflected = True def update_metadata(self) -> None: """ Updates the metadata, for example if a table has been dropped. """ self._metadata = MetaData(bind=self.engine) @property def metadata(self) -> MetaData: """ Returns the SQLAlchemy :class:`MetaData`. If reflection is enabled, ensure the database has been reflected first. """ if not self._reflected: self._reflect() return self._metadata @property def table_names(self) -> List[str]: """ Returns the table names from the database, if reflection is enabled. (Otherwise returns an empty list.) """ if not self._reflected: self._reflect() return self._table_names
class DBMS: """Implementation Philosophy: * Always use sqlalchemy API and avoid sql-dielect specific language. * Engine is provided externally. It is the end-user's business to make this engine. """ def __init__(self, engine, db=None, sch=None, vws=False): self.eng = engine self.path = tb.P(self.eng.url.database) self.con = self.eng.connect() self.ses = sessionmaker()(bind=self.eng) # ORM style self.db = db self.sch = sch self.vws = vws self.insp = None self.meta = MetaData() self.schema = None self.tables = None self.views = None self.sch_tab = None self.sch_vws = None self.refresh() def close(self): self.eng self.con.close() self.ses.close() def refresh(self, sch=None): # fails if multiple schemas are there and None is specified self.meta.reflect(bind=self.eng, schema=sch or self.sch) self.insp = inspect(subject=self.eng) self.schema = tb.L(self.insp.get_schema_names()) self.schema.append(None) self.tables = self.schema.apply( lambda x: self.insp.get_table_names(schema=x)) # self.tables = [self.meta.tables[tmp] for tmp in self.meta.tables.keys()] self.views = self.schema.apply( lambda x: self.insp.get_view_names(schema=x)) self.sch_tab = tb.Struct.from_keys_values(self.schema, self.tables) self.sch_vws = tb.Struct.from_keys_values(self.schema, self.views) return self @classmethod def from_local_db(cls, path=None, echo=False): return cls(engine=cls.make_sql_db(path, echo)) def __repr__(self): return f"DataBase @ {self.eng}" @staticmethod def make_sql_db(path=None, echo=False, dialect="sqlite", driver=["pysqlite", "DBAPI"][0]): """Establish lazy initialization with database""" # core style, use in conjustction with Connect. if path == "memory": return create_engine(url=f"{dialect}+{driver}:///:memory:", echo=echo, future=True) if path is None: path = tb.P.tmpfile(folder="dbs", suffix=".db") print(f"Linking to database at {tb.P(path).as_uri()}") eng = create_engine(url=f"{dialect}+{driver}:///{path}", echo=echo, future=True) # echo flag is just a short for the more formal way of logging sql commands. return eng # ==================== QUERIES ===================================== def execute_as_you_go(self, *commands, res_func=lambda x: x.all()): with self.eng.connect() as conn: for command in commands: result = conn.execute(text(command)) conn.commit( ) # if driver is sqlite3, the connection is autocommitting. # this commit is only needed in case of DBAPI driver. return res_func(result) def execute_begin_once(self, command, res_func=lambda x: x.all()): with self.eng.begin() as conn: result = conn.execute(text(command)) # no need for commit regardless of driver result = res_func(result) return result def execute(self, command): with self.eng.begin() as conn: result = conn.execute(text(command)) return result def _get_table_identifier(self, table, sch): if sch is None: sch = self.sch if sch is not None: return sch + "." + table else: return table # ========================== TABLES ===================================== def read_table(self, table, sch=None, size=100): res = self.con.execute( text( f"""SELECT * FROM {self._get_table_identifier(table, sch)}""")) return res.fetchmany(size) def make_df(self, table_name, records=None, schema=None): self.meta.reflect(bind=self.eng, schema=schema or self.sch) table = self.meta.tables[table_name] res = pd.DataFrame(records or self.ses.query(table).all(), columns=table.exported_columns.keys()) # the following spits an error if sqlalchemy is 2.0 # df = pd.read_sql_table(table, con=self.eng, schema=schema or self.sch) return res def get_columns(self, table, sch=None): return self.meta.tables[self._get_table_identifier( table, sch)].exported_columns.keys() def insert_dicts(self, table, *mydicts): cmd = f"""INSERT INTO {table} VALUES """ for mydict in mydicts: cmd += f"""({tuple(mydict)}), """ self.execute_begin_once(cmd) def describe_table(self, table, sch=None, dtype=True): print(table.center(100, "=")) self.refresh() tbl = self.meta.tables[table] count = self.ses.query(tbl).count() res = tb.Struct(name=table, count=count, size_mb=count * len(tbl.exported_columns) * 10 / 1e6) res.print(dtype=False, config=True) dat = self.read_table(table=table, sch=sch, size=2) cols = self.get_columns(table, sch=sch) df = pd.DataFrame.from_records(dat, columns=cols) print("SAMPLE:\n", df) if dtype: print("\n") print("DETAILED COLUMNS:\n", tb.pd.DataFrame(self.insp.get_columns(table))) # print("DETAILED COLUMNS:\n", list(self.meta.tables[self._get_table_identifier(table, sch)].columns)) print("\n" * 3)