def _get_bso_columns(table_name): return ( Column("id", String(64), primary_key=True, autoincrement=False), Column("userid", Integer, primary_key=True, nullable=False, autoincrement=False), Column("collection", Integer, primary_key=True, nullable=False, autoincrement=False), Column("sortindex", Integer), Column("version", BigInteger), Column("timestamp", BigInteger), Column("payload", Text, nullable=False, server_default=""), Column("payload_size", Integer, nullable=False, server_default=sqltext("0")), Column("ttl", Integer, server_default=sqltext(str(MAX_TTL))), # Declare indexes. # We need to include the tablename in the index name due to sharding, # because index names in sqlite are global, not per-table. # Index on "ttl" for easy pruning of expired items. Index("%s_ttl_idx" % (table_name,), "ttl"), # Index on "version" for easy filtering by older/newer. Index("%s_usr_col_ver_idx" % (table_name,), "userid", "collection", "version"), # There is intentinally no index on "sortindex". # Clients almost always filter on "version" using the above index, # and cannot take advantage of a separate index for sorting. )
def _get_bso_columns(table_name): return ( Column("userid", Integer, primary_key=True, nullable=False, autoincrement=False), Column("collection", Integer, primary_key=True, nullable=False, autoincrement=False), Column("id", String(64), primary_key=True, autoincrement=False), Column("sortindex", Integer), Column("modified", BigInteger, nullable=False), # I'd like to default this to the emptry string, but # MySQL doesn't let you set a default on a TEXT column. Column("payload", PAYLOAD_TYPE, nullable=False), Column("payload_size", Integer, nullable=False, server_default=sqltext("0")), Column("ttl", Integer, nullable=False, server_default=sqltext(str(MAX_TTL))), # Declare indexes. # We need to include the tablename in the index name due to sharding, # because index names in sqlite are global, not per-table. # Index on "ttl" for easy pruning of expired items. Index("%s_ttl_idx" % (table_name,), "ttl"), # Index on "modified" for easy filtering by timestamp. Index("%s_usr_col_mod_idx" % (table_name,), "userid", "collection", "modified"), # There is intentinally no index on "sortindex". # Clients almost always filter on "modified" using the above index, # and cannot take advantage of a separate index for sorting. )
def execute(self, query, params=None, annotations=None): """Execute a database query, with retry and exception-catching logic. This method executes the given query against the database, lazily establishing an actual live connection as required. It catches operational database errors and normalizes them into a BackendError exception. """ if params is None: params = {} if annotations is None: annotations = {} # If there is no active connection, create a fresh one. # This will affect the control flow below. connection = self._connection session_was_active = True if connection is None: connection = self._connector.engine.connect() transaction = connection.begin() session_was_active = False try: # It's possible for the backend to fail in a way that the query # can be retried, e.g. the server timed out the connection we # got from the pool. If so then we can retry the query with a # new connection, but only if the failed connection was never # successfully used as part of this transaction. try: query_str = self._render_query(query, params, annotations) return connection.execute(sqltext(query_str), **params) except DBAPIError, exc: if not is_retryable_db_error(self._connector.engine, exc): raise if session_was_active: raise # Don't try to close the connection if it's already dead. if not exc.connection_invalidated: transaction.rollback() connection.close() connection = self._connector.engine.connect() transaction = connection.begin() annotations["retry"] = "1" query_str = self._render_query(query, params, annotations) return connection.execute(sqltext(query_str), **params) finally: # Now that the underlying connection has been used, remember it # so that all subsequent queries are part of the same transaction. if not session_was_active: self._connection = connection self._transaction = transaction
def visit_conditional_insert(element, compiler, **kwargs): # magic copied from sqlalchemy.sql.compiler.SQLCompiler.visit_insert compiler.isinsert = True try: # pylint: disable=E0611 from sqlalchemy.sql import crud colparams = crud._get_crud_params(compiler, element) except ImportError: # SQLAlchemy <= 1.0 colparams = compiler._get_colparams(element) text = 'INSERT INTO %s' % compiler.process(element.table, asfrom=True) text += ' (%s)\n' % ', '.join(compiler.preparer.format_column(c[0]) for c in colparams) text += 'SELECT %s\n' % ', '.join(c[1] for c in colparams) text += compiler.default_from() # default_from() returns '' for MySQL but that's wrong, MySQL requires # FROM DUAL if there is a following WHERE clause. if isinstance(compiler.dialect, MySQLDialect): text += 'FROM DUAL\n' # We need FOR UPDATE in the inner SELECT for MySQL, to ensure we acquire an # exclusive lock immediately, instead of acquiring a shared lock and then # subsequently upgrading it to an exclusive lock, which is subject to # deadlocks if another transaction is doing the same thing. nonexistence_clause = not_(exists(Select( columns=[sqltext('1')], from_obj=[element.table], whereclause=element.unique_condition, for_update=True))) text += 'WHERE ' + compiler.process(nonexistence_clause) return text
def add_service(self, service, pattern, **kwds): """Add definition for a new service.""" res = self._safe_execute(sqltext(""" insert into services (service, pattern) values (:servicename, :pattern) """), servicename=service, pattern=pattern, **kwds) res.close() return res.lastrowid
def _exec_with_cleanup(self, connection, query_str, **params): """Execution wrapper that kills queries if it is interrupted. This is a wrapper around connection.execute() that will clean up any running query if the execution is interrupted by a control-flow exception such as KeyboardInterrupt or gevent.Timeout. The cleanup currently works only for the PyMySQL driver. Other drivers will still execute fine, they just won't get the cleanup. """ try: return connection.execute(sqltext(query_str), **params) except Exception: # Normal exceptions are passed straight through. raise except BaseException: # Control-flow exceptions trigger the cleanup logic. exc, val, tb = sys.exc_info() logger.warn("query was interrupted by %s", val) # Only cleanup SELECT, INSERT or UPDATE statements. # There are concerns that rolling back DELETEs is too costly. if not SAFE_TO_KILL_QUERY.match(query_str): msg = " refusing to kill unsafe query: %s" logger.warn(msg, query_str[:100]) raise try: # The KILL command is specific to MySQL, and this method of # getting the threadid is specific to the PyMySQL driver. # Other drivers will cause an AttributeError, failing through # to the "finally" clause at the end of this block. thread_id = connection.connection.server_thread_id[0] logger.warn(" killing connection %d", thread_id) cleanup_query = "KILL %d" % (thread_id,) # Use a freshly-created connection so that we don't block # waiting for something from the pool. Unfortunately this # requires use of a private API and raw cursor access. cleanup_conn = self._connector.engine.pool._create_connection() try: cleanup_cursor = cleanup_conn.connection.cursor() try: cleanup_cursor.execute(cleanup_query) except Exception: msg = " failed to kill %d" logger.exception(msg, thread_id) raise finally: cleanup_cursor.close() msg = " successfully killed %d" logger.warn(msg, thread_id) finally: cleanup_conn.close() finally: try: # Don't return this connection to the pool. connection.invalidate() finally: # Always re-raise the original error. raise exc, val, tb
def test_default_node_available_capacity(self): node = "https://phx13" self.backend.add_node("sync-1.0", node, capacity=100) available = int(math.ceil(self.backend.capacity_release_rate * 100)) query = sqltext("SELECT * FROM nodes WHERE node=:node") res = self.backend._safe_execute(query, node=node) row = res.fetchone() res.close() self.assertEqual(row["available"], available)
def remove_node(self, service, node, timestamp=None): """Remove definition for a node.""" nodeid = self.get_node_id(service, node) res = self._safe_execute(sqltext( """ delete from nodes where id=:nodeid """), service=service, nodeid=nodeid ) res.close() self.unassign_node(service, node, timestamp, nodeid=nodeid)
def remove_node(self, service, node, timestamp=None): """Remove definition for a node.""" res = self._safe_execute(sqltext( """ delete from nodes where service=:service and node=:node """), service=service, node=node ) res.close() self.unassign_node(service, node, timestamp)
def unassign_node(self, service, node, timestamp=None): """Clear any assignments to a node.""" if timestamp is None: timestamp = get_timestamp() res = self._safe_execute(sqltext( """ update users set replaced_at=:timestamp where service=:service and node=:node """), service=service, node=node, timestamp=timestamp ) res.close()
def get_node_id(self, service, node): """Get numeric id for a node.""" res = self._safe_execute(sqltext( """ select id from nodes where service=:service and node=:node """), service=service, node=node ) row = res.fetchone() res.close() if row is None: raise ValueError("unknown node: " + node) return row[0]
def _get_bso_columns(table_name): return ( Column("userid", Integer, primary_key=True, nullable=False, autoincrement=False), Column("collection", Integer, primary_key=True, nullable=False, autoincrement=False), Column("id", String(64), primary_key=True, autoincrement=False), Column("sortindex", Integer), Column("modified", BigInteger, nullable=False), Column("payload", Text(length=256 * 1024), nullable=False, server_default=""), Column("payload_size", Integer, nullable=False, server_default=sqltext("0")), Column("ttl", Integer, server_default=sqltext(str(MAX_TTL))), # Declare indexes. # We need to include the tablename in the index name due to sharding, # because index names in sqlite are global, not per-table. # Index on "ttl" for easy pruning of expired items. Index("%s_ttl_idx" % (table_name, ), "ttl"), # Index on "modified" for easy filtering by timestamp. Index("%s_usr_col_mod_idx" % (table_name, ), "userid", "collection", "modified"), # There is intentinally no index on "sortindex". # Clients almost always filter on "modified" using the above index, # and cannot take advantage of a separate index for sorting. )
def unassign_node(self, service, node, timestamp=None, nodeid=None): """Clear any assignments to a node.""" if timestamp is None: timestamp = get_timestamp() if nodeid is None: nodeid = self.get_node_id(service, node) res = self._safe_execute(sqltext( """ update users set replaced_at=:timestamp where nodeid=:nodeid """), nodeid=nodeid, timestamp=timestamp ) res.close()
def add_node(self, service, node, capacity, **kwds): """Add definition for a new node.""" res = self._safe_execute(sqltext( """ insert into nodes (service, node, available, capacity, current_load, downed, backoff) values (:service, :node, :available, :capacity, :current_load, :downed, :backoff) """), service=service, node=node, capacity=capacity, available=kwds.get('available', capacity), current_load=kwds.get('current_load', 0), downed=kwds.get('downed', 0), backoff=kwds.get('backoff', 0), ) res.close()
def get_single_mil_bag(self, pipeline='whole'): """Return a bag of commonly labeled data Bags are defined in SQL Server in the Points table on the group_id Froeign Key""" # Retrieve a single unique bag label sql = """SELECT top(1) group_id FROM {} WHERE IsNumeric(group_id) = 1 ORDER BY group_id ASC""".format(Points.__tablename__) sel = sqltext(sql) # List, remove a single item from the list group_ids = self.Insert.core_select_execute(sel) group_id = group_ids.pop().group_id # Create the pipeline if pipeline == 'whole': full_pipeline = Transform.numeric_transform_pipeline_MIL() elif pipeline == 'categorical': full_pipeline = Transform.categorical_transform_pipeline_MIL() else: raise ValueError('pipeline must be one of ["whole","categorical"]') # Retrieve bag label for each group_id sel = sqlalchemy.select([Labeling]).where(Labeling.id.__eq__(group_id)) with self.Insert.engine.connect() as connection: res = connection.execute(sel) label = res.fetchone().bag_label # Load the dataset sel = sqlalchemy.select([Points]).where(Points.group_id.__eq__(group_id)) dfraw = self.Insert.pandas_select_execute(sel) # Transform the dataset try: bag = full_pipeline.fit_transform(dfraw) except ValueError as e: print('Transform error, Skipped Group ID : ', group_id) traceback.print_exc() print(dfraw) raise e # Validate cleaned dataset if not self.validate_bag(bag): print("Invalid cleaned bag:\n") print(bag) return dfraw, bag, label
def add_node(self, service, node, capacity, **kwds): """Add definition for a new node.""" available = kwds.get('available') # We release only a fraction of the node's capacity to start. if available is None: available = math.ceil(capacity * self.capacity_release_rate) res = self._safe_execute(sqltext( """ insert into nodes (service, node, available, capacity, current_load, downed, backoff) values (:service, :node, :available, :capacity, :current_load, :downed, :backoff) """), service=service, node=node, capacity=capacity, available=available, current_load=kwds.get('current_load', 0), downed=kwds.get('downed', 0), backoff=kwds.get('backoff', 0), ) res.close()
def add_node(self, service, node, capacity, **kwds): """Add definition for a new node.""" available = kwds.get('available') # We release only a fraction of the node's capacity to start. if available is None: available = math.ceil(capacity * self.capacity_release_rate) res = self._safe_execute( sqltext(""" insert into nodes (service, node, available, capacity, current_load, downed, backoff) values (:service, :node, :available, :capacity, :current_load, :downed, :backoff) """), service=service, node=node, capacity=capacity, available=available, current_load=kwds.get('current_load', 0), downed=kwds.get('downed', 0), backoff=kwds.get('backoff', 0), ) res.close()
def update_node(self, node, **kwds): """Updates node fields in the db.""" values = {} cols = NODE_FIELDS & kwds.keys() for col in NODE_FIELDS: try: values[col] = kwds.pop(col) except KeyError: pass args = [v + " = :" + v for v in cols] query = """ update nodes set """ query += ", ".join(args) query += """ where service = :service and node = :node """ values['service'] = self._get_service_id(SERVICE_NAME) values['node'] = node if kwds: raise ValueError("unknown fields: " + str(kwds.keys())) con = self._execute_sql(sqltext(query), **values) con.close()
Column("email", String(255), nullable=False), Column("generation", BigInteger(), nullable=False), Column("client_state", String(32), nullable=False), Column("created_at", BigInteger(), nullable=False), Column("replaced_at", BigInteger(), nullable=True), Index('lookup_idx', 'email', 'service', 'created_at'), ) _GET_USER_RECORDS = sqltext("""\ select uid, generation, client_state, created_at, replaced_at from users where email = :email and service = :service order by created_at desc, uid desc limit 20 """) _CREATE_USER_RECORD = sqltext("""\ insert into users (service, email, generation, client_state, created_at, replaced_at) values (:service, :email, :generation, :client_state, :timestamp, NULL) """)
def set_items(self, user_id, collection_name, items, storage_time=None): """Adds or update a batch of items. Returns a list of success or failures. """ if storage_time is None: storage_time = round_time() if self.engine_name in ('sqlite', 'postgresql'): count = 0 for item in items: if 'id' not in item: continue item_id = item['id'] item['modified'] = storage_time self.set_item(user_id, collection_name, item_id, **item) count += 1 return count # XXX See if SQLAlchemy knows how to do batch inserts # that's quite specific to mysql fields = ('id', 'parentid', 'predecessorid', 'sortindex', 'modified', 'payload', 'payload_size', 'ttl') table = self._get_wbo_table_name(user_id) query = 'insert into %s (username, collection, %s) values ' \ % (table, ','.join(fields)) values = {} values['collection'] = self._get_collection_id(user_id, collection_name) values['user_id'] = user_id # building the values batch binds = [':%s%%(num)d' % field for field in fields] pattern = '(:user_id,:collection,%s) ' % ','.join(binds) lines = [] for num, item in enumerate(items): lines.append(pattern % {'num': num}) for field in fields: value = item.get(field) if value is None: continue if field == 'modified' and value is not None: value = _roundedbigint(storage_time) values['%s%d' % (field, num)] = value if ('payload%d' % num in values and 'modified%d' % num not in values): values['modified%d' % num] = _roundedbigint(storage_time) if values.get('ttl%d' % num) is None: values['ttl%d' % num] = 2100000000 else: values['ttl%d' % num] += int(storage_time) if 'payload%d' % num in values: size = len(values['payload%d' % num]) values['payload_size%d' % num] = size query += ','.join(lines) # allowing updates as well query += (' on duplicate key update parentid = values(parentid),' 'predecessorid = values(predecessorid),' 'sortindex = values(sortindex),' 'modified = values(modified), payload = values(payload),' 'payload_size = values(payload_size),' 'ttl = values(ttl)') return self._do_query(sqltext(query), **values)
downed = Column(Integer(6), default=0, nullable=False) backoff = Column(Integer(11), default=0, nullable=False) class Nodes(_NodesBase, _Base): __tablename__ = 'nodes' nodes = Nodes.__table__ tables.append(nodes) _GET = sqltext("""\ select uid, node from user_nodes where email = :email and service = :service """) _INSERT = sqltext("""\ insert into user_nodes (service, email, node) values (:service, :email, :node) """) WRITEABLE_FIELDS = [ 'available', 'current_load', 'capacity', 'downed', 'backoff' ]
import traceback from sqlalchemy import String, Index, Boolean from sqlalchemy import Table, MetaData, create_engine, Column from sqlalchemy.exc import OperationalError, TimeoutError from sqlalchemy.pool import NullPool from sqlalchemy.sql import text as sqltext, select, or_, and_ from addonreg import logger _GET = sqltext("""\ SELECT addonid, sha256, registered FROM hashes WHERE addonid = :addonid AND sha256 = :sha256 """) _INSERT = sqltext("""\ INSERT INTO hashes (addonid, sha256, registered) VALUES (:addonid, :sha256, 1) """) _MULTIPLE_GET = """\ SELECT addonid, sha256, registered FROM hashes WHERE """ metadata = MetaData()
#! /usr/bin/env python # script to populate the database with records import time import random from sqlalchemy import create_engine from sqlalchemy.sql import text as sqltext _CREATE_USER_RECORD = sqltext("""\ insert into users (service, email, nodeid, generation, client_state, created_at, replaced_at) values (:service, :email, :nodeid, 0, "", :timestamp, NULL) """) _GET_SERVICE_ID = sqltext("""\ select id from services where service = :service """) _GET_NODE_ID = sqltext("""\ select id from nodes where
# Used as a tombstone to mark users that have been "retired" from the db. MAX_GENERATION = 9223372036854775807 NODE_FIELDS = ("capacity", "available", "current_load", "downed", "backoff") _Base = declarative_base() _GET_USER_RECORDS = sqltext("""\ select uid, nodes.node, generation, client_state, created_at, replaced_at from users left outer join nodes on users.nodeid = nodes.id where email = :email and users.service = :service order by created_at desc, uid desc limit 20 """) _CREATE_USER_RECORD = sqltext("""\ insert into users (service, email, nodeid, generation, client_state, created_at, replaced_at) values (:service, :email, :nodeid, :generation, :client_state, :timestamp, NULL) """)
def get_timestamp(): """Get current timestamp in milliseconds.""" return int(time.time() * 1000) _Base = declarative_base() _GET_USER_RECORDS = sqltext("""\ select uid, node, generation, client_state, created_at, replaced_at from users where email = :email and service = :service order by created_at desc, uid desc limit 20 """) _CREATE_USER_RECORD = sqltext("""\ insert into users (service, email, node, generation, client_state, created_at, replaced_at) values (:service, :email, :node, :generation, :client_state, :timestamp, NULL) """)
MAX_GENERATION = 9223372036854775807 NODE_FIELDS = ("capacity", "available", "current_load", "downed", "backoff") _Base = declarative_base() _GET_USER_RECORDS = sqltext("""\ select uid, nodes.node, generation, keys_changed_at, client_state, created_at, replaced_at from users left outer join nodes on users.nodeid = nodes.id where email = :email and users.service = :service order by created_at desc, uid desc limit 20 """) _CREATE_USER_RECORD = sqltext("""\ insert into users (service, email, nodeid, generation, keys_changed_at, client_state, created_at, replaced_at) values (:service, :email, :nodeid, :generation, :keys_changed_at, :client_state, :timestamp, NULL)
class Nodes(_NodesBase, _Base): __tablename__ = 'nodes' nodes = Nodes.__table__ tables.append(nodes) _GET = sqltext("""\ select uid, node from user_nodes where email = :email and service = :service and version = :version """) _INSERT = sqltext("""\ insert into user_nodes (service, email, node, version) values (:service, :email, :node, :version) """)
from sqlalchemy.pool import NullPool from sqlalchemy.sql import text as sqltext from sqlalchemy.sql import select, update, and_ from sqlalchemy.exc import OperationalError, TimeoutError from mozsvc.exceptions import BackendError from wimms import logger from wimms.sql import (SQLMetadata, _NodesBase, get_user_nodes_table, WRITEABLE_FIELDS) _GET = sqltext("""\ select uid, node from user_nodes where email = :email """) _INSERT = sqltext("""\ insert into user_nodes (email, node) values (:email, :node) """) class ShardedSQLMetadata(SQLMetadata):
logger = logging.getLogger('tokenserver.assignment.sqlnode') # The maximum possible generation number. # Used as a tombstone to mark users that have been "retired" from the db. MAX_GENERATION = 9223372036854775807 NODE_FIELDS = ("capacity", "available", "current_load", "downed", "backoff") _Base = declarative_base() _GET_USER_RECORDS = sqltext("""\ select uid, nodes.node, generation, client_state, created_at, replaced_at from users left outer join nodes on users.nodeid = nodes.id where email = :email and users.service = :service order by created_at desc, uid desc limit 20 """) _CREATE_USER_RECORD = sqltext("""\ insert into users (service, email, nodeid, generation, client_state, created_at, replaced_at) values (:service, :email, :nodeid, :generation, :client_state, :timestamp, NULL) """) _UPDATE_GENERATION_NUMBER = sqltext("""\
def set_items(self, user_id, collection_name, items, storage_time=None): """Adds or update a batch of items. Returns a list of success or failures. """ if not self.standard_collections: self.set_collection(user_id, collection_name) if storage_time is None: storage_time = round_time() if self.engine_name in ('sqlite', 'postgresql'): count = 0 for item in items: if 'id' not in item: continue item_id = item['id'] item['modified'] = storage_time self.set_item(user_id, collection_name, item_id, **item) count += 1 return count # XXX See if SQLAlchemy knows how to do batch inserts # that's quite specific to mysql fields = ('id', 'parentid', 'predecessorid', 'sortindex', 'modified', 'payload', 'payload_size', 'ttl') table = self._get_wbo_table_name(user_id) query = 'insert into %s (username, collection, %s) values ' \ % (table, ','.join(fields)) values = {} values['collection'] = self._get_collection_id(user_id, collection_name) values['user_id'] = user_id # building the values batch binds = [':%s%%(num)d' % field for field in fields] pattern = '(:user_id,:collection,%s) ' % ','.join(binds) lines = [] for num, item in enumerate(items): lines.append(pattern % {'num': num}) for field in fields: value = item.get(field) if value is None: continue if field == 'modified' and value is not None: value = _roundedbigint(storage_time) values['%s%d' % (field, num)] = value if ('payload%d' % num in values and 'modified%d' % num not in values): values['modified%d' % num] = _roundedbigint(storage_time) if values.get('ttl%d' % num) is None: values['ttl%d' % num] = 2100000000 else: values['ttl%d' % num] += int(storage_time) if 'payload%d' % num in values: size = len(values['payload%d' % num]) values['payload_size%d' % num] = size query += ','.join(lines) # allowing updates as well query += (' on duplicate key update parentid = values(parentid),' 'predecessorid = values(predecessorid),' 'sortindex = values(sortindex),' 'modified = values(modified), payload = values(payload),' 'payload_size = values(payload_size),' 'ttl = values(ttl)') return self._do_query(sqltext(query), **values)
def get_timestamp(): """Get current timestamp in milliseconds.""" return int(time.time() * 1000) _Base = declarative_base() _GET_USER_RECORDS = sqltext("""\ select uid, nodeid, node, generation, client_state, created_at, replaced_at from users where email = :email and service = :service order by created_at desc, uid desc limit 20 """) _CREATE_USER_RECORD = sqltext("""\ insert into users (service, email, nodeid, node, generation, client_state, created_at, replaced_at) values (:service, :email, :nodeid, :node, :generation, :client_state,
def bag_data_generator(self, pipeline, verbose=False): """Return a bag of commonly labeled data Bags are defined in SQL Server in the Points table on the group_id Froeign Key""" # Retrieve all unique bag labels sql = """SELECT distinct group_id FROM {} WHERE IsNumeric(group_id) = 1 ORDER BY group_id ASC""".format(Points.__tablename__) sel = sqltext(sql) group_ids = self.Insert.core_select_execute(sel) # Retrieve bag label for each group_id sql_bag = """SELECT id, bag_label FROM {} WHERE id = {}""" # Create the pipeline if pipeline == 'whole': full_pipeline = Transform.numeric_transform_pipeline_MIL() elif pipeline == 'categorical': full_pipeline = Transform.categorical_transform_pipeline_MIL() else: raise ValueError('pipeline must be one of ["whole","categorical"]') for row in group_ids: group_id = row.group_id sel = sqltext(sql_bag.format(Labeling.__tablename__, group_id)) with self.Insert.engine.connect() as connection: res = connection.execute(sel) label = res.fetchone().bag_label # Load the dataset sel = sqlalchemy.select([Points]).where(Points.group_id.__eq__(group_id)) dfraw = self.Insert.pandas_select_execute(sel) # Validate raw dataset if not self.validate_bag(dfraw): continue # Transform the dataset try: bag = full_pipeline.fit_transform(dfraw) except ValueError as e: print('Transform error, Skipped Group ID : ', group_id) if verbose: traceback.print_exc() print(dfraw) x = input("Do you want to continue and discard this bag? : ") if x in ['y','yes','Y','Yes','True','TRUE']: continue else: raise e else: continue # Validate cleaned dataset if not self.validate_bag(bag): continue yield bag, label
from sqlalchemy import create_engine from sqlalchemy.pool import NullPool from sqlalchemy.sql import text as sqltext from sqlalchemy.sql import select, update, and_ from sqlalchemy.exc import OperationalError, TimeoutError from mozsvc.exceptions import BackendError from wimms import logger from wimms.sql import (SQLMetadata, _NodesBase, get_user_nodes_table, WRITEABLE_FIELDS) _GET = sqltext("""\ select uid, node from user_nodes where email = :email """) _INSERT = sqltext("""\ insert into user_nodes (email, node) values (:email, :node) """) class ShardedSQLMetadata(SQLMetadata): def __init__(self, databases, create_tables=False, **kw): # databases is a string containing one sqluri per service:
Column("generation", BigInteger(), nullable=False), Column("client_state", String(32), nullable=False), Column("created_at", BigInteger(), nullable=False), Column("replaced_at", BigInteger(), nullable=True), Index('lookup_idx', 'email', 'service', 'created_at'), Index('clientstate_idx', 'email', 'service', 'client_state', unique=True), ) _GET_USER_RECORDS = sqltext("""\ select uid, generation, client_state, created_at from users where email = :email and service = :service order by created_at desc, uid desc limit 20 """) _CREATE_USER_RECORD = sqltext("""\ insert into users (service, email, generation, client_state, created_at, replaced_at) values (:service, :email, :generation, :client_state, :timestamp, NULL) """)
from sqlalchemy.sql import text as sqltext from util import get_timestamp # The maximum possible generation number. # Used as a tombstone to mark users that have been "retired" from the db. MAX_GENERATION = 9223372036854775807 NODE_FIELDS = ("capacity", "available", "current_load", "downed", "backoff") _GET_USER_RECORDS = sqltext("""\ select uid, nodes.node, generation, keys_changed_at, client_state, created_at, replaced_at from users left outer join nodes on users.nodeid = nodes.id where email = :email and users.service = :service order by created_at desc, uid desc limit 20 """) _CREATE_USER_RECORD = sqltext("""\ insert into users (service, email, nodeid, generation, keys_changed_at, client_state, created_at, replaced_at) values (:service, :email, :nodeid, :generation, :keys_changed_at, :client_state, :timestamp, NULL)