Exemple #1
0
def is_master_alive(master, replicas):
    """ Determine if the master is alive

    The function will:
    1. Attempt to connect to the master via the mysql protcol. If successful
       the master is considered alive.
    2. If #1 fails, check the io thread of the replica instance(s). If the io
       thread is not running, the master will be considered dead. If step #1
       fails and step #2 succeeds, we are in a weird state and will throw an
       exception.

    Args:
    master - A hostaddr object for the master instance
    replicas -  A set of hostaddr objects for the replica instances

    Returns:
    A mysql connection to the master if the master is alive, False otherwise.
    """
    if len(replicas) == 0:
        raise Exception('At least one replica must be present to determine '
                        'a master is dead')
    try:
        master_conn = mysql_lib.connect_mysql(master)
        return master_conn
    except MySQLdb.OperationalError as detail:
        (error_code, msg) = detail.args
        if error_code != mysql_lib.MYSQL_ERROR_CONN_HOST_ERROR:
            raise
        master_conn = False
        log.info('Unable to connect to current master {master} from '
                 '{hostname}, will check replica servers beforce declaring '
                 'the master dead'.format(master=master,
                                          hostname=host_utils.HOSTNAME))
    except:
        log.info('This is an unknown connection error. If you are very sure '
                 'that the master is dead, please put a "return False" at the '
                 'top of is_master_alive and then send rwultsch a stack trace')
        raise

    # We can not get a connection to the master, so poll the replica servers
    for replica in replicas:
        conn = mysql_lib.connect_mysql(replica)
        # If replication has not hit a timeout, a dead master can still have
        # a replica which thinks it is ok. "STOP SLAVE; START SLAVE" followed
        # by a sleep will get us truthyness.
        mysql_lib.restart_replication(conn)
        ss = mysql_lib.get_slave_status(conn)
        if ss['Slave_IO_Running'] == 'Yes':
            raise Exception('Replica {replica} thinks it can connect to '
                            'master {master}, but failover script can not. '
                            'Possible network partition!'
                            ''.format(replica=replica,
                                      master=master))
        else:
            log.info('Replica {replica} also can not connect to master '
                     '{master}.'.format(replica=replica,
                                        master=master))
    return False
def is_master_alive(master, replicas):
    """ Determine if the master is alive

    The function will:
    1. Attempt to connect to the master via the mysql protcol. If successful
       the master is considered alive.
    2. If #1 fails, check the io thread of the replica instance(s). If the io
       thread is not running, the master will be considered dead. If step #1
       fails and step #2 succeeds, we are in a weird state and will throw an
       exception.

    Args:
    master - A hostaddr object for the master instance
    replicas -  A set of hostaddr objects for the replica instances

    Returns:
    A mysql connection to the master if the master is alive, False otherwise.
    """
    if len(replicas) == 0:
        raise Exception('At least one replica must be present to determine '
                        'a master is dead')
    try:
        master_conn = mysql_lib.connect_mysql(master)
        return master_conn
    except MySQLdb.OperationalError as detail:
        (error_code, msg) = detail.args
        if error_code != mysql_lib.MYSQL_ERROR_CONN_HOST_ERROR:
            raise
        master_conn = False
        log.info('Unable to connect to current master {master} from '
                 '{hostname}, will check replica servers beforce declaring '
                 'the master dead'.format(master=master,
                                          hostname=host_utils.HOSTNAME))
    except:
        log.info('This is an unknown connection error. If you are very sure '
                 'that the master is dead, please put a "return False" at the '
                 'top of is_master_alive and then send rwultsch a stack trace')
        raise

    # We can not get a connection to the master, so poll the replica servers
    for replica in replicas:
        conn = mysql_lib.connect_mysql(replica)
        # If replication has not hit a timeout, a dead master can still have
        # a replica which thinks it is ok. "STOP SLAVE; START SLAVE" followed
        # by a sleep will get us truthyness.
        mysql_lib.restart_replication(conn)
        ss = mysql_lib.get_slave_status(conn)
        if ss['Slave_IO_Running'] == 'Yes':
            raise Exception('Replica {replica} thinks it can connect to '
                            'master {master}, but failover script can not. '
                            'Possible network partition!'
                            ''.format(replica=replica, master=master))
        else:
            log.info('Replica {replica} also can not connect to master '
                     '{master}.'.format(replica=replica, master=master))
    return False
    def extend_backup_lock(self, lock_identifier, extend_lock_stop_event):
        """ Extend a backup lock. This is to be used by a thread

        Args:
        lock_identifier - Corrosponds to a lock identifier row in the
                          CSV_BACKUP_LOCK_TABLE_NAME.
        extend_lock_stop_event - An event that will be used to inform this
                                 thread to stop extending the lock
        """
        # Assumption is that this is callled right after creating the lock
        last_update = time.time()
        while (not extend_lock_stop_event.is_set()):
            if (time.time() - last_update) > LOCK_EXTEND_FREQUENCY:
                zk = host_utils.MysqlZookeeper()
                replica_set = zk.get_replica_set_from_instance(self.instance)
                master = zk.get_mysql_instance_from_replica_set(
                    replica_set, host_utils.REPLICA_ROLE_MASTER)
                master_conn = mysql_lib.connect_mysql(master, role='dbascript')
                cursor = master_conn.cursor()

                params = {'lock_identifier': lock_identifier}
                sql = ('UPDATE {db}.{tbl} '
                       'SET expires = NOW() + INTERVAL {locks_held_time} '
                       'WHERE lock_identifier = %(lock_identifier)s'
                       '').format(db=mysql_lib.METADATA_DB,
                                  tbl=CSV_BACKUP_LOCK_TABLE_NAME,
                                  locks_held_time=LOCKS_HELD_TIME)
                cursor.execute(sql, params)
                master_conn.commit()
                log.debug(cursor._executed)
                last_update = time.time()
            extend_lock_stop_event.wait(.5)
def csv_backup_success_logged(instance, date):
    """ Check for log entries created by log_csv_backup_success

    Args:
    instance - A hostaddr object
    date - a string for the date

    Returns:
    True if already backed up, False otherwise
    """
    zk = host_utils.MysqlZookeeper()
    replica_set = zk.get_replica_set_from_instance(instance)[0]
    master = zk.get_mysql_instance_from_replica_set(replica_set)
    conn = mysql_lib.connect_mysql(master, 'scriptrw')
    cursor = conn.cursor()

    if not mysql_lib.does_table_exist(master, mysql_lib.METADATA_DB,
                                      environment_specific.CSV_BACKUP_LOG_TABLE):
        return False

    sql = ('SELECT COUNT(*) as "cnt" '
           'FROM {METADATA_DB}.{CSV_BACKUP_LOG_TABLE} '
           'WHERE backup_date = %(date)s '
           ''.format(METADATA_DB=mysql_lib.METADATA_DB,
                     CSV_BACKUP_LOG_TABLE=environment_specific.CSV_BACKUP_LOG_TABLE))
    cursor.execute(sql, {'date': date})
    if cursor.fetchone()["cnt"]:
        return True
    else:
        return False
def log_csv_backup_success(instance, date):
    """ The CSV backup check can be expensive, so let's log that it is done

    Args:
    instance - A hostaddr object
    date - a string for the date
    """
    zk = host_utils.MysqlZookeeper()
    replica_set = zk.get_replica_set_from_instance(instance)[0]
    master = zk.get_mysql_instance_from_replica_set(replica_set)
    conn = mysql_lib.connect_mysql(master, 'scriptrw')
    cursor = conn.cursor()

    if not mysql_lib.does_table_exist(master, mysql_lib.METADATA_DB,
                                      environment_specific.CSV_BACKUP_LOG_TABLE):
            print 'Creating missing metadata table'
            cursor.execute(CSV_BACKUP_LOG_TABLE_DEFINITION.format(
                               db=mysql_lib.METADATA_DB,
                               tbl=environment_specific.CSV_BACKUP_LOG_TABLE))

    sql = ('INSERT IGNORE INTO {METADATA_DB}.{CSV_BACKUP_LOG_TABLE} '
           'SET backup_date = %(date)s, '
           'completion = NOW()'
           ''.format(METADATA_DB=mysql_lib.METADATA_DB,
                     CSV_BACKUP_LOG_TABLE=environment_specific.CSV_BACKUP_LOG_TABLE))
    cursor.execute(sql, {'date': date})
    conn.commit()
def csv_backup_success_logged(instance, date):
    """ Check for log entries created by log_csv_backup_success

    Args:
    instance - A hostaddr object
    date - a string for the date

    Returns:
    True if already backed up, False otherwise
    """
    zk = host_utils.MysqlZookeeper()
    replica_set = zk.get_replica_set_from_instance(instance)[0]
    master = zk.get_mysql_instance_from_replica_set(replica_set)
    conn = mysql_lib.connect_mysql(master, 'scriptrw')
    cursor = conn.cursor()

    if not mysql_lib.does_table_exist(
            master, mysql_lib.METADATA_DB,
            environment_specific.CSV_BACKUP_LOG_TABLE):
        return False

    sql = ('SELECT COUNT(*) as "cnt" '
           'FROM {METADATA_DB}.{CSV_BACKUP_LOG_TABLE} '
           'WHERE backup_date = %(date)s '
           ''.format(
               METADATA_DB=mysql_lib.METADATA_DB,
               CSV_BACKUP_LOG_TABLE=environment_specific.CSV_BACKUP_LOG_TABLE))
    cursor.execute(sql, {'date': date})
    if cursor.fetchone()["cnt"]:
        return True
    else:
        return False
    def extend_backup_lock(self, lock_identifier, extend_lock_stop_event):
        """ Extend a backup lock. This is to be used by a thread

        Args:
        lock_identifier - Corrosponds to a lock identifier row in the
                          CSV_BACKUP_LOCK_TABLE_NAME.
        extend_lock_stop_event - An event that will be used to inform this
                                 thread to stop extending the lock
        """
        # Assumption is that this is callled right after creating the lock
        last_update = time.time()
        while(not extend_lock_stop_event.is_set()):
            if (time.time() - last_update) > LOCK_EXTEND_FREQUENCY:
                zk = host_utils.MysqlZookeeper()
                (replica_set, _) = self.instance.get_zk_replica_set()
                master = zk.get_mysql_instance_from_replica_set(replica_set, host_utils.REPLICA_ROLE_MASTER)
                master_conn = mysql_lib.connect_mysql(master, role='scriptrw')
                cursor = master_conn.cursor()

                params = {'lock_identifier': lock_identifier}
                sql = ('UPDATE {db}.{tbl} '
                       'SET expires = NOW() + INTERVAL {locks_held_time} '
                       'WHERE lock_identifier = %(lock_identifier)s'
                       '').format(db=mysql_lib.METADATA_DB,
                                  tbl=CSV_BACKUP_LOCK_TABLE_NAME,
                                  locks_held_time=LOCKS_HELD_TIME)
                cursor.execute(sql, params)
                master_conn.commit()
                log.debug(cursor._executed)
                last_update = time.time()
            extend_lock_stop_event.wait(.5)
Exemple #8
0
def get_db_size_from_log(instance, db):
    """ Get yesterdays db size for an instance

    Args:
    instance - A hostaddr object
    db - A database that exists on the instance

    Returns: size in MB
    """
    conn = mysql_lib.connect_mysql(instance, 'dbascript')
    cursor = conn.cursor()
    sql = ("SELECT SUM(size_mb) as 'mb', "
           "        COUNT(1) as 'table_count' "
           "FROM  {metadata_db}.{tbl} "
           "WHERE db = %(db)s "
           "    AND reported_at=CURDATE() - INTERVAL 1 DAY "
           "    AND hostname=%(hostname)s and port=%(port)s "
           "GROUP BY db;")
    params = {'hostname': instance.hostname, 'port': instance.port, 'db': db}
    cursor.execute(
        sql.format(metadata_db=mysql_lib.METADATA_DB, tbl=TABLE_SIZE_TBL),
        params)
    ret = cursor.fetchone()

    expected_tables = mysql_lib.get_tables(instance, db, skip_views=True)
    if ret['table_count'] != len(expected_tables):
        raise Exception('Size data appears to be missing for {db} on {inst}'
                        ''.format(db=db, inst=instance))
    return ret['mb']
    def ensure_backup_locks_sanity(self):
        """ Release any backup locks that aren't sane. This means locks
            created by the same host as the caller. The instance level flock
            should allow this assumption to be correct.
        """
        zk = host_utils.MysqlZookeeper()
        (replica_set, _) = self.instance.get_zk_replica_set()
        master = zk.get_mysql_instance_from_replica_set(replica_set, host_utils.REPLICA_ROLE_MASTER)
        master_conn = mysql_lib.connect_mysql(master, role='scriptrw')
        cursor = master_conn.cursor()

        if not mysql_lib.does_table_exist(master, mysql_lib.METADATA_DB,
                                          CSV_BACKUP_LOCK_TABLE_NAME):
            log.debug('Creating missing metadata table')
            cursor.execute(CSV_BACKUP_LOCK_TABLE.format(db=mysql_lib.METADATA_DB,
                                                        tbl=CSV_BACKUP_LOCK_TABLE_NAME))

        params = {'hostname': self.instance.hostname,
                  'port': self.instance.port}
        sql = ('UPDATE {db}.{tbl} '
               'SET lock_active = NULL, released = NOW() '
               'WHERE hostname = %(hostname)s AND '
               '     port = %(port)s'
               '').format(db=mysql_lib.METADATA_DB,
                          tbl=CSV_BACKUP_LOCK_TABLE_NAME)
        cursor.execute(sql, params)
        master_conn.commit()
Exemple #10
0
def check_one_replica(slave_instance, db, tbl):
    diff_count = -1
    elapsed_time_ms = -1

    try:
        conn = mysql_lib.connect_mysql(slave_instance, 'scriptro')
        cursor = conn.cursor()

        # first, count the diffs
        sql = ("SELECT COUNT(*) AS diffs FROM test.checksum "
               "WHERE (master_cnt <> this_cnt "
               "OR master_crc <> this_crc "
               "OR ISNULL(master_crc) <> ISNULL(this_crc)) "
               "AND (db=%(db)s AND tbl=%(tbl)s)")
        cursor.execute(sql, {'db': db, 'tbl': tbl})
        row = cursor.fetchone()
        if row is not None:
            diff_count = row['diffs']

        # second, sum up the elapsed time.
        sql = ("SELECT ROUND(SUM(chunk_time)*1000) AS time_ms "
               "FROM test.checksum WHERE db=%(db)s AND tbl=%(tbl)s")
        cursor.execute(sql, {'db': db, 'tbl': tbl})
        row = cursor.fetchone()
        if row is not None:
            elapsed_time_ms = row['time_ms']
        cursor.close()
        conn.close()
    except Exception as e:
        raise Exception("An error occurred polling the "
                        "replica: {e}".format(e=e))

    return elapsed_time_ms, diff_count
def log_binlog_upload(instance, binlog):
    """ Log to the master that a binlog has been uploaded

    Args:
    instance - a hostAddr object
    binlog - the full path to the binlog file
    """
    zk = host_utils.MysqlZookeeper()
    binlog_creation = datetime.datetime.fromtimestamp(os.stat(binlog).st_atime)
    replica_set = zk.get_replica_set_from_instance(instance)[0]
    master = zk.get_mysql_instance_from_replica_set(replica_set)
    conn = mysql_lib.connect_mysql(master, 'scriptrw')
    cursor = conn.cursor()
    sql = ("REPLACE INTO {metadata_db}.{tbl} "
           "SET hostname = %(hostname)s, "
           "    port = %(port)s, "
           "    binlog = %(binlog)s, "
           "    binlog_creation = %(binlog_creation)s, "
           "    uploaded = NOW() ").format(metadata_db=mysql_lib.METADATA_DB,
                                           tbl=environment_specific.BINLOG_ARCHIVING_TABLE_NAME)
    metadata = {'hostname': instance.hostname,
                'port': str(instance.port),
                'binlog': os.path.basename(binlog),
                'binlog_creation': binlog_creation}
    cursor.execute(sql, metadata)
    conn.commit()
def log_csv_backup_success(instance, date):
    """ The CSV backup check can be expensive, so let's log that it is done

    Args:
    instance - A hostaddr object
    date - a string for the date
    """
    zk = host_utils.MysqlZookeeper()
    replica_set = zk.get_replica_set_from_instance(instance)[0]
    master = zk.get_mysql_instance_from_replica_set(replica_set)
    conn = mysql_lib.connect_mysql(master, 'scriptrw')
    cursor = conn.cursor()

    if not mysql_lib.does_table_exist(
            master, mysql_lib.METADATA_DB,
            environment_specific.CSV_BACKUP_LOG_TABLE):
        print 'Creating missing metadata table'
        cursor.execute(
            CSV_BACKUP_LOG_TABLE_DEFINITION.format(
                db=mysql_lib.METADATA_DB,
                tbl=environment_specific.CSV_BACKUP_LOG_TABLE))

    sql = ('INSERT IGNORE INTO {METADATA_DB}.{CSV_BACKUP_LOG_TABLE} '
           'SET backup_date = %(date)s, '
           'completion = NOW()'
           ''.format(
               METADATA_DB=mysql_lib.METADATA_DB,
               CSV_BACKUP_LOG_TABLE=environment_specific.CSV_BACKUP_LOG_TABLE))
    cursor.execute(sql, {'date': date})
    conn.commit()
    def partition_lock_exists(self, table_tuple):
        """ Find out if there is already a lock on one partition of a
            partitioned table from a host other than us.  If so, we
            cannot backup that table here.
        Args:
            table_tuple - the tuple of table information.

        Returns:
            True if there is such a lock, False if not.
        """
        zk = host_utils.MysqlZookeeper()
        replica_set = zk.get_replica_set_from_instance(self.instance)
        master = zk.get_mysql_instance_from_replica_set(
            replica_set, host_utils.REPLICA_ROLE_MASTER)
        master_conn = mysql_lib.connect_mysql(master, role='dbascript')
        cursor = master_conn.cursor()
        params = {
            'table_name': table_tuple[0],
            'hostname': self.instance.hostname,
            'port': self.instance.port,
            'active': ACTIVE
        }

        sql = ("SELECT COUNT(*) AS cnt FROM {db}.{tbl} WHERE "
               "lock_active = %(active)s AND "
               "table_name = %(table_name)s AND "
               "hostname <> %(hostname)s AND "
               "port = %(port)s").format(db=mysql_lib.METADATA_DB,
                                         tbl=CSV_BACKUP_LOCK_TABLE_NAME)
        cursor.execute(sql, params)
        row = int(cursor.fetchone()['cnt'])
        return (row > 0)
def ensure_binlog_archiving_table_sanity(instance):
    """ Create binlog archiving log table if missing, purge old data

    Args:
    instance - A hostAddr object. Note: this function will find the master of
               the instance if the instance is not a master
    """
    zk = host_utils.MysqlZookeeper()
    replica_set = zk.get_replica_set_from_instance(instance)[0]
    master = zk.get_mysql_instance_from_replica_set(replica_set)
    conn = mysql_lib.connect_mysql(master, 'scriptrw')
    cursor = conn.cursor()
    if not mysql_lib.does_table_exist(master, mysql_lib.METADATA_DB,
                                      environment_specific.BINLOG_ARCHIVING_TABLE_NAME):
            log.debug('Creating missing metadata table')
            cursor.execute(BINLOG_ARCHIVING_TABLE.format(db=mysql_lib.METADATA_DB,
                                                         tbl=environment_specific.BINLOG_ARCHIVING_TABLE_NAME))
    sql = ("DELETE FROM {metadata_db}.{tbl} "
           "WHERE binlog_creation < now() - INTERVAL {d} DAY"
           "").format(metadata_db=mysql_lib.METADATA_DB,
                      tbl=environment_specific.BINLOG_ARCHIVING_TABLE_NAME,
                      d=(environment_specific.S3_BINLOG_RETENTION+1))
    log.info(sql)
    cursor.execute(sql)
    conn.commit()
Exemple #15
0
def check_instance_table(hostaddr, table, desired_hash):
    """ Check that a table on a MySQL instance has the expected schema

    Args:
    hostaddr - object describing which mysql instance to connect to
    table - the name of the table to verify
    desired_hash - the md5sum of the desired CREATE TABLE for the table

    Returns:
    A dictionary with keys that are the hash of the CREATE TABLE statement
    and the values are sets of hostname:port followed by a space and then the
    db one which the incorrect schema was found.
    """
    ret = dict()
    conn = mysql_lib.connect_mysql(hostaddr)
    for db in mysql_lib.get_dbs(conn):
        definition = mysql_lib.show_create_table(conn, db, table)
        tbl_hash = hashlib.md5(definition).hexdigest()
        if tbl_hash != desired_hash:
            if tbl_hash not in ret:
                ret[tbl_hash] = set()
            ret[tbl_hash].add(''.join((hostaddr.__str__(),
                                       ' ',
                                       db)))
    return ret
def log_binlog_upload(instance, binlog):
    """ Log to the master that a binlog has been uploaded

    Args:
    instance - a hostAddr object
    binlog - the full path to the binlog file
    """
    zk = host_utils.MysqlZookeeper()
    binlog_creation = datetime.datetime.fromtimestamp(os.stat(binlog).st_atime)
    replica_set = zk.get_replica_set_from_instance(instance)[0]
    master = zk.get_mysql_instance_from_replica_set(replica_set)
    conn = mysql_lib.connect_mysql(master, 'scriptrw')
    cursor = conn.cursor()
    sql = ("REPLACE INTO {metadata_db}.{tbl} "
           "SET hostname = %(hostname)s, "
           "    port = %(port)s, "
           "    binlog = %(binlog)s, "
           "    binlog_creation = %(binlog_creation)s, "
           "    uploaded = NOW() ").format(
               metadata_db=mysql_lib.METADATA_DB,
               tbl=environment_specific.BINLOG_ARCHIVING_TABLE_NAME)
    metadata = {
        'hostname': instance.hostname,
        'port': str(instance.port),
        'binlog': os.path.basename(binlog),
        'binlog_creation': binlog_creation
    }
    cursor.execute(sql, metadata)
    conn.commit()
    def ensure_backup_locks_sanity(self):
        """ Release any backup locks that aren't sane. This means locks
            created by the same host as the caller. The instance level lock
            should allow this assumption to be correct.
        """
        zk = host_utils.MysqlZookeeper()
        replica_set = zk.get_replica_set_from_instance(self.instance)
        master = zk.get_mysql_instance_from_replica_set(
            replica_set, host_utils.REPLICA_ROLE_MASTER)
        master_conn = mysql_lib.connect_mysql(master, role='dbascript')
        cursor = master_conn.cursor()

        if not mysql_lib.does_table_exist(master, mysql_lib.METADATA_DB,
                                          CSV_BACKUP_LOCK_TABLE_NAME):
            log.debug('Creating missing metadata table')
            cursor.execute(
                CSV_BACKUP_LOCK_TABLE.format(db=mysql_lib.METADATA_DB,
                                             tbl=CSV_BACKUP_LOCK_TABLE_NAME))

        params = {
            'hostname': self.instance.hostname,
            'port': self.instance.port
        }
        sql = ('UPDATE {db}.{tbl} '
               'SET lock_active = NULL, released = NOW() '
               'WHERE hostname = %(hostname)s AND '
               '      port = %(port)s'
               '').format(db=mysql_lib.METADATA_DB,
                          tbl=CSV_BACKUP_LOCK_TABLE_NAME)
        cursor.execute(sql, params)
        master_conn.commit()
def get_logged_binlog_uploads(instance):
    """ Get all binlogs that have been logged as uploaded

    Args:
    instance - a hostAddr object to run against and check

    Returns:
    A set of binlog file names
    """
    conn = mysql_lib.connect_mysql(instance, 'scriptro')
    cursor = conn.cursor()
    sql = ("SELECT binlog "
           "FROM {metadata_db}.{tbl} "
           "WHERE hostname = %(hostname)s AND "
           "      port = %(port)s "
           "".format(metadata_db=mysql_lib.METADATA_DB,
                     tbl=environment_specific.BINLOG_ARCHIVING_TABLE_NAME))
    cursor.execute(sql, {
        'hostname': instance.hostname,
        'port': str(instance.port)
    })
    ret = set()
    for binlog in cursor.fetchall():
        ret.add(binlog['binlog'])

    return ret
def ensure_binlog_archiving_table_sanity(instance):
    """ Create binlog archiving log table if missing, purge old data

    Args:
    instance - A hostAddr object. Note: this function will find the master of
               the instance if the instance is not a master
    """
    zk = host_utils.MysqlZookeeper()
    replica_set = zk.get_replica_set_from_instance(instance)[0]
    master = zk.get_mysql_instance_from_replica_set(replica_set)
    conn = mysql_lib.connect_mysql(master, 'scriptrw')
    cursor = conn.cursor()
    if not mysql_lib.does_table_exist(
            master, mysql_lib.METADATA_DB,
            environment_specific.BINLOG_ARCHIVING_TABLE_NAME):
        log.debug('Creating missing metadata table')
        cursor.execute(
            BINLOG_ARCHIVING_TABLE.format(
                db=mysql_lib.METADATA_DB,
                tbl=environment_specific.BINLOG_ARCHIVING_TABLE_NAME))
    sql = ("DELETE FROM {metadata_db}.{tbl} "
           "WHERE binlog_creation < now() - INTERVAL {d} DAY"
           "").format(metadata_db=mysql_lib.METADATA_DB,
                      tbl=environment_specific.BINLOG_ARCHIVING_TABLE_NAME,
                      d=(environment_specific.S3_BINLOG_RETENTION + 1))
    log.info(sql)
    cursor.execute(sql)
    conn.commit()
Exemple #20
0
def get_binlog_start(binlog_file, instance, username, password):
    """ Read the first event in a binlog so that we can extract
        the timestamp.  This should help us skip over binlogs
        that can't possibly contain the GTID we're looking for.

    Args:
        binlog_file: the binlog to examine
        instance: a hostaddr object
        username: the user to connect as
        password: the password to connect as
    Returns:
        A timestamp in MySQL-friendly format, or we throw
        an exception if something doesn't work.
    """

    # first, make sure we have the proper log positions.
    # most likely, this is always 4 and 120, and that's what
    # we'll default to, but it could be different with
    # different versions of MySQL.
    #
    start_pos = 4
    stop_pos = 120
    try:
        conn = mysql_lib.connect_mysql(instance)
        cursor = conn.cursor()
        sql = 'SHOW BINLOG EVENTS in %(binlog)s LIMIT 0,1'
        cursor.execute(sql, {'binlog': binlog_file})
        row = cursor.fetchone()

        start_pos = row['Pos']
        stop_pos = row['End_log_pos']
    except Exception as e:
        log.error('Unable to retrieve binlog positions: {}'.format(e))
        raise

    binlog_cmd = [
        '/usr/bin/mysqlbinlog', '--read-from-remote-server',
        '--host={}'.format(instance.hostname), '--user={}'.format(username),
        '--password={}'.format(password),
        '--start-position="{}"'.format(start_pos),
        '--stop-position="{}"'.format(stop_pos), binlog_file, '2>/dev/null'
    ]

    pipeline = list()
    pipeline.append(dict(args=' '.join(binlog_cmd), shell=True))
    pipeline.append(dict(args='/bin/egrep created', shell=True))
    procs = pipe_runner(pipeline)
    results = pipe_wait(procs)

    try:
        (date, time) = results.split()[-2:]
        timestamp = dt.datetime.strptime('{} {}'.format(date, time),
                                         BINLOG_DT_FORMAT)
        return timestamp
    except Exception as e:
        log.error("Invalid value/format for binlog create time: {}".format(e))
        raise
def rename_db_to_drop(instance, dbs, verbose=False, dry_run=False):
    """ Create a new empty db and move the contents of the original db there

    Args:
    instance - a hostaddr object
    dbs -  a set of database names
    verbose - bool, will direct sql to stdout
    dry_run - bool, will make no changes to
    """
    # confirm db is not in zk and not in use
    orphaned, _, _ = find_shard_mismatches.find_shard_mismatches(instance)
    if not orphaned:
        print "Detected no orphans"
        sys.exit(1)

    instance_orphans = orphaned[instance.__str__()]
    unexpected = dbs.difference(instance_orphans)
    if unexpected:
        print ''.join(("Cowardly refusing to act on the following dbs: ",
                       ','.join(unexpected)))
        sys.exit(1)

    # confirm that renames would not be blocked by an existing table
    conn = mysql_lib.connect_mysql(instance)

    cursor = conn.cursor()
    for db in dbs:
        renamed_db = ''.join((DB_PREPEND, db))

        sql = ''.join(("SELECT CONCAT(t2.TABLE_SCHEMA, \n",
                       "              '.', t2.TABLE_NAME) as tbl \n",
                       "FROM information_schema.tables t1 \n",
                       "INNER JOIN information_schema.tables t2 \n",
                       "    USING(TABLE_NAME) \n",
                       "WHERE t1.TABLE_SCHEMA = %(old_db)s AND \n"
                       "      t2.TABLE_SCHEMA = %(new_db)s;"))

        params = {'old_db': db,
                  'new_db': renamed_db}
        cursor = conn.cursor()
        cursor.execute(sql, params)
        dups = cursor.fetchall()

        if dups:
            for dup in dups:
                print "Table rename blocked by {tbl}".format(tbl=dup['tbl'])
            sys.exit(1)

        # We should be safe to create the new db and rename
        if not dry_run:
            mysql_lib.create_db(conn, renamed_db)
        mysql_lib.move_db_contents(conn=conn,
                                   old_db=db,
                                   new_db=renamed_db,
                                   verbose=verbose,
                                   dry_run=dry_run)
def rename_db_to_drop(instance, dbs, verbose=False, dry_run=False):
    """ Create a new empty db and move the contents of the original db there

    Args:
    instance - a hostaddr object
    dbs -  a set of database names
    verbose - bool, will direct sql to stdout
    dry_run - bool, will make no changes to
    """
    # confirm db is not in zk and not in use
    orphaned, _, _ = find_shard_mismatches.find_shard_mismatches(instance)
    if not orphaned:
        print "Detected no orphans"
        sys.exit(1)

    instance_orphans = orphaned[instance.__str__()]
    unexpected = dbs.difference(instance_orphans)
    if unexpected:
        print ''.join(("Cowardly refusing to act on the following dbs: ",
                       ','.join(unexpected)))
        sys.exit(1)

    # confirm that renames would not be blocked by an existing table
    conn = mysql_lib.connect_mysql(instance)

    cursor = conn.cursor()
    for db in dbs:
        renamed_db = ''.join((DB_PREPEND, db))

        sql = ''.join(("SELECT CONCAT(t2.TABLE_SCHEMA, \n",
                       "              '.', t2.TABLE_NAME) as tbl \n",
                       "FROM information_schema.tables t1 \n",
                       "INNER JOIN information_schema.tables t2 \n",
                       "    USING(TABLE_NAME) \n",
                       "WHERE t1.TABLE_SCHEMA = %(old_db)s AND \n"
                       "      t2.TABLE_SCHEMA = %(new_db)s;"))

        params = {'old_db': db,
                  'new_db': renamed_db}
        cursor = conn.cursor()
        cursor.execute(sql, params)
        dups = cursor.fetchall()

        if dups:
            for dup in dups:
                print "Table rename blocked by {tbl}".format(tbl=dup['tbl'])
            sys.exit(1)

        # We should be safe to create the new db and rename
        if not dry_run:
            mysql_lib.create_db(instance, renamed_db)
        mysql_lib.move_db_contents(instance,
                                   old_db=db,
                                   new_db=renamed_db,
                                   verbose=verbose,
                                   dry_run=dry_run)
Exemple #23
0
def main():
    parser = argparse.ArgumentParser(description='MySQL schema verifier')
    parser.add_argument('instance_type',
                        help='Type of MySQL instance to verify',
                        choices=('sharddb',
                                 'modsharddb'))
    parser.add_argument('table',
                        help='Table to check',)
    parser.add_argument('seed_instance',
                        help=('Which host from which to fetch a table '
                              ' definition. (format hostname[:port])'),)
    parser.add_argument('seed_db',
                        help=('Which db on --seed_instance from which to fetch'
                              ' a table definition. (ex pbdata012345)'))
    args = parser.parse_args()
    if args.instance_type == 'sharddb':
        zk_prefix = SHARDDB_PREFIX
    elif args.instance_type == 'modsharddb':
        zk_prefix = MODSHARDDB_PREFIX
    seed_instance = host_utils.HostAddr(args.seed_instance)
    seed_conn = mysql_lib.connect_mysql(seed_instance)
    desired = mysql_lib.show_create_table(seed_conn, args.seed_db, args.table)
    tbl_hash = hashlib.md5(desired).hexdigest()
    print ("Desired table definition:\n{desired}").format(desired=desired)
    incorrect = check_schema(zk_prefix, args.table, tbl_hash)
    if len(incorrect) == 0:
        print "It appears that all schema is synced"
        sys.exit(0)

    d = difflib.Differ()
    for problem in incorrect.iteritems():
        represenative = list(problem[1])[0].split(' ')
        hostaddr = host_utils.HostAddr(represenative[0])
        conn = mysql_lib.connect_mysql(hostaddr)
        create = mysql_lib.show_create_table(conn,
                                             represenative[1],
                                             args.table)
        diff = d.compare(desired.splitlines(), create.splitlines())
        print 'The following difference has been found:'
        print '\n'.join(diff)
        print "It is present on the following db's:"
        print '\n'.join(list(problem[1]))
    sys.exit(1)
def main():

    action_desc = """Action description

rename - after checking no recent changes and shard not in zk,
         create a db with the old name appended to 'dropme_'. Then
         copy all tables to the new db
revert_rename - Copy all tables back from a 'dropme_' to their original table
drop - This should be run a few days after a rename. Drop the empty original
       db, and drop the 'dropme_' db.
"""

    parser = argparse.ArgumentParser(description='MySQL shard cleanup utility',
                                     epilog=action_desc,
                                     formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('-i',
                        '--instance',
                        help='Instance to act on if other than localhost:3306',
                        default=''.join((socket.getfqdn(), ':3306')))
    parser.add_argument('-a',
                        '--action',
                        choices=('rename',
                                 'revert_rename',
                                 'drop',),
                        required=True)
    parser.add_argument('-d',
                        '--dbs',
                        help=("Comma seperated list of db's to act upon"),
                        required=True)
    parser.add_argument('-r',
                        '--dry_run',
                        help=("Do not change any state"),
                        default=False,
                        action='store_true')
    parser.add_argument('-v',
                        '--verbose',
                        default=False,
                        action='store_true')

    args = parser.parse_args()
    dbs = set(args.dbs.split(','))
    instance = host_utils.HostAddr(args.instance)

    if args.action == 'rename':
        rename_db_to_drop(instance, dbs, args.verbose, args.dry_run)
    elif args.action == 'revert_rename':
        conn = mysql_lib.connect_mysql(instance)
        for db in dbs:
            mysql_lib.move_db_contents(conn=conn,
                                       old_db=''.join((DB_PREPEND, db)),
                                       new_db=db,
                                       verbose=args.verbose,
                                       dry_run=args.dry_run)
    elif args.action == 'drop':
        drop_db_after_rename(instance, dbs, args.verbose, args.dry_run)
def create_table_size_table(instance):
    """ Create the table_size_historic table

    Args:
    a hostAddr object for the master of the replica set
    """
    conn = mysql_lib.connect_mysql(instance, 'scriptrw')
    cursor = conn.cursor()
    cursor.execute(TABLE_DEF.format(db=mysql_lib.METADATA_DB,
                   tbl=TABLE_SIZE_TBL))
    cursor.close()
    conn.close()
def find_shard_mismatches(instance=False):
    """ Find shards that are missing or unexpected in modhsarddb and sharddb

    Args:
    instance - If supplied, only check this instance.

    Returns:
    orphaned - A dict of unexpected and (according to table statistics)
               unused shards. Key is master instance, value is a set.
    orphaned_but_used - A dict of unexpected and but used shards.
                        Data strucutre is the same as orphaned.
    missing - A dict of expected but missing shards.
              Data strucutre is the same as orphaned.

    """
    orphaned = dict()
    orphaned_but_used = dict()
    missing_shards = dict()

    zk = host_utils.MysqlZookeeper()
    host_shard_map = zk.get_host_shard_map()

    if instance:
        new_host_shard_map = dict()
        new_host_shard_map[instance.__str__()] = host_shard_map[
            instance.__str__()]
        host_shard_map = new_host_shard_map

    for master in host_shard_map:
        expected_shards = host_shard_map[master]
        instance = host_utils.HostAddr(master)
        conn = mysql_lib.connect_mysql(instance)
        activity = mysql_lib.get_dbs_activity(conn)
        actual_shards = mysql_lib.get_dbs(conn)
        unexpected_shards = actual_shards.difference(expected_shards)
        missing = expected_shards.difference(actual_shards)
        if missing:
            missing_shards[master] = expected_shards.difference(actual_shards)

        for db in unexpected_shards:
            if activity[db]['ROWS_CHANGED'] != 0:
                if master not in orphaned_but_used:
                    orphaned_but_used[master] = set()
                orphaned_but_used[master].add(db)
            else:
                if master not in orphaned:
                    orphaned[master] = set()
                orphaned[master].add(db)

    return orphaned, orphaned_but_used, missing_shards
def find_shard_mismatches(instance=False):
    """ Find shards that are missing or unexpected in modhsarddb and sharddb

    Args:
    instance - If supplied, only check this instance.

    Returns:
    orphaned - A dict of unexpected and (according to table statistics)
               unused shards. Key is master instance, value is a set.
    orphaned_but_used - A dict of unexpected and but used shards.
                        Data strucutre is the same as orphaned.
    missing - A dict of expected but missing shards.
              Data strucutre is the same as orphaned.

    """
    orphaned = dict()
    orphaned_but_used = dict()
    missing_shards = dict()

    zk = host_utils.MysqlZookeeper()
    host_shard_map = zk.get_host_shard_map()

    if instance:
        new_host_shard_map = dict()
        new_host_shard_map[instance.__str__()] = host_shard_map[instance.__str__()]
        host_shard_map = new_host_shard_map

    for master in host_shard_map:
        expected_shards = host_shard_map[master]
        instance = host_utils.HostAddr(master)
        conn = mysql_lib.connect_mysql(instance)
        activity = mysql_lib.get_dbs_activity(conn)
        actual_shards = mysql_lib.get_dbs(conn)
        unexpected_shards = actual_shards.difference(expected_shards)
        missing = expected_shards.difference(actual_shards)
        if missing:
            missing_shards[master] = expected_shards.difference(actual_shards)

        for db in unexpected_shards:
            if activity[db]['ROWS_CHANGED'] != 0:
                if master not in orphaned_but_used:
                    orphaned_but_used[master] = set()
                orphaned_but_used[master].add(db)
            else:
                if master not in orphaned:
                    orphaned[master] = set()
                orphaned[master].add(db)

    return orphaned, orphaned_but_used, missing_shards
Exemple #28
0
def get_master_mysql_major_version(instance):
    """ Given an instance, determine the mysql major version for the master
        of the replica set.

    Args:
    instance - a hostaddr object

    Returns - A string similar to '5.5' or '5.6'
   """
    zk = host_utils.MysqlZookeeper()
    master = zk.get_mysql_instance_from_replica_set(instance.get_zk_replica_set()[0],
                                                    repl_type=host_utils.REPLICA_ROLE_MASTER)
    master_conn = mysql_lib.connect_mysql(master)
    mysql_version = mysql_lib.get_global_variables(master_conn)['version'][:3]
    return mysql_version
    def release_expired_locks(self):
        """ Release any expired locks """
        zk = host_utils.MysqlZookeeper()
        (replica_set, _) = self.instance.get_zk_replica_set()
        master = zk.get_mysql_instance_from_replica_set(replica_set, host_utils.REPLICA_ROLE_MASTER)
        master_conn = mysql_lib.connect_mysql(master, role='scriptrw')
        cursor = master_conn.cursor()

        sql = ('UPDATE {db}.{tbl} '
               'SET lock_active = NULL, released = NOW() '
               'WHERE expires < NOW()'
               '').format(db=mysql_lib.METADATA_DB,
                          tbl=CSV_BACKUP_LOCK_TABLE_NAME)
        cursor.execute(sql)
        master_conn.commit()
        log.debug(cursor._executed)
    def purge_old_expired_locks(self):
        """ Delete any locks older than a week """
        zk = host_utils.MysqlZookeeper()
        (replica_set, _) = self.instance.get_zk_replica_set()
        master = zk.get_mysql_instance_from_replica_set(replica_set, host_utils.REPLICA_ROLE_MASTER)
        master_conn = mysql_lib.connect_mysql(master, role='scriptrw')
        cursor = master_conn.cursor()

        sql = ('DELETE FROM {db}.{tbl} '
               'WHERE expires < NOW() - INTERVAL 1 WEEK AND '
               '        lock_active is NOT NULL '
               '').format(db=mysql_lib.METADATA_DB,
                          tbl=CSV_BACKUP_LOCK_TABLE_NAME)
        cursor.execute(sql)
        master_conn.commit()
        log.debug(cursor._executed)
    def purge_old_expired_locks(self):
        """ Delete any locks older than 2 days """
        zk = host_utils.MysqlZookeeper()
        replica_set = zk.get_replica_set_from_instance(self.instance)
        master = zk.get_mysql_instance_from_replica_set(
            replica_set, host_utils.REPLICA_ROLE_MASTER)
        master_conn = mysql_lib.connect_mysql(master, role='dbascript')
        cursor = master_conn.cursor()

        sql = ('DELETE FROM {db}.{tbl} '
               'WHERE expires < NOW() - INTERVAL 2 DAY'
               '').format(db=mysql_lib.METADATA_DB,
                          tbl=CSV_BACKUP_LOCK_TABLE_NAME)
        cursor.execute(sql)
        master_conn.commit()
        log.debug(cursor._executed)
Exemple #32
0
def create_checksum_detail_table(instance):
    """ Args:
            instance: the master instance for this replica set

        Returns: Nothing.  If this fails, throw an exception.
    """

    try:
        conn = mysql_lib.connect_mysql(instance, 'scriptrw')
        cursor = conn.cursor()
        cursor.execute(TABLE_DEF.format(db=mysql_lib.METADATA_DB, tbl=CHECKSUM_TBL))
        cursor.close()
        conn.close()
    except Exception as e:
        raise Exception("Failed to create checksum detail "
                        "table: {e}".format(e=e))
    def release_expired_locks(self):
        """ Release any expired locks """
        zk = host_utils.MysqlZookeeper()
        replica_set = zk.get_replica_set_from_instance(self.instance)
        master = zk.get_mysql_instance_from_replica_set(
            replica_set, host_utils.REPLICA_ROLE_MASTER)
        master_conn = mysql_lib.connect_mysql(master, role='dbascript')
        cursor = master_conn.cursor()

        sql = ('UPDATE {db}.{tbl} '
               'SET lock_active = NULL, released = NOW() '
               'WHERE expires < NOW() AND lock_active IS NOT NULL'
               '').format(db=mysql_lib.METADATA_DB,
                          tbl=CSV_BACKUP_LOCK_TABLE_NAME)
        cursor.execute(sql)
        master_conn.commit()
        log.debug(cursor._executed)
    def purge_old_expired_locks(self):
        """ Delete any locks older than a week """
        zk = host_utils.MysqlZookeeper()
        (replica_set, _) = self.instance.get_zk_replica_set()
        master = zk.get_mysql_instance_from_replica_set(
            replica_set, host_utils.REPLICA_ROLE_MASTER)
        master_conn = mysql_lib.connect_mysql(master, role='scriptrw')
        cursor = master_conn.cursor()

        sql = ('DELETE FROM {db}.{tbl} '
               'WHERE expires < NOW() - INTERVAL 1 WEEK AND '
               '        lock_active is NOT NULL '
               '').format(db=mysql_lib.METADATA_DB,
                          tbl=CSV_BACKUP_LOCK_TABLE_NAME)
        cursor.execute(sql)
        master_conn.commit()
        log.debug(cursor._executed)
def drop_db_after_rename(instance, dbs, verbose, dry_run):
    """ Drop the original empty db and a non-empty rename db

    Args:
    instance - a hostaddr object
    dbs -  a set of database names
    verbose - bool, will direct sql to stdout
    dry_run - bool, will make no changes to
    """

    # confirm db is not in zk and not in use
    orphaned, _, _ = find_shard_mismatches.find_shard_mismatches(instance)
    instance_orphans = orphaned[instance.__str__()]
    unexpected = dbs.difference(instance_orphans)
    if unexpected:
        print ''.join(("Cowardly refusing to act on the following dbs: ",
                       ','.join(unexpected)))
        sys.exit(1)

    # make sure the original db is empty
    for db in dbs:
        if mysql_lib.get_tables(instance, db):
            print ''.join(("Cowardly refusing to drop non-empty db:",
                           db))
            sys.exit(1)

    conn = mysql_lib.connect_mysql(instance)
    cursor = conn.cursor()
    for db in dbs:
        # we should be good to drop the old empty dbs
        raw_sql = 'DROP DATABASE IF EXISTS `{db}`;'
        sql = raw_sql.format(db=db)
        if verbose:
            print sql
        if not dry_run:
            cursor.execute(sql)

        # and we should be ok to drop the non-empty 'dropme_' prepended db
        renamed_db = ''.join((DB_PREPEND, db))
        sql = raw_sql.format(db=renamed_db)
        if verbose:
            print sql
        if not dry_run:
            cursor.execute(sql)
Exemple #36
0
def log_table_sizes(port):
    """ Determine and record the size of tables on a MySQL instance

    Args:
    port - int
    """
    instance = host_utils.HostAddr(':'.join((host_utils.HOSTNAME, port)))
    zk = host_utils.MysqlZookeeper()

    replica_set = zk.get_replica_set_from_instance(instance)
    master = zk.get_mysql_instance_from_replica_set(
        replica_set, host_utils.REPLICA_ROLE_MASTER)
    if not mysql_lib.does_table_exist(master, mysql_lib.METADATA_DB,
                                      TABLE_SIZE_TBL):
        create_table_size_table(master)

    sizes = get_all_table_sizes(instance)
    conn = mysql_lib.connect_mysql(master, 'dbascript')
    for db in sizes:
        for table in sizes[db]:
            for partition in sizes[db][table]:
                cursor = conn.cursor()
                sql = ('REPLACE INTO {metadata_db}.{tbl} '
                       'SET '
                       'hostname = %(hostname)s, '
                       'port = %(port)s, '
                       'db = %(db)s, '
                       'table_name = %(table)s, '
                       'partition_name = %(partition)s, '
                       'reported_at = CURDATE(), '
                       'size_mb = %(size)s ')
                cursor.execute(
                    sql.format(metadata_db=mysql_lib.METADATA_DB,
                               tbl=TABLE_SIZE_TBL), {
                                   'hostname': instance.hostname,
                                   'port': instance.port,
                                   'db': db,
                                   'table': table,
                                   'partition': partition,
                                   'size': sizes[db][table][partition]
                               })
                conn.commit()
                log.info(cursor._executed)
                cursor.close()
def drop_db_after_rename(instance, dbs, verbose, dry_run):
    """ Drop the original empty db and a non-empty rename db

    Args:
    instance - a hostaddr object
    dbs -  a set of database names
    verbose - bool, will direct sql to stdout
    dry_run - bool, will make no changes to
    """

    # confirm db is not in zk and not in use
    orphaned, _, _ = find_shard_mismatches.find_shard_mismatches(instance)
    instance_orphans = orphaned[instance.__str__()]
    unexpected = dbs.difference(instance_orphans)
    if unexpected:
        print ''.join(("Cowardly refusing to act on the following dbs: ",
                       ','.join(unexpected)))
        sys.exit(1)

    # make sure the original db is empty
    conn = mysql_lib.connect_mysql(instance)
    cursor = conn.cursor()
    for db in dbs:
        if mysql_lib.get_tables(conn, db):
            print ''.join(("Cowardly refusing to drop non-empty db:",
                           db))
            sys.exit(1)

    for db in dbs:
        # we should be good to drop the old empty dbs
        raw_sql = 'DROP DATABASE IF EXISTS `{db}`;'
        sql = raw_sql.format(db=db)
        if verbose:
            print sql
        if not dry_run:
            cursor.execute(sql)

        # and we should be ok to drop the non-empty 'dropme_' prepended db
        renamed_db = ''.join((DB_PREPEND, db))
        sql = raw_sql.format(db=renamed_db)
        if verbose:
            print sql
        if not dry_run:
            cursor.execute(sql)
    def mysql_backup_csv_tables(self):
        """ Worker for backing up a queue of tables """
        proc_id = multiprocessing.current_process().name
        conn = mysql_lib.connect_mysql(self.instance,
                                       backup.USER_ROLE_MYSQLDUMP)
        mysql_lib.start_consistent_snapshot(conn,
                                            read_only=True,
                                            session_id=self.session_id)
        pitr_data = mysql_lib.get_pitr_data(self.instance)
        err_count = 0
        while not (self.tables_to_backup.empty()
                   and self.tables_to_retry.empty()):
            table_tuple = self.tables_to_retry.get() if not self.tables_to_retry.empty() \
                else self.tables_to_backup.get()
            try:
                # if this is a partitioned table, and it is already
                # being backed up on some other host, we do not want to attempt
                # to back it up here.
                #
                if table_tuple[1] and self.partition_lock_exists(table_tuple):
                    log.debug('Partitioned table {} is already being '
                              'backed up elsewhere, so we cannot do it '
                              'here.'.format(table_tuple[0]))
                else:
                    self.mysql_backup_csv_table_wrapper(
                        table_tuple, conn, pitr_data)

                self.table_count = self.table_count + 1
                if (self.table_count % 50) == 0:
                    self.release_expired_locks()
            except:
                self.tables_to_retry.put(table_tuple)
                log.error('{proc_id}: Could not dump {tbl}, partition {p} - '
                          'error: {e}'.format(tbl=table_tuple[0],
                                              p=table_tuple[2],
                                              e=traceback.format_exc(),
                                              proc_id=proc_id))
                err_count = err_count + 1
                if err_count > MAX_THREAD_ERROR:
                    log.error('{}: Error count in thread > MAX_THREAD_ERROR. '
                              'Aborting :('.format(proc_id))
                    return
def log_table_sizes(port):
    """ Determine and record the size of tables on a MySQL instance

    Args:
    port - int
    """
    instance = host_utils.HostAddr(':'.join((host_utils.HOSTNAME, port)))
    zk = host_utils.MysqlZookeeper()
    replica_set = instance.get_zk_replica_set()[0]
    master = zk.get_mysql_instance_from_replica_set(replica_set,
                                                    host_utils.REPLICA_ROLE_MASTER)
    if not mysql_lib.does_table_exist(master,
                                      mysql_lib.METADATA_DB,
                                      TABLE_SIZE_TBL):
        create_table_size_table(master)

    sizes = get_all_table_sizes(instance)
    conn = mysql_lib.connect_mysql(master, 'scriptrw')
    for db in sizes:
        for table in sizes[db]:
            for partition in sizes[db][table]:
                cursor = conn.cursor()
                sql = ('REPLACE INTO {metadata_db}.{tbl} '
                       'SET '
                       'hostname = %(hostname)s, '
                       'port = %(port)s, '
                       'db = %(db)s, '
                       'table_name = %(table)s, '
                       'partition_name = %(partition)s, '
                       'reported_at = CURDATE(), '
                       'size_mb = %(size)s ')
                cursor.execute(sql.format(metadata_db=mysql_lib.METADATA_DB,
                               tbl=TABLE_SIZE_TBL),
                               {'hostname': instance.hostname,
                                'port': instance.port,
                                'db': db,
                                'table': table,
                                'partition': partition,
                                'size': sizes[db][table][partition]})
                conn.commit()
                log.info(cursor._executed)
                cursor.close()
Exemple #40
0
def verify_blackhole_dbs(destination, non_mig_databases):
    """ Confirm that non migrated tables have no non-blackhole tables

    Args:
    destination - A hostaddr object
    non_mig_databases - A set of dbs to check
    """
    conn = mysql_lib.connect_mysql(destination)
    cursor = conn.cursor()
    query = ("SELECT COUNT(*) AS 'tbls' "
             "FROM information_schema.tables "
             "WHERE ENGINE !='BLACKHOLE'"
             " AND TABLE_SCHEMA=%(db)s")

    for db in non_mig_databases:
        cursor.execute(query, {'db': db})
        check = cursor.fetchone()
        if check['tbls']:
            raise Exception('Blackhole db {db} has non blackhole table on '
                            'instance {i}'
                            ''.format(db=db, i=destination))
Exemple #41
0
def confirm_replica_topology(master, replicas):
    """ Confirm that replica servers are actually replicating off of a master

    Args:
    master - A hostaddr object for the master instance
    replicas - A set of hostaddr objects for the replica instance
    """
    for replica in replicas:
        conn = mysql_lib.connect_mysql(replica)
        ss = mysql_lib.get_slave_status(conn)
        repl_master = host_utils.HostAddr(':'.join(
            (ss['Master_Host'], str(ss['Master_Port']))))
        if repl_master != master:
            raise Exception('Slave {replica} is not a replica of master '
                            '{master}, but is instead a replica of '
                            '{repl_master}'.format(replica=replica,
                                                   repl_master=repl_master,
                                                   master=master))
        else:
            log.info('Replica {replica} is replicating from expected master '
                     'server {master}'.format(replica=replica, master=master))
    def release_db_backup_lock(self, lock_identifier):
        """ Release a backup lock created by take_backup_lock

        Args:
        lock_identifier - a uuid to identify a lock row
        """
        zk = host_utils.MysqlZookeeper()
        (replica_set, _) = self.instance.get_zk_replica_set()
        master = zk.get_mysql_instance_from_replica_set(replica_set, host_utils.REPLICA_ROLE_MASTER)
        master_conn = mysql_lib.connect_mysql(master, role='scriptrw')
        cursor = master_conn.cursor()

        params = {'lock_identifier': lock_identifier}
        sql = ('UPDATE {db}.{tbl} '
               'SET lock_active = NULL AND released = NOW() '
               'WHERE lock_identifier = %(lock_identifier)s'
               '').format(db=mysql_lib.METADATA_DB,
                          tbl=CSV_BACKUP_LOCK_TABLE_NAME)
        cursor.execute(sql, params)
        master_conn.commit()
        log.debug(cursor._executed)
Exemple #43
0
def write_checksum_status(instance, data):
    """ Args:
            instance: Host info for the master that we'll connect to.
            data: A dictionary containing the row to insert.  See
                  the table definition at the top of the script for info.

        Returns: Nothing
    """
    try:
        conn = mysql_lib.connect_mysql(instance, 'scriptrw')
        cursor = conn.cursor()
        sql = ("INSERT INTO test.checksum_detail SET "
               "reported_at=NOW(), "
               "instance=%(instance)s, "
               "master_instance=%(master_instance)s, "
               "db=%(db)s, tbl=%(tbl)s, "
               "elapsed_time_ms=%(elapsed_time_ms)s, "
               "chunk_count=%(chunk_count)s, "
               "chunk_errors=%(chunk_errors)s, "
               "chunk_diffs=%(chunk_diffs)s, "
               "chunk_skips=%(chunk_skips)s, "
               "row_count=%(row_count)s, "
               "row_diffs=%(row_diffs)s, "
               "rows_checked=%(rows_checked)s, "
               "checksum_status=%(checksum_status)s, "
               "checksum_cmd=%(checksum_cmd)s, "
               "checksum_stdout=%(checksum_stdout)s, "
               "checksum_stderr=%(checksum_stderr)s, "
               "checksum_rc=%(checksum_rc)s, "
               "sync_cmd=%(sync_cmd)s, "
               "sync_stdout=%(sync_stdout)s, "
               "sync_stderr=%(sync_stderr)s, "
               "sync_rc=%(sync_rc)s")
        cursor.execute(sql, data)
    except Exception as e:
        log.error("Unable to write to the database: {e}".format(s=sql, e=e))
    finally:
        conn.commit()
        conn.close()
Exemple #44
0
def confirm_replica_topology(master, replicas):
    """ Confirm that replica servers are actually replicating off of a master

    Args:
    master - A hostaddr object for the master instance
    replicas - A set of hostaddr objects for the replica instance
    """
    for replica in replicas:
        conn = mysql_lib.connect_mysql(replica)
        ss = mysql_lib.get_slave_status(conn)
        repl_master = host_utils.HostAddr(':'.join((ss['Master_Host'],
                                                    str(ss['Master_Port']))))
        if repl_master != master:
            raise Exception('Slave {replica} is not a replica of master '
                            '{master}, but is instead a replica of '
                            '{repl_master}'.format(replica=replica,
                                                   repl_master=repl_master,
                                                   master=master))
        else:
            log.info('Replica {replica} is replicating from expected master '
                     'server {master}'.format(replica=replica,
                                              master=master))
    def release_db_backup_lock(self, lock_identifier):
        """ Release a backup lock created by take_backup_lock

        Args:
        lock_identifier - a uuid to identify a lock row
        """
        zk = host_utils.MysqlZookeeper()
        (replica_set, _) = zk.get_replica_set_from_instance(self.instance)
        master = zk.get_mysql_instance_from_replica_set(
            replica_set, host_utils.REPLICA_ROLE_MASTER)
        master_conn = mysql_lib.connect_mysql(master, role='scriptrw')
        cursor = master_conn.cursor()

        params = {'lock_identifier': lock_identifier}
        sql = ('UPDATE {db}.{tbl} '
               'SET lock_active = NULL AND released = NOW() '
               'WHERE lock_identifier = %(lock_identifier)s'
               '').format(db=mysql_lib.METADATA_DB,
                          tbl=CSV_BACKUP_LOCK_TABLE_NAME)
        cursor.execute(sql, params)
        master_conn.commit()
        log.debug(cursor._executed)
def get_master_mysql_major_version(instance):
    """ Given an instance, determine the mysql major version for the master
        of the replica set.

    Args:
    instance - a hostaddr object

    Returns - A string similar to '5.5' or '5.6'
   """
    zk = host_utils.MysqlZookeeper()
    master = zk.get_mysql_instance_from_replica_set(instance.get_zk_replica_set()[0],
                                                    repl_type=host_utils.REPLICA_ROLE_MASTER)
    try:
        master_conn = mysql_lib.connect_mysql(master)
    except _mysql_exceptions.OperationalError:
        raise Exception('Could not connect to master server {instance} in '
                        'order to determine MySQL version to launch with. '
                        'Perhaps run this script from there? This is likely '
                        'due to firewall rules.'
                        ''.format(instance=instance.hostname))
    mysql_version = mysql_lib.get_global_variables(master_conn)['version'][:3]
    return mysql_version
 def mysql_backup_csv_dbs(self):
     """ Worker for backing up a queue of dbs """
     proc_id = multiprocessing.current_process().name
     conn = mysql_lib.connect_mysql(self.instance, backup.USER_ROLE_MYSQLDUMP)
     mysql_lib.start_consistent_snapshot(conn, read_only=True)
     pitr_data = mysql_lib.get_pitr_data(self.instance)
     err_count = 0
     while not self.dbs_to_backup.empty():
         db = self.dbs_to_backup.get()
         try:
             self.mysql_backup_csv_db(db, conn, pitr_data)
         except:
             self.dbs_to_backup.put(db)
             log.error('{proc_id}: Could not dump {db}, '
                       'error: {e}'.format(db=db,
                                           e=traceback.format_exc(),
                                           proc_id=proc_id))
             err_count = err_count + 1
             if err_count > MAX_THREAD_ERROR:
                 log.error('{proc_id}: Error count in thread > MAX_THREAD_ERROR. '
                           'Aborting :('.format(proc_id=proc_id))
                 return
def sanity_check_replica(instance):
    """ Make sure a slave is slaving and relatively caught up

    Args:
    instance - A hostaddr object

    Returns:
    A hostaddr object of master of the instance argument
    """
    # Test to see if the slave is setup for replication. If not, we are hosed
    conn = mysql_lib.connect_mysql(instance)
    try:
        mysql_lib.get_master_status(conn)
    except mysql_lib.ReplicationError:
        raise Exception('{instance} is not setup to write replicaiton '
                        'logs!'.format(instance=instance))

    replication = mysql_lib.calc_slave_lag(instance)
    if replication['ss']['Slave_SQL_Running'] != 'Yes':
        raise Exception('SQL thread is not running on {instance}'
                        ''.format(instance=instance))

    if replication['ss']['Slave_IO_Running'] != 'Yes':
        raise Exception('IO thread is not running on {instance}'
                        ''.format(instance=instance))

    if replication['sbm'] > mysql_lib.MAX_HEARTBEAT_LAG:
        raise Exception('Heartbeat lag {sbm} > {max_lag} seconds'
                        ''.format(sbm=replication['sbm'],
                                  max_lag=mysql_lib.MAX_HEARTBEAT_LAG))

    if replication['io_bytes'] > mysql_lib.MAX_IO_LAG:
        raise Exception('IO lag {io_bytes} > {max_io} bytes'
                        ''.format(io_bytes=replication['io_bytes'],
                                  max_io=mysql_lib.MAX_IO_LAG))
    master = host_utils.HostAddr(':'.join((replication['ss']['Master_Host'],
                                           str(replication['ss']['Master_Port']))))
    return master
def get_logged_binlog_uploads(instance):
    """ Get all binlogs that have been logged as uploaded

    Args:
    instance - a hostAddr object to run against and check

    Returns:
    A set of binlog file names
    """
    conn = mysql_lib.connect_mysql(instance, 'scriptro')
    cursor = conn.cursor()
    sql = ("SELECT binlog "
           "FROM {metadata_db}.{tbl} "
           "WHERE hostname = %(hostname)s AND "
           "      port = %(port)s "
           "".format(metadata_db=mysql_lib.METADATA_DB,
                     tbl=environment_specific.BINLOG_ARCHIVING_TABLE_NAME))
    cursor.execute(sql, {'hostname': instance.hostname,
                         'port': str(instance.port)})
    ret = set()
    for binlog in cursor.fetchall():
        ret.add(binlog['binlog'])

    return ret
    def take_backup_lock(self, db):
        """ Write a lock row on to the master

        Args:
        db - the db to be backed up

        Returns:
        a uuid lock identifier
        """
        zk = host_utils.MysqlZookeeper()
        (replica_set, _) = self.instance.get_zk_replica_set()
        master = zk.get_mysql_instance_from_replica_set(replica_set,
                                                        host_utils.REPLICA_ROLE_MASTER)
        master_conn = mysql_lib.connect_mysql(master, role='scriptrw')
        cursor = master_conn.cursor()

        lock_identifier = str(uuid.uuid4())
        log.debug('Taking backup lock: {replica_set} {db} '
                  ''.format(replica_set=replica_set,
                            db=db))
        params = {'lock': lock_identifier,
                  'db': db,
                  'hostname': self.instance.hostname,
                  'port': self.instance.port,
                  'active': ACTIVE}
        sql = ("INSERT INTO {db}.{tbl} "
               "SET "
               "lock_identifier = %(lock)s, "
               "lock_active = %(active)s, "
               "created_at = NOW(), "
               "expires = NOW() + INTERVAL {locks_held_time}, "
               "released = NULL, "
               "db = %(db)s,"
               "hostname = %(hostname)s,"
               "port = %(port)s"
               "").format(db=mysql_lib.METADATA_DB,
                          tbl=CSV_BACKUP_LOCK_TABLE_NAME,
                          locks_held_time=LOCKS_HELD_TIME)
        cursor = master_conn.cursor()
        try:
            cursor.execute(sql, params)
            master_conn.commit()
        except _mysql_exceptions.IntegrityError:
            lock_identifier = None
            sql = ("SELECT hostname, port, expires "
                   "FROM {db}.{tbl} "
                   "WHERE "
                   "    lock_active = %(active)s AND "
                   "    db = %(db)s"
                   "").format(db=mysql_lib.METADATA_DB,
                              tbl=CSV_BACKUP_LOCK_TABLE_NAME)
            cursor.execute(sql,
                           {'db': db, 'active': ACTIVE})
            ret = cursor.fetchone()
            log.debug('DB {db} is already being backed up on {hostname}:{port}, '
                      'lock will expire at {expires}.'
                      ''.format(db=db,
                                hostname=ret['hostname'],
                                port=ret['port'],
                                expires=str(ret['expires'])))

        log.debug(cursor._executed)
        return lock_identifier
def main():
    description = ("MySQL checksum wrapper\n\n"
                   "Wrapper of pt-table-checksum and pt-table-sync.\n"
                   "Defaults to checksumming 1/{k}th of databases on instance.\n"
                   "If diffs are found, use pt-table-sync to measure actual "
                   "divergence,\nbut only if the number of diffs is between "
                   "--min_diffs and --max_diffs.").format(k=DB_CHECK_FRACTION)

    parser = argparse.ArgumentParser(description=description,
                                     formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument('-i',
                        '--instance',
                        help='Instance to act on if other than localhost:3306',
                        default=''.join((socket.getfqdn(),
                                         ':3306')))
    parser.add_argument('-a',
                        '--all',
                        help='Checksums all dbs rather than the default',
                        action='store_true',
                        default=False)
    parser.add_argument('-d',
                        '--dbs',
                        help=("Comma separated list of db's to check rather "
                              "than the default"),
                        default=False)
    parser.add_argument('-q',
                        '--quiet',
                        help=("Do not print output to stdout"),
                        action='store_true',
                        default=False)
    parser.add_argument('-m',
                        '--min_diffs',
                        help=("Do per-row check if chunk diff count is at "
                              "least this value"),
                        dest='min_diffs',
                        default=MIN_DIFFS)
    parser.add_argument('-M',
                        '--max_diffs',
                        help=("Do not do per-row check if chunk diff count "
                              "is greater than this value"),
                        dest='max_diffs',
                        default=MAX_DIFFS)
    parser.add_argument('-C',
                        '--no_create_table',
                        help=("If test.checksum_detail is missing, do "
                              "not try to create it."),
                        dest='create_table',
                        action='store_false',
                        default=True)
    parser.add_argument('-v',
                        '--verbose',
                        help=("Store raw output from PT tools in the DB?"),
                        action='store_true',
                        default=False)
    parser.add_argument('-c',
                        '--check_fraction',
                        help=('Check this fraction of databases.'),
                        default=DB_CHECK_FRACTION)

    args = parser.parse_args()
    instance = host_utils.HostAddr(args.instance)
    zk = host_utils.MysqlZookeeper()

    if instance not in \
            zk.get_all_mysql_instances_by_type(host_utils.REPLICA_ROLE_MASTER):
        raise Exception("Instance is not a master in ZK")

    # If enabled, try to create the table that holds the checksum info.
    # If not enabled, make sure that the table exists.
    conn = mysql_lib.connect_mysql(instance, 'scriptro')
    if not mysql_lib.does_table_exist(conn, mysql_lib.METADATA_DB, CHECKSUM_TBL):
        if args.create_table:
            create_checksum_detail_table(instance)
        else:
            raise Exception("Checksum table not found.  Unable to continue."
                            "Consider not using the -C option or create it "
                            "yourself.")

    # Determine what replica set we belong to and get a list of slaves.
    replica_set = zk.get_replica_set_from_instance(instance)[0]
    slaves = set()
    for rtype in host_utils.REPLICA_ROLE_SLAVE, host_utils.REPLICA_ROLE_DR_SLAVE:
        s = zk.get_mysql_instance_from_replica_set(replica_set, rtype)
        if s:
            slaves.add(s)

    if len(slaves) == 0:
        log.info("This server has no slaves.  Nothing to do.")
        sys.exit(0)

    # before we even start this, make sure replication is OK.
    for slave in slaves:
        slave_conn = mysql_lib.connect_mysql(slave, 'scriptrw')
        ss = mysql_lib.get_slave_status(slave_conn)
        if ss['Slave_SQL_Running'] != "Yes" or ss['Slave_IO_Running'] != "Yes":
            raise Exception("Replication is NOT RUNNING on slave {s}: "
                            "SQL: {st} | IO: {it}".format(st=ss['Slave_SQL_Running'],
                                                          it=ss['Slave_IO_Running']))

    if args.dbs:
        db_to_check = set(args.dbs.split(','))
    else:
        dbs = mysql_lib.get_dbs(conn)

        if args.all:
            db_to_check = dbs
        else:
            # default behaviour, check a given DB every N days based on
            # day of year.  minimizes month-boundary issues.
            db_to_check = set()
            check_modulus = int(time.strftime("%j")) % int(args.check_fraction)
            counter = 0
            for db in dbs:
                modulus = counter % int(args.check_fraction)
                if modulus == check_modulus:
                    db_to_check.add(db)
                counter = counter + 1

    # Iterate through the list of DBs and check one table at a time.
    # We do it this way to ensure more coverage in case pt-table-checksum
    # loses its DB connection and errors out before completing a full scan
    # of a given database.
    #
    for db in db_to_check:
        conn = mysql_lib.connect_mysql(instance, 'scriptro')
        tables_to_check = mysql_lib.get_tables(conn, db, skip_views=True)
        for tbl in tables_to_check:
            c_cmd, c_out, c_err, c_ret = checksum_tbl(instance, db, tbl)
            if not args.quiet:
                log.info("Checksum command executed was:\n{cmd}".format(cmd=c_cmd))
                log.info("Standard out:\n{out}".format(out=c_out))
                log.info("Standard error:\n{err}".format(err=c_err))
                log.info("Return code: {ret}".format(ret=c_ret))

            # parse each line of STDOUT (there should only be one with
            # actual data).  We only care about errors, rows, chunks, and
            # skipped, since we'll need to figure out diffs separately for
            # each slave box.
            for line in c_out.split("\n"):
                results = parse_checksum_row(line)
                if results:
                    chunk_errors = int(results[1])
                    row_count = int(results[3])
                    chunk_count = int(results[4])
                    chunk_skips = int(results[5])

                    for slave in slaves:
                        rows_checked = 'NO'
                        sync_cmd = ""
                        sync_out = ""
                        sync_err = ""
                        sync_ret = -1
                        row_diffs = 0

                        elapsed_time_ms,\
                            chunk_diffs = check_one_replica(slave,
                                                            db, tbl)

                        # if we skipped some chunks or there were errors,
                        # this means we can't have complete information about the
                        # state of the replica. in the case of a hard error,
                        # we'll just stop.  in the case of a skipped chunk, we will
                        # treat it as a different chunk for purposes of deciding
                        # whether or not to do a more detailed analysis.
                        #
                        checkable_chunks = chunk_skips + chunk_diffs

                        if chunk_errors > 0:
                            checksum_status = 'ERRORS_IN_CHECKSUM_PROCESS'
                        elif checkable_chunks == 0:
                            checksum_status = 'GOOD'
                        else:
                            if checkable_chunks > int(args.max_diffs):
                                # too many chunk diffs, don't bother checking
                                # further.  not good.
                                checksum_status = 'TOO_MANY_CHUNK_DIFFS'
                            elif checkable_chunks < int(args.min_diffs):
                                # some diffs, but not enough that we care.
                                checksum_status = 'CHUNK_DIFFS_FOUND_BUT_OK'
                            else:
                                start_time = int(time.time()*1000)
                                rows_checked = 'YES'

                                # set the proper status - did we do a sync-based check
                                # because of explicit diffs or because of skipped chunks?
                                if chunk_diffs > 0:
                                    checksum_status = 'ROW_DIFFS_FOUND'
                                else:
                                    checksum_status = 'CHUNKS_WERE_SKIPPED'

                                sync_cmd, sync_out, sync_err, sync_ret, \
                                    row_diffs = checksum_tbl_via_sync(slave,
                                                                      db,
                                                                      tbl)

                                # Add in the time it took to do the sync.
                                elapsed_time_ms += int(time.time()*1000) - start_time

                                if not args.quiet:
                                    log.info("Sync command executed was:\n{cmd} ".format(cmd=sync_cmd))
                                    log.info("Standard out:\n {out}".format(out=sync_out))
                                    log.info("Standard error:\n {err}".format(err=sync_err))
                                    log.info("Return code: {ret}".format(ret=sync_ret))
                                    log.info("Row diffs found: {cnt}".format(cnt=row_diffs))

                        # Checksum process is complete, store the results.
                        #
                        data = {'instance': slave,
                                'master_instance': instance,
                                'db': db,
                                'tbl': tbl,
                                'elapsed_time_ms': elapsed_time_ms,
                                'chunk_count': chunk_count,
                                'chunk_errors': chunk_errors,
                                'chunk_diffs': chunk_diffs,
                                'chunk_skips': chunk_skips,
                                'row_count': row_count,
                                'row_diffs': row_diffs,
                                'rows_checked': rows_checked,
                                'checksum_status': checksum_status,
                                'checksum_cmd': None,
                                'checksum_stdout': None,
                                'checksum_stderr': None,
                                'checksum_rc': c_ret,
                                'sync_cmd': None,
                                'sync_stdout': None,
                                'sync_stderr': None,
                                'sync_rc': sync_ret}

                        if args.verbose:
                            data.update({'checksum_cmd': c_cmd,
                                         'checksum_stdout': c_out,
                                         'checksum_stderr': c_err,
                                         'sync_cmd': sync_cmd,
                                         'sync_stdout': sync_out,
                                         'sync_stderr': sync_err,
                                         'sync_rc': sync_ret})

                        write_checksum_status(instance, data)

        conn.close()
Exemple #52
0
def mysql_failover(master, dry_run, skip_lock,
                   ignore_dr_slave, trust_me_its_dead, kill_old_master):
    """ Promte a new MySQL master

    Args:
    master - Hostaddr object of the master instance to be demoted
    dry_run - Do not change state, just do sanity testing and exit
    skip_lock - Do not take a promotion lock
    ignore_dr_slave - Ignore the existance of a dr_slave
    trust_me_its_dead - Do not test to see if the master is dead
    kill_old_master - Send a mysqladmin kill command to the old master

    Returns:
    new_master - The new master server
    """
    log.info('Master to demote is {master}'.format(master=master))

    zk = host_utils.MysqlZookeeper()
    (replica_set, _) = zk.get_replica_set_from_instance(master, rtypes=['master'])
    log.info('Replica set is detected as '
             '{replica_set}'.format(replica_set=replica_set))

    # take a lock here to make sure nothing changes underneath us
    if not skip_lock and not dry_run:
        log.info('Taking promotion lock on replica set')
        lock_identifier = get_promotion_lock(replica_set)
    else:
        lock_identifier = None

    # giant try. If there any problems we roll back from the except
    try:
        master_conn = False
        slave = zk.get_mysql_instance_from_replica_set(replica_set=replica_set,
                                                       repl_type=host_utils.REPLICA_ROLE_SLAVE)
        log.info('Slave/new master is detected as {slave}'.format(slave=slave))

        if ignore_dr_slave:
            log.info('Intentionally ignoring a dr_slave')
            dr_slave = None
        else:
            dr_slave = zk.get_mysql_instance_from_replica_set(replica_set,
                                                              host_utils.REPLICA_ROLE_DR_SLAVE)
        log.info('DR slave is detected as {dr_slave}'.format(dr_slave=dr_slave))
        if dr_slave:
            if dr_slave == slave:
                raise Exception('Slave and dr_slave appear to be the same')

            replicas = set([slave, dr_slave])
        else:
            replicas = set([slave])

        # let's make sure that what we think is the master, actually is
        confirm_replica_topology(master, replicas)

        # We use master_conn as a mysql connection to the master server, if
        # it is False, the master is dead
        if trust_me_its_dead:
            master_conn = None
        else:
            master_conn = is_master_alive(master, replicas)
        slave_conn = mysql_lib.connect_mysql(slave)

        # Test to see if the slave is setup for replication. If not, we are hosed
        log.info('Testing to see if Slave/new master is setup to write '
                 'replication logs')
        try:
            mysql_lib.get_master_status(slave_conn)
        except mysql_lib.ReplicationError:
            log.error('New master {slave} is not setup to write replicaiton '
                      'logs!'.format(slave=slave))
            raise
        log.info('Slave/new master is setup to write replication logs')

        if kill_old_master:
            log.info('Killing old master, we hope you know what you are doing')
            mysql_lib.shutdown_mysql(master)
            master_conn = None

        if master_conn:
            log.info('Master is considered alive')
            dead_master = False
            confirm_max_replica_lag(replicas, MAX_ALIVE_MASTER_SLAVE_LAG_SECONDS,
                                    dead_master=dead_master)
        else:
            log.info('Master is considered dead')
            dead_master = True
            confirm_max_replica_lag(replicas, MAX_DEAD_MASTER_SLAVE_LAG_SECONDS,
                                    dead_master=dead_master)

        if dry_run:
            log.info('In dry_run mode, so exiting now')
            # Using os._exit in order to not get catch in the giant try
            os._exit(0)

        log.info('Preliminary sanity checks complete, starting promotion')

        if master_conn:
            log.info('Setting read_only on master')
            mysql_lib.set_global_variable(master_conn, 'read_only', True)
            log.info('Confirming no writes to old master')
            # If there are writes with the master in read_only mode then the
            # promotion can not proceed.
            # A likely reason is a client has the SUPER privilege.
            confirm_no_writes(master_conn)
            log.info('Waiting for replicas to be caught up')
            confirm_max_replica_lag(replicas, 0,
                                    timeout=MAX_ALIVE_MASTER_SLAVE_LAG_SECONDS,
                                    dead_master=dead_master)
            log.info('Setting up replication from old master ({master})'
                     'to new master ({slave})'.format(master=master,
                                                      slave=slave))
            mysql_lib.setup_replication(new_master=slave, new_replica=master)
        else:
            log.info('Starting up a zk connection to make sure we can connect')
            kazoo_client = environment_specific.get_kazoo_client()
            if not kazoo_client:
                raise Exception('Could not conect to zk')

            log.info('Confirming replica has processed all replication '
                     ' logs')
            confirm_no_writes(slave_conn)
            log.info('Looks like no writes being processed by replica via '
                     'replication or other means')
            if len(replicas) > 1:
                log.info('Confirming relpica servers in sync')
                confirm_max_replica_lag(replicas, MAX_DEAD_MASTER_SLAVE_LAG_SECONDS,
                                        replicas_synced=True,
                                        dead_master=dead_master)
    except:
        log.info('Starting rollback')
        if master_conn:
            log.info('Releasing read_only on old master')
            mysql_lib.set_global_variable(master_conn, 'read_only', False)

            log.info('Clearing replication settings on old master')
            mysql_lib.reset_slave(master_conn)
        if lock_identifier:
            log.info('Releasing promotion lock')
            release_promotion_lock(lock_identifier)
        log.info('Rollback complete, reraising exception')
        raise

    if dr_slave:
        try:
            mysql_lib.setup_replication(new_master=slave, new_replica=dr_slave)
        except Exception as e:
            log.error(e)
            log.error('Setting up replication on the dr_slave failed. '
                      'Failing forward!')

    log.info('Updating zk')
    zk_write_attempt = 0
    while True:
        try:
            modify_mysql_zk.swap_master_and_slave(slave, dry_run=False)
            break
        except:
            if zk_write_attempt > MAX_ZK_WRITE_ATTEMPTS:
                log.info('Final failure writing to zk, bailing')
                raise
            else:
                log.info('Write to zk failed, trying again')
                zk_write_attempt = zk_write_attempt+1

    log.info('Removing read_only from new master')
    mysql_lib.set_global_variable(slave_conn, 'read_only', False)
    log.info('Removing replication configuration from new master')
    mysql_lib.reset_slave(slave_conn)
    if lock_identifier:
        log.info('Releasing promotion lock')
        release_promotion_lock(lock_identifier)

    log.info('Failover complete')

    if not master_conn:
        log.info('As master is dead, will try to launch a replacement. Will '
                 'sleep 20 seconds first to let things settle')
        time.sleep(20)
        launch_replacement_db_host.launch_replacement_db_host(master)