Ejemplo n.º 1
0
def main():
    """
    Parse argument as command and execute that command with
    parameters containing the state of MySQL, ContainerPilot, etc.
    Default behavior is to run `pre_start` DB initialization.
    """
    if len(sys.argv) == 1:
        consul = Consul(envs={'CONSUL': os.environ.get('CONSUL', 'consul')})
        cmd = pre_start
    else:
        consul = Consul()
        try:
            cmd = globals()[sys.argv[1]]
        except KeyError:
            log.error('Invalid command: %s', sys.argv[1])
            sys.exit(1)

    my = MySQL()
    cp = ContainerPilot()
    cp.load()

    # what storage backend did we use?
    driver = os.environ.get('BACKUP_DRIVER', 'manta').lower()
    if driver == 'manta':
        storage = Manta()
    elif driver == 's3':
        storage = S3()
    elif driver == 'scp':
        storage = SCP()
    else:
        storage = None

    node = Node(mysql=my, consul=consul, storage=storage, cp=cp)

    cmd(node)
Ejemplo n.º 2
0
def on_change(node):
    """ The top-level ContainerPilot onChange handler """

    # first check if this node has already been set primary by a completed
    # call to failover and update the ContainerPilot config as needed.
    if node.is_primary():
        log.debug('[on_change] this node is primary, no failover required.')
        if node.cp.update():
            # we're ignoring the lock here intentionally
            node.consul.put(PRIMARY_KEY, node.name)
            node.cp.reload()
        return

    # check if another node has been set primary already and is reporting
    # as healthy, in which case there's no failover required. Note that
    # we can't simply check if we're a replica via .is_replica() b/c that
    # trusts mysqld's view of the world.
    try:
        node.consul.get_primary(timeout=1)
        log.debug('[on_change] primary is already healthy, no failover required')
        return
    except (UnknownPrimary, WaitTimeoutError) as ex:
        log.debug('[on_change] no primary from consul: %s', ex)

    if node.consul.lock_failover(node.name):
        try:
            nodes = node.consul.client.health.service(REPLICA, passing=True)[1]
            ips = [instance['Service']['Address'] for instance in nodes]
            log.info('[on_change] Executing failover with candidates: %s', ips)
            node.mysql.failover(ips)
        except Exception:
            # On failure we bubble-up the exception and fail the onChange.
            # Either another instance that didn't overlap in time will
            # complete failover or we'll be left w/o a primary and require
            # manual intervention via `mysqlrpladmin failover`
            node.consul.unlock_failover()
            raise
    else:
        log.info('[on_change] Failover in progress on another node, '
                 'waiting to complete.')
        node.consul.wait_for_failover_lock()

    # need to determine replicaton status at this point, so make
    # sure we refresh .state from mysqld/Consul
    node.cp.state = UNASSIGNED
    if node.is_primary():
        log.info('[on_change] node %s is primary after failover', node.name)
        if node.cp.update():
            # we're intentionally ignoring the advisory lock here
            ok = node.consul.put(PRIMARY_KEY, node.name)
            log.debug('[on_change] %s obtained lock: %s', node.name, ok)
            node.cp.reload()
        return
    elif node.is_replica():
        log.info('[on_change] node %s is replica after failover', node.name)

    if node.cp.state == UNASSIGNED:
        log.error('[on_change] this node is neither primary or replica '
                  'after failover; check replication status on cluster.')
        sys.exit(1)
Ejemplo n.º 3
0
def main():
    """
    Parse argument as command and execute that command with
    parameters containing the state of MySQL, ContainerPilot, etc.
    Default behavior is to run `pre_start` DB initialization.
    """
    if len(sys.argv) == 1:
        consul = Consul(envs={'CONSUL': os.environ.get('CONSUL', 'consul')})
        cmd = pre_start
    else:
        consul = Consul()
        try:
            cmd = globals()[sys.argv[1]]
        except KeyError:
            log.error('Invalid command: %s', sys.argv[1])
            sys.exit(1)

    storage_class = env('BACKUP_STORAGE_CLASS', 'manager.libmanta.Manta')

    my = MySQL()
    backup_store = get_class(storage_class)()
    cp = ContainerPilot()
    cp.load()
    node = Node(mysql=my, consul=consul, backup_store=backup_store, cp=cp)

    cmd(node)
Ejemplo n.º 4
0
def on_change(node):
    """ The top-level ContainerPilot onChange handler """

    # first check if this node has already been set primary by a completed
    # call to failover and update the ContainerPilot config as needed.
    if node.is_primary():
        log.debug('[on_change] this node is primary, no failover required.')
        if node.cp.update():
            # we're ignoring the lock here intentionally
            node.consul.put(PRIMARY_KEY, node.name)
            node.cp.reload()
        return

    # check if another node has been set primary already and is reporting
    # as healthy, in which case there's no failover required. Note that
    # we can't simply check if we're a replica via .is_replica() b/c that
    # trusts mysqld's view of the world.
    try:
        node.consul.get_primary(timeout=1)
        log.debug('[on_change] primary is already healthy, no failover required')
        return
    except (UnknownPrimary, WaitTimeoutError) as ex:
        log.debug('[on_change] no primary from consul: %s', ex)

    if node.consul.lock_failover(node.name):
        try:
            nodes = node.consul.client.health.service(REPLICA, passing=True)[1]
            ips = [instance['Service']['Address'] for instance in nodes]
            log.info('[on_change] Executing failover with candidates: %s', ips)
            node.mysql.failover(ips)
        except Exception:
            # On failure we bubble-up the exception and fail the onChange.
            # Either another instance that didn't overlap in time will
            # complete failover or we'll be left w/o a primary and require
            # manual intervention via `mysqlrpladmin failover`
            node.consul.unlock_failover()
            raise
    else:
        log.info('[on_change] Failover in progress on another node, '
                 'waiting to complete.')
        node.consul.wait_for_failover_lock()

    # need to determine replicaton status at this point, so make
    # sure we refresh .state from mysqld/Consul
    node.cp.state = UNASSIGNED
    if node.is_primary():
        log.info('[on_change] node %s is primary after failover', node.name)
        if node.cp.update():
            # we're intentionally ignoring the advisory lock here
            ok = node.consul.put(PRIMARY_KEY, node.name)
            log.debug('[on_change] %s obtained lock: %s', node.name, ok)
            node.cp.reload()
        return
    elif node.is_replica():
        log.info('[on_change] node %s is replica after failover', node.name)

    if node.cp.state == UNASSIGNED:
        log.error('[on_change] this node is neither primary or replica '
                  'after failover; check replication status on cluster.')
        sys.exit(1)
Ejemplo n.º 5
0
def main():
    """
    Parse argument as command and execute that command with
    parameters containing the state of MySQL, ContainerPilot, etc.
    Default behavior is to run `pre_start` DB initialization.
    """
    if len(sys.argv) == 1:
        consul = Consul(envs={'CONSUL': os.environ.get('CONSUL', 'consul')})
        cmd = pre_start
    else:
        consul = Consul()
        try:
            cmd = globals()[sys.argv[1]]
        except KeyError:
            log.error('Invalid command: %s', sys.argv[1])
            sys.exit(1)

    my = MySQL()

    snapshot_backend = os.environ.get('SNAPSHOT_BACKEND', 'manta')
    if snapshot_backend == 'local':
        snaps = Local()
    elif snapshot_backend == 'minio':
        snaps = Minio()
    else:
        snaps = Manta()

    cp = ContainerPilot()
    cp.load()
    node = Node(mysql=my, consul=consul, snaps=snaps, cp=cp)

    cmd(node)

    my.close()
Ejemplo n.º 6
0
 def set_timezone_info(self):
     """
     Write TZ data to mysqld by piping mysql_tzinfo_to_sql to the mysql
     client. This is kinda gross but piping it avoids having to parse the
     output for a bulk insert with the Connector/MySQL client.
     """
     try:
         subprocess.check_output(
             '/usr/bin/mysql_tzinfo_to_sql /usr/share/zoneinfo | '
             '/usr/bin/mysql -uroot --protocol=socket '
             '--socket=/var/run/mysqld/mysqld.sock')
     except (subprocess.CalledProcessError, OSError) as ex:
         log.error('mysql_tzinfo_to_sql returned error: %s', ex)
Ejemplo n.º 7
0
 def set_timezone_info(self):
     """
     Write TZ data to mysqld by piping mysql_tzinfo_to_sql to the mysql
     client. This is kinda gross but piping it avoids having to parse the
     output for a bulk insert with the Connector/MySQL client.
     """
     try:
         subprocess.check_output(
             '/usr/bin/mysql_tzinfo_to_sql /usr/share/zoneinfo | '
             '/usr/bin/mysql -uroot --protocol=socket '
             '--socket=/var/run/mysqld/mysqld.sock')
     except (subprocess.CalledProcessError, OSError) as ex:
         log.error('mysql_tzinfo_to_sql returned error: %s', ex)
Ejemplo n.º 8
0
    def create_repl_user(self, conn):
        """ this user will be used for both replication and backups """
        if not self.repl_user or not self.repl_password:
            log.error('No replication user/password configured.')
            return

        self.add('CREATE USER `{}`@`%` IDENTIFIED BY %s; '
                 .format(self.repl_user), (self.repl_password,))
        self.add('GRANT SUPER, SELECT, INSERT, REPLICATION SLAVE, RELOAD'
                 ', LOCK TABLES, GRANT OPTION, REPLICATION CLIENT'
                 ', RELOAD, DROP, CREATE '
                 'ON *.* TO `{}`@`%`; '
                 .format(self.repl_user))
        self.add('FLUSH PRIVILEGES;')
        self.execute_many(conn=conn)
Ejemplo n.º 9
0
    def create_repl_user(self, conn):
        """ this user will be used for both replication and backups """
        if not self.repl_user or not self.repl_password:
            log.error('No replication user/password configured.')
            return

        self.add(
            'CREATE USER `{}`@`%` IDENTIFIED BY %s; '.format(self.repl_user),
            (self.repl_password, ))
        self.add('GRANT SUPER, SELECT, INSERT, REPLICATION SLAVE, RELOAD'
                 ', LOCK TABLES, GRANT OPTION, REPLICATION CLIENT'
                 ', RELOAD, DROP, CREATE '
                 'ON *.* TO `{}`@`%`; '.format(self.repl_user))
        self.add('FLUSH PRIVILEGES;')
        self.execute_many(conn=conn)
Ejemplo n.º 10
0
    def load(self, envs=os.environ):
        """
        Fetches the ContainerPilot config file and asks ContainerPilot
        to render it out so that all environment variables have been
        interpolated.
        """
        self.path = env('CONTAINERPILOT', None, envs)
        try:
            cfg = subprocess.check_output(
                ['containerpilot', '-config', self.path, '-template'],
                env=envs.copy())
        except (subprocess.CalledProcessError, OSError) as ex:
            log.error('containerpilot -template returned error: %s', ex)
            raise (ex)

        config = json5.loads(cfg)
        self.config = config
Ejemplo n.º 11
0
def health(node):
    """
    The top-level ContainerPilot `health` handler. Runs a simple health check.
    Also acts as a check for whether the ContainerPilot configuration needs
    to be reloaded (if it's been changed externally).
    """

    # Because we need MySQL up to finish initialization, we need to check
    # for each pass thru the health check that we've done so. The happy
    # path is to check a lock file against the node state (which has been
    # set above) and immediately return when we discover the lock exists.
    # Otherwise, we bootstrap the instance for its *current* state.
    assert_initialized_for_state(node)

    if node.is_primary():
        # If this lock is allowed to expire and the health check for the
        # primary fails the `onChange` handlers for the replicas will try
        # to failover and then the primary will obtain a new lock.
        # If this node can update the lock but the DB fails its health check,
        # then the operator will need to manually intervene if they want to
        # force a failover. This architecture is a result of Consul not
        # permitting us to acquire a new lock on a health-checked session if the
        # health check is *currently* failing, but has the happy side-effect of
        # reducing the risk of flapping on a transient health check failure.
        node.consul.renew_session()

        # Simple health check; exceptions result in a non-zero exit code
        node.mysql.query('select 1')

        # When failing over the new node needs a chance to lock the kv
        node.consul.mark_as_primary(node.name)

    elif node.is_replica():
        # TODO: we should make this check actual replication health
        # and not simply that replication has been established
        if not node.mysql.query('show slave status'):
            log.error('Replica is not replicating.')
            sys.exit(1)
    else:
        # If we're still somehow marked UNASSIGNED we exit now. This is a
        # byzantine failure mode where the end-user needs to intervene.
        log.error('Cannot determine MySQL state; failing health check.')
        sys.exit(1)

    node.consul.unlock_failover()
Ejemplo n.º 12
0
def health(node):
    """
    The top-level ContainerPilot `health` handler. Runs a simple health check.
    Also acts as a check for whether the ContainerPilot configuration needs
    to be reloaded (if it's been changed externally).
    """

    # Because we need MySQL up to finish initialization, we need to check
    # for each pass thru the health check that we've done so. The happy
    # path is to check a lock file against the node state (which has been
    # set above) and immediately return when we discover the lock exists.
    # Otherwise, we bootstrap the instance for its *current* state.
    assert_initialized_for_state(node)

    if node.is_primary():
        # If this lock is allowed to expire and the health check for the
        # primary fails the `onChange` handlers for the replicas will try
        # to failover and then the primary will obtain a new lock.
        # If this node can update the lock but the DB fails its health check,
        # then the operator will need to manually intervene if they want to
        # force a failover. This architecture is a result of Consul not
        # permitting us to acquire a new lock on a health-checked session if the
        # health check is *currently* failing, but has the happy side-effect of
        # reducing the risk of flapping on a transient health check failure.
        node.consul.renew_session()

        # Simple health check; exceptions result in a non-zero exit code
        node.mysql.query('select 1')

    elif node.is_replica():
        # TODO: we should make this check actual replication health
        # and not simply that replication has been established
        if not node.mysql.query('show slave status'):
            log.error('Replica is not replicating.')
            sys.exit(1)
    else:
        # If we're still somehow marked UNASSIGNED we exit now. This is a
        # byzantine failure mode where the end-user needs to intervene.
        log.error('Cannot determine MySQL state; failing health check.')
        sys.exit(1)

    node.consul.unlock_failover()
Ejemplo n.º 13
0
def assert_initialized_for_state(node):
    """
    If the node has not yet been set up, find the correct state and
    initialize for that state. After the first health check we'll have
    written a lock file and will never hit this path again.
    """
    LOCK_PATH = '/var/run/init.lock'
    try:
        os.mkdir(LOCK_PATH, 0700)
    except OSError:
        # the lock file exists so we've already initialized
        return True

    # the check for primary will set the state if its known. If another
    # instance is the primary then we'll be marked as REPLICA, so if
    # we can't determine after the check which we are then we're likely
    # the first instance (this will get safely verified later).
    try:
        if not run_as_primary(node):
            log.error(
                'Tried to mark node %s primary but primary exists, '
                'exiting for retry on next check.', node.name)
            os.rmdir(LOCK_PATH)
            sys.exit(1)
    except MySQLError as ex:
        # We've made it only partly thru setup. Setup isn't idempotent
        # but should be safe to retry if we can make more progress. At
        # worst we end up with a bunch of failure logs.
        log.error(
            'Failed to set up %s as primary (%s). Exiting but will '
            'retry setup. Check logs following this line to see if '
            'setup needs reconfiguration or manual intervention to '
            'continue.', node.name, ex)
        os.rmdir(LOCK_PATH)
        sys.exit(1)
    return False
Ejemplo n.º 14
0
def main():
    """
    Parse argument as command and execute that command with
    parameters containing the state of MySQL, ContainerPilot, etc.
    Default behavior is to run `pre_start` DB initialization.
    """
    if len(sys.argv) == 1:
        consul = Consul(envs={'CONSUL': os.environ.get('CONSUL', 'consul')})
        cmd = pre_start
    else:
        consul = Consul()
        try:
            cmd = globals()[sys.argv[1]]
        except KeyError:
            log.error('Invalid command: %s', sys.argv[1])
            sys.exit(1)

    my = MySQL()
    manta = Manta()
    cp = ContainerPilot()
    cp.load()
    node = Node(mysql=my, consul=consul, manta=manta, cp=cp)

    cmd(node)
def main():
    """
    Parse argument as command and execute that command with
    parameters containing the state of MySQL, ContainerPilot, etc.
    Default behavior is to run `pre_start` DB initialization.
    """
    if len(sys.argv) == 1:
        consul = Consul(envs={'CONSUL': os.environ.get('CONSUL', 'consul')})
        cmd = pre_start
    else:
        consul = Consul()
        try:
            cmd = globals()[sys.argv[1]]
        except KeyError:
            log.error('Invalid command: %s', sys.argv[1])
            sys.exit(1)

    cp = ContainerPilot()
    cass = CassandraService()
    cp.load()
    print(consul)
    node = Node(kvstore=consul, cp=cp, service=cass)

    cmd(node)
Ejemplo n.º 16
0
def main():
    """
    Parse argument as command and execute that command with
    parameters containing the state of MySQL, ContainerPilot, etc.
    Default behavior is to run `pre_start` DB initialization.
    """
    if len(sys.argv) == 1:
        consul = Consul(envs={'CONSUL': os.environ.get('CONSUL', 'consul')})
        cmd = pre_start
    else:
        consul = Consul()
        try:
            cmd = globals()[sys.argv[1]]
        except KeyError:
            log.error('Invalid command: %s', sys.argv[1])
            sys.exit(1)

    my = MySQL()
    manta = Manta()
    cp = ContainerPilot()
    cp.load()
    node = Node(mysql=my, consul=consul, manta=manta, cp=cp)

    cmd(node)
Ejemplo n.º 17
0
def assert_initialized_for_state(node):
    """
    If the node has not yet been set up, find the correct state and
    initialize for that state. After the first health check we'll have
    written a lock file and will never hit this path again.
    """
    LOCK_PATH = '/var/run/init.lock'
    try:
        os.mkdir(LOCK_PATH, 0700)
    except OSError:
        # the lock file exists so we've already initialized
        return True

    # the check for primary will set the state if its known. If another
    # instance is the primary then we'll be marked as REPLICA, so if
    # we can't determine after the check which we are then we're likely
    # the first instance (this will get safely verified later).
    if node.is_primary() or node.cp.state == UNASSIGNED:
        try:
            if not run_as_primary(node):
                log.error(
                    'Tried to mark node %s primary but primary exists, '
                    'exiting for retry on next check.', node.name)
                os.rmdir(LOCK_PATH)
                sys.exit(1)
        except MySQLError as ex:
            # We've made it only partly thru setup. Setup isn't idempotent
            # but should be safe to retry if we can make more progress. At
            # worst we end up with a bunch of failure logs.
            log.error(
                'Failed to set up %s as primary (%s). Exiting but will '
                'retry setup. Check logs following this line to see if '
                'setup needs reconfiguration or manual intervention to '
                'continue.', node.name, ex)
            os.rmdir(LOCK_PATH)
            sys.exit(1)
        if node.cp.update():
            os.rmdir(LOCK_PATH)
            node.cp.reload()
            # this is racy with the SIGHUP that ContainerPilot just got
            # sent, but if the Consul agent shuts down quickly enough we
            # end up sending extra API calls to it and get a bunch of log
            # spam. This forces us to exit early.
            sys.exit(0)
    else:
        try:
            run_as_replica(node)
        except (UnknownPrimary, MySQLError) as ex:
            log.error(
                'Failed to set up %s for replication (%s). Exiting for retry '
                'on next check.', node.name, ex)
            os.rmdir(LOCK_PATH)
            sys.exit(1)
    return False
Ejemplo n.º 18
0
def assert_initialized_for_state(node):
    """
    If the node has not yet been set up, find the correct state and
    initialize for that state. After the first health check we'll have
    written a lock file and will never hit this path again.
    """
    LOCK_PATH = '/var/run/init.lock'
    try:
        os.mkdir(LOCK_PATH, 0700)
    except OSError:
        # the lock file exists so we've already initialized
        return True

    # the check for primary will set the state if its known. If another
    # instance is the primary then we'll be marked as REPLICA, so if
    # we can't determine after the check which we are then we're likely
    # the first instance (this will get safely verified later).
    if node.is_primary() or node.cp.state == UNASSIGNED:
        try:
            if not run_as_primary(node):
                log.error('Tried to mark node %s primary but primary exists, '
                          'exiting for retry on next check.', node.name)
                os.rmdir(LOCK_PATH)
                sys.exit(1)
        except MySQLError as ex:
            # We've made it only partly thru setup. Setup isn't idempotent
            # but should be safe to retry if we can make more progress. At
            # worst we end up with a bunch of failure logs.
            log.error('Failed to set up %s as primary (%s). Exiting but will '
                      'retry setup. Check logs following this line to see if '
                      'setup needs reconfiguration or manual intervention to '
                      'continue.', node.name, ex)
            os.rmdir(LOCK_PATH)
            sys.exit(1)
        if node.cp.update():
            os.rmdir(LOCK_PATH)
            node.cp.reload()
            # this is racy with the SIGHUP that ContainerPilot just got
            # sent, but if the Consul agent shuts down quickly enough we
            # end up sending extra API calls to it and get a bunch of log
            # spam. This forces us to exit early.
            sys.exit(0)
    else:
        try:
            run_as_replica(node)
        except (UnknownPrimary, MySQLError) as ex:
            log.error('Failed to set up %s for replication (%s). Exiting for retry '
                      'on next check.', node.name, ex)
            os.rmdir(LOCK_PATH)
            sys.exit(1)
    return False