Example #1
0
def bounce_lock_zookeeper(
        name: str,
        system_paasta_config: Optional[SystemPaastaConfig] = None) -> Iterator:
    """Acquire a bounce lock in zookeeper for the name given. The name should
    generally be the service namespace being bounced.
    This is a contextmanager. Please use it via 'with bounce_lock(name):'.
    :param name: The lock name to acquire"""
    if system_paasta_config is None:
        system_paasta_config = load_system_paasta_config()
    zk = KazooClient(
        hosts=system_paasta_config.get_zk_hosts(),
        timeout=ZK_LOCK_CONNECT_TIMEOUT_S,
    )
    zk.start()
    lock = zk.Lock(f"{ZK_LOCK_PATH}/{name}")
    try:
        lock.acquire(
            timeout=1)  # timeout=0 throws some other strange exception
        yield
    except LockTimeout:
        raise LockHeldException("Service %s is already being bounced!" % name)
    else:
        lock.release()
    finally:
        zk.stop()
        zk.close()
Example #2
0
class ZK:

    client = None

    def __init__(self, zk_host):
        self.client = KazooClient(zk_host)
        self.client.start()

    def __del__(self):
        self.client.stop()

    def get_node(self, path):
        if not self.client.exists(path):
            return None
        node = ZKNode(path, self)
        return node

    def create_node(self, path):
        self.client.ensure_path(path)
        return self.get_node(path)

    def get_transaction(self):
        return self.client.transaction()

    def get_lock(self, path, id=None):
        return self.client.Lock(path + "/lock", id)

    def has_lock(self, path):
        lock_path = path + "/lock"
        if not self.client.exists(lock_path):
            return False
        if len(self.client.get_children(lock_path)) > 0:
            return True
        else:
            return False
Example #3
0
class ZkHelper(object):
    def __init__(self, address='', port=''):
        assert address and port
        self.zk_address = address
        self.zk_port = port
        self.retry = KazooRetry(max_delay=10000, max_tries=None)
        self.zk = KazooClient(hosts='%s:%s' % (self.zk_address, self.zk_port),
                              connection_retry=self.retry,
                              timeout=20)
        self.zk.add_listener(self._listener)
        self.zk.start()
        logging.info("instance zk client start (%s:%s)" %
                     (self.zk_address, self.zk_port))

    @staticmethod
    def _listener(state):
        if state == KazooState.LOST:
            logging.info(
                "zk connect lost, stop this connection and then start new one!"
            )
        elif state == KazooState.SUSPENDED:
            logging.info(
                "zk connect suspended, stop this connection and then start new one!"
            )

    def write(self, path, data):
        self.zk.ensure_path(path)
        self.retry(self.zk.set, path, data)
        logging.info("write data:%s to path:%s" % (data, path))

    def ensure_path(self, path):
        self.zk.ensure_path(path)

    def get_lock(self, path):
        return self.zk.Lock(path, threading.currentThread())

    def read(self, path):
        if self.zk.exists(path):
            data = self.retry(self.zk.get, path)
            logging.info("read data:%s from path:%s" % (data, path))
            return data[0]
        logging.info("path:%s not exist" % path)

    def get_children_list(self, path):
        if self.zk.exists(path):
            data = self.retry(self.zk.get_children, path)
            logging.info("get children:%s from path:%s" % (data, path))
            return data
        logging.info("path:%s not exist" % path)

    def exists(self, path):
        return self.zk.exists(path)

    def get_lock(self, path):
        lock = self.retry(self.zk.Lock, path, threading.current_thread())
        return lock

    def close(self):
        self.zk.close()
Example #4
0
def main():
    """ Starts the AdminServer. """
    logging.basicConfig(format=LOG_FORMAT, level=logging.INFO)

    parser = argparse.ArgumentParser()
    parser.add_argument('-p',
                        '--port',
                        type=int,
                        default=constants.DEFAULT_PORT,
                        help='The port to listen on')
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        help='Output debug-level logging')
    args = parser.parse_args()

    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)

    options.define('secret', appscale_info.get_secret())
    options.define('login_ip', appscale_info.get_login_ip())
    options.define('private_ip', appscale_info.get_private_ip())

    acc = appscale_info.get_appcontroller_client()
    ua_client = UAClient(appscale_info.get_db_master_ip(), options.secret)
    zk_client = KazooClient(hosts=','.join(appscale_info.get_zk_node_ips()),
                            connection_retry=ZK_PERSISTENT_RECONNECTS)
    zk_client.start()
    version_update_lock = zk_client.Lock(constants.VERSION_UPDATE_LOCK_NODE)
    thread_pool = ThreadPoolExecutor(4)
    monit_operator = MonitOperator()
    all_resources = {
        'acc': acc,
        'ua_client': ua_client,
        'zk_client': zk_client,
        'version_update_lock': version_update_lock,
        'thread_pool': thread_pool
    }

    if options.private_ip in appscale_info.get_taskqueue_nodes():
        logging.info('Starting push worker manager')
        GlobalPushWorkerManager(zk_client, monit_operator)

    app = web.Application([
        ('/v1/apps/([a-z0-9-]+)/services/([a-z0-9-]+)/versions',
         VersionsHandler, all_resources),
        ('/v1/apps/([a-z0-9-]+)/services/([a-z0-9-]+)/versions/([a-z0-9-]+)',
         VersionHandler, all_resources),
        ('/v1/apps/([a-z0-9-]+)/operations/([a-z0-9-]+)', OperationsHandler),
        ('/api/queue/update', UpdateQueuesHandler, {
            'zk_client': zk_client
        })
    ])
    logging.info('Starting AdminServer')
    app.listen(args.port)
    io_loop = IOLoop.current()
    io_loop.start()
Example #5
0
class TimeStampLeader(PublicationLeader):

    def __init__(self, zk_hosts, bdii_path):
        self.zk_hosts = zk_hosts
        self.zk = KazooClient(self.zk_hosts)
        self.bdii_path = bdii_path
        self.is_leader = False

    def pack_ts(self, input_dt):
        return struct.pack('f', self.gen_ts(input_dt))

    def gen_ts(self, input_dt):
        return time.mktime(input_dt.timetuple())

    def does_exist(self):
        if self.zk.exists(self.bdii_path) is not None:
            return True
        else:
            return False

    def is_stale(self, current_time):
        data, stat = self.zk.get(self.bdii_path)
        if data == '':
            return True
        last_updated_timestamp = struct.unpack('f',data)[0]
        if last_updated_timestamp <= (self.gen_ts(current_time) - 120):
            return True
        else:
            return False

    def should_publish(self):
        self.zk.start()
        current_time = datetime.datetime.utcnow()
        if not self.does_exist():
            self.zk.create(self.bdii_path, self.pack_ts(current_time))
            self.is_leader = True
            return self.is_leader
        bdii_lock = self.zk.Lock(self.bdii_path, socket.getfqdn())
        try:
            lock_acquired = bdii_lock.acquire(5.0)
            if lock_acquired:
                self.is_leader = self.is_stale(current_time)
                bdii_lock.release()
                self.zk.stop()
                return self.is_leader
        except LockTimeout:
            # Another Compute Element has the lock
            pass
        return False

    def update_ts(self):
        if self.is_leader:
            self.zk.start()
            current_ts = self.gen_ts(datetime.datetime.utcnow())
            self.zk.set(self.bdii_path, struct.pack('f', current_ts))
            self.zk.stop()
Example #6
0
def run(idx):
    zk_hosts = os.environ['RATER_ZK_HOSTS']
    run_time = int(os.environ['RATER_RUN_TIME'])
    currency_name = init_rate_table[idx]['currency']
    exchange_rate = init_rate_table[idx]['init_rate']

    zk = KazooClient(hosts=zk_hosts)
    zk.start()

    minute_cnt = 0
    while minute_cnt <= run_time:
        # Create a new rate table if not exists
        if not zk.exists('/rate_table/' + str(minute_cnt)):
            list_lock = zk.Lock('/rate_table/list_lock', 'list_lock')
            with list_lock:
                if not zk.exists('/rate_table/' + str(minute_cnt)):
                    raw_rate_table = json.dumps({
                        'RMB': -1,
                        'USD': -1,
                        'JPY': -1,
                        'EUR': -1
                    })
                    zk.create('/rate_table/' + str(minute_cnt),
                              bytes(raw_rate_table),
                              makepath=True)

        # Update rate table
        table_lock = zk.Lock('/rate_table/lock/' + str(minute_cnt),
                             'table_lock' + str(minute_cnt))
        with table_lock:
            byte_rate_table, stat = zk.get('/rate_table/' + str(minute_cnt))
            rate_table = json.loads(str(byte_rate_table))
            rate_table[currency_name] = exchange_rate

            raw_rate_table = json.dumps(rate_table)
            zk.set('/rate_table/' + str(minute_cnt), bytes(raw_rate_table))

        exchange_rate += 0.1
        minute_cnt += 1

        #time.sleep(60)

    zk.stop()
Example #7
0
def run_with_lock(name):
    # 连接zookeeper
    zk = KazooClient(hosts='39.108.147.32:2182')
    # 启动连接
    zk.start()
    # 创建锁
    lock = zk.Lock("/lockpath", "my-identifier")
    while True:
        # 获取当前秒数,当秒数为5时同时访问秒杀函数
        if arrow.now().second % 5 == 0:
            with lock:
                seckilling()
                return
Example #8
0
def zk_lock(zk: KazooClient, lock_path: str, contender_id: str,
            timeout: int) -> Generator:
    """
    This contextmanager takes a ZooKeeper lock, yields, then releases the lock.
    This lock behaves like an interprocess mutex lock.

    ZooKeeper allows one to read values without holding a lock, but there is no
    guarantee that you will read the latest value. To read the latest value,
    you must call `sync()` on a ZNode before calling `get()`.

    Args:
        zk:
            The client to use to communicate with ZooKeeper.
        lock_path:
            The ZNode path to use as prefix for the locking recipe.
        contender_id:
            The contender id to identify the current client
            in the locking recipe.
        timeout:
            Time in seconds to wait for the lock to be acquired.
            If this time elapses before the lock is acquired, a
            `kazoo.exceptions.LockTimeout` exception is raised.

    Raises:
        kazoo.exceptions.LockTimeout:
            If the `timeout` is exceeded without the lock being acquired.

    """
    lock = zk.Lock(lock_path, contender_id)
    try:
        log.info("Acquiring ZooKeeper lock.")
        lock.acquire(blocking=True, timeout=timeout, ephemeral=True)
    except (ConnectionLoss, SessionExpiredError) as e:
        msg_fmt = "Failed to acquire lock: {}"
        msg = msg_fmt.format(e.__class__.__name__)
        log.exception(msg)
        raise e
    except LockTimeout as e:
        msg_fmt = "Failed to acquire lock in `{}` seconds"
        msg = msg_fmt.format(timeout)
        log.exception(msg)
        raise e
    else:
        log.info("ZooKeeper lock acquired.")
    try:
        yield
    finally:
        log.info("Releasing ZooKeeper lock")
        lock.release()
        log.info("ZooKeeper lock released.")
Example #9
0
def main(loops=-1, loop_interval=60, restart_interval=30):
    """
    :param loops: Number of loops. (set <0 for infinite)
    :param loop_interval: Time to sleep per loop (seconds).
    :param restart_interval: Time to sleep after a restart (seconds).
    :return: Return code for process.
    """
    exhibitor = os.environ.get('EXHIBITOR_BASE')
    if not exhibitor:
        logger.error('Variable EXHIBITOR_BASE not found')
        return -1

    base_properties = server_template()

    # Discover ZK and start server:
    zk_conn = zk_conn_string(exhibitor)
    kafka_pid = start_kafka(base_properties, zk_conn)

    # Loop:
    while loops != 0:
        # If Kafka has died, stop:
        kafka_pid.poll()
        if kafka_pid.returncode:
            logger.info('Kafka died: %s', kafka_pid.returncode)
            return kafka_pid.returncode

        # Poll Exhibitor for current ensemble:
        cur_zk = zk_conn_string(exhibitor)
        if cur_zk != zk_conn and len(cur_zk) >= len(zk_conn):
            logger.info('ZooKeeper ensemble change: %s', cur_zk)
            # If ensemble has changed, acquire lock:
            zk = KazooClient(hosts=','.join(cur_zk))
            try:
                zk.start()
                with zk.Lock('/kafka-exhibitor/%s' % exhibitor):
                    logger.info('Restart lock acquired, restarting...')
                    # Lock acquired, restart:
                    kafka_pid.terminate()
                    kafka_pid.wait()
                    kafka_pid = start_kafka(base_properties, cur_zk)
                    zk_conn = cur_zk

                    time.sleep(restart_interval)
            finally:
                zk.stop()

        # Loop:
        time.sleep(loop_interval)
        loops -= 1
    return 0
Example #10
0
def create_app_lock():
    """Acquire a lock in zookeeper for creating a marathon app. This is
    due to marathon's extreme lack of resilience with creating multiple
    apps at once, so we use this to not do that and only deploy
    one app at a time."""
    zk = KazooClient(hosts=load_system_paasta_config().get_zk_hosts(), timeout=ZK_LOCK_CONNECT_TIMEOUT_S)
    zk.start()
    lock = zk.Lock('%s/%s' % (ZK_LOCK_PATH, 'create_marathon_app_lock'))
    try:
        lock.acquire(timeout=30)  # timeout=0 throws some other strange exception
        yield
    except LockTimeout:
        raise LockHeldException("Failed to acquire lock for creating marathon app!")
    finally:
        lock.release()
        zk.stop()
Example #11
0
def thread_process2(index):
    logDS.info("begin %d" % index)
    zk = KazooClient(hosts='172.10.3.111:2181', logger=logDS.logger)
    zk.start()
    node = "/my/lockpath"
    lock = zk.Lock(node, "my-identifier")
    with lock:  # blocks waiting for lock acquisition
        # do something with the lock
        logDS.info("get the lock %d" % index)
        global gindex
        if gindex != index:
            logDS.error("error xxxx %d %d" % (gindex, index))
        gindex = gindex + 1
        if index == 0:
            time.sleep(10)
    zk.stop()
    logDS.info("exit %d" % index)
Example #12
0
        def create_lock(self):
                zk = KazooClient(hosts=self.zookeeper_quorum)
                zk.start()
                with zk.Lock("/",get_current_host_name()):
                        proc_object = ProcCheck(self.process_name, self.monitor_interval, self.start                                                                                                          _command)
                        proc_id = proc_object.get_process_id()
                        if proc_id is None:
                                proc_status = proc_object.start_process()
                                if proc_status:
                                        proc_id = proc_object.get_process_id()

                        # shall proc_status be False, proc_id will return None again,
                        # which will be handled by ProcCheck.monitor()

                        # as soon as monitor finishes, it will release lock,
                        # allowing the service running on another machine to create lock.
                        proc_object.monitor(proc_id)
Example #13
0
def create_autoscaling_lock():
    """Acquire a lock in zookeeper for autoscaling. This is
    to avoid autoscaling a service multiple times, and to avoid
    having multiple paasta services all attempting to autoscale and
    fetching mesos data."""
    zk = KazooClient(hosts=load_system_paasta_config().get_zk_hosts(), timeout=ZK_LOCK_CONNECT_TIMEOUT_S)
    zk.start()
    lock = zk.Lock('/autoscaling/autoscaling.lock')
    try:
        lock.acquire(timeout=1)  # timeout=0 throws some other strange exception
        yield
    except LockTimeout:
        raise LockHeldException("Failed to acquire lock for autoscaling!")
    else:
        lock.release()
    finally:
        zk.stop()
Example #14
0
def bounce_lock_zookeeper(name):
    """Acquire a bounce lock in zookeeper for the name given. The name should
    generally be the service namespace being bounced.
    This is a contextmanager. Please use it via 'with bounce_lock(name):'.
    :param name: The lock name to acquire"""
    zk = KazooClient(hosts=load_system_paasta_config().get_zk_hosts(), timeout=ZK_LOCK_CONNECT_TIMEOUT_S)
    zk.start()
    lock = zk.Lock('%s/%s' % (ZK_LOCK_PATH, name))
    try:
        lock.acquire(timeout=1)  # timeout=0 throws some other strange exception
        yield
    except LockTimeout:
        raise LockHeldException("Service %s is already being bounced!" % name)
    else:
        lock.release()
    finally:
        zk.stop()
Example #15
0
class Orc(object):
    def __init__(self, host, port, supervisor, orc_host):
        self.zk = KazooClient('{}:{}'.format(host, port))
        self.path = PathMaker()
        self.name_gen = NameGenerator()
        self.name = None
        self.supervisor = supervisor
        self.orc_host = orc_host
        self.setup()

    def setup_nodes(self):
        # Setup ephemeral nodes
        lock = self.zk.Lock(self.path.namelock())
        with lock:
            used_names = self.zk.get_children(self.path.toolchain())
            new_name = self.name_gen.generate()
            while new_name in used_names:
                new_name = self.name_gen.generate()
            self.name = new_name
            # Register watch
            DataWatch(self.zk, self.path.toolchain(self.name), self.on_sync)
            # Setup path for conf synchronization
            self.zk.create(self.path.toolchain(new_name), ephemeral=True)
        # Put information about node
        self.zk.create(self.path.node(self.name),
                       value=self.orc_host,
                       ephemeral=True)

    def setup(self):
        logger.info('Setting up Orc')
        self.zk.start()
        # Setup nodes
        self.setup_nodes()

    def on_sync(self, data, stat, event):
        if event and event.type == EventType.CHANGED:
            logger.info('Synchronizing toolchain')
            self.supervisor.update(data)

    def teardown(self):
        logger.info('Tearing down Orc')
        self.zk.stop()
        self.zk.close()
        self.supervisor.teardown()
def zk_cluster_lock(zk: KazooClient,
                    name: str,
                    timeout: int = 30) -> Generator:
    lock = zk.Lock("{}/{}".format(ZK_PREFIX, name), socket.gethostname())
    try:
        print("Acquiring cluster lock '{}'".format(name))
        lock.acquire(blocking=True, timeout=timeout)
    except (ConnectionLoss, SessionExpiredError) as e:
        print("Failed to acquire cluster lock: {}".format(
            e.__class__.__name__))
        raise e
    except LockTimeout as e:
        print("Failed to acquire cluster lock in {} seconds".format(timeout))
        raise e
    else:
        print("ZooKeeper lock acquired.")
    yield
    print("Releasing ZooKeeper lock")
    lock.release()
    print("ZooKeeper lock released.")
Example #17
0
def test_lock_timeout():
    zk = KazooClient(hosts="127.0.0.1:2181")
    zk.start()
    lock = zk.Lock("/zha-lock", "test")
    lock.acquire()

    obj = type('', (), {})
    obj.flg = False

    def _oa():
        obj.flg = True
        return 0

    config = skelton.Config()
    config.check_health = lambda: 3
    config.become_active = _oa
    z = zha.ZHA(config)
    trigger_zha(z)
    assert obj.flg is False
    lock.release()
    zk.stop()
    time.sleep(10)
Example #18
0
class Zookeeper(object):
    HOSTS = [
        '127.0.0.1:2181'
    ]

    def __enter__(self):
        return self

    def __exit__(self, type, value, traceback):
        self._kill_subprocess()
        self.zk.stop()

    def __init__(self, app):
        self.app = app.strip().split()
        self.subprocess = None
        self.descendants = set()
        self.zk = KazooClient(hosts=self.HOSTS)
        self.zk.start()
        self.zk.ensure_path("/")
        self.lock = self.zk.Lock("/lock")
        self._set_z_watcher()

    def _start_subprocess(self):
        if not self.subprocess:
            self.subprocess = subprocess.Popen(args=self.app)

    def _kill_subprocess(self):
        if self.subprocess:
            self.subprocess.kill()
            self.subprocess = None

    def _set_children_watchers(self, path):
        if path not in self.descendants:
            self.descendants.add(path)
            if self.zk.exists(path):
                children = self.zk.get_children(path)
                for child in children:
                    new_path = os.path.join(path, child)
                    self._set_children_watchers(new_path)
            self._create_child_watcher(path)

    def _create_child_watcher(self, path):
        @self.zk.ChildrenWatch(path, send_event=True)
        def child_watcher(children, event):
            if event:
                path = event.path
                with self.lock:
                    diff_list = list({os.path.join(path, child) for child in children} - self.descendants)
                if len(diff_list) > 0:
                    for child in diff_list:
                        self._set_children_watchers(child)
                        with self.lock:
                            print("Current descendants:", len(self.descendants) - 1)
                else:
                    self._clean(path, children)

    def _clean(self, path, children):
        with self.lock:
            old_paths = {x for x in self.descendants if x.startswith(path) if x != path}
            new_paths = {os.path.join(path, child) for child in children}
            new_paths_extended = {descendant for descendant in self.descendants if
                                  any(descendant.startswith(new_path) for new_path in new_paths)}
            paths = old_paths - new_paths_extended

            for old_path in paths:
                self.descendants.remove(old_path)

    def _set_z_watcher(self):
        @self.zk.DataWatch('/z')
        def data_watcher(data, stat, event):
            if stat:
                self._start_subprocess()
                self._set_children_watchers("/z")
            else:
                self._kill_subprocess()
                self.descendants = set()

    def _print_tree(self):
        def _print_recursive(indent, node):
            print("│  " * indent, "├─", node, sep="")oc
            for child in self.zk.get_children(node):
                path = os.path.join(node, child)
                if self.zk.exists(path):
                    _print_recursive(indent + 1, path)

        if not self.zk.exists("/z"):
            print("No node /z for printing tree")
        else:
            print("/z")
            for child in self.zk.get_children("z"):
                _print_recursive(0, os.path.join("/z", child))

    def _print_usage(self):
        print("q - quit")
        print("t - print tree")

    def handle(self):
        while True:

            self._print_usage()
            x = input('Command: ')
            if x == "q":
                break
            elif x == "t":
                self._print_tree()
            else:
                print("Unknown command")
Example #19
0
class DistributedSequenceCoordinator(object):
    def __init__(self, zookeeper_connect, autoscaling_grp_name, strategy_name,
                 instance_id, max_sequence_id, asg_instances_ids):
        self.zk = KazooClient(hosts=zookeeper_connect)
        self.running = False
        self.interrupted = False
        self.autoscaling_grp_name = autoscaling_grp_name
        self.strategy_name = strategy_name
        self.instance_id = instance_id
        self.max_sequence_id = max_sequence_id
        self.asg_instances_ids = asg_instances_ids

    def state_change_listener(self, state):
        logging.debug('zookeeper state changed to {0}'.format(state))
        if state == KazooState.LOST or state == KazooState.SUSPENDED:
            if self.running:
                self.interrupted = True
                self.log_msg('distributed coordination interrupted')
                raise Exception('zookeeper session interrupted')

    """
     Responsible for executing operation in isolation even-in cases of failures, connection-resets etc. Uses optimistic
     concurrency control by assuming that operation would be executed without any interruption, and if any interruption
     occurs, then acquires a new lock and re-execute the idempotent operation to guarantee isolation.
    """

    def execute(self):
        result = None

        # exception-handling for cases where unable to establish connection to zookeeper
        try:
            # TODO: use python retrying lib to control with timeouts, max & exponential back-off wait time b/w retries
            while result is None or self.interrupted:
                self.running = True
                self.interrupted = False
                self.log_msg('distributed operation starting')
                self.zk.start()
                self.zk.add_listener(self.state_change_listener)
                try:
                    lock = self.zk.Lock(zk_sequencer_root,
                                        self.autoscaling_grp_name)
                    logging.debug('zookeeper lock created {}'.format(
                        lock.data))
                    self.log_msg('entering zookeeper lock')
                    with lock:
                        result = self.operation()
                except Exception as e:
                    logging.exception(e)
                    self.log_msg('encountered zk exception')
                finally:
                    self.log_msg('stopping zk')
                    self.zk.stop()
        except Exception as e:
            raise e

        if result is None:
            raise Exception('Unable to generate sequence id')

        return result

    def operation(self):
        instances_root_path = "/".join(
            [zk_sequencer_root, self.autoscaling_grp_name])
        self.zk.ensure_path(instances_root_path)
        instance_nodes = self.zk.get_children(instances_root_path)
        zk_instance_sequencers = {}
        for instance_node in instance_nodes:
            instance_node_path = "/".join([instances_root_path, instance_node])
            instance_id = self.zk.get(instance_node_path)[0]
            zk_instance_sequencers[str(instance_id)] = int(instance_node)

        logging.debug('zk instances: {0}'.format(zk_instance_sequencers))

        instance_sequencers = {
            k: v
            for k, v in zk_instance_sequencers.items()
            if k in self.asg_instances_ids
        }

        logging.debug('active instances with assigned sequences: {0}'.format(
            instance_sequencers))

        generator = SequenceStrategy(self.strategy_name, self.instance_id,
                                     instance_sequencers, self.max_sequence_id)

        sequence_id = generator.get_sequence_id()
        current_instance_node_path = "/".join(
            [instances_root_path, str(sequence_id)])
        self.zk.ensure_path(current_instance_node_path)
        self.zk.set(current_instance_node_path,
                    str.encode(str(self.instance_id)))
        self.running = False
        return sequence_id

    def log_msg(self, msg):
        logging.debug('{0}, running = {1}, interrupted = {2}'.format(
            msg, self.running, self.interrupted))
Example #20
0
class Listener:
    def __init__(self, hosts, root, workspace='/tmp'):
        self.zk = KazooClient(hosts=hosts)
        self.root = root
        self.workspace = os.path.abspath(workspace)
        self.tasks = []
        self.event = threading.Event()
        self.hostname = os.uname().nodename

    def get_task(self, task_id):
        node = os.path.join(self.root, 'tasks', task_id, 'targets',
                            self.hostname)
        lock_node = os.path.join(node, 'lock')
        lock = self.zk.Lock(lock_node, self.hostname.encode())
        with lock:
            data, _ = self.zk.get(node)
        return json.dumps(data.decode())

    def set_status(self, task_id, status):
        node = os.path.join(self.root, 'tasks', task_id, 'targets',
                            self.hostname)
        lock_node = os.path.join(node, 'lock')
        lock = self.zk.Lock(lock_node, self.hostname.encode())
        with lock:
            self.zk.set(node, status.encode())
        signal_node = os.path.join(self.root, 'signal', task_id)
        self.zk.set(signal_node, uuid.uuid4().bytes)

    def get_job_server_list(self):
        node = os.path.join(self.root, 'job_server')
        return [
            self.zk.get(os.path.join(node, x))[0]
            for x in self.zk.get_children(node)
        ]

    def get_log_server_list(self):
        node = os.path.join(self.root, 'log_server')
        result = []
        for server in self.zk.get_children(node):
            address, port = server.split(':')
            result.append((address, int(port)))
        return result

    def render(self, params):
        for root, _, files in os.walk('.'):
            for tmpl in [f for f in files if f.endswith('.tmpl')]:
                path = os.path.join(root, tmpl)
                with open(path, 'r') as f:
                    content = f.read()
                    rendered = pystache.render(content, params)
                    with open(path.replace('.tmpl', ''), 'w') as w:
                        w.write(rendered)

    def _send_log(self, task_id, cmd, seq=1):
        log_server = random.choice(self.get_log_server_list())
        s = socket.socket()
        s.connect(log_server)
        s.send(task_id.encode())
        s.send(b'\n')
        s.send(self.hostname.encode())
        s.send(b'\n')
        s.send('{0}'.format(seq))
        s.send(b'\n\n')
        for buf in cmd.out_stream():
            s.send(buf)
        s.close()

    def send_log(self, task_id, cmd):
        seq = 1
        while not cmd.finish:
            t = threading.Thread(target=self._send_log,
                                 args=(task_id, cmd, seq))
            t.start()
            t.join()
            seq += 1

    def schedule(self, task_id):
        task = self.get_task(task_id)
        job_server = random.choice(self.get_job_server_list())
        # http://xxx.xxx.xx.xxx/packages/
        # magedu/test-job
        # http://xxx.xxx.xx.xxx/packages/magedu/test-job.zip
        url = '{0}/{1}.zip'.format(job_server, task_id['job_id'])
        response = requests.get(url)
        z = zipfile.ZipFile(io.BytesIO(response.content))
        workspace = os.path.join(self.workspace, task_id)
        os.makedirs(workspace)
        os.chdir(workspace)
        z.extractall()
        try:
            self.render(task.get('params', {}))
        except Exception as e:
            logging.error(e)
            self.set_status(task_id, 'F')
            return
        os.chmod('./run.sh', 0o755)
        cmd = Command('run.sh', workspace, timeout=task.get('timeout', 0))
        self.set_status(task_id, 'R')
        cmd.exec()
        self.send_log(task_id, cmd)
        cmd.wait()
        if cmd.success:
            self.set_status(task_id, 'S')
        else:
            self.set_status(task_id, 'F')

    def run(self):
        while not self.event.is_set():
            if len(self.tasks) > 0:
                task_id = self.tasks.pop(0)
                try:
                    self.schedule(task_id)
                finally:
                    shutil.rmtree(os.path.join(self.workspace, task_id))
            else:
                self.event.wait(1)

    def watch(self, tasks):
        new_tasks = set(tasks).difference(self.tasks)
        self.tasks.extend(new_tasks)
        return not self.event.is_set()

    def start(self):
        self.zk.start()
        node = os.path.join(self.root, 'agents', self.hostname)
        self.zk.ensure_path(node)
        tasks_node = os.path.join(node, 'tasks')
        self.zk.ensure_path(tasks_node)
        self.zk.create(os.path.join(node, 'alive'),
                       str(datetime.datetime.now().timestamp()).encode(),
                       ephemeral=True)
        ChildrenWatch(self.zk, tasks_node, self.watch)
        threading.Thread(target=self.run, name='task-runner').start()

    def shutdown(self):
        self.event.set()

    def join(self):
        self.event.wait()
Example #21
0
#! /usr/bin/env python2

import socket
import os
from kazoo.client import KazooClient
from kazoo.client import KazooState


def zk_status(state):
    if state == KazooState.LOST:
        print 'lost session'
    elif state == KazooState.SUSPENDED:
        print 'disconnected from ZK'
    elif state == KazooState.CONNECTED:
        print 'connected'


# API 0.3 spec
# http://kazoo.readthedocs.org/en/0.3/api/client.html

zk = KazooClient(hosts='server1:2181,vmk1:2181,vmk2:2181')
zk.add_listener(zk_status)
zk.start()
lock = zk.Lock('/master', '%s-%d' % (socket.gethostname(), os.getpid()))
zk.ensure_path("/path")
zk.set("/path", "data_string".encode('utf8'))
start_key, stat = zk.get("/path")
def get_cluster_status():
    try:
        return requests.get("{}admin/collections?action=CLUSTERSTATUS".format(
            SDAP_SOLR_URL)).json()
    except (requests.exceptions.ConnectionError, json.decoder.JSONDecodeError):
        return False


logging.info("Attempting to aquire lock from {}".format(SDAP_ZK_SOLR))
zk_host, zk_chroot = SDAP_ZK_SOLR.split('/')
zk = KazooClient(hosts=zk_host)
zk.start()
zk.ensure_path(zk_chroot)
zk.chroot = zk_chroot
lock = zk.Lock("/collection-creator", ZK_LOCK_GUID)
try:
    with lock:  # blocks waiting for lock acquisition
        logging.info(
            "Lock aquired. Checking for SolrCloud at {}".format(SDAP_SOLR_URL))
        # Wait for MAX_RETRIES for the entire Solr cluster to be available.
        attempts = 0
        status = None
        collection_exists = False
        while attempts <= MAX_RETRIES:
            status = get_cluster_status()
            if not status:
                # If we can't get the cluster status, my Solr node is not running
                attempts += 1
                logging.info("Waiting for Solr at {}".format(SDAP_SOLR_URL))
                time.sleep(1)
Example #23
0
class Manager:
    def __init__(self):
        super(Manager, self).__init__()

        self._terminated = False

        self.config = config['zookeeper']
        self.zk = KazooClient(**self.config)
        self.zk.start()

        self.zk.ensure_path('/jobs')
        self.zk.ensure_path('/settings/running')
        self.zk.set('/settings/running', 'false'.encode())

        self.hadoop = HadoopModules()

        self.lock = self.zk.Lock('/settings/lock', 'lock')

        self._start_polling()

    def _start_polling(self):
        t = threading.Thread(target=self._try_execute_job, daemon=True)
        t.start()

    def _try_execute_job(self):
        if not self._terminated:
            threading.Timer(5.0, self._try_execute_job).start()
            self.execute_next_job()

    def enqueue_job(self, job):
        if job.id is None:
            return
        success = False
        with self.lock:
            node = "/jobs/{0}".format(job.id)
            if not self.zk.exists(node):
                self.zk.create(node)
                self.zk.create(node + "/jar_path",
                               job.file_full_path().encode())
                self.zk.create(node + "/retries", '0'.encode())
                success = True
                log.info("Enqueued job {0}".format(job.id))
        return success

    def execute_next_job(self):
        with self.lock:
            if self._is_running():
                return
            children = self.zk.get_children('/jobs')
            if not children:
                return
            children = map(lambda s: int(s), children)
            next_job_id = min(children)
            if self._check_retries(next_job_id):
                return self.execute_job(next_job_id, False)
            else:
                Jobs.update_entity(next_job_id, status=Jobs.FAILED)
                self._delete_job(next_job_id)
                log.info(
                    "Removing job {0} after 3 failures".format(next_job_id))
        self.execute_next_job()

    def _delete_job(self, id):
        self.zk.delete("/jobs/{0}".format(id), recursive=True)

    def _check_retries(self, id):
        retries, _ = self.zk.get("/jobs/{0}/retries".format(id))
        return int(retries.decode()) < 3

    def execute_job(self, id, take_lock=True):
        if take_lock:
            with self.lock:
                self.execute_job_no_lock(id)
        else:
            self.execute_job_no_lock(id)

    def hadoop_callback(self, job_id, return_code, stdout, stderr):
        job = Jobs.find(job_id)
        if job:
            job.update(stdout=stdout, stderr=stderr)
        with self.lock:
            self._set_running(False)
            if return_code == 0:
                job.update(status=Jobs.FINISHED)
                self._delete_job(job_id)

    def execute_job_no_lock(self, id):
        log.info("Executing job {0}".format(id))
        self._set_running(True)
        self._increase_retries(id)
        job = Jobs.find(id)
        job.update_entity(id, status=Jobs.RUNNING)
        path, _ = self.zk.get("/jobs/{0}/jar_path".format(id))
        callback = lambda ret, out, err: self.hadoop_callback(
            id, ret, out, err)
        self.hadoop.start_hadoop(path.decode(), job.arguments_list(), callback)

    def _set_running(self, is_running):
        value = ('true' if is_running else 'false').encode()
        self.zk.set('/settings/running', value)

    def _is_running(self):
        v, _ = self.zk.get('/settings/running')
        return v.decode() == 'true'

    def _increase_retries(self, id):
        key = "/jobs/{0}/retries".format(id)
        retries, _ = self.zk.get(key)
        new_retries = int(retries.decode()) + 1
        Jobs.update_entity(id, retries=new_retries)
        self.zk.set(key, str(new_retries).encode())
Example #24
0
File: zk.py Project: xabarass/scion
class Zookeeper(object):
    """
    A wrapper class for Zookeeper interfacing, using the `Kazoo python library
    <https://kazoo.readthedocs.org/en/latest/index.html>`_.

    As Kazoo's functionality is mostly unaware of connection-state changes,
    it requires quite a bit of delicate code to make work reliably.

    E.g. Kazoo's Lock will claim to be held, even if the Zookeeper connection
    has been lost in the meantime. This causes an immediate split-brain problem
    for anything relying on that lock for synchronization. There is also,
    unfortunately, no documented way to inform the local Lock object that the
    connection is down and therefore the Lock should be released.

    All of Kazoo's events are done via callbacks. These callbacks must not
    block. If they do, no more Kazoo events can happen.

    E.g. if a watch callback blocks, disconnection callbacks will not run.
    """
    def __init__(self,
                 isd_as,
                 srv_type,
                 srv_id,
                 zk_hosts,
                 timeout=1.0,
                 on_connect=None,
                 on_disconnect=None):
        """
        Setup the Zookeeper connection.

        :param ISD_AS isd_as: The local ISD-AS.
        :param str srv_type:
            a service type from :const:`lib.types.ServiceType`
        :param str srv_id: Service instance identifier.
        :param list zk_hosts:
            List of Zookeeper instances to connect to, in the form of
            ``["host:port"..]``.
        :param float timeout: Zookeeper session timeout length (in seconds).
        :param on_connect:
            A function called everytime a connection is made to Zookeeper.
        :param on_disconnect:
            A function called everytime a connection is lost to Zookeeper.
        """
        self._isd_as = isd_as
        self._srv_id = b64encode(srv_id).decode("ascii")
        self._timeout = timeout
        self._on_connect = on_connect
        self._on_disconnect = on_disconnect
        self.prefix = "/%s/%s" % (self._isd_as, srv_type)
        # Keep track of our connection state
        self._connected = threading.Event()
        # Keep track of the kazoo lock
        self._lock = threading.Event()
        # Used to signal connection state changes
        self._state_events = queue.Queue()
        self.conn_epoch = 0
        # Kazoo parties
        self._parties = {}
        # Kazoo lock (initialised later)
        self._zk_lock = None
        self._lock_epoch = 0

        self._kazoo_setup(zk_hosts)
        self._setup_state_listener()
        self._kazoo_start()

    def _kazoo_setup(self, zk_hosts):
        """
        Create and configure Kazoo client

        :param list zk_hosts:
            List of Zookeeper instances to connect to, in the form of
            ``["host:port"..]``.
        """
        # Disable exponential back-off
        kretry = KazooRetry(max_tries=-1, max_delay=1)
        # Stop kazoo from drowning the log with debug spam:
        logger = logging.getLogger("KazooClient")
        logger.setLevel(logging.ERROR)
        # (For low-level kazoo debugging):
        # import kazoo.loggingsupport
        # logger.setLevel(kazoo.loggingsupport.BLATHER)
        self.kazoo = KazooClient(hosts=",".join(zk_hosts),
                                 timeout=self._timeout,
                                 connection_retry=kretry,
                                 logger=logger)

    def _kazoo_start(self):
        """Connect the Kazoo client to Zookeeper."""
        logging.info("Connecting to Zookeeper")
        try:
            self.kazoo.start()
        except KazooTimeoutError:
            logging.critical(
                "Timed out connecting to Zookeeper on startup, exiting")
            kill_self()

    def _setup_state_listener(self):
        """
        Spawn state listener thread, to respond to state change notifications
        from Kazoo. We use a thread, as the listener callback must not block.
        """
        threading.Thread(target=thread_safety_net,
                         args=(self._state_handler, ),
                         name="libZK._state_handler",
                         daemon=True).start()
        # Listener called every time connection state changes
        self.kazoo.add_listener(self._state_listener)

    def _state_listener(self, new_state):
        """Called everytime the Kazoo connection state changes."""
        self.conn_epoch += 1
        # Signal a connection state change
        logging.debug("Kazoo state changed to %s (epoch %d)", new_state,
                      self.conn_epoch)
        self._state_events.put(new_state)
        # Tell kazoo not to remove this listener:
        return False

    def _state_handler(self, initial_state="startup"):
        """
        A thread worker function to wait for Kazoo connection state changes,
        and call the relevant method.
        """
        old_state = initial_state
        while True:
            # Wait for connection state change
            new_state = self._state_events.get()

            if (new_state == KazooState.CONNECTED
                    and not self._state_events.empty()):
                # Helps prevent some state flapping.
                logging.debug("Kazoo CONNECTED ignored as the events "
                              "queue is not empty.")
                continue
            # Short-circuit handler if the state hasn't actually changed. This
            # prooobably shouldn't happen now, so making it an error.
            if new_state == old_state:
                logging.error("Kazoo state didn't change from %s, ignoring",
                              old_state)
                continue

            logging.debug("Kazoo old state: %s, new state: %s", old_state,
                          new_state)
            old_state = new_state
            if new_state == KazooState.CONNECTED:
                self._state_connected()
            elif new_state == KazooState.SUSPENDED:
                self._state_suspended()
            else:
                self._state_lost()

    def _state_connected(self):
        """Handles the Kazoo 'connected' event."""
        # Might be first connection, or reconnecting after a problem.
        clid = self.kazoo.client_id
        if clid is None:
            # Protect against a race-condition.
            return
        try:
            zk_peer = self.kazoo._connection._socket.getpeername()
        except AttributeError:
            zk_peer = "?", "?"
        logging.debug(
            "Connection to Zookeeper succeeded (Session: %s, ZK: [%s]:%s)",
            hex(clid[0]), zk_peer[0], zk_peer[1])
        try:
            self.ensure_path(self.prefix, abs=True)
            # Use a copy of the dictionary values, as the dictioary is changed
            # by another thread.
            for party in list(self._parties.values()):
                party.autojoin()
        except ZkNoConnection:
            return
        self._connected.set()
        if self._on_connect:
            self._on_connect()

    def _state_suspended(self):
        """
        Handles the Kazoo 'connection suspended' event.

        This means that the connection to Zookeeper is down.
        """
        self._connected.clear()
        logging.info("Connection to Zookeeper suspended")
        if self._on_disconnect:
            self._on_disconnect()

    def _state_lost(self):
        """
        Handles the Kazoo 'connection lost' event.

        This means that the Zookeeper session is lost, so all setup needs to be
        re-done on connect.
        """
        self._connected.clear()
        logging.info("Connection to Zookeeper lost")
        if self._on_disconnect:
            self._on_disconnect()

    def is_connected(self):
        """Check if there is currently a connection to Zookeeper."""
        return self._connected.is_set()

    def wait_connected(self, timeout=None):
        """
        Wait until there is a connection to Zookeeper. Log every 10s until a
        connection is available.

        :param float timeout:
            Number of seconds to wait for a ZK connection. If ``None``, wait
            forever.
        :raises:
            ZkNoConnection:
                if there's no connection to ZK after timeout has expired.
        """
        if self.is_connected():
            return
        logging.debug("Waiting for ZK connection")
        start = time.time()
        total_time = 0.0
        if timeout is None:
            next_timeout = 10.0
        while True:
            if timeout is not None:
                next_timeout = min(timeout - total_time, 10.0)
            ret = self._connected.wait(timeout=next_timeout)
            total_time = time.time() - start
            if ret:
                logging.debug("ZK connection available after %.2fs",
                              total_time)
                return
            elif timeout is not None and total_time >= timeout:
                logging.debug("ZK connection still unavailable after %.2fs",
                              total_time)
                raise ZkNoConnection
            else:
                logging.debug("Still waiting for ZK connection (%.2fs so far)",
                              total_time)

    def ensure_path(self, path, abs=False):
        """
        Ensure that a path exists in Zookeeper.

        :param str path: Path to ensure
        :param bool abs: Is the path abolute or relative?
        :raises:
            ZkNoConnection: if there's no connection to ZK.
        """
        full_path = path
        if not abs:
            full_path = os.path.join(self.prefix, path)
        try:
            self.kazoo.ensure_path(full_path)
        except (ConnectionLoss, SessionExpiredError):
            raise ZkNoConnection from None

    def party_setup(self, prefix=None, autojoin=True):
        """
        Setup a `Kazoo Party
        <https://kazoo.readthedocs.org/en/latest/api/recipe/party.html>`_.

        Used to signal that a group of processes are in a similar state.

        :param str prefix:
            Path to create the party under. If not specified, uses the default
            prefix for this server instance.
        :param bool autojoin: Join the party if True, also on reconnect
        :return: a ZkParty object
        :rtype: ZkParty
        :raises:
            ZkNoConnection: if there's no connection to ZK.
        """
        if not self.is_connected():
            raise ZkNoConnection
        if prefix is None:
            prefix = self.prefix
        party_path = os.path.join(prefix, "party")
        self.ensure_path(party_path, abs=True)
        party = ZkParty(self.kazoo, party_path, self._srv_id, autojoin)
        self._parties[party_path] = party
        return party

    def get_lock(self, lock_timeout=None, conn_timeout=None):
        """
        Try to get the lock. Returns immediately if we already have the lock.

        :param float lock_timeout:
            Time (in seconds) to wait for lock acquisition, or ``None`` to wait
            forever (Default).
        :param float conn_timeout:
            Time (in seconds) to wait for a connection to ZK, or ``None`` to
            wait forever (Default).
        :return:
            ``ZK_LOCK_FAIL`` if getting the lock failed, ``ZK_LOCK_SUCCESS`` if the lock was
            acquired, or ``ZK_LOCK_ALREADY`` if the lock is already held by this process.
        :rtype: :class:`int`
        """
        if self._zk_lock is None:
            # First-time setup.
            lock_path = os.path.join(self.prefix, "lock")
            self._zk_lock = self.kazoo.Lock(lock_path, self._srv_id)
        elif self.have_lock():
            return ZK_LOCK_ALREADY
        self.wait_connected(timeout=conn_timeout)
        self._lock_epoch = self.conn_epoch
        if lock_timeout is None:
            # Only need to log this when we could block for a long time
            logging.debug("Trying to acquire ZK lock (epoch %d)",
                          self._lock_epoch)
        try:
            if self._zk_lock.acquire(timeout=lock_timeout):
                logging.info("Successfully acquired ZK lock (epoch %d)",
                             self._lock_epoch)
                self._lock.set()
        except (ConnectionLoss, SessionExpiredError):
            raise ZkNoConnection from None
        except LockTimeout:
            pass
        except (AttributeError, TypeError):
            # Work-around for https://github.com/python-zk/kazoo/issues/288
            pass
        if self.have_lock():
            return ZK_LOCK_SUCCESS
        return ZK_LOCK_FAIL

    def release_lock(self):
        """Release the lock."""
        self._lock.clear()
        if self._zk_lock is None:
            return
        if self.is_connected():
            try:
                self._zk_lock.release()
            except (NoNodeError, ConnectionLoss, SessionExpiredError):
                pass
        # Hack suggested by https://github.com/python-zk/kazoo/issues/2
        self._zk_lock.is_acquired = False

    def have_lock(self):
        """Check if we currently hold the lock."""
        if (self.is_connected() and self._lock_epoch == self.conn_epoch
                and self._lock.is_set()):
            return True
        else:
            self.release_lock()
            return False

    def wait_lock(self):
        """Wait until we hold the lock."""
        self._lock.wait()

    def get_lock_holder(self):
        """
        Return address and port of the lock holder, or None if master is not
        elected.

        :raises:
            ZkNoConnection: if there's no connection to ZK.
        """
        if self._zk_lock is None:
            return None
        try:
            contenders = self._zk_lock.contenders()
            if not contenders:
                logging.warning('No lock contenders found')
                return None
            return ZkID.from_raw(b64decode(contenders[0]))
        except (ConnectionLoss, SessionExpiredError):
            logging.warning("Disconnected from ZK.")
            raise ZkNoConnection from None

    def retry(self, desc, f, *args, _retries=4, _timeout=10.0, **kwargs):
        """
        Execute a given operation, retrying it if fails due to connection
        problems.

        :param str desc: Description of the operation
        :param function f: Function to call, passing in \*args and \*\*kwargs
        :param int _retries:
            Number of times to retry the operation, or `None` to retry
            indefinitely.
        :param float _timeout:
            Number of seconds to wait for a connection, or `None` to wait
            indefinitely.
        """
        count = -1
        while True:
            count += 1
            if _retries is not None and count > _retries:
                break
            try:
                self.wait_connected(timeout=_timeout)
            except ZkNoConnection:
                logging.warning("%s: No connection to ZK", desc)
                continue
            try:
                return f(*args, **kwargs)
            except ZkNoConnection:
                logging.warning("%s: Connection to ZK dropped", desc)
        raise ZkRetryLimit("%s: Failed %s times, giving up" %
                           (desc, 1 + _retries))
Example #25
0
class PartitionClient(object):
    """ Client Class for the Partition Library
    Example usage:
    ---------------------
    import libpartition
    from libpartition.libpartition import PartitionClient

    def own_change_cb(l):
            print "ownership change:" + str(l)

    c = PartitionClient("test", "s1", ["s1", "s2", "s3"], 32, 
            own_change_cb, "zookeeper_s1")

    ##do some real work now"
    if (c.own_partition(1)):
        ...... do something with partition #1 .....
        .........
    ...
    c.update_cluster_list(["s1", "s2"])
    ...
    ----------------------
    You should not call any partition library routine from within the 
    callback function

    Args:
        app_name(str): Name of the app for which partition cluster is used
        self_name(str): Name of the local cluster node (can be ip address)
        cluster_list(list): List of all the nodes in the cluster including 
            local node
        max_partition(int): Partition space always go from 0..max_partition-1
        partition_update_cb: Callback function invoked when partition
            ownership list is updated.x
        zk_server(str): <zookeeper server>:<zookeeper server port>
    """
    def __init__(self,
                 app_name,
                 self_name,
                 cluster_list,
                 max_partition,
                 partition_update_cb,
                 zk_server,
                 logger=None):

        # Initialize local variables
        self._zk_server = zk_server
        self._cluster_list = set(cluster_list)
        self._max_partition = max_partition
        self._update_cb = partition_update_cb
        self._curr_part_ownership_list = []
        self._target_part_ownership_list = []
        self._con_hash = ConsistentHash(cluster_list)
        self._name = self_name

        # some sanity check
        if not (self._name in cluster_list):
            raise ValueError('cluster list is missing local server name')

        # initialize logging and other stuff
        if logger is None:
            logging.basicConfig()
            self._logger = logging
        else:
            self._logger = logger
        self._conn_state = None
        self._sandesh_connection_info_update(status='INIT', message='')

        # connect to zookeeper
        while True:
            self._logger.error("Libpartition zk start")
            self._zk = KazooClient(zk_server, timeout=60.0)
            self._zk.add_listener(self._zk_listen)
            try:
                self._zk.start()
                while self._conn_state != ConnectionStatus.UP:
                    gevent.sleep(1)
                break
            except Exception as e:
                # Update connection info
                self._sandesh_connection_info_update(status='DOWN',
                                                     message=str(e))
                self._zk.remove_listener(self._zk_listen)
                try:
                    self._zk.stop()
                    self._zk.close()
                except Exception as ex:
                    template = "Exception {0} in Libpartition zk stop/close. Args:\n{1!r}"
                    messag = template.format(type(ex).__name__, ex.args)
                    self._logger.error("%s : traceback %s for %s" % \
                        (messag, traceback.format_exc(), self._name))
                finally:
                    self._zk = None
                gevent.sleep(1)

        # create a lock array to contain locks for each partition
        self._part_locks = []
        for part in range(0, self._max_partition):
            lockpath = "/lockpath/" + app_name + "/" + str(part)
            l = self._zk.Lock(lockpath, self._name)
            self._part_locks.append(l)

        # initialize partition # to lock acquire greenlet dictionary
        self._part_lock_task_dict = {}

        self._logger.error("initial servers:" + str(self._cluster_list))

        # update target partition ownership list
        for part in range(0, self._max_partition):
            if (self._con_hash.get_node(str(part)) == self._name):
                self._target_part_ownership_list.append(part)

        # update current ownership list
        self._acquire_partition_ownership()

    #end __init__

    def _sandesh_connection_info_update(self, status, message):
        new_conn_state = getattr(ConnectionStatus, status)
        ConnectionState.update(conn_type=ConnectionType.ZOOKEEPER,
                               name='Zookeeper',
                               status=new_conn_state,
                               message=message,
                               server_addrs=self._zk_server.split(','))

        if (self._conn_state and self._conn_state != ConnectionStatus.DOWN
                and new_conn_state == ConnectionStatus.DOWN):
            msg = 'Connection to Zookeeper down: %s' % (message)
            self._logger.error(msg)
        if (self._conn_state and self._conn_state != new_conn_state
                and new_conn_state == ConnectionStatus.UP):
            msg = 'Connection to Zookeeper ESTABLISHED'
            self._logger.error(msg)

        self._conn_state = new_conn_state

    # end _sandesh_connection_info_update

    def _zk_listen(self, state):
        self._logger.error("Libpartition listen %s" % str(state))
        if state == KazooState.CONNECTED:
            # Update connection info
            self._sandesh_connection_info_update(status='UP', message='')
        elif state == KazooState.LOST:
            self._logger.error("Libpartition connection LOST")
            # Lost the session with ZooKeeper Server
            # Best of option we have is to exit the process and restart all
            # over again
            self._sandesh_connection_info_update(
                status='DOWN', message='Connection to Zookeeper lost')
            os._exit(2)
        elif state == KazooState.SUSPENDED:
            self._logger.error("Libpartition connection SUSPENDED")
            # Update connection info
            self._sandesh_connection_info_update(
                status='INIT',
                message='Connection to zookeeper lost. Retrying')

    # following routine is the greenlet task function to acquire the lock
    # for a partition
    def _acquire_lock(self, part):
        # lock for the partition
        l = self._part_locks[part]

        # go in an infinite loop waiting to acquire the lock
        try:
            while True:
                ret = l.acquire(blocking=False)
                if ret == True:
                    self._logger.error("Acquired lock for:" + str(part))
                    self._curr_part_ownership_list.append(part)
                    self._update_cb(self._curr_part_ownership_list)
                    return True
                else:
                    gevent.sleep(1)
        except CancelledError:
            self._logger.error("Lock acquire cancelled for:" + str(part))
            return False
        except Exception as ex:
            # TODO: If we have a non-KazooException, the lock object
            #       may get stuck in the "cancelled" state
            self._logger.error("Lock acquire unexpected error!: " + str(ex))
            # This exception should get propogated to main thread
            raise SystemExit(1)
            return False

    #end _acquire_lock

    # get rid of finished spawned tasks from datastructures
    def _cleanup_greenlets(self):
        for part in list(self._part_lock_task_dict.keys()):
            if (self._part_lock_task_dict[part].ready()):
                del self._part_lock_task_dict[part]

    #end _cleanup_greenlets

    # following routine launches tasks to acquire partition locks
    def _acquire_partition_ownership(self):
        # cleanup any finished greenlets
        self._cleanup_greenlets()

        # this variable will help us decide if we need to call callback
        updated_curr_ownership = False

        # list of partitions for which locks have to be released
        release_lock_list = []

        self._logger.info("known servers: %s" % self._con_hash.get_all_nodes())

        for part in range(0, self._max_partition):
            if (part in self._target_part_ownership_list):
                if (part in self._curr_part_ownership_list):
                    # do nothing, I already have ownership of this partition
                    self._logger.info("No need to acquire ownership of:" +
                                      str(part))
                else:
                    # I need to acquire lock for this partition before I own
                    if (part in list(self._part_lock_task_dict.keys())):
                        try:
                            self._part_lock_task_dict[part].get(block=False)
                        except:
                            # do nothing there is already a greenlet running to
                            # acquire the lock
                            self._logger.error("Already a greenlet running to"
                                               " acquire:" + str(part))
                            continue

                        # Greenlet died without getting ownership. Cleanup
                        self._logger.error("Cleanup stale greenlet running to"
                                           " acquire:" + str(part))
                        del self._part_lock_task_dict[part]

                    self._logger.error("Starting greenlet running to"
                                       " acquire:" + str(part))
                    # launch the greenlet to acquire the loc, k
                    g = Greenlet.spawn(self._acquire_lock, part)
                    self._part_lock_task_dict[part] = g

            else:
                # give up ownership of the partition

                # cancel any lock acquisition which is ongoing
                if (part in list(self._part_lock_task_dict.keys())):
                    try:
                        self._part_lock_task_dict[part].get(block=False)
                    except:

                        self._logger.error(
                            "canceling lock acquisition going on \
                            for:" + str(part))
                        # Cancelling the lock should result in killing the gevent
                        self._part_locks[part].cancel()
                        self._part_lock_task_dict[part].get(block=True)

                    del self._part_lock_task_dict[part]

                if (part in self._curr_part_ownership_list):
                    release_lock_list.append(part)
                    self._curr_part_ownership_list.remove(part)
                    updated_curr_ownership = True
                    self._logger.error("giving up ownership of:" + str(part))

        if (updated_curr_ownership is True):
            # current partition membership was updated call the callback
            self._update_cb(self._curr_part_ownership_list)

        if (len(release_lock_list) != 0):
            # release locks which were acquired
            for part in release_lock_list:
                self._logger.error("release the lock which was acquired:" + \
                        str(part))
                try:
                    self._part_locks[part].release()
                    self._logger.error("fully gave up ownership of:" +
                                       str(part))
                except:
                    pass

    #end _acquire_partition_ownership

    def update_cluster_list(self, cluster_list):
        """ Updates the cluster node list
        Args:
            cluster_list(list): New list of names of the nodes in 
                the cluster
        Returns:
            None
        """
        # some sanity check
        if not (self._name in cluster_list):
            raise ValueError('cluster list is missing local server name')

        new_cluster_list = set(cluster_list)
        new_servers = list(new_cluster_list.difference(self._cluster_list))
        deleted_servers = list(
            set(self._cluster_list).difference(new_cluster_list))
        self._cluster_list = set(cluster_list)

        # update the hash structure
        if new_servers:
            self._logger.error("new servers:" + str(new_servers))
            self._con_hash.add_nodes(new_servers)
        if deleted_servers:
            self._logger.error("deleted servers:" + str(deleted_servers))
            self._con_hash.del_nodes(deleted_servers)

        # update target partition ownership list
        self._target_part_ownership_list = []
        for part in range(0, self._max_partition):
            if (self._con_hash.get_node(str(part)) == self._name):
                if not (part in self._target_part_ownership_list):
                    self._target_part_ownership_list.append(part)

        # update current ownership list
        self._acquire_partition_ownership()

    #end update_cluster_list

    def own_partition(self, part_no):
        """ Returns ownership information of a partition
        Args:
            part_no(int) : Partition no 
        Returns:
            True if partition is owned by the local node
            False if partition is not owned by the local node
        """
        return part_no in self._curr_part_ownership_list

    #end own_partition

    def close(self):
        """ Closes any connections and frees up any data structures
        Args:
        Returns:
            None
        """
        # clean up greenlets
        for part in list(self._part_lock_task_dict.keys()):
            try:
                self._logger.error("libpartition greenlet cleanup %s" %
                                   str(part))
                self._part_lock_task_dict[part].kill()
            except:
                pass

        self._zk.remove_listener(self._zk_listen)
        gevent.sleep(1)
        self._logger.error("Stopping libpartition")
        # close zookeeper
        try:
            self._zk.stop()
        except:
            self._logger.error("Stopping libpartition failed")
        else:
            self._logger.error("Stopping libpartition successful")

        self._logger.error("Closing libpartition")
        try:
            self._zk.close()
        except:
            self._logger.error("Closing libpartition failed")
        else:
            self._logger.error("Closing libpartition successful")
Example #26
0
  elif state == KazooState.SUSPENDED:
    print >>stderr, 'Connection to Zookeeper lost... Retrying...'
  else:
    print >>stderr, 'Connected.'

zk.start()

base_zk_path = '%s/%s' % (service_ns, service_id)

def resolve_path(path):
  rel_path = relpath(path, config_dir)
  return base_zk_path if rel_path == '.' else join(base_zk_path, rel_path)

if exists(config_dir) and isdir(config_dir):
  print >>stderr, 'Acquiring access lock...'
  with zk.Lock(base_zk_path + '.lock', node_id):
    for dirname, dirs, files in os.walk(config_dir):
      zk.ensure_path(resolve_path(dirname))
      print >>stderr, '  Directory zk://' + resolve_path(dirname)
      for filename in files:
        filename = join(dirname, filename)
        config_path = resolve_path(filename)
        value = open(filename, 'rb').read()
        if zk.exists(config_path):
          print >>stderr, '   Updating zk://%s from %s [%d bytes]' % (config_path, filename, len(value))
          zk.retry(zk.set, config_path, value)
        else:
          print >>stderr, '   Creating zk://%s from %s [%d bytes]' % (config_path, filename, len(value))
          zk.retry(zk.create, config_path, value)
else:
  print >>stderr, 'Invalid configuration directory'
Example #27
0
File: lock.py Project: pspk/ARK
class Lock(object):
    """
    分布式锁模块
    """
    def __init__(self, name):
        """
        初始化方法

        :param str name: 分布式锁名字
        :return: None
        :rtype: None
        :raises kazoo.interfaces.IHandler.timeout_exception: 连接超时异常
        """
        self._lock_name = name
        self._lock_node_path = config.GuardianConfig.get_persistent_path(
            "lock")
        self._lock_node = self._lock_node_path + '/' + self._lock_name
        self._lock_handle = None

        hosts = config.GuardianConfig.get(config.STATE_SERVICE_HOSTS_NAME)
        self._zkc = KazooClient(hosts=hosts)
        self._zkc.start()

    def create(self):
        """
        创建分布式锁

        :return: 分布式锁句柄
        :rtype: Kazoo lock
        """
        if not self._lock_handle:
            self._lock_handle = self._zkc.Lock(self._lock_node)

        return self._lock_handle

    def delete(self):
        """
        删除分布式锁

        :return: None
        :rtype: None
        :raises kazoo.exceptions.NoNodeError: 锁不存在
        :raises kazoo.exceptions.NotEmptyError: 锁被占用
        :raises kazoo.exceptions.ZookeeperError: Zookeeper连接异常
        """
        if not self._lock_handle:
            self._zkc.delete(self._lock_node)
            self._lock_handle = None

    def obtain(self):
        """
        获得锁,调用该接口后会一直阻塞,直到获得锁

        :return: None
        :rtype: None
        """
        self._lock_handle.acquire()

    def obtain_wait(self, timeout):
        """
        获得锁,调用该接口后,如果在timeout秒内得锁便成功返回,否则抛出异常

        :param int timeout: 争锁超时时间
        :return: 无返回
        :rtype: None
        :raises kazoo.exceptions.LockTimeout: 得锁超时
        """
        self._lock_handle.acquire(timeout=timeout)

    def release(self):
        """
        释放锁

        :return: 无返回
        :rtype: None
        """
        self._lock_handle.release()

    def retain(self):
        """
        锁重入,暂未实现
        """
        pass
Example #28
0
# coding=utf-8
"""
分布式锁
"""

from kazoo.client import KazooClient
import time
from uuid import uuid4

my_id = uuid4()
zk = KazooClient(hosts="127.0.0.1:2181")

zk.start()

lock = zk.Lock("/test/lock", my_id)

def work():
    print "i am %s" % uuid4

while True:
    with lock:
        work()
    time.sleep(3)

zk.stop()
Example #29
0
def main():
  """ Starts the AdminServer. """
  logging.basicConfig(format=LOG_FORMAT, level=logging.INFO)

  parser = argparse.ArgumentParser(
    prog='appscale-admin', description='Manages AppScale-related processes')
  subparsers = parser.add_subparsers(dest='command')
  subparsers.required = True

  serve_parser = subparsers.add_parser(
    'serve', description='Starts the server that manages AppScale processes')
  serve_parser.add_argument(
    '-p', '--port', type=int, default=constants.DEFAULT_PORT,
    help='The port to listen on')
  serve_parser.add_argument(
    '-v', '--verbose', action='store_true', help='Output debug-level logging')

  subparsers.add_parser(
    'summary', description='Lists AppScale processes running on this machine')
  restart_parser = subparsers.add_parser(
    'restart',
    description='Restart AppScale processes running on this machine')
  restart_parser.add_argument('service', nargs='+',
                              help='The process or service ID to restart')

  args = parser.parse_args()
  if args.command == 'summary':
    table = sorted(list(get_combined_services().items()))
    print(tabulate(table, headers=['Service', 'State']))
    sys.exit(0)

  if args.command == 'restart':
    socket_path = urlquote(ServiceManagerHandler.SOCKET_PATH, safe='')
    session = requests_unixsocket.Session()
    response = session.post(
      'http+unix://{}/'.format(socket_path),
      data={'command': 'restart', 'arg': [args.service]})
    response.raise_for_status()
    return

  if args.verbose:
    logger.setLevel(logging.DEBUG)

  options.define('secret', appscale_info.get_secret())
  options.define('login_ip', appscale_info.get_login_ip())
  options.define('private_ip', appscale_info.get_private_ip())
  options.define('load_balancers', appscale_info.get_load_balancer_ips())

  acc = appscale_info.get_appcontroller_client()
  ua_client = UAClient(appscale_info.get_db_master_ip(), options.secret)
  zk_client = KazooClient(
    hosts=','.join(appscale_info.get_zk_node_ips()),
    connection_retry=ZK_PERSISTENT_RECONNECTS)
  zk_client.start()
  version_update_lock = zk_client.Lock(constants.VERSION_UPDATE_LOCK_NODE)
  thread_pool = ThreadPoolExecutor(4)
  monit_operator = MonitOperator()
  all_resources = {
    'acc': acc,
    'ua_client': ua_client,
    'zk_client': zk_client,
    'version_update_lock': version_update_lock,
    'thread_pool': thread_pool
  }

  if options.private_ip in appscale_info.get_taskqueue_nodes():
    logger.info('Starting push worker manager')
    GlobalPushWorkerManager(zk_client, monit_operator)

  service_manager = ServiceManager(zk_client)
  service_manager.start()

  app = web.Application([
    ('/oauth/token', OAuthHandler, {'ua_client': ua_client}),
    ('/v1/apps/([a-z0-9-]+)/services/([a-z0-9-]+)/versions', VersionsHandler,
     {'ua_client': ua_client, 'zk_client': zk_client,
      'version_update_lock': version_update_lock, 'thread_pool': thread_pool}),
    ('/v1/projects', ProjectsHandler, all_resources),
    ('/v1/projects/([a-z0-9-]+)', ProjectHandler, all_resources),
    ('/v1/apps/([a-z0-9-]+)/services/([a-z0-9-]+)', ServiceHandler,
     all_resources),
    ('/v1/apps/([a-z0-9-]+)/services/([a-z0-9-]+)/versions/([a-z0-9-]+)',
     VersionHandler, all_resources),
    ('/v1/apps/([a-z0-9-]+)/operations/([a-z0-9-]+)', OperationsHandler,
     {'ua_client': ua_client}),
    ('/api/cron/update', UpdateCronHandler,
     {'acc': acc, 'zk_client': zk_client, 'ua_client': ua_client}),
    ('/api/datastore/index/add', UpdateIndexesHandler,
     {'zk_client': zk_client, 'ua_client': ua_client}),
    ('/api/queue/update', UpdateQueuesHandler,
     {'zk_client': zk_client, 'ua_client': ua_client})
  ])
  logger.info('Starting AdminServer')
  app.listen(args.port)

  management_app = web.Application([
    ('/', ServiceManagerHandler, {'service_manager': service_manager})])
  management_server = HTTPServer(management_app)
  management_socket = bind_unix_socket(ServiceManagerHandler.SOCKET_PATH)
  management_server.add_socket(management_socket)

  io_loop = IOLoop.current()
  io_loop.start()
Example #30
0
class Coordinator(object):
    def __init__(self, zk_hosts, hostname, port, join_cluster):
        self.me = '%s:%s' % (hostname, port)
        self.is_master = None
        self.slaves = cycle([])
        self.slave_count = 0
        self.started_shutdown = False

        if join_cluster:
            read_only = False
        else:
            read_only = True

        self.zk = KazooClient(hosts=zk_hosts,
                              handler=SequentialGeventHandler(),
                              read_only=read_only)
        event = self.zk.start_async()
        event.wait(timeout=5)

        self.lock = self.zk.Lock(path='/iris/sender_master',
                                 identifier=self.me)

        # Used to keep track of slaves / senders present in cluster
        self.party = Party(client=self.zk,
                           path='/iris/sender_nodes',
                           identifier=self.me)

        if join_cluster:
            self.zk.add_listener(self.event_listener)
            self.party.join()

    def am_i_master(self):
        return self.is_master

    # Used for API to get the current master
    def get_current_master(self):
        try:
            contenders = self.lock.contenders()
        except kazoo.exceptions.KazooException:
            logger.exception('Failed getting contenders')
            return None

        if contenders:
            return self.address_to_tuple(contenders[0])
        else:
            return None

    # Used for API to get the current slaves if master can't be reached
    def get_current_slaves(self):
        return [self.address_to_tuple(host) for host in self.party]

    def address_to_tuple(self, address):
        try:
            host, port = address.split(':')
            return host, int(port)
        except (IndexError, ValueError):
            logger.error('Failed getting address tuple from %s', address)
            return None

    def update_status(self):
        if self.started_shutdown:
            return

        if self.zk.state == KazooState.CONNECTED:
            if self.lock.is_acquired:
                self.is_master = True
            else:
                try:
                    self.is_master = self.lock.acquire(blocking=False,
                                                       timeout=2)

                # This one is expected when we're recovering from ZK being down
                except kazoo.exceptions.CancelledError:
                    self.is_master = False

                except kazoo.exceptions.LockTimeout:
                    self.is_master = False
                    logger.exception(
                        'Failed trying to acquire lock (shouldn\'t happen as we\'re using nonblocking locks)'
                    )

                except kazoo.exceptions.KazooException:
                    self.is_master = False
                    logger.exception(
                        'ZK problem while Failed trying to acquire lock')
        else:
            logger.error('ZK connection is in %s state', self.zk.state)
            self.is_master = False

        if self.zk.state == KazooState.CONNECTED:

            if self.is_master:
                slaves = [
                    self.address_to_tuple(host) for host in self.party
                    if host != self.me
                ]
                self.slave_count = len(slaves)
                self.slaves = cycle(slaves)
            else:
                self.slaves = cycle([])
                self.slave_count = 0

            # Keep us as part of the party, so the current master sees us as a slave
            if not self.party.participating:
                try:
                    self.party.join()
                except kazoo.exceptions.KazooException:
                    logger.exception('ZK problem while trying to join party')
        else:
            self.slaves = cycle([])
            self.slave_count = 0

    def update_forever(self):
        while True:
            if self.started_shutdown:
                return

            old_status = self.is_master
            self.update_status()
            new_status = self.is_master

            if old_status != new_status:
                log = logger.info
            else:
                log = logger.debug

            if self.is_master:
                log('I am the master sender')
            else:
                log('I am a slave sender')

            metrics.set('slave_instance_count', self.slave_count)
            metrics.set('is_master_sender', int(self.is_master is True))

            sleep(UPDATE_FREQUENCY)

    def leave_cluster(self):
        self.started_shutdown = True

        # cancel any attempts to acquire master lock which could make us hang
        self.lock.cancel()

        if self.zk.state == KazooState.CONNECTED:
            if self.party and self.party.participating:
                logger.info('Leaving party')
                self.party.leave()
            if self.lock and self.lock.is_acquired:
                logger.info('Releasing lock')
                self.lock.release()

    def event_listener(self, state):
        if state == KazooState.LOST or state == KazooState.SUSPENDED:
            logger.info(
                'ZK state transitioned to %s. Resetting master status.', state)

            # cancel pending attempts to acquire lock which will break and leave
            # us in bad state
            self.lock.cancel()

            # make us try to re-acquire lock during next iteration when we're connected
            if self.lock.is_acquired:
                self.lock.is_acquired = False

            # make us try to rejoin the party during next iteration when we're connected
            if self.party.participating:
                self.party.participating = False

            # in the meantime we're not master
            self.is_master = None