def __init__(self, cluster_name):
        self.available = False
        self.cluster_name = cluster_name if cluster_name is not None else 'unknown'
        self._retry = Retry(deadline=300,
                            max_delay=30,
                            max_tries=-1,
                            retry_exceptions=(boto.exception.StandardError, ))
        try:
            # get the instance id
            r = requests.get(
                'http://169.254.169.254/latest/dynamic/instance-identity/document',
                timeout=2.1)
        except RequestException:
            logger.error('cannot query AWS meta-data')
            return

        if r.ok:
            try:
                content = r.json()
                self.instance_id = content['instanceId']
                self.region = content['region']
            except Exception:
                logger.exception(
                    'unable to fetch instance id and region from AWS meta-data'
                )
                return
            self.available = True
Example #2
0
    def __init__(self, config):
        self._labels = config['labels']
        self._labels[config.get('scope_label', 'cluster-name')] = config['scope']
        self._label_selector = ','.join('{0}={1}'.format(k, v) for k, v in self._labels.items())
        self._namespace = config.get('namespace') or 'default'
        self._role_label = config.get('role_label', 'role')
        config['namespace'] = ''
        super(Kubernetes, self).__init__(config)
        self._retry = Retry(deadline=config['retry_timeout'], max_delay=1, max_tries=-1,
                            retry_exceptions=(KubernetesRetriableException, HTTPException,
                                              HTTPError, socket.error, socket.timeout))
        self._ttl = None
        try:
            k8s_config.load_incluster_config()
        except k8s_config.ConfigException:
            k8s_config.load_kube_config(context=config.get('context', 'local'))

        self.__subsets = None
        use_endpoints = config.get('use_endpoints') and (config.get('patronictl') or 'pod_ip' in config)
        if use_endpoints:
            addresses = [k8s_client.V1EndpointAddress(ip=config['pod_ip'])]
            ports = []
            for p in config.get('ports', [{}]):
                port = {'port': int(p.get('port', '5432'))}
                port.update({n: p[n] for n in ('name', 'protocol') if p.get(n)})
                ports.append(k8s_client.V1EndpointPort(**port))
            self.__subsets = [k8s_client.V1EndpointSubset(addresses=addresses, ports=ports)]
        self._api = CoreV1ApiProxy(use_endpoints)
        self.set_retry_timeout(config['retry_timeout'])
        self.set_ttl(config.get('ttl') or 30)
        self._leader_observed_record = {}
        self._leader_observed_time = None
        self._leader_resource_version = None
        self._leader_observed_subsets = []
        self.__do_not_watch = False
Example #3
0
    def test_copy(self):
        def _sleep(t):
            pass

        retry = Retry(sleep_func=_sleep)
        rcopy = retry.copy()
        self.assertTrue(rcopy.sleep_func is _sleep)
Example #4
0
    def __init__(self, config):
        super(Consul, self).__init__(config)
        self._scope = config['scope']
        self._session = None
        self.__do_not_watch = False
        self._retry = Retry(deadline=config['retry_timeout'],
                            max_delay=1,
                            max_tries=-1,
                            retry_exceptions=(ConsulInternalError,
                                              HTTPException, HTTPError,
                                              socket.error, socket.timeout))

        kwargs = {}
        if 'url' in config:
            r = urlparse(config['url'])
            config.update({
                'scheme': r.scheme,
                'host': r.hostname,
                'port': r.port or 8500
            })
        elif 'host' in config:
            host, port = split_host_port(config.get('host', '127.0.0.1:8500'),
                                         8500)
            config['host'] = host
            if 'port' not in config:
                config['port'] = int(port)

        if config.get('cacert'):
            config['ca_cert'] = config.pop('cacert')

        if config.get('key') and config.get('cert'):
            config['cert'] = (config['cert'], config['key'])

        config_keys = ('host', 'port', 'token', 'scheme', 'cert', 'ca_cert',
                       'dc')
        kwargs = {p: config.get(p) for p in config_keys if config.get(p)}

        verify = config.get('verify')
        if not isinstance(verify, bool):
            verify = parse_bool(verify)
        if isinstance(verify, bool):
            kwargs['verify'] = verify

        self._client = ConsulClient(**kwargs)
        self.set_retry_timeout(config['retry_timeout'])
        self.set_ttl(config.get('ttl') or 30)
        self._last_session_refresh = 0
        self.__session_checks = config.get('checks')
        self._register_service = config.get('register_service', False)
        if self._register_service:
            self._service_name = service_name_from_scope_name(self._scope)
            if self._scope != self._service_name:
                logger.warning(
                    'Using %s as consul service name instead of scope name %s',
                    self._service_name, self._scope)
        self._service_check_interval = config.get('service_check_interval',
                                                  '5s')
        if not self._ctl:
            self.create_session()
Example #5
0
    def __init__(self, config):
        self.name = config['name']
        self.scope = config['scope']
        self._data_dir = config['data_dir']
        self._database = config.get('database', 'postgres')
        self._version_file = os.path.join(self._data_dir, 'PG_VERSION')
        self._pg_control = os.path.join(self._data_dir, 'global', 'pg_control')
        self._major_version = self.get_major_version()

        self._state_lock = Lock()
        self.set_state('stopped')

        self._pending_restart = False
        self._connection = Connection()
        self.config = ConfigHandler(self, config)
        self.config.check_directories()

        self._bin_dir = config.get('bin_dir') or ''
        self.bootstrap = Bootstrap(self)
        self.bootstrapping = False
        self.__thread_ident = current_thread().ident

        self.slots_handler = SlotsHandler(self)

        self._callback_executor = CallbackExecutor()
        self.__cb_called = False
        self.__cb_pending = None

        self.cancellable = CancellableSubprocess()

        self._sysid = None
        self.retry = Retry(max_tries=-1, deadline=config['retry_timeout']/2.0, max_delay=1,
                           retry_exceptions=PostgresConnectionException)

        # Retry 'pg_is_in_recovery()' only once
        self._is_leader_retry = Retry(max_tries=1, deadline=config['retry_timeout']/2.0, max_delay=1,
                                      retry_exceptions=PostgresConnectionException)

        self._role_lock = Lock()
        self.set_role(self.get_postgres_role_from_data_directory())
        self._state_entry_timestamp = None

        self._cluster_info_state = {}
        self._cached_replica_timeline = None

        # Last known running process
        self._postmaster_proc = None

        if self.is_running():
            self.set_state('running')
            self.set_role('master' if self.is_leader() else 'replica')
            self.config.write_postgresql_conf()  # we are "joining" already running postgres
            hba_saved = self.config.replace_pg_hba()
            ident_saved = self.config.replace_pg_ident()
            if hba_saved or ident_saved:
                self.reload()
        elif self.role == 'master':
            self.set_role('demoted')
Example #6
0
 def __init__(self, name, config):
     super(Etcd, self).__init__(name, config)
     self.ttl = config.get('ttl', 30)
     self._retry = Retry(deadline=10, max_delay=1, max_tries=-1,
                         retry_exceptions=(etcd.EtcdConnectionFailed,
                                           etcd.EtcdLeaderElectionInProgress,
                                           etcd.EtcdWatcherCleared,
                                           etcd.EtcdEventIndexCleared))
     self._client = self.get_etcd_client(config)
Example #7
0
 def __init__(self, config):
     super(Etcd, self).__init__(config)
     self._ttl = int(config.get('ttl') or 30)
     self._retry = Retry(deadline=config['retry_timeout'], max_delay=1, max_tries=-1,
                         retry_exceptions=(etcd.EtcdLeaderElectionInProgress,
                                           etcd.EtcdWatcherCleared,
                                           etcd.EtcdEventIndexCleared))
     self._client = self.get_etcd_client(config)
     self.__do_not_watch = False
Example #8
0
 def __init__(self, config):
     super(Etcd, self).__init__(config)
     self._ttl = int(config.get('ttl') or 30)
     self._retry = Retry(
         deadline=config['retry_timeout'],
         max_delay=1,
         max_tries=-1,
         retry_exceptions=(etcd.EtcdLeaderElectionInProgress,
                           EtcdRaftInternal))
     self._client = self.get_etcd_client(config)
     self.__do_not_watch = False
     self._has_failed = False
Example #9
0
    def __init__(self, config):
        super(Consul, self).__init__(config)
        self._scope = config['scope']
        self._session = None
        self.__do_not_watch = False
        self._retry = Retry(deadline=config['retry_timeout'],
                            max_delay=1,
                            max_tries=-1,
                            retry_exceptions=(ConsulInternalError,
                                              HTTPException, HTTPError,
                                              socket.error, socket.timeout))

        self._my_member_data = None
        kwargs = {}
        if 'url' in config:
            r = urlparse(config['url'])
            config.update({
                'scheme': r.scheme,
                'host': r.hostname,
                'port': r.port or 8500
            })
        elif 'host' in config:
            host, port = (config.get('host', '127.0.0.1:8500') +
                          ':8500').split(':')[:2]
            config['host'] = host
            if 'port' not in config:
                config['port'] = int(port)

        if config.get('cacert'):
            config['ca_cert'] = config.pop('cacert')

        if config.get('key') and config.get('cert'):
            config['cert'] = (config['cert'], config['key'])

        config_keys = ('host', 'port', 'token', 'scheme', 'cert', 'ca_cert',
                       'dc')
        kwargs = {p: config.get(p) for p in config_keys if config.get(p)}

        verify = config.get('verify')
        if not isinstance(verify, bool):
            verify = parse_bool(verify)
        if isinstance(verify, bool):
            kwargs['verify'] = verify

        self._client = ConsulClient(**kwargs)
        self.set_retry_timeout(config['retry_timeout'])
        self.set_ttl(config.get('ttl') or 30)
        self._last_session_refresh = 0
        self.__session_checks = config.get('checks')
        if not self._ctl:
            self.create_session()
Example #10
0
    def __init__(self, config):
        self._labels = config['labels']
        self._labels[config.get('scope_label', 'cluster-name')] = config['scope']
        self._label_selector = ','.join('{0}={1}'.format(k, v) for k, v in self._labels.items())
        self._namespace = config.get('namespace') or 'default'
        self._role_label = config.get('role_label', 'role')
        config['namespace'] = ''
        super(Kubernetes, self).__init__(config)
        self._retry = Retry(deadline=config['retry_timeout'], max_delay=1, max_tries=-1,
                            retry_exceptions=(KubernetesRetriableException, HTTPException,
                                              HTTPError, socket.error, socket.timeout))
        self._ttl = None
        try:
            k8s_config.load_incluster_config()
        except k8s_config.ConfigException:
            k8s_config.load_kube_config(context=config.get('context', 'local'))

        self.__subsets = None
        use_endpoints = config.get('use_endpoints') and (config.get('patronictl') or 'pod_ip' in config)
        if use_endpoints:
            addresses = [k8s_client.V1EndpointAddress(ip=config['pod_ip'])]
            ports = []
            for p in config.get('ports', [{}]):
                port = {'port': int(p.get('port', '5432'))}
                port.update({n: p[n] for n in ('name', 'protocol') if p.get(n)})
                ports.append(k8s_client.V1EndpointPort(**port))
            self.__subsets = [k8s_client.V1EndpointSubset(addresses=addresses, ports=ports)]
        self._api = CoreV1ApiProxy(use_endpoints)
        self.set_retry_timeout(config['retry_timeout'])
        self.set_ttl(config.get('ttl') or 30)
        self._leader_observed_record = {}
        self._leader_observed_time = None
        self._leader_resource_version = None
        self._leader_observed_subsets = []
        self.__do_not_watch = False
Example #11
0
    def test_api_execute(self, mock_machines):
        mock_machines.__get__ = Mock(return_value=['http://localhost:4001', 'http://localhost:2379'])
        self.client._base_uri = 'http://localhost:4001'
        self.assertRaises(etcd.EtcdException, self.client.api_execute, '/', 'POST', timeout=0)
        self.client._base_uri = 'http://localhost:4001'
        rtry = Retry(deadline=10, max_delay=1, max_tries=-1, retry_exceptions=(etcd.EtcdLeaderElectionInProgress,))
        rtry(self.client.api_execute, '/', 'POST', timeout=0, params={'retry': rtry})
        self.client._machines_cache_updated = 0
        self.client.api_execute('/', 'POST', timeout=0)
        self.client._machines_cache = [self.client._base_uri]
        self.assertRaises(etcd.EtcdWatchTimedOut, self.client.api_execute, '/timeout', 'POST', params={'wait': 'true'})
        self.assertRaises(etcd.EtcdWatchTimedOut, self.client.api_execute, '/timeout', 'POST', params={'wait': 'true'})

        with patch.object(EtcdClient, '_calculate_timeouts', Mock(side_effect=[(1, 1, 0), (1, 1, 0), (0, 1, 0)])),\
                patch.object(EtcdClient, '_load_machines_cache', Mock(side_effect=Exception)):
            self.client.http.request = Mock(side_effect=socket.error)
            self.assertRaises(etcd.EtcdException, rtry, self.client.api_execute, '/', 'GET', params={'retry': rtry})

        with patch.object(EtcdClient, '_calculate_timeouts', Mock(side_effect=[(1, 1, 0), (1, 1, 0), (0, 1, 0)])),\
                patch.object(EtcdClient, '_load_machines_cache', Mock(return_value=True)):
            self.assertRaises(etcd.EtcdException, rtry, self.client.api_execute, '/', 'GET', params={'retry': rtry})

        with patch.object(EtcdClient, '_do_http_request', Mock(side_effect=etcd.EtcdException)):
            self.client._read_timeout = 0.01
            self.assertRaises(etcd.EtcdException, self.client.api_execute, '/', 'GET')
Example #12
0
    def __init__(self, config):
        self.config = config
        self.name = config['name']
        self.scope = config['scope']
        self._database = config.get('database', 'postgres')
        self._data_dir = config['data_dir']
        self._pending_restart = False
        self._server_parameters = self.get_server_parameters(config)

        self._connect_address = config.get('connect_address')
        self._superuser = config['authentication'].get('superuser', {})
        self._replication = config['authentication']['replication']
        self.resolve_connection_addresses()

        self._need_rewind = False
        self._use_slots = config.get('use_slots', True)
        self._version_file = os.path.join(self._data_dir, 'PG_VERSION')
        self._major_version = self.get_major_version()
        self._schedule_load_slots = self.use_slots

        self._pgpass = config.get('pgpass') or os.path.join(
            os.path.expanduser('~'), 'pgpass')
        self.callback = config.get('callbacks') or {}
        config_base_name = config.get('config_base_name', 'postgresql')
        self._postgresql_conf = os.path.join(self._data_dir,
                                             config_base_name + '.conf')
        self._postgresql_base_conf_name = config_base_name + '.base.conf'
        self._postgresql_base_conf = os.path.join(
            self._data_dir, self._postgresql_base_conf_name)
        self._recovery_conf = os.path.join(self._data_dir, 'recovery.conf')
        self._configuration_to_save = (self._postgresql_conf,
                                       self._postgresql_base_conf,
                                       os.path.join(self._data_dir,
                                                    'pg_hba.conf'))
        self._postmaster_pid = os.path.join(self._data_dir, 'postmaster.pid')
        self._trigger_file = config.get('recovery_conf',
                                        {}).get('trigger_file') or 'promote'
        self._trigger_file = os.path.abspath(
            os.path.join(self._data_dir, self._trigger_file))

        self._connection = None
        self._cursor_holder = None
        self._sysid = None
        self._replication_slots = [
        ]  # list of already existing replication slots
        self.retry = Retry(max_tries=-1,
                           deadline=config['retry_timeout'] / 2.0,
                           max_delay=1,
                           retry_exceptions=PostgresConnectionException)

        self._state_lock = Lock()
        self.set_state('stopped')
        self._role_lock = Lock()
        self.set_role(self.get_postgres_role_from_data_directory())

        if self.is_running():
            self.set_state('running')
            self.set_role('master' if self.is_leader() else 'replica')
            self._write_postgresql_conf(
            )  # we are "joining" already running postgres
Example #13
0
    def __init__(self, config):
        super(Consul, self).__init__(config)
        self._scope = config['scope']
        self._session = None
        self.__do_not_watch = False
        self._retry = Retry(deadline=config['retry_timeout'], max_delay=1, max_tries=-1,
                            retry_exceptions=(ConsulInternalError, HTTPException,
                                              HTTPError, socket.error, socket.timeout))

        self._my_member_data = None
        host, port = config.get('host', '127.0.0.1:8500').split(':')
        self._client = ConsulClient(host=host, port=port)
        self.set_retry_timeout(config['retry_timeout'])
        self.set_ttl(config.get('ttl') or 30)
        self._last_session_refresh = 0
        if not self._ctl:
            self.create_session()
Example #14
0
 def __init__(self, name, config):
     super(Etcd, self).__init__(name, config)
     self.ttl = config.get('ttl', 30)
     self._retry = Retry(deadline=10, max_delay=1, max_tries=-1,
                         retry_exceptions=(etcd.EtcdConnectionFailed,
                                           etcd.EtcdLeaderElectionInProgress,
                                           etcd.EtcdWatcherCleared,
                                           etcd.EtcdEventIndexCleared))
     self.client = self.get_etcd_client(config)
Example #15
0
 def __init__(self, config):
     super(Etcd, self).__init__(config)
     self._ttl = int(config.get('ttl') or 30)
     self._retry = Retry(deadline=config['retry_timeout'], max_delay=1, max_tries=-1,
                         retry_exceptions=(etcd.EtcdLeaderElectionInProgress,
                                           etcd.EtcdWatcherCleared,
                                           etcd.EtcdEventIndexCleared))
     self._client = self.get_etcd_client(config)
     self.__do_not_watch = False
Example #16
0
    def __init__(self, config):
        self.config = config
        self.name = config['name']
        self.server_parameters = config.get('parameters', {})
        self.scope = config['scope']
        self.listen_addresses, self.port = config['listen'].split(':')
        self.data_dir = config['data_dir']
        self.replication = config['replication']
        self.superuser = config['superuser']
        self.admin = config['admin']
        self.initdb_options = config.get('initdb', [])
        self.pgpass = config.get('pgpass', None) or os.path.join(
            os.path.expanduser('~'), 'pgpass')
        self.pg_rewind = config.get('pg_rewind', {})
        self.callback = config.get('callbacks', {})
        self.use_slots = config.get('use_slots', True)
        self.schedule_load_slots = self.use_slots
        self.recovery_conf = os.path.join(self.data_dir, 'recovery.conf')
        self.configuration_to_save = (os.path.join(self.data_dir,
                                                   'pg_hba.conf'),
                                      os.path.join(self.data_dir,
                                                   'postgresql.conf'))
        self.postmaster_pid = os.path.join(self.data_dir, 'postmaster.pid')
        self.trigger_file = config.get('recovery_conf', {}).get(
            'trigger_file', None) or 'promote'
        self.trigger_file = os.path.abspath(
            os.path.join(self.data_dir, self.trigger_file))

        self._pg_ctl = ['pg_ctl', '-w', '-D', self.data_dir]

        self.local_address = self.get_local_address()
        connect_address = config.get('connect_address',
                                     None) or self.local_address
        self.connection_string = 'postgres://{username}:{password}@{connect_address}/postgres'.format(
            connect_address=connect_address, **self.replication)

        self._connection = None
        self._cursor_holder = None
        self._need_rewind = False
        self._sysid = None
        self.replication_slots = [
        ]  # list of already existing replication slots
        self.retry = Retry(max_tries=-1,
                           deadline=5,
                           max_delay=1,
                           retry_exceptions=PostgresConnectionException)

        self._state = 'stopped'
        self._state_lock = Lock()
        self._role = 'replica'
        self._role_lock = Lock()

        if self.is_running():
            self._state = 'running'
            self._role = 'master' if self.is_leader() else 'replica'
Example #17
0
class AWSConnection(object):

    def __init__(self, cluster_name):
        self.available = False
        self.cluster_name = cluster_name if cluster_name is not None else 'unknown'
        self._retry = Retry(deadline=300, max_delay=30, max_tries=-1, retry_exceptions=(boto.exception.StandardError,))
        try:
            # get the instance id
            r = requests_get('http://169.254.169.254/latest/dynamic/instance-identity/document', timeout=2.1)
        except Exception:
            logger.error('cannot query AWS meta-data')
            return

        if r.status < 400:
            try:
                content = json.loads(r.data.decode('utf-8'))
                self.instance_id = content['instanceId']
                self.region = content['region']
            except Exception:
                logger.exception('unable to fetch instance id and region from AWS meta-data')
                return
            self.available = True

    def retry(self, *args, **kwargs):
        return self._retry.copy()(*args, **kwargs)

    def aws_available(self):
        return self.available

    def _tag_ebs(self, conn, role):
        """ set tags, carrying the cluster name, instance role and instance id for the EBS storage """
        tags = {'Name': 'spilo_' + self.cluster_name, 'Role': role, 'Instance': self.instance_id}
        volumes = conn.get_all_volumes(filters={'attachment.instance-id': self.instance_id})
        conn.create_tags([v.id for v in volumes], tags)

    def _tag_ec2(self, conn, role):
        """ tag the current EC2 instance with a cluster role """
        tags = {'Role': role}
        conn.create_tags([self.instance_id], tags)

    def on_role_change(self, new_role):
        if not self.available:
            return False
        try:
            conn = self.retry(boto.ec2.connect_to_region, self.region)
            self.retry(self._tag_ec2, conn, new_role)
            self.retry(self._tag_ebs, conn, new_role)
        except RetryFailedError:
            logger.warning("Unable to communicate to AWS "
                           "when setting tags for the EC2 instance {0} "
                           "and attached EBS volumes".format(self.instance_id))
            return False
        return True
Example #18
0
class AWSConnection(object):

    def __init__(self, cluster_name):
        self.available = False
        self.cluster_name = cluster_name if cluster_name is not None else 'unknown'
        self._retry = Retry(deadline=300, max_delay=30, max_tries=-1, retry_exceptions=(boto.exception.StandardError,))
        try:
            # get the instance id
            r = requests.get('http://169.254.169.254/latest/dynamic/instance-identity/document', timeout=2.1)
        except RequestException:
            logger.error('cannot query AWS meta-data')
            return

        if r.ok:
            try:
                content = r.json()
                self.instance_id = content['instanceId']
                self.region = content['region']
            except Exception:
                logger.exception('unable to fetch instance id and region from AWS meta-data')
                return
            self.available = True

    def retry(self, *args, **kwargs):
        return self._retry.copy()(*args, **kwargs)

    def aws_available(self):
        return self.available

    def _tag_ebs(self, conn, role):
        """ set tags, carrying the cluster name, instance role and instance id for the EBS storage """
        tags = {'Name': 'spilo_' + self.cluster_name, 'Role': role, 'Instance': self.instance_id}
        volumes = conn.get_all_volumes(filters={'attachment.instance-id': self.instance_id})
        conn.create_tags([v.id for v in volumes], tags)

    def _tag_ec2(self, conn, role):
        """ tag the current EC2 instance with a cluster role """
        tags = {'Role': role}
        conn.create_tags([self.instance_id], tags)

    def on_role_change(self, new_role):
        if not self.available:
            return False
        try:
            conn = self.retry(boto.ec2.connect_to_region, self.region)
            self.retry(self._tag_ec2, conn, new_role)
            self.retry(self._tag_ebs, conn, new_role)
        except RetryFailedError:
            logger.warning("Unable to communicate to AWS "
                           "when setting tags for the EC2 instance {0} "
                           "and attached EBS volumes".format(self.instance_id))
            return False
        return True
Example #19
0
    def __init__(self, config):
        self.config = config
        self.name = config['name']
        self.database = config.get('database', 'postgres')
        self._server_parameters = self.get_server_parameters(config)
        self._listen_addresses, self._port = (config['listen'] + ':5432').split(':')[:2]

        self.scope = config['scope']
        self._data_dir = config['data_dir']
        self.replication = config['replication']
        self.superuser = config.get('superuser') or {}
        self.admin = config.get('admin') or {}

        self.initdb_options = config.get('initdb') or []
        self.pgpass = config.get('pgpass') or os.path.join(os.path.expanduser('~'), 'pgpass')
        self.pg_rewind = config.get('pg_rewind') or {}
        self.callback = config.get('callbacks') or {}
        self.use_slots = config.get('use_slots', True)
        self._schedule_load_slots = self.use_slots
        config_base_name = config.get('config_base_name', 'postgresql')
        self._postgresql_conf = os.path.join(self._data_dir, config_base_name + '.conf')
        self._postgresql_base_conf_name = config_base_name + '.base.conf'
        self._postgresql_base_conf = os.path.join(self._data_dir, self._postgresql_base_conf_name)
        self._recovery_conf = os.path.join(self._data_dir, 'recovery.conf')
        self._configuration_to_save = (self._postgresql_conf, self._postgresql_base_conf,
                                       os.path.join(self._data_dir, 'pg_hba.conf'))
        self._postmaster_pid = os.path.join(self._data_dir, 'postmaster.pid')
        self._trigger_file = config.get('recovery_conf', {}).get('trigger_file') or 'promote'
        self._trigger_file = os.path.abspath(os.path.join(self._data_dir, self._trigger_file))

        self._pg_ctl = ['pg_ctl', '-w', '-D', self._data_dir]

        self.local_address = self.get_local_address()
        connect_address = config.get('connect_address') or self.local_address
        self.connection_string = 'postgres://{username}:{password}@{connect_address}/{database}'.format(
            connect_address=connect_address, database=self.database, **self.replication)

        self._connection = None
        self._cursor_holder = None
        self._sysid = None
        self._replication_slots = []  # list of already existing replication slots
        self.retry = Retry(max_tries=-1, deadline=5, max_delay=1, retry_exceptions=PostgresConnectionException)

        self._state_lock = Lock()
        self.set_state('stopped')
        self._role_lock = Lock()
        self.set_role(self.get_postgres_role_from_data_directory())

        if self.is_running():
            self.set_state('running')
            self.set_role('master' if self.is_leader() else 'replica')
            self._write_postgresql_conf()  # we are "joining" already running postgres
Example #20
0
    def __init__(self, config):
        super(Consul, self).__init__(config)
        self._scope = config['scope']
        self._session = None
        self.__do_not_watch = False
        self._retry = Retry(deadline=config['retry_timeout'], max_delay=1, max_tries=-1,
                            retry_exceptions=(ConsulInternalError, HTTPException,
                                              HTTPError, socket.error, socket.timeout))

        self._my_member_data = {}
        kwargs = {}
        if 'url' in config:
            r = urlparse(config['url'])
            config.update({'scheme': r.scheme, 'host': r.hostname, 'port': r.port or 8500})
        elif 'host' in config:
            host, port = split_host_port(config.get('host', '127.0.0.1:8500'), 8500)
            config['host'] = host
            if 'port' not in config:
                config['port'] = int(port)

        if config.get('cacert'):
            config['ca_cert'] = config.pop('cacert')

        if config.get('key') and config.get('cert'):
            config['cert'] = (config['cert'], config['key'])

        config_keys = ('host', 'port', 'token', 'scheme', 'cert', 'ca_cert', 'dc')
        kwargs = {p: config.get(p) for p in config_keys if config.get(p)}

        verify = config.get('verify')
        if not isinstance(verify, bool):
            verify = parse_bool(verify)
        if isinstance(verify, bool):
            kwargs['verify'] = verify

        self._client = ConsulClient(**kwargs)
        self.set_retry_timeout(config['retry_timeout'])
        self.set_ttl(config.get('ttl') or 30)
        self._last_session_refresh = 0
        self.__session_checks = config.get('checks')
        if not self._ctl:
            self.create_session()
Example #21
0
    def __init__(self, cluster_name):
        self.available = False
        self.cluster_name = cluster_name if cluster_name is not None else 'unknown'
        self._retry = Retry(deadline=300, max_delay=30, max_tries=-1, retry_exceptions=(boto.exception.StandardError,))
        try:
            # get the instance id
            r = requests.get('http://169.254.169.254/latest/dynamic/instance-identity/document', timeout=2.1)
        except RequestException:
            logger.error('cannot query AWS meta-data')
            return

        if r.ok:
            try:
                content = r.json()
                self.instance_id = content['instanceId']
                self.region = content['region']
            except Exception:
                logger.exception('unable to fetch instance id and region from AWS meta-data')
                return
            self.available = True
Example #22
0
 def test_deadline(self):
     retry = Retry(deadline=0.0001)
     self.assertRaises(RetryFailedError, retry, self._fail(times=100))
Example #23
0
 def test_too_many_tries(self):
     retry = Retry(delay=0)
     self.assertRaises(RetryFailedError, retry, self._fail(times=999))
     self.assertEquals(retry._attempts, 1)
Example #24
0
 def test_maximum_delay(self):
     retry = Retry(delay=10, max_tries=100)
     retry(self._fail(times=10))
     self.assertTrue(retry._cur_delay < 4000, retry._cur_delay)
     # gevent's sleep function is picky about the type
     self.assertEquals(type(retry._cur_delay), float)
Example #25
0
class Kubernetes(AbstractDCS):
    def __init__(self, config):
        self._labels = config['labels']
        self._labels[config.get('scope_label',
                                'cluster-name')] = config['scope']
        self._label_selector = ','.join('{0}={1}'.format(k, v)
                                        for k, v in self._labels.items())
        self._namespace = config.get('namespace') or 'default'
        self._role_label = config.get('role_label', 'role')
        config['namespace'] = ''
        super(Kubernetes, self).__init__(config)
        self._retry = Retry(deadline=config['retry_timeout'],
                            max_delay=1,
                            max_tries=-1,
                            retry_exceptions=(KubernetesRetriableException,
                                              HTTPException, HTTPError,
                                              socket.error, socket.timeout))
        self._ttl = None
        try:
            k8s_config.load_incluster_config()
        except k8s_config.ConfigException:
            k8s_config.load_kube_config(context=config.get('context', 'local'))

        self.__subsets = None
        use_endpoints = config.get('use_endpoints') and (
            config.get('patronictl') or 'pod_ip' in config)
        if use_endpoints:
            addresses = [k8s_client.V1EndpointAddress(ip=config['pod_ip'])]
            ports = []
            for p in config.get('ports', [{}]):
                port = {'port': int(p.get('port', '5432'))}
                port.update(
                    {n: p[n]
                     for n in ('name', 'protocol') if p.get(n)})
                ports.append(k8s_client.V1EndpointPort(**port))
            self.__subsets = [
                k8s_client.V1EndpointSubset(addresses=addresses, ports=ports)
            ]
        self._api = CoreV1ApiProxy(use_endpoints)
        self.set_retry_timeout(config['retry_timeout'])
        self.set_ttl(config.get('ttl') or 30)
        self._leader_observed_record = {}
        self._leader_observed_time = None
        self._leader_resource_version = None
        self._leader_observed_subsets = []
        self.__do_not_watch = False

    def retry(self, *args, **kwargs):
        return self._retry.copy()(*args, **kwargs)

    def client_path(self, path):
        return super(Kubernetes, self).client_path(path)[1:].replace('/', '-')

    @property
    def leader_path(self):
        return self._base_path[1:] if self.__subsets else super(
            Kubernetes, self).leader_path

    def set_ttl(self, ttl):
        ttl = int(ttl)
        self.__do_not_watch = self._ttl != ttl
        self._ttl = ttl

    def set_retry_timeout(self, retry_timeout):
        self._retry.deadline = retry_timeout
        self._api.set_timeout(retry_timeout)

    @staticmethod
    def member(pod):
        annotations = pod.metadata.annotations or {}
        member = Member.from_node(pod.metadata.resource_version,
                                  pod.metadata.name, None,
                                  annotations.get('status', ''))
        member.data['pod_labels'] = pod.metadata.labels
        return member

    def _load_cluster(self):
        try:
            # get list of members
            response = self.retry(self._api.list_namespaced_pod,
                                  self._namespace,
                                  label_selector=self._label_selector)
            members = [self.member(pod) for pod in response.items]

            response = self.retry(self._api.list_namespaced_kind,
                                  self._namespace,
                                  label_selector=self._label_selector)
            nodes = {item.metadata.name: item for item in response.items}

            config = nodes.get(self.config_path)
            metadata = config and config.metadata
            annotations = metadata and metadata.annotations or {}

            # get initialize flag
            initialize = annotations.get(self._INITIALIZE)

            # get global dynamic configuration
            config = ClusterConfig.from_node(
                metadata and metadata.resource_version,
                annotations.get(self._CONFIG) or '{}')

            # get timeline history
            history = TimelineHistory.from_node(
                metadata and metadata.resource_version,
                annotations.get(self._HISTORY) or '[]')

            leader = nodes.get(self.leader_path)
            metadata = leader and leader.metadata
            self._leader_resource_version = metadata.resource_version if metadata else None
            self._leader_observed_subsets = leader.subsets if self.__subsets and leader else []
            annotations = metadata and metadata.annotations or {}

            # get last leader operation
            last_leader_operation = annotations.get(self._OPTIME)
            last_leader_operation = 0 if last_leader_operation is None else int(
                last_leader_operation)

            # get leader
            leader_record = {
                n: annotations.get(n)
                for n in (self._LEADER, 'acquireTime', 'ttl', 'renewTime',
                          'transitions') if n in annotations
            }
            if (leader_record or self._leader_observed_record
                ) and leader_record != self._leader_observed_record:
                self._leader_observed_record = leader_record
                self._leader_observed_time = time.time()

            leader = leader_record.get(self._LEADER)
            try:
                ttl = int(leader_record.get('ttl')) or self._ttl
            except (TypeError, ValueError):
                ttl = self._ttl

            if not metadata or not self._leader_observed_time or self._leader_observed_time + ttl < time.time(
            ):
                leader = None

            if metadata:
                member = Member(-1, leader, None, {})
                member = ([m for m in members if m.name == leader]
                          or [member])[0]
                leader = Leader(response.metadata.resource_version, None,
                                member)

            # failover key
            failover = nodes.get(self.failover_path)
            metadata = failover and failover.metadata
            failover = Failover.from_node(
                metadata and metadata.resource_version, metadata
                and metadata.annotations)

            # get synchronization state
            sync = nodes.get(self.sync_path)
            metadata = sync and sync.metadata
            sync = SyncState.from_node(metadata and metadata.resource_version,
                                       metadata and metadata.annotations)

            self._cluster = Cluster(initialize, config, leader,
                                    last_leader_operation, members, failover,
                                    sync, history)
        except Exception:
            logger.exception('get_cluster')
            raise KubernetesError('Kubernetes API is not responding properly')

    @staticmethod
    def compare_ports(p1, p2):
        return p1.name == p2.name and p1.port == p2.port and (
            p1.protocol or 'TCP') == (p2.protocol or 'TCP')

    @staticmethod
    def subsets_changed(last_observed_subsets, subsets):
        """
        >>> Kubernetes.subsets_changed([], [])
        False
        >>> Kubernetes.subsets_changed([], [k8s_client.V1EndpointSubset()])
        True
        >>> s1 = [k8s_client.V1EndpointSubset(addresses=[k8s_client.V1EndpointAddress(ip='1.2.3.4')])]
        >>> s2 = [k8s_client.V1EndpointSubset(addresses=[k8s_client.V1EndpointAddress(ip='1.2.3.5')])]
        >>> Kubernetes.subsets_changed(s1, s2)
        True
        >>> a = [k8s_client.V1EndpointAddress(ip='1.2.3.4')]
        >>> s1 = [k8s_client.V1EndpointSubset(addresses=a, ports=[k8s_client.V1EndpointPort(protocol='TCP', port=1)])]
        >>> s2 = [k8s_client.V1EndpointSubset(addresses=a, ports=[k8s_client.V1EndpointPort(port=5432)])]
        >>> Kubernetes.subsets_changed(s1, s2)
        True
        >>> p1 = k8s_client.V1EndpointPort(name='port1', port=1)
        >>> p2 = k8s_client.V1EndpointPort(name='port2', port=2)
        >>> p3 = k8s_client.V1EndpointPort(name='port3', port=3)
        >>> s1 = [k8s_client.V1EndpointSubset(addresses=a, ports=[p1, p2])]
        >>> s2 = [k8s_client.V1EndpointSubset(addresses=a, ports=[p2, p3])]
        >>> Kubernetes.subsets_changed(s1, s2)
        True
        >>> s2 = [k8s_client.V1EndpointSubset(addresses=a, ports=[p2, p1])]
        >>> Kubernetes.subsets_changed(s1, s2)
        False
        """
        if len(last_observed_subsets) != len(subsets):
            return True
        if subsets == []:
            return False
        if len(last_observed_subsets[0].addresses or []) != 1 or \
                last_observed_subsets[0].addresses[0].ip != subsets[0].addresses[0].ip or \
                len(last_observed_subsets[0].ports) != len(subsets[0].ports):
            return True
        if len(subsets[0].ports) == 1:
            return not Kubernetes.compare_ports(
                last_observed_subsets[0].ports[0], subsets[0].ports[0])
        observed_ports = {p.name: p for p in last_observed_subsets[0].ports}
        for p in subsets[0].ports:
            if p.name not in observed_ports or not Kubernetes.compare_ports(
                    p, observed_ports.pop(p.name)):
                return True
        return False

    @catch_kubernetes_errors
    def patch_or_create(self,
                        name,
                        annotations,
                        resource_version=None,
                        patch=False,
                        retry=True,
                        subsets=None):
        metadata = {
            'namespace': self._namespace,
            'name': name,
            'labels': self._labels,
            'annotations': annotations
        }
        if patch or resource_version:
            if resource_version is not None:
                metadata['resource_version'] = resource_version
            func = functools.partial(self._api.patch_namespaced_kind, name)
        else:
            func = functools.partial(self._api.create_namespaced_kind)
            # skip annotations with null values
            metadata['annotations'] = {
                k: v
                for k, v in metadata['annotations'].items() if v is not None
            }

        metadata = k8s_client.V1ObjectMeta(**metadata)
        if subsets is not None and self.__subsets:
            endpoints = {'metadata': metadata}
            if self.subsets_changed(self._leader_observed_subsets, subsets):
                endpoints['subsets'] = subsets
            body = k8s_client.V1Endpoints(**endpoints)
        else:
            body = k8s_client.V1ConfigMap(metadata=metadata)
        return self.retry(func, self._namespace, body) if retry else func(
            self._namespace, body)

    def _write_leader_optime(self, last_operation):
        """Unused"""

    def _update_leader(self):
        """Unused"""

    def update_leader(self, last_operation):
        now = datetime.datetime.now(tzutc).isoformat()
        annotations = {
            self._LEADER: self._name,
            'ttl': str(self._ttl),
            'renewTime': now,
            'acquireTime': self._leader_observed_record.get('acquireTime')
            or now,
            'transitions': self._leader_observed_record.get('transitions')
            or '0'
        }
        if last_operation:
            annotations[self._OPTIME] = last_operation

        ret = self.patch_or_create(self.leader_path,
                                   annotations,
                                   self._leader_resource_version,
                                   subsets=self.__subsets)
        if ret:
            self._leader_resource_version = ret.metadata.resource_version
        return ret

    def attempt_to_acquire_leader(self, permanent=False):
        now = datetime.datetime.now(tzutc).isoformat()
        annotations = {
            self._LEADER: self._name,
            'ttl': str(sys.maxsize if permanent else self._ttl),
            'renewTime': now,
            'acquireTime': now,
            'transitions': '0'
        }
        if self._leader_observed_record:
            try:
                transitions = int(
                    self._leader_observed_record.get('transitions'))
            except (TypeError, ValueError):
                transitions = 0

            if self._leader_observed_record.get(self._LEADER) != self._name:
                transitions += 1
            else:
                annotations['acquireTime'] = self._leader_observed_record.get(
                    'acquireTime') or now
            annotations['transitions'] = str(transitions)
        ret = self.patch_or_create(self.leader_path,
                                   annotations,
                                   self._leader_resource_version,
                                   subsets=self.__subsets)
        if ret:
            self._leader_resource_version = ret.metadata.resource_version
        else:
            logger.info('Could not take out TTL lock')
        return ret

    def take_leader(self):
        return self.attempt_to_acquire_leader()

    def set_failover_value(self, value, index=None):
        """Unused"""

    def manual_failover(self,
                        leader,
                        candidate,
                        scheduled_at=None,
                        index=None):
        annotations = {
            'leader': leader or None,
            'member': candidate or None,
            'scheduled_at': scheduled_at
        }
        patch = bool(self.cluster
                     and isinstance(self.cluster.failover, Failover)
                     and self.cluster.failover.index)
        return self.patch_or_create(self.failover_path, annotations, index,
                                    bool(index or patch), False)

    def set_config_value(self, value, index=None):
        patch = bool(index or self.cluster and self.cluster.config
                     and self.cluster.config.index)
        return self.patch_or_create(self.config_path, {self._CONFIG: value},
                                    index, patch, False)

    @catch_kubernetes_errors
    def touch_member(self, data, ttl=None, permanent=False):
        cluster = self.cluster
        if cluster and cluster.leader and cluster.leader.name == self._name:
            role = 'master'
        elif data['state'] == 'running' and data['role'] != 'master':
            role = data['role']
        else:
            role = None

        member = cluster and cluster.get_member(self._name,
                                                fallback_to_leader=False)
        pod_labels = member and member.data.pop('pod_labels', None)
        ret = pod_labels is not None and pod_labels.get(
            self._role_label) == role and deep_compare(data, member.data)

        if not ret:
            metadata = {
                'namespace': self._namespace,
                'name': self._name,
                'labels': {
                    self._role_label: role
                },
                'annotations': {
                    'status': json.dumps(data, separators=(',', ':'))
                }
            }
            body = k8s_client.V1Pod(metadata=k8s_client.V1ObjectMeta(
                **metadata))
            ret = self._api.patch_namespaced_pod(self._name, self._namespace,
                                                 body)
        return ret

    def initialize(self, create_new=True, sysid=""):
        cluster = self.cluster
        resource_version = cluster.config.index if cluster and cluster.config and cluster.config.index else None
        return self.patch_or_create(self.config_path,
                                    {self._INITIALIZE: sysid},
                                    resource_version)

    def delete_leader(self):
        if self.cluster and isinstance(
                self.cluster.leader,
                Leader) and self.cluster.leader.name == self._name:
            self.patch_or_create(self.leader_path, {self._LEADER: None},
                                 self._leader_resource_version, True, False,
                                 [])
            self.reset_cluster()

    def cancel_initialization(self):
        self.patch_or_create(self.config_path, {self._INITIALIZE: None},
                             self.cluster.config.index, True)

    @catch_kubernetes_errors
    def delete_cluster(self):
        self.retry(self._api.delete_collection_namespaced_kind,
                   self._namespace,
                   label_selector=self._label_selector)

    def set_history_value(self, value):
        patch = bool(self.cluster and self.cluster.config
                     and self.cluster.config.index)
        return self.patch_or_create(self.config_path, {self._HISTORY: value},
                                    None, patch, False)

    def set_sync_state_value(self, value, index=None):
        """Unused"""

    def write_sync_state(self, leader, sync_standby, index=None):
        return self.patch_or_create(self.sync_path,
                                    self.sync_state(leader, sync_standby),
                                    index, False)

    def delete_sync_state(self, index=None):
        return self.write_sync_state(None, None, index)

    def watch(self, leader_index, timeout):
        if self.__do_not_watch:
            self.__do_not_watch = False
            return True

        if leader_index:
            end_time = time.time() + timeout
            w = k8s_watch.Watch()
            while timeout >= 1:
                try:
                    for event in w.stream(self._api.list_namespaced_kind,
                                          self._namespace,
                                          resource_version=leader_index,
                                          timeout_seconds=int(timeout + 0.5),
                                          field_selector='metadata.name=' +
                                          self.leader_path,
                                          _request_timeout=(1, timeout + 1)):
                        return event['raw_object'].get(
                            'metadata',
                            {}).get('resourceVersion') != leader_index
                    return False
                except KeyboardInterrupt:
                    raise
                except Exception:
                    logging.exception('watch')

                timeout = end_time - time.time()

        try:
            return super(Kubernetes, self).watch(None, timeout)
        finally:
            self.event.clear()
Example #26
0
 def test_reset(self):
     retry = Retry(delay=0, max_tries=2)
     retry(self._fail())
     self.assertEquals(retry._attempts, 1)
     retry.reset()
     self.assertEquals(retry._attempts, 0)
Example #27
0
class Kubernetes(AbstractDCS):

    def __init__(self, config):
        self._labels = config['labels']
        self._labels[config.get('scope_label', 'cluster-name')] = config['scope']
        self._label_selector = ','.join('{0}={1}'.format(k, v) for k, v in self._labels.items())
        self._namespace = config.get('namespace') or 'default'
        self._role_label = config.get('role_label', 'role')
        config['namespace'] = ''
        super(Kubernetes, self).__init__(config)
        self._retry = Retry(deadline=config['retry_timeout'], max_delay=1, max_tries=-1,
                            retry_exceptions=(KubernetesRetriableException, HTTPException,
                                              HTTPError, socket.error, socket.timeout))
        self._ttl = None
        try:
            k8s_config.load_incluster_config()
        except k8s_config.ConfigException:
            k8s_config.load_kube_config(context=config.get('context', 'local'))

        self.__subsets = None
        use_endpoints = config.get('use_endpoints') and (config.get('patronictl') or 'pod_ip' in config)
        if use_endpoints:
            addresses = [k8s_client.V1EndpointAddress(ip=config['pod_ip'])]
            ports = []
            for p in config.get('ports', [{}]):
                port = {'port': int(p.get('port', '5432'))}
                port.update({n: p[n] for n in ('name', 'protocol') if p.get(n)})
                ports.append(k8s_client.V1EndpointPort(**port))
            self.__subsets = [k8s_client.V1EndpointSubset(addresses=addresses, ports=ports)]
        self._api = CoreV1ApiProxy(use_endpoints)
        self.set_retry_timeout(config['retry_timeout'])
        self.set_ttl(config.get('ttl') or 30)
        self._leader_observed_record = {}
        self._leader_observed_time = None
        self._leader_resource_version = None
        self._leader_observed_subsets = []
        self.__do_not_watch = False

    def retry(self, *args, **kwargs):
        return self._retry.copy()(*args, **kwargs)

    def client_path(self, path):
        return super(Kubernetes, self).client_path(path)[1:].replace('/', '-')

    @property
    def leader_path(self):
        return self._base_path[1:] if self.__subsets else super(Kubernetes, self).leader_path

    def set_ttl(self, ttl):
        ttl = int(ttl)
        self.__do_not_watch = self._ttl != ttl
        self._ttl = ttl

    def set_retry_timeout(self, retry_timeout):
        self._retry.deadline = retry_timeout
        self._api.set_timeout(retry_timeout)

    @staticmethod
    def member(pod):
        annotations = pod.metadata.annotations or {}
        member = Member.from_node(pod.metadata.resource_version, pod.metadata.name, None, annotations.get('status', ''))
        member.data['pod_labels'] = pod.metadata.labels
        return member

    def _load_cluster(self):
        try:
            # get list of members
            response = self.retry(self._api.list_namespaced_pod, self._namespace, label_selector=self._label_selector)
            members = [self.member(pod) for pod in response.items]

            response = self.retry(self._api.list_namespaced_kind, self._namespace, label_selector=self._label_selector)
            nodes = {item.metadata.name: item for item in response.items}

            config = nodes.get(self.config_path)
            metadata = config and config.metadata
            annotations = metadata and metadata.annotations or {}

            # get initialize flag
            initialize = annotations.get(self._INITIALIZE)

            # get global dynamic configuration
            config = ClusterConfig.from_node(metadata and metadata.resource_version,
                                             annotations.get(self._CONFIG) or '{}')

            # get timeline history
            history = TimelineHistory.from_node(metadata and metadata.resource_version,
                                                annotations.get(self._HISTORY) or '[]')

            leader = nodes.get(self.leader_path)
            metadata = leader and leader.metadata
            self._leader_resource_version = metadata.resource_version if metadata else None
            self._leader_observed_subsets = leader.subsets if self.__subsets and leader else []
            annotations = metadata and metadata.annotations or {}

            # get last leader operation
            last_leader_operation = annotations.get(self._OPTIME)
            last_leader_operation = 0 if last_leader_operation is None else int(last_leader_operation)

            # get leader
            leader_record = {n: annotations.get(n) for n in (self._LEADER, 'acquireTime',
                             'ttl', 'renewTime', 'transitions') if n in annotations}
            if (leader_record or self._leader_observed_record) and leader_record != self._leader_observed_record:
                self._leader_observed_record = leader_record
                self._leader_observed_time = time.time()

            leader = leader_record.get(self._LEADER)
            try:
                ttl = int(leader_record.get('ttl')) or self._ttl
            except (TypeError, ValueError):
                ttl = self._ttl

            if not metadata or not self._leader_observed_time or self._leader_observed_time + ttl < time.time():
                leader = None

            if metadata:
                member = Member(-1, leader, None, {})
                member = ([m for m in members if m.name == leader] or [member])[0]
                leader = Leader(response.metadata.resource_version, None, member)

            # failover key
            failover = nodes.get(self.failover_path)
            metadata = failover and failover.metadata
            failover = Failover.from_node(metadata and metadata.resource_version, metadata and metadata.annotations)

            # get synchronization state
            sync = nodes.get(self.sync_path)
            metadata = sync and sync.metadata
            sync = SyncState.from_node(metadata and metadata.resource_version,  metadata and metadata.annotations)

            self._cluster = Cluster(initialize, config, leader, last_leader_operation, members, failover, sync, history)
        except Exception:
            logger.exception('get_cluster')
            raise KubernetesError('Kubernetes API is not responding properly')

    @staticmethod
    def compare_ports(p1, p2):
        return p1.name == p2.name and p1.port == p2.port and (p1.protocol or 'TCP') == (p2.protocol or 'TCP')

    @staticmethod
    def subsets_changed(last_observed_subsets, subsets):
        """
        >>> Kubernetes.subsets_changed([], [])
        False
        >>> Kubernetes.subsets_changed([], [k8s_client.V1EndpointSubset()])
        True
        >>> s1 = [k8s_client.V1EndpointSubset(addresses=[k8s_client.V1EndpointAddress(ip='1.2.3.4')])]
        >>> s2 = [k8s_client.V1EndpointSubset(addresses=[k8s_client.V1EndpointAddress(ip='1.2.3.5')])]
        >>> Kubernetes.subsets_changed(s1, s2)
        True
        >>> a = [k8s_client.V1EndpointAddress(ip='1.2.3.4')]
        >>> s1 = [k8s_client.V1EndpointSubset(addresses=a, ports=[k8s_client.V1EndpointPort(protocol='TCP', port=1)])]
        >>> s2 = [k8s_client.V1EndpointSubset(addresses=a, ports=[k8s_client.V1EndpointPort(port=5432)])]
        >>> Kubernetes.subsets_changed(s1, s2)
        True
        >>> p1 = k8s_client.V1EndpointPort(name='port1', port=1)
        >>> p2 = k8s_client.V1EndpointPort(name='port2', port=2)
        >>> p3 = k8s_client.V1EndpointPort(name='port3', port=3)
        >>> s1 = [k8s_client.V1EndpointSubset(addresses=a, ports=[p1, p2])]
        >>> s2 = [k8s_client.V1EndpointSubset(addresses=a, ports=[p2, p3])]
        >>> Kubernetes.subsets_changed(s1, s2)
        True
        >>> s2 = [k8s_client.V1EndpointSubset(addresses=a, ports=[p2, p1])]
        >>> Kubernetes.subsets_changed(s1, s2)
        False
        """
        if len(last_observed_subsets) != len(subsets):
            return True
        if subsets == []:
            return False
        if len(last_observed_subsets[0].addresses or []) != 1 or \
                last_observed_subsets[0].addresses[0].ip != subsets[0].addresses[0].ip or \
                len(last_observed_subsets[0].ports) != len(subsets[0].ports):
            return True
        if len(subsets[0].ports) == 1:
            return not Kubernetes.compare_ports(last_observed_subsets[0].ports[0], subsets[0].ports[0])
        observed_ports = {p.name: p for p in last_observed_subsets[0].ports}
        for p in subsets[0].ports:
            if p.name not in observed_ports or not Kubernetes.compare_ports(p, observed_ports.pop(p.name)):
                return True
        return False

    @catch_kubernetes_errors
    def patch_or_create(self, name, annotations, resource_version=None, patch=False, retry=True, subsets=None):
        metadata = {'namespace': self._namespace, 'name': name, 'labels': self._labels, 'annotations': annotations}
        if patch or resource_version:
            if resource_version is not None:
                metadata['resource_version'] = resource_version
            func = functools.partial(self._api.patch_namespaced_kind, name)
        else:
            func = functools.partial(self._api.create_namespaced_kind)
            # skip annotations with null values
            metadata['annotations'] = {k: v for k, v in metadata['annotations'].items() if v is not None}

        metadata = k8s_client.V1ObjectMeta(**metadata)
        if subsets is not None and self.__subsets:
            endpoints = {'metadata': metadata}
            if self.subsets_changed(self._leader_observed_subsets, subsets):
                endpoints['subsets'] = subsets
            body = k8s_client.V1Endpoints(**endpoints)
        else:
            body = k8s_client.V1ConfigMap(metadata=metadata)
        return self.retry(func, self._namespace, body) if retry else func(self._namespace, body)

    def _write_leader_optime(self, last_operation):
        """Unused"""

    def _update_leader(self):
        """Unused"""

    def update_leader(self, last_operation):
        now = datetime.datetime.now(tzutc).isoformat()
        annotations = {self._LEADER: self._name, 'ttl': str(self._ttl), 'renewTime': now,
                       'acquireTime': self._leader_observed_record.get('acquireTime') or now,
                       'transitions': self._leader_observed_record.get('transitions') or '0'}
        if last_operation:
            annotations[self._OPTIME] = last_operation

        ret = self.patch_or_create(self.leader_path, annotations, self._leader_resource_version, subsets=self.__subsets)
        if ret:
            self._leader_resource_version = ret.metadata.resource_version
        return ret

    def attempt_to_acquire_leader(self, permanent=False):
        now = datetime.datetime.now(tzutc).isoformat()
        annotations = {self._LEADER: self._name, 'ttl': str(sys.maxsize if permanent else self._ttl),
                       'renewTime': now, 'acquireTime': now, 'transitions': '0'}
        if self._leader_observed_record:
            try:
                transitions = int(self._leader_observed_record.get('transitions'))
            except (TypeError, ValueError):
                transitions = 0

            if self._leader_observed_record.get(self._LEADER) != self._name:
                transitions += 1
            else:
                annotations['acquireTime'] = self._leader_observed_record.get('acquireTime') or now
            annotations['transitions'] = str(transitions)
        ret = self.patch_or_create(self.leader_path, annotations, self._leader_resource_version, subsets=self.__subsets)
        if ret:
            self._leader_resource_version = ret.metadata.resource_version
        else:
            logger.info('Could not take out TTL lock')
        return ret

    def take_leader(self):
        return self.attempt_to_acquire_leader()

    def set_failover_value(self, value, index=None):
        """Unused"""

    def manual_failover(self, leader, candidate, scheduled_at=None, index=None):
        annotations = {'leader': leader or None, 'member': candidate or None, 'scheduled_at': scheduled_at}
        patch = bool(self.cluster and isinstance(self.cluster.failover, Failover) and self.cluster.failover.index)
        return self.patch_or_create(self.failover_path, annotations, index, bool(index or patch), False)

    def set_config_value(self, value, index=None):
        patch = bool(index or self.cluster and self.cluster.config and self.cluster.config.index)
        return self.patch_or_create(self.config_path, {self._CONFIG: value}, index, patch, False)

    @catch_kubernetes_errors
    def touch_member(self, data, ttl=None, permanent=False):
        cluster = self.cluster
        if cluster and cluster.leader and cluster.leader.name == self._name:
            role = 'master'
        elif data['state'] == 'running' and data['role'] != 'master':
            role = data['role']
        else:
            role = None

        member = cluster and cluster.get_member(self._name, fallback_to_leader=False)
        pod_labels = member and member.data.pop('pod_labels', None)
        ret = pod_labels is not None and pod_labels.get(self._role_label) == role and deep_compare(data, member.data)

        if not ret:
            metadata = {'namespace': self._namespace, 'name': self._name, 'labels': {self._role_label: role},
                        'annotations': {'status': json.dumps(data, separators=(',', ':'))}}
            body = k8s_client.V1Pod(metadata=k8s_client.V1ObjectMeta(**metadata))
            ret = self._api.patch_namespaced_pod(self._name, self._namespace, body)
        return ret

    def initialize(self, create_new=True, sysid=""):
        cluster = self.cluster
        resource_version = cluster.config.index if cluster and cluster.config and cluster.config.index else None
        return self.patch_or_create(self.config_path, {self._INITIALIZE: sysid}, resource_version)

    def delete_leader(self):
        if self.cluster and isinstance(self.cluster.leader, Leader) and self.cluster.leader.name == self._name:
            self.patch_or_create(self.leader_path, {self._LEADER: None}, self._leader_resource_version, True, False, [])
            self.reset_cluster()

    def cancel_initialization(self):
        self.patch_or_create(self.config_path, {self._INITIALIZE: None}, self.cluster.config.index, True)

    @catch_kubernetes_errors
    def delete_cluster(self):
        self.retry(self._api.delete_collection_namespaced_kind, self._namespace, label_selector=self._label_selector)

    def set_history_value(self, value):
        patch = bool(self.cluster and self.cluster.config and self.cluster.config.index)
        return self.patch_or_create(self.config_path, {self._HISTORY: value}, None, patch, False)

    def set_sync_state_value(self, value, index=None):
        """Unused"""

    def write_sync_state(self, leader, sync_standby, index=None):
        return self.patch_or_create(self.sync_path, self.sync_state(leader, sync_standby), index, False)

    def delete_sync_state(self, index=None):
        return self.write_sync_state(None, None, index)

    def watch(self, leader_index, timeout):
        if self.__do_not_watch:
            self.__do_not_watch = False
            return True

        if leader_index:
            end_time = time.time() + timeout
            w = k8s_watch.Watch()
            while timeout >= 1:
                try:
                    for event in w.stream(self._api.list_namespaced_kind, self._namespace,
                                          resource_version=leader_index, timeout_seconds=int(timeout + 0.5),
                                          field_selector='metadata.name=' + self.leader_path,
                                          _request_timeout=(1, timeout + 1)):
                        return event['raw_object'].get('metadata', {}).get('resourceVersion') != leader_index
                    return False
                except KeyboardInterrupt:
                    raise
                except Exception:
                    logging.exception('watch')

                timeout = end_time - time.time()

        try:
            return super(Kubernetes, self).watch(None, timeout)
        finally:
            self.event.clear()
Example #28
0
class Consul(AbstractDCS):

    def __init__(self, config):
        super(Consul, self).__init__(config)
        self._scope = config['scope']
        self._session = None
        self._ttl = None
        self.__do_not_watch = False
        self._retry = Retry(deadline=config['retry_timeout'], max_delay=1, max_tries=-1,
                            retry_exceptions=(ConsulInternalError, HTTPException,
                                              HTTPError, socket.error, socket.timeout))

        self._my_member_data = None
        self.set_ttl(config.get('ttl') or 30)
        host, port = config.get('host', '127.0.0.1:8500').split(':')
        self._client = ConsulClient(host=host, port=port)
        self.set_retry_timeout(config['retry_timeout'])
        if not self._ctl:
            self.create_session()

    def retry(self, *args, **kwargs):
        return self._retry.copy()(*args, **kwargs)

    def create_session(self):
        while not self._session:
            try:
                self.refresh_session()
            except ConsulError:
                logger.info('waiting on consul')
                sleep(5)

    def set_ttl(self, ttl):
        ttl = ttl/2.0  # My experiments have shown that session expires after 2*ttl time
        if self._ttl != ttl:
            self._session = None
            self.__do_not_watch = True
        self._ttl = ttl

    def set_retry_timeout(self, retry_timeout):
        self._retry.deadline = retry_timeout
        self._client.http.set_read_timeout(retry_timeout)

    def _do_refresh_session(self):
        """:returns: `!True` if it had to create new session"""
        if self._session:
            try:
                return self._client.session.renew(self._session) is None
            except NotFound:
                self._session = None
        if not self._session:
            name = self._scope + '-' + self._name
            self._session = self._client.session.create(name=name, lock_delay=0, behavior='delete', ttl=self._ttl)
        return True

    def refresh_session(self):
        try:
            return self.retry(self._do_refresh_session)
        except (ConsulException, RetryFailedError):
            logger.exception('refresh_session')
        raise ConsulError('Failed to renew/create session')

    def client_path(self, path):
        return super(Consul, self).client_path(path)[1:]

    @staticmethod
    def member(node):
        return Member.from_node(node['ModifyIndex'], os.path.basename(node['Key']), node.get('Session'), node['Value'])

    def _load_cluster(self):
        try:
            path = self.client_path('/')
            _, results = self.retry(self._client.kv.get, path, recurse=True)

            if results is None:
                raise NotFound

            nodes = {}
            for node in results:
                node['Value'] = (node['Value'] or b'').decode('utf-8')
                nodes[os.path.relpath(node['Key'], path)] = node

            # get initialize flag
            initialize = nodes.get(self._INITIALIZE)
            initialize = initialize and initialize['Value']

            # get global dynamic configuration
            config = nodes.get(self._CONFIG)
            config = config and ClusterConfig.from_node(config['ModifyIndex'], config['Value'])

            # get last leader operation
            last_leader_operation = nodes.get(self._LEADER_OPTIME)
            last_leader_operation = 0 if last_leader_operation is None else int(last_leader_operation['Value'])

            # get list of members
            members = [self.member(n) for k, n in nodes.items() if k.startswith(self._MEMBERS) and k.count('/') == 1]

            # get leader
            leader = nodes.get(self._LEADER)
            if not self._ctl and leader and leader['Value'] == self._name \
                    and self._session != leader.get('Session', 'x'):
                logger.info('I am leader but not owner of the session. Removing leader node')
                self._client.kv.delete(self.leader_path, cas=leader['ModifyIndex'])
                leader = None

            if leader:
                member = Member(-1, leader['Value'], None, {})
                member = ([m for m in members if m.name == leader['Value']] or [member])[0]
                leader = Leader(leader['ModifyIndex'], leader.get('Session'), member)

            # failover key
            failover = nodes.get(self._FAILOVER)
            if failover:
                failover = Failover.from_node(failover['ModifyIndex'], failover['Value'])

            self._cluster = Cluster(initialize, config, leader, last_leader_operation, members, failover)
        except NotFound:
            self._cluster = Cluster(None, None, None, None, [], None)
        except:
            logger.exception('get_cluster')
            raise ConsulError('Consul is not responding properly')

    def touch_member(self, data, **kwargs):
        cluster = self.cluster
        member = cluster and cluster.get_member(self._name, fallback_to_leader=False)
        create_member = self.refresh_session()
        if member and (create_member or member.session != self._session):
            try:
                self._client.kv.delete(self.member_path)
                create_member = True
            except Exception:
                return False

        if not create_member and member and data == self._my_member_data:
            return True

        try:
            args = {} if kwargs.get('permanent', False) else {'acquire': self._session}
            self._client.kv.put(self.member_path, data, **args)
            self._my_member_data = data
            return True
        except Exception:
            logger.exception('touch_member')
        return False

    @catch_consul_errors
    def attempt_to_acquire_leader(self, permanent=False):
        if not self._session and not permanent:
            self.refresh_session()

        args = {} if permanent else {'acquire': self._session}
        ret = self.retry(self._client.kv.put, self.leader_path, self._name, **args)
        if not ret:
            logger.info('Could not take out TTL lock')
        return ret

    def take_leader(self):
        return self.attempt_to_acquire_leader()

    @catch_consul_errors
    def set_failover_value(self, value, index=None):
        return self._client.kv.put(self.failover_path, value, cas=index)

    @catch_consul_errors
    def set_config_value(self, value, index=None):
        return self._client.kv.put(self.config_path, value, cas=index)

    @catch_consul_errors
    def _write_leader_optime(self, last_operation):
        return self._client.kv.put(self.leader_optime_path, last_operation)

    @staticmethod
    def update_leader():
        return True

    @catch_consul_errors
    def initialize(self, create_new=True, sysid=''):
        kwargs = {'cas': 0} if create_new else {}
        return self.retry(self._client.kv.put, self.initialize_path, sysid, **kwargs)

    @catch_consul_errors
    def cancel_initialization(self):
        return self.retry(self._client.kv.delete, self.initialize_path)

    @catch_consul_errors
    def delete_cluster(self):
        return self.retry(self._client.kv.delete, self.client_path(''), recurse=True)

    @catch_consul_errors
    def delete_leader(self):
        cluster = self.cluster
        if cluster and isinstance(cluster.leader, Leader) and cluster.leader.name == self._name:
            return self._client.kv.delete(self.leader_path, cas=cluster.leader.index)

    def watch(self, timeout):
        if self.__do_not_watch:
            self.__do_not_watch = False
            return True

        cluster = self.cluster
        if cluster and cluster.leader and cluster.leader.name != self._name and cluster.leader.index:
            end_time = time.time() + timeout
            while timeout >= 1:
                try:
                    idx, _ = self._client.kv.get(self.leader_path, index=cluster.leader.index, wait=str(timeout) + 's')
                    return str(idx) != str(cluster.leader.index)
                except (ConsulException, HTTPException, HTTPError, socket.error, socket.timeout):
                    logging.exception('watch')

                timeout = end_time - time.time()

        try:
            return super(Consul, self).watch(timeout)
        finally:
            self.event.clear()
Example #29
0
class Etcd(AbstractDCS):
    def __init__(self, config):
        super(Etcd, self).__init__(config)
        self._ttl = int(config.get('ttl') or 30)
        self._retry = Retry(
            deadline=config['retry_timeout'],
            max_delay=1,
            max_tries=-1,
            retry_exceptions=(etcd.EtcdLeaderElectionInProgress,
                              etcd.EtcdWatcherCleared,
                              etcd.EtcdEventIndexCleared))
        self._client = self.get_etcd_client(config)
        self.__do_not_watch = False
        self._has_failed = False

    def retry(self, *args, **kwargs):
        return self._retry.copy()(*args, **kwargs)

    def _handle_exception(self, e, name='', do_sleep=False, raise_ex=None):
        if not self._has_failed:
            # Handling Attribute Error Avoiding System Back Trace When etcd goes away during normal operation
            print(
                "####  Distributed Consensu (DCS) cannot be reached anymore. ###"
            )
            #logger.exception(name)
        else:
            logger.error(e)
            if do_sleep:
                time.sleep(1)
        self._has_failed = True
        if isinstance(raise_ex, Exception):
            raise raise_ex

    def catch_etcd_errors(func):
        def wrapper(self, *args, **kwargs):
            try:
                retval = func(self, *args, **kwargs) is not None
                self._has_failed = False
                return retval
            except (RetryFailedError, etcd.EtcdException) as e:
                self._handle_exception(e)
                return False
            except Exception as e:
                self._handle_exception(e,
                                       raise_ex=EtcdError('unexpected error'))

        return wrapper

    @staticmethod
    def get_etcd_client(config):
        if 'proxy' in config:
            config['use_proxies'] = True
            config['url'] = config['proxy']

        if 'url' in config:
            r = urlparse(config['url'])
            config.update({
                'protocol': r.scheme,
                'host': r.hostname,
                'port': r.port or 2379,
                'username': r.username,
                'password': r.password
            })
        elif 'hosts' in config:
            hosts = config.pop('hosts')
            default_port = config.pop('port', 2379)
            protocol = config.get('protocol', 'http')

            if isinstance(hosts, six.string_types):
                hosts = hosts.split(',')

            config['hosts'] = []
            for value in hosts:
                if isinstance(value, six.string_types):
                    config['hosts'].append(
                        uri(protocol, *split_host_port(value, default_port)))
        elif 'host' in config:
            host, port = split_host_port(config['host'], 2379)
            config['host'] = host
            if 'port' not in config:
                config['port'] = int(port)

        if config.get('cacert'):
            config['ca_cert'] = config.pop('cacert')

        if config.get('key') and config.get('cert'):
            config['cert'] = (config['cert'], config['key'])

        for p in ('discovery_srv', 'srv_domain'):
            if p in config:
                config['srv'] = config.pop(p)

        dns_resolver = DnsCachingResolver()

        def create_connection_patched(address,
                                      timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
                                      source_address=None,
                                      socket_options=None):
            host, port = address
            if host.startswith('['):
                host = host.strip('[]')
            err = None
            for af, socktype, proto, _, sa in dns_resolver.resolve(host, port):
                sock = None
                try:
                    sock = socket.socket(af, socktype, proto)
                    if socket_options:
                        for opt in socket_options:
                            sock.setsockopt(*opt)
                    if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
                        sock.settimeout(timeout)
                    if source_address:
                        sock.bind(source_address)
                    sock.connect(sa)
                    return sock

                except socket.error as e:
                    err = e
                    if sock is not None:
                        sock.close()
                        sock = None

            if err is not None:
                raise err

            raise socket.error("getaddrinfo returns an empty list")

        try:
            urllib3.util.connection.create_connection = create_connection_patched
        except urllib3.exceptions.MaxRetryError:
            print("asdASDSAD")
        client = None
        while not client:
            try:
                client = Client(config, dns_resolver)
                if 'use_proxies' in config and not client.machines:
                    raise etcd.EtcdException
            except etcd.EtcdException:
                logger.info('waiting on etcd')
                time.sleep(5)
        return client

    def set_ttl(self, ttl):
        ttl = int(ttl)
        self.__do_not_watch = self._ttl != ttl
        self._ttl = ttl
        self._client.set_machines_cache_ttl(ttl * 10)

    @property
    def ttl(self):
        return self._ttl

    def set_retry_timeout(self, retry_timeout):
        self._retry.deadline = retry_timeout
        self._client.set_read_timeout(retry_timeout)

    @staticmethod
    def member(node):
        return Member.from_node(node.modifiedIndex, os.path.basename(node.key),
                                node.ttl, node.value)

    def _load_cluster(self):
        cluster = None
        try:
            result = self.retry(self._client.read,
                                self.client_path(''),
                                recursive=True)
            nodes = {
                node.key[len(result.key):].lstrip('/'): node
                for node in result.leaves
            }

            # get initialize flag
            initialize = nodes.get(self._INITIALIZE)
            initialize = initialize and initialize.value

            # get global dynamic configuration
            config = nodes.get(self._CONFIG)
            config = config and ClusterConfig.from_node(
                config.modifiedIndex, config.value)

            # get timeline history
            history = nodes.get(self._HISTORY)
            history = history and TimelineHistory.from_node(
                history.modifiedIndex, history.value)

            # get last leader operation
            last_leader_operation = nodes.get(self._LEADER_OPTIME)
            last_leader_operation = 0 if last_leader_operation is None else int(
                last_leader_operation.value)

            # get list of members
            members = [
                self.member(n) for k, n in nodes.items()
                if k.startswith(self._MEMBERS) and k.count('/') == 1
            ]

            # get leader
            leader = nodes.get(self._LEADER)
            if leader:
                member = Member(-1, leader.value, None, {})
                member = ([m for m in members if m.name == leader.value]
                          or [member])[0]
                index = result.etcd_index if result.etcd_index > leader.modifiedIndex else leader.modifiedIndex + 1
                leader = Leader(index, leader.ttl, member)

            # failover key
            failover = nodes.get(self._FAILOVER)
            if failover:
                failover = Failover.from_node(failover.modifiedIndex,
                                              failover.value)

            # get synchronization state
            sync = nodes.get(self._SYNC)
            sync = SyncState.from_node(sync and sync.modifiedIndex, sync
                                       and sync.value)

            cluster = Cluster(initialize, config, leader,
                              last_leader_operation, members, failover, sync,
                              history)
        except etcd.EtcdKeyNotFound:
            cluster = Cluster(None, None, None, None, [], None, None, None)
        except Exception as e:
            self._handle_exception(
                e,
                'get_cluster',
                raise_ex=EtcdError('Etcd is not responding properly'))
        self._has_failed = False
        return cluster

    @catch_etcd_errors
    def touch_member(self, data, permanent=False):
        data = json.dumps(data, separators=(',', ':'))
        return self._client.set(self.member_path, data,
                                None if permanent else self._ttl)

    @catch_etcd_errors
    def take_leader(self):
        return self.retry(self._client.set, self.leader_path, self._name,
                          self._ttl)

    def attempt_to_acquire_leader(self, permanent=False):
        try:
            return bool(
                self.retry(self._client.write,
                           self.leader_path,
                           self._name,
                           ttl=None if permanent else self._ttl,
                           prevExist=False))
        except etcd.EtcdAlreadyExist:
            logger.info('Could not take out TTL lock')
        except (RetryFailedError, etcd.EtcdException):
            pass
        return False

    @catch_etcd_errors
    def set_failover_value(self, value, index=None):
        return self._client.write(self.failover_path,
                                  value,
                                  prevIndex=index or 0)

    @catch_etcd_errors
    def set_config_value(self, value, index=None):
        return self._client.write(self.config_path,
                                  value,
                                  prevIndex=index or 0)

    @catch_etcd_errors
    def _write_leader_optime(self, last_operation):
        return self._client.set(self.leader_optime_path, last_operation)

    @catch_etcd_errors
    def _update_leader(self):
        return self.retry(self._client.test_and_set, self.leader_path,
                          self._name, self._name, self._ttl)

    @catch_etcd_errors
    def initialize(self, create_new=True, sysid=""):
        return self.retry(self._client.write,
                          self.initialize_path,
                          sysid,
                          prevExist=(not create_new))

    @catch_etcd_errors
    def delete_leader(self):
        return self._client.delete(self.leader_path, prevValue=self._name)

    @catch_etcd_errors
    def cancel_initialization(self):
        return self.retry(self._client.delete, self.initialize_path)

    @catch_etcd_errors
    def delete_cluster(self):
        return self.retry(self._client.delete,
                          self.client_path(''),
                          recursive=True)

    @catch_etcd_errors
    def set_history_value(self, value):
        return self._client.write(self.history_path, value)

    @catch_etcd_errors
    def set_sync_state_value(self, value, index=None):
        return self.retry(self._client.write,
                          self.sync_path,
                          value,
                          prevIndex=index or 0)

    @catch_etcd_errors
    def delete_sync_state(self, index=None):
        return self.retry(self._client.delete,
                          self.sync_path,
                          prevIndex=index or 0)

    def watch(self, leader_index, timeout):
        if self.__do_not_watch:
            self.__do_not_watch = False
            return True

        if leader_index:
            end_time = time.time() + timeout

            while timeout >= 1:  # when timeout is too small urllib3 doesn't have enough time to connect
                try:
                    self._client.watch(self.leader_path,
                                       index=leader_index,
                                       timeout=timeout + 0.5)
                    self._has_failed = False
                    # Synchronous work of all cluster members with etcd is less expensive
                    # than reestablishing http connection every time from every replica.
                    return True
                except etcd.EtcdWatchTimedOut:
                    self._client.http.clear()
                    self._has_failed = False
                    return False
                except (etcd.EtcdEventIndexCleared,
                        etcd.EtcdWatcherCleared):  # Watch failed
                    self._has_failed = False
                    return True  # leave the loop, because watch with the same parameters will fail anyway
                except etcd.EtcdException as e:
                    self._handle_exception(e, 'watch', True)

                timeout = end_time - time.time()

        try:
            return super(Etcd, self).watch(None, timeout)
        finally:
            self.event.clear()
Example #30
0
class Etcd(AbstractDCS):
    def __init__(self, config):
        super(Etcd, self).__init__(config)
        self._ttl = int(config.get('ttl') or 30)
        self._retry = Retry(
            deadline=config['retry_timeout'],
            max_delay=1,
            max_tries=-1,
            retry_exceptions=(etcd.EtcdLeaderElectionInProgress,
                              etcd.EtcdWatcherCleared,
                              etcd.EtcdEventIndexCleared))
        self._client = self.get_etcd_client(config)
        self.__do_not_watch = False

    def retry(self, *args, **kwargs):
        return self._retry.copy()(*args, **kwargs)

    @staticmethod
    def get_etcd_client(config):
        if 'proxy' in config:
            config['use_proxies'] = True
            config['url'] = config['proxy']

        if 'url' in config:
            r = urlparse(config['url'])
            config.update({
                'protocol': r.scheme,
                'host': r.hostname,
                'port': r.port or 2379,
                'username': r.username,
                'password': r.password
            })
        elif 'host' in config:
            host, port = (config['host'] + ':2379').split(':')[:2]
            config['host'] = host
            if 'port' not in config:
                config['port'] = int(port)

        if config.get('cacert'):
            config['ca_cert'] = config.pop('cacert')

        if config.get('key') and config.get('cert'):
            config['cert'] = (config['cert'], config['key'])

        for p in ('discovery_srv', 'srv_domain'):
            if p in config:
                config['srv'] = config.pop(p)

        dns_resolver = DnsCachingResolver()

        def create_connection_patched(address,
                                      timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
                                      source_address=None,
                                      socket_options=None):
            host, port = address
            if host.startswith('['):
                host = host.strip('[]')
            err = None
            for af, socktype, proto, _, sa in dns_resolver.resolve(host, port):
                sock = None
                try:
                    sock = socket.socket(af, socktype, proto)
                    if socket_options:
                        for opt in socket_options:
                            sock.setsockopt(*opt)
                    if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
                        sock.settimeout(timeout)
                    if source_address:
                        sock.bind(source_address)
                    sock.connect(sa)
                    return sock

                except socket.error as e:
                    err = e
                    if sock is not None:
                        sock.close()
                        sock = None

            if err is not None:
                raise err

            raise socket.error("getaddrinfo returns an empty list")

        urllib3.util.connection.create_connection = create_connection_patched

        client = None
        while not client:
            try:
                client = Client(config, dns_resolver)
            except etcd.EtcdException:
                logger.info('waiting on etcd')
                time.sleep(5)
        return client

    def set_ttl(self, ttl):
        ttl = int(ttl)
        self.__do_not_watch = self._ttl != ttl
        self._ttl = ttl
        self._client.set_machines_cache_ttl(ttl * 10)

    def set_retry_timeout(self, retry_timeout):
        self._retry.deadline = retry_timeout
        self._client.set_read_timeout(retry_timeout)

    @staticmethod
    def member(node):
        return Member.from_node(node.modifiedIndex, os.path.basename(node.key),
                                node.ttl, node.value)

    def _load_cluster(self):
        try:
            result = self.retry(self._client.read,
                                self.client_path(''),
                                recursive=True)
            nodes = {
                os.path.relpath(node.key, result.key): node
                for node in result.leaves
            }

            # get initialize flag
            initialize = nodes.get(self._INITIALIZE)
            initialize = initialize and initialize.value

            # get global dynamic configuration
            config = nodes.get(self._CONFIG)
            config = config and ClusterConfig.from_node(
                config.modifiedIndex, config.value)

            # get last leader operation
            last_leader_operation = nodes.get(self._LEADER_OPTIME)
            last_leader_operation = 0 if last_leader_operation is None else int(
                last_leader_operation.value)

            # get list of members
            members = [
                self.member(n) for k, n in nodes.items()
                if k.startswith(self._MEMBERS) and k.count('/') == 1
            ]

            # get leader
            leader = nodes.get(self._LEADER)
            if leader:
                member = Member(-1, leader.value, None, {})
                member = ([m for m in members if m.name == leader.value]
                          or [member])[0]
                index = result.etcd_index if result.etcd_index > leader.modifiedIndex else leader.modifiedIndex + 1
                leader = Leader(index, leader.ttl, member)

            # failover key
            failover = nodes.get(self._FAILOVER)
            if failover:
                failover = Failover.from_node(failover.modifiedIndex,
                                              failover.value)

            # get synchronization state
            sync = nodes.get(self._SYNC)
            sync = SyncState.from_node(sync and sync.modifiedIndex, sync
                                       and sync.value)

            self._cluster = Cluster(initialize, config, leader,
                                    last_leader_operation, members, failover,
                                    sync)
        except etcd.EtcdKeyNotFound:
            self._cluster = Cluster(None, None, None, None, [], None, None)
        except:
            logger.exception('get_cluster')
            raise EtcdError('Etcd is not responding properly')

    @catch_etcd_errors
    def touch_member(self, data, ttl=None, permanent=False):
        return self.retry(self._client.set, self.member_path, data,
                          None if permanent else ttl or self._ttl)

    @catch_etcd_errors
    def take_leader(self):
        return self.retry(self._client.set, self.leader_path, self._name,
                          self._ttl)

    def attempt_to_acquire_leader(self, permanent=False):
        try:
            return bool(
                self.retry(self._client.write,
                           self.leader_path,
                           self._name,
                           ttl=None if permanent else self._ttl,
                           prevExist=False))
        except etcd.EtcdAlreadyExist:
            logger.info('Could not take out TTL lock')
        except (RetryFailedError, etcd.EtcdException):
            pass
        return False

    @catch_etcd_errors
    def set_failover_value(self, value, index=None):
        return self._client.write(self.failover_path,
                                  value,
                                  prevIndex=index or 0)

    @catch_etcd_errors
    def set_config_value(self, value, index=None):
        return self._client.write(self.config_path,
                                  value,
                                  prevIndex=index or 0)

    @catch_etcd_errors
    def _write_leader_optime(self, last_operation):
        return self._client.set(self.leader_optime_path, last_operation)

    @catch_etcd_errors
    def update_leader(self):
        return self.retry(self._client.test_and_set, self.leader_path,
                          self._name, self._name, self._ttl)

    @catch_etcd_errors
    def initialize(self, create_new=True, sysid=""):
        return self.retry(self._client.write,
                          self.initialize_path,
                          sysid,
                          prevExist=(not create_new))

    @catch_etcd_errors
    def delete_leader(self):
        return self._client.delete(self.leader_path, prevValue=self._name)

    @catch_etcd_errors
    def cancel_initialization(self):
        return self.retry(self._client.delete, self.initialize_path)

    @catch_etcd_errors
    def delete_cluster(self):
        return self.retry(self._client.delete,
                          self.client_path(''),
                          recursive=True)

    @catch_etcd_errors
    def set_sync_state_value(self, value, index=None):
        return self._client.write(self.sync_path, value, prevIndex=index or 0)

    @catch_etcd_errors
    def delete_sync_state(self, index=None):
        return self.retry(self._client.delete,
                          self.sync_path,
                          prevIndex=index or 0)

    def watch(self, leader_index, timeout):
        if self.__do_not_watch:
            self.__do_not_watch = False
            return True

        if leader_index:
            end_time = time.time() + timeout

            while timeout >= 1:  # when timeout is too small urllib3 doesn't have enough time to connect
                try:
                    self._client.watch(self.leader_path,
                                       index=leader_index,
                                       timeout=timeout + 0.5)
                    # Synchronous work of all cluster members with etcd is less expensive
                    # than reestablishing http connection every time from every replica.
                    return True
                except etcd.EtcdWatchTimedOut:
                    self._client.http.clear()
                    return False
                except etcd.EtcdException:
                    logger.exception('watch')

                timeout = end_time - time.time()

        try:
            return super(Etcd, self).watch(None, timeout)
        finally:
            self.event.clear()
Example #31
0
class Etcd(AbstractDCS):

    def __init__(self, config):
        super(Etcd, self).__init__(config)
        self._ttl = int(config.get('ttl') or 30)
        self._retry = Retry(deadline=config['retry_timeout'], max_delay=1, max_tries=-1,
                            retry_exceptions=(etcd.EtcdConnectionFailed,
                                              etcd.EtcdLeaderElectionInProgress,
                                              etcd.EtcdWatcherCleared,
                                              etcd.EtcdEventIndexCleared))
        self._client = self.get_etcd_client(config)
        self.__do_not_watch = False

    def retry(self, *args, **kwargs):
        return self._retry.copy()(*args, **kwargs)

    @staticmethod
    def get_etcd_client(config):
        client = None
        while not client:
            try:
                client = Client(config)
            except etcd.EtcdException:
                logger.info('waiting on etcd')
                sleep(5)
        return client

    def set_ttl(self, ttl):
        ttl = int(ttl)
        self.__do_not_watch = self._ttl != ttl
        self._ttl = ttl

    def set_retry_timeout(self, retry_timeout):
        self._retry.deadline = retry_timeout

    @staticmethod
    def member(node):
        return Member.from_node(node.modifiedIndex, os.path.basename(node.key), node.ttl, node.value)

    def _load_cluster(self):
        try:
            result = self.retry(self._client.read, self.client_path(''), recursive=True)
            nodes = {os.path.relpath(node.key, result.key): node for node in result.leaves}

            # get initialize flag
            initialize = nodes.get(self._INITIALIZE)
            initialize = initialize and initialize.value

            # get global dynamic configuration
            config = nodes.get(self._CONFIG)
            config = config and ClusterConfig.from_node(config.modifiedIndex, config.value)

            # get last leader operation
            last_leader_operation = nodes.get(self._LEADER_OPTIME)
            last_leader_operation = 0 if last_leader_operation is None else int(last_leader_operation.value)

            # get list of members
            members = [self.member(n) for k, n in nodes.items() if k.startswith(self._MEMBERS) and k.count('/') == 1]

            # get leader
            leader = nodes.get(self._LEADER)
            if leader:
                member = Member(-1, leader.value, None, {})
                member = ([m for m in members if m.name == leader.value] or [member])[0]
                leader = Leader(leader.modifiedIndex, leader.ttl, member)

            # failover key
            failover = nodes.get(self._FAILOVER)
            if failover:
                failover = Failover.from_node(failover.modifiedIndex, failover.value)

            self._cluster = Cluster(initialize, config, leader, last_leader_operation, members, failover)
        except etcd.EtcdKeyNotFound:
            self._cluster = Cluster(False, None, None, None, [], None)
        except:
            logger.exception('get_cluster')
            raise EtcdError('Etcd is not responding properly')

    @catch_etcd_errors
    def touch_member(self, data, ttl=None):
        return self.retry(self._client.set, self.member_path, data, ttl or self._ttl)

    @catch_etcd_errors
    def take_leader(self):
        return self.retry(self._client.set, self.leader_path, self._name, self._ttl)

    def attempt_to_acquire_leader(self):
        try:
            return bool(self.retry(self._client.write, self.leader_path, self._name, ttl=self._ttl, prevExist=False))
        except etcd.EtcdAlreadyExist:
            logger.info('Could not take out TTL lock')
        except (RetryFailedError, etcd.EtcdException):
            pass
        return False

    @catch_etcd_errors
    def set_failover_value(self, value, index=None):
        return self._client.write(self.failover_path, value, prevIndex=index or 0)

    @catch_etcd_errors
    def set_config_value(self, value, index=None):
        return self._client.write(self.config_path, value, prevIndex=index or 0)

    @catch_etcd_errors
    def write_leader_optime(self, last_operation):
        return self._client.set(self.leader_optime_path, last_operation)

    @catch_etcd_errors
    def update_leader(self):
        return self.retry(self._client.test_and_set, self.leader_path, self._name, self._name, self._ttl)

    @catch_etcd_errors
    def initialize(self, create_new=True, sysid=""):
        return self.retry(self._client.write, self.initialize_path, sysid, prevExist=(not create_new))

    @catch_etcd_errors
    def delete_leader(self):
        return self._client.delete(self.leader_path, prevValue=self._name)

    @catch_etcd_errors
    def cancel_initialization(self):
        return self.retry(self._client.delete, self.initialize_path)

    @catch_etcd_errors
    def delete_cluster(self):
        return self.retry(self._client.delete, self.client_path(''), recursive=True)

    def watch(self, timeout):
        if self.__do_not_watch:
            self.__do_not_watch = False
            return True

        cluster = self.cluster
        # watch on leader key changes if it is defined and current node is not lock owner
        if cluster and cluster.leader and cluster.leader.name != self._name and cluster.leader.index:
            end_time = time.time() + timeout

            while timeout >= 1:  # when timeout is too small urllib3 doesn't have enough time to connect
                try:
                    self._client.watch(self.leader_path, index=cluster.leader.index + 1, timeout=timeout + 0.5)
                    # Synchronous work of all cluster members with etcd is less expensive
                    # than reestablishing http connection every time from every replica.
                    return True
                except etcd.EtcdWatchTimedOut:
                    self._client.http.clear()
                    return False
                except etcd.EtcdException:
                    logging.exception('watch')

                timeout = end_time - time.time()

        try:
            return super(Etcd, self).watch(timeout)
        finally:
            self.event.clear()
Example #32
0
class Consul(AbstractDCS):

    def __init__(self, config):
        super(Consul, self).__init__(config)
        self._scope = config['scope']
        self._session = None
        self.__do_not_watch = False
        self._retry = Retry(deadline=config['retry_timeout'], max_delay=1, max_tries=-1,
                            retry_exceptions=(ConsulInternalError, HTTPException,
                                              HTTPError, socket.error, socket.timeout))

        self._my_member_data = None
        host, port = config.get('host', '127.0.0.1:8500').split(':')
        self._client = ConsulClient(host=host, port=port)
        self.set_retry_timeout(config['retry_timeout'])
        self.set_ttl(config.get('ttl') or 30)
        self._last_session_refresh = 0
        if not self._ctl:
            self.create_session()

    def retry(self, *args, **kwargs):
        return self._retry.copy()(*args, **kwargs)

    def create_session(self):
        while not self._session:
            try:
                self.refresh_session()
            except ConsulError:
                logger.info('waiting on consul')
                time.sleep(5)

    def set_ttl(self, ttl):
        if self._client.http.set_ttl(ttl/2.0):  # Consul multiplies the TTL by 2x
            self._session = None
            self.__do_not_watch = True

    def set_retry_timeout(self, retry_timeout):
        self._retry.deadline = retry_timeout
        self._client.http.set_read_timeout(retry_timeout)

    def _do_refresh_session(self):
        """:returns: `!True` if it had to create new session"""
        if self._session and self._last_session_refresh + self._loop_wait > time.time():
            return False

        if self._session:
            try:
                self._client.session.renew(self._session)
            except NotFound:
                self._session = None
        ret = not self._session
        if ret:
            self._session = self._client.session.create(name=self._scope + '-' + self._name,
                                                        lock_delay=0.001, behavior='delete')
        self._last_session_refresh = time.time()
        return ret

    def refresh_session(self):
        try:
            return self.retry(self._do_refresh_session)
        except (ConsulException, RetryFailedError):
            logger.exception('refresh_session')
        raise ConsulError('Failed to renew/create session')

    def client_path(self, path):
        return super(Consul, self).client_path(path)[1:]

    @staticmethod
    def member(node):
        return Member.from_node(node['ModifyIndex'], os.path.basename(node['Key']), node.get('Session'), node['Value'])

    def _load_cluster(self):
        try:
            path = self.client_path('/')
            _, results = self.retry(self._client.kv.get, path, recurse=True)

            if results is None:
                raise NotFound

            nodes = {}
            for node in results:
                node['Value'] = (node['Value'] or b'').decode('utf-8')
                nodes[os.path.relpath(node['Key'], path)] = node

            # get initialize flag
            initialize = nodes.get(self._INITIALIZE)
            initialize = initialize and initialize['Value']

            # get global dynamic configuration
            config = nodes.get(self._CONFIG)
            config = config and ClusterConfig.from_node(config['ModifyIndex'], config['Value'])

            # get last leader operation
            last_leader_operation = nodes.get(self._LEADER_OPTIME)
            last_leader_operation = 0 if last_leader_operation is None else int(last_leader_operation['Value'])

            # get list of members
            members = [self.member(n) for k, n in nodes.items() if k.startswith(self._MEMBERS) and k.count('/') == 1]

            # get leader
            leader = nodes.get(self._LEADER)
            if not self._ctl and leader and leader['Value'] == self._name \
                    and self._session != leader.get('Session', 'x'):
                logger.info('I am leader but not owner of the session. Removing leader node')
                self._client.kv.delete(self.leader_path, cas=leader['ModifyIndex'])
                leader = None

            if leader:
                member = Member(-1, leader['Value'], None, {})
                member = ([m for m in members if m.name == leader['Value']] or [member])[0]
                leader = Leader(leader['ModifyIndex'], leader.get('Session'), member)

            # failover key
            failover = nodes.get(self._FAILOVER)
            if failover:
                failover = Failover.from_node(failover['ModifyIndex'], failover['Value'])

            # get synchronization state
            sync = nodes.get(self._SYNC)
            sync = SyncState.from_node(sync and sync['ModifyIndex'], sync and sync['Value'])

            self._cluster = Cluster(initialize, config, leader, last_leader_operation, members, failover, sync)
        except NotFound:
            self._cluster = Cluster(None, None, None, None, [], None, None)
        except:
            logger.exception('get_cluster')
            raise ConsulError('Consul is not responding properly')

    def touch_member(self, data, **kwargs):
        cluster = self.cluster
        member = cluster and cluster.get_member(self._name, fallback_to_leader=False)
        create_member = self.refresh_session()

        if member and (create_member or member.session != self._session):
            try:
                self._client.kv.delete(self.member_path)
                create_member = True
            except Exception:
                return False

        if not create_member and member and data == self._my_member_data:
            return True

        try:
            args = {} if kwargs.get('permanent', False) else {'acquire': self._session}
            self._client.kv.put(self.member_path, data, **args)
            self._my_member_data = data
            return True
        except Exception:
            logger.exception('touch_member')
        return False

    @catch_consul_errors
    def attempt_to_acquire_leader(self, permanent=False):
        if not self._session and not permanent:
            self.refresh_session()

        args = {} if permanent else {'acquire': self._session}
        ret = self.retry(self._client.kv.put, self.leader_path, self._name, **args)
        if not ret:
            logger.info('Could not take out TTL lock')
        return ret

    def take_leader(self):
        return self.attempt_to_acquire_leader()

    @catch_consul_errors
    def set_failover_value(self, value, index=None):
        return self._client.kv.put(self.failover_path, value, cas=index)

    @catch_consul_errors
    def set_config_value(self, value, index=None):
        return self._client.kv.put(self.config_path, value, cas=index)

    @catch_consul_errors
    def _write_leader_optime(self, last_operation):
        return self._client.kv.put(self.leader_optime_path, last_operation)

    @catch_consul_errors
    def update_leader(self):
        if self._session:
            self.retry(self._client.session.renew, self._session)
            self._last_session_refresh = time.time()
        return bool(self._session)

    @catch_consul_errors
    def initialize(self, create_new=True, sysid=''):
        kwargs = {'cas': 0} if create_new else {}
        return self.retry(self._client.kv.put, self.initialize_path, sysid, **kwargs)

    @catch_consul_errors
    def cancel_initialization(self):
        return self.retry(self._client.kv.delete, self.initialize_path)

    @catch_consul_errors
    def delete_cluster(self):
        return self.retry(self._client.kv.delete, self.client_path(''), recurse=True)

    @catch_consul_errors
    def delete_leader(self):
        cluster = self.cluster
        if cluster and isinstance(cluster.leader, Leader) and cluster.leader.name == self._name:
            return self._client.kv.delete(self.leader_path, cas=cluster.leader.index)

    @catch_consul_errors
    def set_sync_state_value(self, value, index=None):
        return self._client.kv.put(self.sync_path, value, cas=index)

    @catch_consul_errors
    def delete_sync_state(self, index=None):
        return self._client.kv.delete(self.sync_path, cas=index)

    def watch(self, leader_index, timeout):
        if self.__do_not_watch:
            self.__do_not_watch = False
            return True

        if leader_index:
            end_time = time.time() + timeout
            while timeout >= 1:
                try:
                    idx, _ = self._client.kv.get(self.leader_path, index=leader_index, wait=str(timeout) + 's')
                    return str(idx) != str(leader_index)
                except (ConsulException, HTTPException, HTTPError, socket.error, socket.timeout):
                    logging.exception('watch')

                timeout = end_time - time.time()

        try:
            return super(Consul, self).watch(None, timeout)
        finally:
            self.event.clear()
Example #33
0
class Etcd(AbstractDCS):

    def __init__(self, name, config):
        super(Etcd, self).__init__(name, config)
        self.ttl = config['ttl']
        self.member_ttl = config.get('member_ttl', 3600)
        self._retry = Retry(deadline=10, max_delay=1, max_tries=-1,
                            retry_exceptions=(etcd.EtcdConnectionFailed,
                                              etcd.EtcdLeaderElectionInProgress,
                                              etcd.EtcdWatcherCleared,
                                              etcd.EtcdEventIndexCleared))
        self.client = self.get_etcd_client(config)
        self.cluster = None

    def retry(self, *args, **kwargs):
        return self._retry.copy()(*args, **kwargs)

    def get_etcd_client(self, config):
        client = None
        while not client:
            try:
                client = Client(config)
            except etcd.EtcdException:
                logger.info('waiting on etcd')
                sleep(5)
        return client

    @staticmethod
    def member(node):
        conn_url, api_url = parse_connection_string(node.value)
        return Member(node.modifiedIndex, os.path.basename(node.key), conn_url, api_url, node.expiration, node.ttl)

    def get_cluster(self):
        try:
            result = self.retry(self.client.read, self.client_path(''), recursive=True)
            nodes = {os.path.relpath(node.key, result.key): node for node in result.leaves}

            # get initialize flag
            initialize = bool(nodes.get(self._INITIALIZE, False))

            # get last leader operation
            last_leader_operation = nodes.get(self._LEADER_OPTIME, None)
            last_leader_operation = 0 if last_leader_operation is None else int(last_leader_operation.value)

            # get list of members
            members = [self.member(n) for k, n in nodes.items() if k.startswith(self._MEMBERS) and k.count('/') == 1]

            # get leader
            leader = nodes.get(self._LEADER, None)
            if leader:
                member = Member(-1, leader.value, None, None, None, None)
                member = ([m for m in members if m.name == leader.value] or [member])[0]
                leader = Leader(leader.modifiedIndex, leader.expiration, leader.ttl, member)

            self.cluster = Cluster(initialize, leader, last_leader_operation, members)
        except etcd.EtcdKeyNotFound:
            self.cluster = Cluster(False, None, None, [])
        except:
            self.cluster = None
            logger.exception('get_cluster')
            raise EtcdError('Etcd is not responding properly')
        return self.cluster

    @catch_etcd_errors
    def touch_member(self, connection_string, ttl=None):
        return self.retry(self.client.set, self.member_path, connection_string, ttl or self.member_ttl)

    @catch_etcd_errors
    def take_leader(self):
        return self.retry(self.client.set, self.leader_path, self._name, self.ttl)

    def attempt_to_acquire_leader(self):
        try:
            return bool(self.retry(self.client.write, self.leader_path, self._name, ttl=self.ttl, prevExist=False))
        except etcd.EtcdAlreadyExist:
            logger.info('Could not take out TTL lock')
        except (RetryFailedError, etcd.EtcdException):
            pass
        return False

    @catch_etcd_errors
    def write_leader_optime(self, state_handler):
        return self.client.set(self.leader_optime_path, state_handler.last_operation())

    @catch_etcd_errors
    def update_leader(self, state_handler):
        ret = self.retry(self.client.test_and_set, self.leader_path, self._name, self._name, self.ttl)
        ret and self.write_leader_optime(state_handler)
        return ret

    @catch_etcd_errors
    def initialize(self):
        return self.client.write(self.initialize_path, self._name, prevExist=False)

    @catch_etcd_errors
    def delete_leader(self):
        return self.client.delete(self.leader_path, prevValue=self._name)

    @catch_etcd_errors
    def cancel_initialization(self):
        return self.client.delete(self.initialize_path, prevValue=self._name)

    def watch(self, timeout):
        # watch on leader key changes if it is defined and current node is not lock owner
        if self.cluster and self.cluster.leader and self.cluster.leader.name != self._name:
            end_time = time.time() + timeout
            index = self.cluster.leader.index

            while index and timeout >= 1:  # when timeout is too small urllib3 doesn't have enough time to connect
                try:
                    self.client.watch(self.leader_path, index=index + 1, timeout=timeout + 0.5)
                    # Synchronous work of all cluster members with etcd is less expensive
                    # than reestablishing http connection every time from every replica.
                    return True
                except urllib3.exceptions.TimeoutError:
                    self.client.http.clear()
                    return False
                except etcd.EtcdException:
                    logging.exception('watch')

                timeout = end_time - time.time()

        return timeout > 0 and super(Etcd, self).watch(timeout)
Example #34
0
class Consul(AbstractDCS):
    def __init__(self, config):
        super(Consul, self).__init__(config)
        self._scope = config['scope']
        self._session = None
        self.__do_not_watch = False
        self._retry = Retry(deadline=config['retry_timeout'],
                            max_delay=1,
                            max_tries=-1,
                            retry_exceptions=(ConsulInternalError,
                                              HTTPException, HTTPError,
                                              socket.error, socket.timeout))

        self._my_member_data = None
        kwargs = {}
        if 'url' in config:
            r = urlparse(config['url'])
            config.update({
                'scheme': r.scheme,
                'host': r.hostname,
                'port': r.port or 8500
            })
        elif 'host' in config:
            host, port = (config.get('host', '127.0.0.1:8500') +
                          ':8500').split(':')[:2]
            config['host'] = host
            if 'port' not in config:
                config['port'] = int(port)

        if config.get('cacert'):
            config['ca_cert'] = config.pop('cacert')

        if config.get('key') and config.get('cert'):
            config['cert'] = (config['cert'], config['key'])

        config_keys = ('host', 'port', 'token', 'scheme', 'cert', 'ca_cert',
                       'dc')
        kwargs = {p: config.get(p) for p in config_keys if config.get(p)}

        verify = config.get('verify')
        if not isinstance(verify, bool):
            verify = parse_bool(verify)
        if isinstance(verify, bool):
            kwargs['verify'] = verify

        self._client = ConsulClient(**kwargs)
        self.set_retry_timeout(config['retry_timeout'])
        self.set_ttl(config.get('ttl') or 30)
        self._last_session_refresh = 0
        self.__session_checks = config.get('checks')
        if not self._ctl:
            self.create_session()

    def retry(self, *args, **kwargs):
        return self._retry.copy()(*args, **kwargs)

    def create_session(self):
        while not self._session:
            try:
                self.refresh_session()
            except ConsulError:
                logger.info('waiting on consul')
                time.sleep(5)

    def set_ttl(self, ttl):
        if self._client.http.set_ttl(ttl /
                                     2.0):  # Consul multiplies the TTL by 2x
            self._session = None
            self.__do_not_watch = True

    def set_retry_timeout(self, retry_timeout):
        self._retry.deadline = retry_timeout
        self._client.http.set_read_timeout(retry_timeout)

    def adjust_ttl(self):
        try:
            settings = self._client.agent.self()
            min_ttl = (settings['Config']['SessionTTLMin']
                       or 10000000000) / 1000000000.0
            logger.warning('Changing Session TTL from %s to %s',
                           self._client.http.ttl, min_ttl)
            self._client.http.set_ttl(min_ttl)
        except Exception:
            logger.exception('adjust_ttl')

    def _do_refresh_session(self):
        """:returns: `!True` if it had to create new session"""
        if self._session and self._last_session_refresh + self._loop_wait > time.time(
        ):
            return False

        if self._session:
            try:
                self._client.session.renew(self._session)
            except NotFound:
                self._session = None
        ret = not self._session
        if ret:
            try:
                self._session = self._client.session.create(
                    name=self._scope + '-' + self._name,
                    checks=self.__session_checks,
                    lock_delay=0.001,
                    behavior='delete')
            except InvalidSessionTTL:
                logger.exception('session.create')
                self.adjust_ttl()
                raise

        self._last_session_refresh = time.time()
        return ret

    def refresh_session(self):
        try:
            return self.retry(self._do_refresh_session)
        except (ConsulException, RetryFailedError):
            logger.exception('refresh_session')
        raise ConsulError('Failed to renew/create session')

    def client_path(self, path):
        return super(Consul, self).client_path(path)[1:]

    @staticmethod
    def member(node):
        return Member.from_node(node['ModifyIndex'],
                                os.path.basename(node['Key']),
                                node.get('Session'), node['Value'])

    def _load_cluster(self):
        try:
            path = self.client_path('/')
            _, results = self.retry(self._client.kv.get, path, recurse=True)

            if results is None:
                raise NotFound

            nodes = {}
            for node in results:
                node['Value'] = (node['Value'] or b'').decode('utf-8')
                nodes[os.path.relpath(node['Key'], path)] = node

            # get initialize flag
            initialize = nodes.get(self._INITIALIZE)
            initialize = initialize and initialize['Value']

            # get global dynamic configuration
            config = nodes.get(self._CONFIG)
            config = config and ClusterConfig.from_node(
                config['ModifyIndex'], config['Value'])

            # get last leader operation
            last_leader_operation = nodes.get(self._LEADER_OPTIME)
            last_leader_operation = 0 if last_leader_operation is None else int(
                last_leader_operation['Value'])

            # get list of members
            members = [
                self.member(n) for k, n in nodes.items()
                if k.startswith(self._MEMBERS) and k.count('/') == 1
            ]

            # get leader
            leader = nodes.get(self._LEADER)
            if not self._ctl and leader and leader['Value'] == self._name \
                    and self._session != leader.get('Session', 'x'):
                logger.info(
                    'I am leader but not owner of the session. Removing leader node'
                )
                self._client.kv.delete(self.leader_path,
                                       cas=leader['ModifyIndex'])
                leader = None

            if leader:
                member = Member(-1, leader['Value'], None, {})
                member = ([m for m in members if m.name == leader['Value']]
                          or [member])[0]
                leader = Leader(leader['ModifyIndex'], leader.get('Session'),
                                member)

            # failover key
            failover = nodes.get(self._FAILOVER)
            if failover:
                failover = Failover.from_node(failover['ModifyIndex'],
                                              failover['Value'])

            # get synchronization state
            sync = nodes.get(self._SYNC)
            sync = SyncState.from_node(sync and sync['ModifyIndex'], sync
                                       and sync['Value'])

            self._cluster = Cluster(initialize, config, leader,
                                    last_leader_operation, members, failover,
                                    sync)
        except NotFound:
            self._cluster = Cluster(None, None, None, None, [], None, None)
        except Exception:
            logger.exception('get_cluster')
            raise ConsulError('Consul is not responding properly')

    def touch_member(self, data, ttl=None, permanent=False):
        cluster = self.cluster
        member = cluster and cluster.get_member(self._name,
                                                fallback_to_leader=False)
        create_member = not permanent and self.refresh_session()

        if member and (create_member or member.session != self._session):
            try:
                self._client.kv.delete(self.member_path)
                create_member = True
            except Exception:
                return False

        if not create_member and member and data == self._my_member_data:
            return True

        try:
            args = {} if permanent else {'acquire': self._session}
            self._client.kv.put(self.member_path, data, **args)
            self._my_member_data = data
            return True
        except Exception:
            logger.exception('touch_member')
        return False

    @catch_consul_errors
    def _do_attempt_to_acquire_leader(self, kwargs):
        return self.retry(self._client.kv.put, self.leader_path, self._name,
                          **kwargs)

    def attempt_to_acquire_leader(self, permanent=False):
        if not self._session and not permanent:
            self.refresh_session()

        ret = self._do_attempt_to_acquire_leader(
            {} if permanent else {'acquire': self._session})
        if not ret:
            logger.info('Could not take out TTL lock')
        return ret

    def take_leader(self):
        return self.attempt_to_acquire_leader()

    @catch_consul_errors
    def set_failover_value(self, value, index=None):
        return self._client.kv.put(self.failover_path, value, cas=index)

    @catch_consul_errors
    def set_config_value(self, value, index=None):
        return self._client.kv.put(self.config_path, value, cas=index)

    @catch_consul_errors
    def _write_leader_optime(self, last_operation):
        return self._client.kv.put(self.leader_optime_path, last_operation)

    @catch_consul_errors
    def update_leader(self):
        if self._session:
            self.retry(self._client.session.renew, self._session)
            self._last_session_refresh = time.time()
        return bool(self._session)

    @catch_consul_errors
    def initialize(self, create_new=True, sysid=''):
        kwargs = {'cas': 0} if create_new else {}
        return self.retry(self._client.kv.put, self.initialize_path, sysid,
                          **kwargs)

    @catch_consul_errors
    def cancel_initialization(self):
        return self.retry(self._client.kv.delete, self.initialize_path)

    @catch_consul_errors
    def delete_cluster(self):
        return self.retry(self._client.kv.delete,
                          self.client_path(''),
                          recurse=True)

    @catch_consul_errors
    def delete_leader(self):
        cluster = self.cluster
        if cluster and isinstance(
                cluster.leader, Leader) and cluster.leader.name == self._name:
            return self._client.kv.delete(self.leader_path,
                                          cas=cluster.leader.index)

    @catch_consul_errors
    def set_sync_state_value(self, value, index=None):
        return self._client.kv.put(self.sync_path, value, cas=index)

    @catch_consul_errors
    def delete_sync_state(self, index=None):
        return self._client.kv.delete(self.sync_path, cas=index)

    def watch(self, leader_index, timeout):
        if self.__do_not_watch:
            self.__do_not_watch = False
            return True

        if leader_index:
            end_time = time.time() + timeout
            while timeout >= 1:
                try:
                    idx, _ = self._client.kv.get(self.leader_path,
                                                 index=leader_index,
                                                 wait=str(timeout) + 's')
                    return str(idx) != str(leader_index)
                except (ConsulException, HTTPException, HTTPError,
                        socket.error, socket.timeout):
                    logging.exception('watch')

                timeout = end_time - time.time()

        try:
            return super(Consul, self).watch(None, timeout)
        finally:
            self.event.clear()
Example #35
0
class Etcd(AbstractDCS):

    def __init__(self, config):
        super(Etcd, self).__init__(config)
        self._ttl = int(config.get('ttl') or 30)
        self._retry = Retry(deadline=config['retry_timeout'], max_delay=1, max_tries=-1,
                            retry_exceptions=(etcd.EtcdLeaderElectionInProgress,
                                              etcd.EtcdWatcherCleared,
                                              etcd.EtcdEventIndexCleared))
        self._client = self.get_etcd_client(config)
        self.__do_not_watch = False

    def retry(self, *args, **kwargs):
        return self._retry.copy()(*args, **kwargs)

    @staticmethod
    def get_etcd_client(config):
        if 'proxy' in config:
            config['use_proxies'] = True
            config['url'] = config['proxy']

        if 'url' in config:
            r = urlparse(config['url'])
            config.update({'protocol': r.scheme, 'host': r.hostname, 'port': r.port or 2379,
                           'username': r.username, 'password': r.password})
        elif 'host' in config:
            host, port = (config['host'] + ':2379').split(':')[:2]
            config['host'] = host
            if 'port' not in config:
                config['port'] = int(port)

        if config.get('cacert'):
            config['ca_cert'] = config.pop('cacert')

        if config.get('key') and config.get('cert'):
            config['cert'] = (config['cert'], config['key'])

        for p in ('discovery_srv', 'srv_domain'):
            if p in config:
                config['srv'] = config.pop(p)
        client = None
        while not client:
            try:
                client = Client(config)
            except etcd.EtcdException:
                logger.info('waiting on etcd')
                time.sleep(5)
        return client

    def set_ttl(self, ttl):
        ttl = int(ttl)
        self.__do_not_watch = self._ttl != ttl
        self._ttl = ttl
        self._client.set_machines_cache_ttl(ttl*10)

    def set_retry_timeout(self, retry_timeout):
        self._retry.deadline = retry_timeout
        self._client.set_read_timeout(retry_timeout)

    @staticmethod
    def member(node):
        return Member.from_node(node.modifiedIndex, os.path.basename(node.key), node.ttl, node.value)

    def _load_cluster(self):
        try:
            result = self.retry(self._client.read, self.client_path(''), recursive=True)
            nodes = {os.path.relpath(node.key, result.key): node for node in result.leaves}

            # get initialize flag
            initialize = nodes.get(self._INITIALIZE)
            initialize = initialize and initialize.value

            # get global dynamic configuration
            config = nodes.get(self._CONFIG)
            config = config and ClusterConfig.from_node(config.modifiedIndex, config.value)

            # get last leader operation
            last_leader_operation = nodes.get(self._LEADER_OPTIME)
            last_leader_operation = 0 if last_leader_operation is None else int(last_leader_operation.value)

            # get list of members
            members = [self.member(n) for k, n in nodes.items() if k.startswith(self._MEMBERS) and k.count('/') == 1]

            # get leader
            leader = nodes.get(self._LEADER)
            if leader:
                member = Member(-1, leader.value, None, {})
                member = ([m for m in members if m.name == leader.value] or [member])[0]
                index = result.etcd_index if result.etcd_index > leader.modifiedIndex else leader.modifiedIndex + 1
                leader = Leader(index, leader.ttl, member)

            # failover key
            failover = nodes.get(self._FAILOVER)
            if failover:
                failover = Failover.from_node(failover.modifiedIndex, failover.value)

            # get synchronization state
            sync = nodes.get(self._SYNC)
            sync = SyncState.from_node(sync and sync.modifiedIndex, sync and sync.value)

            self._cluster = Cluster(initialize, config, leader, last_leader_operation, members, failover, sync)
        except etcd.EtcdKeyNotFound:
            self._cluster = Cluster(None, None, None, None, [], None, None)
        except:
            logger.exception('get_cluster')
            raise EtcdError('Etcd is not responding properly')

    @catch_etcd_errors
    def touch_member(self, data, ttl=None, permanent=False):
        return self.retry(self._client.set, self.member_path, data, None if permanent else ttl or self._ttl)

    @catch_etcd_errors
    def take_leader(self):
        return self.retry(self._client.set, self.leader_path, self._name, self._ttl)

    def attempt_to_acquire_leader(self, permanent=False):
        try:
            return bool(self.retry(self._client.write,
                                   self.leader_path,
                                   self._name,
                                   ttl=None if permanent else self._ttl,
                                   prevExist=False))
        except etcd.EtcdAlreadyExist:
            logger.info('Could not take out TTL lock')
        except (RetryFailedError, etcd.EtcdException):
            pass
        return False

    @catch_etcd_errors
    def set_failover_value(self, value, index=None):
        return self._client.write(self.failover_path, value, prevIndex=index or 0)

    @catch_etcd_errors
    def set_config_value(self, value, index=None):
        return self._client.write(self.config_path, value, prevIndex=index or 0)

    @catch_etcd_errors
    def _write_leader_optime(self, last_operation):
        return self._client.set(self.leader_optime_path, last_operation)

    @catch_etcd_errors
    def update_leader(self):
        return self.retry(self._client.test_and_set, self.leader_path, self._name, self._name, self._ttl)

    @catch_etcd_errors
    def initialize(self, create_new=True, sysid=""):
        return self.retry(self._client.write, self.initialize_path, sysid, prevExist=(not create_new))

    @catch_etcd_errors
    def delete_leader(self):
        return self._client.delete(self.leader_path, prevValue=self._name)

    @catch_etcd_errors
    def cancel_initialization(self):
        return self.retry(self._client.delete, self.initialize_path)

    @catch_etcd_errors
    def delete_cluster(self):
        return self.retry(self._client.delete, self.client_path(''), recursive=True)

    @catch_etcd_errors
    def set_sync_state_value(self, value, index=None):
        return self._client.write(self.sync_path, value, prevIndex=index or 0)

    @catch_etcd_errors
    def delete_sync_state(self, index=None):
        return self.retry(self._client.delete, self.sync_path, prevIndex=index or 0)

    def watch(self, leader_index, timeout):
        if self.__do_not_watch:
            self.__do_not_watch = False
            return True

        if leader_index:
            end_time = time.time() + timeout

            while timeout >= 1:  # when timeout is too small urllib3 doesn't have enough time to connect
                try:
                    self._client.watch(self.leader_path, index=leader_index, timeout=timeout + 0.5)
                    # Synchronous work of all cluster members with etcd is less expensive
                    # than reestablishing http connection every time from every replica.
                    return True
                except etcd.EtcdWatchTimedOut:
                    self._client.http.clear()
                    return False
                except etcd.EtcdException:
                    logger.exception('watch')

                timeout = end_time - time.time()

        try:
            return super(Etcd, self).watch(None, timeout)
        finally:
            self.event.clear()
Example #36
0
class Consul(AbstractDCS):

    def __init__(self, config):
        super(Consul, self).__init__(config)
        self._scope = config['scope']
        self._session = None
        self.__do_not_watch = False
        self._retry = Retry(deadline=config['retry_timeout'], max_delay=1, max_tries=-1,
                            retry_exceptions=(ConsulInternalError, HTTPException,
                                              HTTPError, socket.error, socket.timeout))

        kwargs = {}
        if 'url' in config:
            r = urlparse(config['url'])
            config.update({'scheme': r.scheme, 'host': r.hostname, 'port': r.port or 8500})
        elif 'host' in config:
            host, port = split_host_port(config.get('host', '127.0.0.1:8500'), 8500)
            config['host'] = host
            if 'port' not in config:
                config['port'] = int(port)

        if config.get('cacert'):
            config['ca_cert'] = config.pop('cacert')

        if config.get('key') and config.get('cert'):
            config['cert'] = (config['cert'], config['key'])

        config_keys = ('host', 'port', 'token', 'scheme', 'cert', 'ca_cert', 'dc')
        kwargs = {p: config.get(p) for p in config_keys if config.get(p)}

        verify = config.get('verify')
        if not isinstance(verify, bool):
            verify = parse_bool(verify)
        if isinstance(verify, bool):
            kwargs['verify'] = verify

        self._client = ConsulClient(**kwargs)
        self.set_retry_timeout(config['retry_timeout'])
        self.set_ttl(config.get('ttl') or 30)
        self._last_session_refresh = 0
        self.__session_checks = config.get('checks')
        self._register_service = config.get('register_service', False)
        if self._register_service:
            self._service_name = service_name_from_scope_name(self._scope)
            if self._scope != self._service_name:
                logger.warning('Using %s as consul service name instead of scope name %s', self._service_name,
                               self._scope)
        self._service_check_interval = config.get('service_check_interval', '5s')
        if not self._ctl:
            self.create_session()

    def retry(self, *args, **kwargs):
        return self._retry.copy()(*args, **kwargs)

    def create_session(self):
        while not self._session:
            try:
                self.refresh_session()
            except ConsulError:
                logger.info('waiting on consul')
                time.sleep(5)

    def set_ttl(self, ttl):
        if self._client.http.set_ttl(ttl/2.0):  # Consul multiplies the TTL by 2x
            self._session = None
            self.__do_not_watch = True

    def set_retry_timeout(self, retry_timeout):
        self._retry.deadline = retry_timeout
        self._client.http.set_read_timeout(retry_timeout)

    def adjust_ttl(self):
        try:
            settings = self._client.agent.self()
            min_ttl = (settings['Config']['SessionTTLMin'] or 10000000000)/1000000000.0
            logger.warning('Changing Session TTL from %s to %s', self._client.http.ttl, min_ttl)
            self._client.http.set_ttl(min_ttl)
        except Exception:
            logger.exception('adjust_ttl')

    def _do_refresh_session(self):
        """:returns: `!True` if it had to create new session"""
        if self._session and self._last_session_refresh + self._loop_wait > time.time():
            return False

        if self._session:
            try:
                self._client.session.renew(self._session)
            except NotFound:
                self._session = None
        ret = not self._session
        if ret:
            try:
                self._session = self._client.session.create(name=self._scope + '-' + self._name,
                                                            checks=self.__session_checks,
                                                            lock_delay=0.001, behavior='delete')
            except InvalidSessionTTL:
                logger.exception('session.create')
                self.adjust_ttl()
                raise

        self._last_session_refresh = time.time()
        return ret

    def refresh_session(self):
        try:
            return self.retry(self._do_refresh_session)
        except (ConsulException, RetryFailedError):
            logger.exception('refresh_session')
        raise ConsulError('Failed to renew/create session')

    def client_path(self, path):
        return super(Consul, self).client_path(path)[1:]

    @staticmethod
    def member(node):
        return Member.from_node(node['ModifyIndex'], os.path.basename(node['Key']), node.get('Session'), node['Value'])

    def _load_cluster(self):
        try:
            path = self.client_path('/')
            _, results = self.retry(self._client.kv.get, path, recurse=True)

            if results is None:
                raise NotFound

            nodes = {}
            for node in results:
                node['Value'] = (node['Value'] or b'').decode('utf-8')
                nodes[node['Key'][len(path):].lstrip('/')] = node

            # get initialize flag
            initialize = nodes.get(self._INITIALIZE)
            initialize = initialize and initialize['Value']

            # get global dynamic configuration
            config = nodes.get(self._CONFIG)
            config = config and ClusterConfig.from_node(config['ModifyIndex'], config['Value'])

            # get timeline history
            history = nodes.get(self._HISTORY)
            history = history and TimelineHistory.from_node(history['ModifyIndex'], history['Value'])

            # get last leader operation
            last_leader_operation = nodes.get(self._LEADER_OPTIME)
            last_leader_operation = 0 if last_leader_operation is None else int(last_leader_operation['Value'])

            # get list of members
            members = [self.member(n) for k, n in nodes.items() if k.startswith(self._MEMBERS) and k.count('/') == 1]

            # get leader
            leader = nodes.get(self._LEADER)
            if not self._ctl and leader and leader['Value'] == self._name \
                    and self._session != leader.get('Session', 'x'):
                logger.info('I am leader but not owner of the session. Removing leader node')
                self._client.kv.delete(self.leader_path, cas=leader['ModifyIndex'])
                leader = None

            if leader:
                member = Member(-1, leader['Value'], None, {})
                member = ([m for m in members if m.name == leader['Value']] or [member])[0]
                leader = Leader(leader['ModifyIndex'], leader.get('Session'), member)

            # failover key
            failover = nodes.get(self._FAILOVER)
            if failover:
                failover = Failover.from_node(failover['ModifyIndex'], failover['Value'])

            # get synchronization state
            sync = nodes.get(self._SYNC)
            sync = SyncState.from_node(sync and sync['ModifyIndex'], sync and sync['Value'])

            self._cluster = Cluster(initialize, config, leader, last_leader_operation, members, failover, sync, history)
        except NotFound:
            self._cluster = Cluster(None, None, None, None, [], None, None, None)
        except Exception:
            logger.exception('get_cluster')
            raise ConsulError('Consul is not responding properly')

    @catch_consul_errors
    def touch_member(self, data, ttl=None, permanent=False):
        cluster = self.cluster
        member = cluster and cluster.get_member(self._name, fallback_to_leader=False)
        create_member = not permanent and self.refresh_session()

        if member and (create_member or member.session != self._session):
            self._client.kv.delete(self.member_path)
            create_member = True

        if not create_member and member and deep_compare(data, member.data):
            return True

        try:
            args = {} if permanent else {'acquire': self._session}
            self._client.kv.put(self.member_path, json.dumps(data, separators=(',', ':')), **args)
            if self._register_service:
                self.update_service(not create_member and member and member.data or {}, data)
            return True
        except InvalidSession:
            self._session = None
            logger.error('Our session disappeared from Consul, can not "touch_member"')
        except Exception:
            logger.exception('touch_member')
        return False

    @catch_consul_errors
    def register_service(self, service_name, **kwargs):
        logger.info('Register service %s, params %s', service_name, kwargs)
        return self._client.agent.service.register(service_name, **kwargs)

    @catch_consul_errors
    def deregister_service(self, service_id):
        logger.info('Deregister service %s', service_id)
        # service_id can contain special characters, but is used as part of uri in deregister request
        service_id = quote(service_id)
        return self._client.agent.service.deregister(service_id)

    def _update_service(self, data):
        service_name = self._service_name
        role = data['role'].replace('_', '-')
        state = data['state']
        api_parts = urlparse(data['api_url'])
        api_parts = api_parts._replace(path='/{0}'.format(role))
        conn_parts = urlparse(data['conn_url'])
        check = base.Check.http(api_parts.geturl(), self._service_check_interval,
                                deregister='{0}s'.format(self._client.http.ttl * 10))
        params = {
            'service_id': '{0}/{1}'.format(self._scope, self._name),
            'address': conn_parts.hostname,
            'port': conn_parts.port,
            'check': check,
            'tags': [role]
        }

        if state == 'stopped':
            return self.deregister_service(params['service_id'])

        if role in ['master', 'replica', 'standby-leader']:
            if state != 'running':
                return
            return self.register_service(service_name, **params)

        logger.warning('Could not register service: unknown role type %s', role)

    @force_if_last_failed
    def update_service(self, old_data, new_data, force=False):
        update = False

        for key in ['role', 'api_url', 'conn_url', 'state']:
            if key not in new_data:
                logger.warning('Could not register service: not enough params in member data')
                return
            if old_data.get(key) != new_data[key]:
                update = True

        if force or update:
            return self._update_service(new_data)

    @catch_consul_errors
    def _do_attempt_to_acquire_leader(self, permanent):
        try:
            kwargs = {} if permanent else {'acquire': self._session}
            return self.retry(self._client.kv.put, self.leader_path, self._name, **kwargs)
        except InvalidSession:
            self._session = None
            logger.error('Our session disappeared from Consul. Will try to get a new one and retry attempt')
            self.refresh_session()
            return self.retry(self._client.kv.put, self.leader_path, self._name, acquire=self._session)

    def attempt_to_acquire_leader(self, permanent=False):
        if not self._session and not permanent:
            self.refresh_session()

        ret = self._do_attempt_to_acquire_leader(permanent)
        if not ret:
            logger.info('Could not take out TTL lock')

        return ret

    def take_leader(self):
        return self.attempt_to_acquire_leader()

    @catch_consul_errors
    def set_failover_value(self, value, index=None):
        return self._client.kv.put(self.failover_path, value, cas=index)

    @catch_consul_errors
    def set_config_value(self, value, index=None):
        return self._client.kv.put(self.config_path, value, cas=index)

    @catch_consul_errors
    def _write_leader_optime(self, last_operation):
        return self._client.kv.put(self.leader_optime_path, last_operation)

    @catch_consul_errors
    def _update_leader(self):
        if self._session:
            self.retry(self._client.session.renew, self._session)
            self._last_session_refresh = time.time()
        return bool(self._session)

    @catch_consul_errors
    def initialize(self, create_new=True, sysid=''):
        kwargs = {'cas': 0} if create_new else {}
        return self.retry(self._client.kv.put, self.initialize_path, sysid, **kwargs)

    @catch_consul_errors
    def cancel_initialization(self):
        return self.retry(self._client.kv.delete, self.initialize_path)

    @catch_consul_errors
    def delete_cluster(self):
        return self.retry(self._client.kv.delete, self.client_path(''), recurse=True)

    @catch_consul_errors
    def set_history_value(self, value):
        return self._client.kv.put(self.history_path, value)

    @catch_consul_errors
    def delete_leader(self):
        cluster = self.cluster
        if cluster and isinstance(cluster.leader, Leader) and cluster.leader.name == self._name:
            return self._client.kv.delete(self.leader_path, cas=cluster.leader.index)

    @catch_consul_errors
    def set_sync_state_value(self, value, index=None):
        return self.retry(self._client.kv.put, self.sync_path, value, cas=index)

    @catch_consul_errors
    def delete_sync_state(self, index=None):
        return self.retry(self._client.kv.delete, self.sync_path, cas=index)

    def watch(self, leader_index, timeout):
        if self.__do_not_watch:
            self.__do_not_watch = False
            return True

        if leader_index:
            end_time = time.time() + timeout
            while timeout >= 1:
                try:
                    idx, _ = self._client.kv.get(self.leader_path, index=leader_index, wait=str(timeout) + 's')
                    return str(idx) != str(leader_index)
                except (ConsulException, HTTPException, HTTPError, socket.error, socket.timeout):
                    logger.exception('watch')

                timeout = end_time - time.time()

        try:
            return super(Consul, self).watch(None, timeout)
        finally:
            self._last_session_refresh = 0
            self.event.clear()
Example #37
0
class Etcd(AbstractDCS):

    def __init__(self, config):
        super(Etcd, self).__init__(config)
        self._ttl = int(config.get('ttl') or 30)
        self._retry = Retry(deadline=config['retry_timeout'], max_delay=1, max_tries=-1,
                            retry_exceptions=(etcd.EtcdLeaderElectionInProgress,
                                              etcd.EtcdWatcherCleared,
                                              etcd.EtcdEventIndexCleared))
        self._client = self.get_etcd_client(config)
        self.__do_not_watch = False

    def retry(self, *args, **kwargs):
        return self._retry.copy()(*args, **kwargs)

    @staticmethod
    def get_etcd_client(config):
        client = None
        while not client:
            try:
                client = Client(config)
            except etcd.EtcdException:
                logger.info('waiting on etcd')
                sleep(5)
        return client

    def set_ttl(self, ttl):
        ttl = int(ttl)
        self.__do_not_watch = self._ttl != ttl
        self._ttl = ttl

    def set_retry_timeout(self, retry_timeout):
        self._retry.deadline = retry_timeout
        self._client.set_read_timeout(retry_timeout)

    @staticmethod
    def member(node):
        return Member.from_node(node.modifiedIndex, os.path.basename(node.key), node.ttl, node.value)

    def _load_cluster(self):
        try:
            result = self.retry(self._client.read, self.client_path(''), recursive=True)
            nodes = {os.path.relpath(node.key, result.key): node for node in result.leaves}

            # get initialize flag
            initialize = nodes.get(self._INITIALIZE)
            initialize = initialize and initialize.value

            # get global dynamic configuration
            config = nodes.get(self._CONFIG)
            config = config and ClusterConfig.from_node(config.modifiedIndex, config.value)

            # get last leader operation
            last_leader_operation = nodes.get(self._LEADER_OPTIME)
            last_leader_operation = 0 if last_leader_operation is None else int(last_leader_operation.value)

            # get list of members
            members = [self.member(n) for k, n in nodes.items() if k.startswith(self._MEMBERS) and k.count('/') == 1]

            # get leader
            leader = nodes.get(self._LEADER)
            if leader:
                member = Member(-1, leader.value, None, {})
                member = ([m for m in members if m.name == leader.value] or [member])[0]
                leader = Leader(leader.modifiedIndex, leader.ttl, member)

            # failover key
            failover = nodes.get(self._FAILOVER)
            if failover:
                failover = Failover.from_node(failover.modifiedIndex, failover.value)

            self._cluster = Cluster(initialize, config, leader, last_leader_operation, members, failover)
        except etcd.EtcdKeyNotFound:
            self._cluster = Cluster(None, None, None, None, [], None)
        except:
            logger.exception('get_cluster')
            raise EtcdError('Etcd is not responding properly')

    @catch_etcd_errors
    def touch_member(self, data, ttl=None, permanent=False):
        return self.retry(self._client.set, self.member_path, data, None if permanent else ttl or self._ttl)

    @catch_etcd_errors
    def take_leader(self):
        return self.retry(self._client.set, self.leader_path, self._name, self._ttl)

    def attempt_to_acquire_leader(self, permanent=False):
        try:
            return bool(self.retry(self._client.write,
                                   self.leader_path,
                                   self._name,
                                   ttl=None if permanent else self._ttl,
                                   prevExist=False))
        except etcd.EtcdAlreadyExist:
            logger.info('Could not take out TTL lock')
        except (RetryFailedError, etcd.EtcdException):
            pass
        return False

    @catch_etcd_errors
    def set_failover_value(self, value, index=None):
        return self._client.write(self.failover_path, value, prevIndex=index or 0)

    @catch_etcd_errors
    def set_config_value(self, value, index=None):
        return self._client.write(self.config_path, value, prevIndex=index or 0)

    @catch_etcd_errors
    def write_leader_optime(self, last_operation):
        return self._client.set(self.leader_optime_path, last_operation)

    @catch_etcd_errors
    def update_leader(self):
        return self.retry(self._client.test_and_set, self.leader_path, self._name, self._name, self._ttl)

    @catch_etcd_errors
    def initialize(self, create_new=True, sysid=""):
        return self.retry(self._client.write, self.initialize_path, sysid, prevExist=(not create_new))

    @catch_etcd_errors
    def delete_leader(self):
        return self._client.delete(self.leader_path, prevValue=self._name)

    @catch_etcd_errors
    def cancel_initialization(self):
        return self.retry(self._client.delete, self.initialize_path)

    @catch_etcd_errors
    def delete_cluster(self):
        return self.retry(self._client.delete, self.client_path(''), recursive=True)

    def watch(self, timeout):
        if self.__do_not_watch:
            self.__do_not_watch = False
            return True

        cluster = self.cluster
        # watch on leader key changes if it is defined and current node is not lock owner
        if cluster and cluster.leader and cluster.leader.name != self._name and cluster.leader.index:
            end_time = time.time() + timeout

            while timeout >= 1:  # when timeout is too small urllib3 doesn't have enough time to connect
                try:
                    self._client.watch(self.leader_path, index=cluster.leader.index + 1, timeout=timeout + 0.5)
                    # Synchronous work of all cluster members with etcd is less expensive
                    # than reestablishing http connection every time from every replica.
                    return True
                except etcd.EtcdWatchTimedOut:
                    self._client.http.clear()
                    return False
                except etcd.EtcdException:
                    logging.exception('watch')

                timeout = end_time - time.time()

        try:
            return super(Etcd, self).watch(timeout)
        finally:
            self.event.clear()
Example #38
0
 def query(self, sql, *params, **kwargs):
     if not kwargs.get('retry', False):
         return self.server.query(sql, *params)
     retry = Retry(delay=1, retry_exceptions=PostgresConnectionException)
     return retry(self.server.query, sql, *params)
Example #39
0
 def _makeOne(self, *args, **kwargs):
     return Retry(*args, **kwargs)