def test_two_nodes_fail(self): # Create container1 # Kill container1 servers excepting one of the primaries # Delete container1 directly to the one primary still up # Restart other container1 servers # Get to a final state # Assert all container1 servers indicate container1 is gone (happens # because the one node that knew about the delete replicated to the # others.) # Assert account level also indicates container1 is gone container1 = 'container-%s' % uuid4() cpart, cnodes = self.container_ring.get_nodes(self.account, container1) client.put_container(self.url, self.token, container1) cnp_port = kill_nonprimary_server(cnodes, self.port2server, self.pids) kill_server(cnodes[0]['port'], self.port2server, self.pids) kill_server(cnodes[1]['port'], self.port2server, self.pids) direct_client.direct_delete_container(cnodes[2], cpart, self.account, container1) start_server(cnodes[0]['port'], self.port2server, self.pids) start_server(cnodes[1]['port'], self.port2server, self.pids) start_server(cnp_port, self.port2server, self.pids) get_to_final_state() for cnode in cnodes: exc = None try: direct_client.direct_get_container(cnode, cpart, self.account, container1) except client.ClientException as err: exc = err self.assertEquals(exc.http_status, 404) headers, containers = client.get_account(self.url, self.token) self.assertEquals(headers['x-account-container-count'], '0') self.assertEquals(headers['x-account-object-count'], '0') self.assertEquals(headers['x-account-bytes-used'], '0')
def test_direct_delete_container(self): with mocked_http_conn(200) as conn: direct_client.direct_delete_container( self.node, self.part, self.account, self.container) self.assertEqual(conn.host, self.node['ip']) self.assertEqual(conn.port, self.node['port']) self.assertEqual(conn.method, 'DELETE') self.assertEqual(conn.path, self.container_path)
def test_direct_delete_container_with_timestamp(self): # ensure timestamp is different from any that might be auto-generated timestamp = Timestamp(time.time() - 100) headers = {'X-Timestamp': timestamp.internal} with mocked_http_conn(200) as conn: direct_client.direct_delete_container( self.node, self.part, self.account, self.container, headers=headers) self.assertEqual(conn.method, 'DELETE') self.assertEqual(conn.path, self.container_path) self.assertTrue('X-Timestamp' in conn.req_headers) self.assertEqual(timestamp, conn.req_headers['X-Timestamp'])
def test_direct_delete_container(self): node = {'ip': '1.2.3.4', 'port': '6000', 'device': 'sda'} part = '0' account = 'a' container = 'c' was_http_connector = direct_client.http_connect direct_client.http_connect = mock_http_connect(200) direct_client.direct_delete_container(node, part, account, container) direct_client.http_connect = was_http_connector
def test_direct_delete_container_error(self): with mocked_http_conn(500) as conn: with self.assertRaises(ClientException) as raised: direct_client.direct_delete_container( self.node, self.part, self.account, self.container) self.assertEqual(conn.host, self.node['ip']) self.assertEqual(conn.port, self.node['port']) self.assertEqual(conn.method, 'DELETE') self.assertEqual(conn.path, self.container_path) self.assertEqual(raised.exception.http_status, 500) self.assertTrue('DELETE' in str(raised.exception))
def test_direct_delete_container_error(self): with mocked_http_conn(500) as conn: try: direct_client.direct_delete_container( self.node, self.part, self.account, self.container) except ClientException as err: pass else: self.fail('ClientException not raised') self.assertEqual(conn.method, 'DELETE') self.assertEqual(conn.path, self.container_path) self.assertEqual(err.http_status, 500) self.assertTrue('DELETE' in str(err))
def test_direct_delete_container(self): node = {'ip':'1.2.3.4', 'port':'6000', 'device':'sda'} part = '0' account = 'a' container = 'c' name = 'o' contents = StringIO.StringIO('123456') headers = {'key':'value'} was_http_connector = direct_client.http_connect direct_client.http_connect = mock_http_connect(200) direct_client.direct_delete_container(node, part, account, container) direct_client.http_connect = was_http_connector
def test_two_nodes_fail(self): # Create container1 container1 = 'container-%s' % uuid4() cpart, cnodes = self.container_ring.get_nodes(self.account, container1) client.put_container(self.url, self.token, container1) # Kill container1 servers excepting one of the primaries cnp_ipport = kill_nonprimary_server(cnodes, self.ipport2server, self.pids) kill_server((cnodes[0]['ip'], cnodes[0]['port']), self.ipport2server, self.pids) kill_server((cnodes[1]['ip'], cnodes[1]['port']), self.ipport2server, self.pids) # Delete container1 directly to the one primary still up direct_client.direct_delete_container(cnodes[2], cpart, self.account, container1) # Restart other container1 servers start_server((cnodes[0]['ip'], cnodes[0]['port']), self.ipport2server, self.pids) start_server((cnodes[1]['ip'], cnodes[1]['port']), self.ipport2server, self.pids) start_server(cnp_ipport, self.ipport2server, self.pids) # Get to a final state self.get_to_final_state() # Assert all container1 servers indicate container1 is gone (happens # because the one node that knew about the delete replicated to the # others.) for cnode in cnodes: try: direct_client.direct_get_container(cnode, cpart, self.account, container1) except ClientException as err: self.assertEqual(err.http_status, 404) else: self.fail("Expected ClientException but didn't get it") # Assert account level also indicates container1 is gone headers, containers = client.get_account(self.url, self.token) self.assertEqual(headers['x-account-container-count'], '0') self.assertEqual(headers['x-account-object-count'], '0') self.assertEqual(headers['x-account-bytes-used'], '0')
def reap_container(self, account, account_partition, account_nodes, container): """ Deletes the data and the container itself for the given container. This will call :func:`reap_object` up to sqrt(self.concurrency) times concurrently for the objects in the container. If there is any exception while deleting a single object, the process will continue for any other objects in the container and the failed objects will be tried again the next time this function is called with the same parameters. If there is any exception while listing the objects for deletion, the process will stop (but will obviously be tried again the next time this function is called with the same parameters). This is a possibility since the listing comes from querying just the primary remote container server. Once all objects have been attempted to be deleted, the container itself will be attempted to be deleted by sending a delete request to all container nodes. The format of the delete request is such that each container server will update a corresponding account server, removing the container from the account's listing. This function returns nothing and should raise no exception but only update various self.stats_* values for what occurs. :param account: The name of the account for the container. :param account_partition: The partition for the account on the account ring. :param account_nodes: The primary node dicts for the account. :param container: The name of the container to delete. * See also: :func:`swift.common.ring.Ring.get_nodes` for a description of the account node dicts. """ account_nodes = list(account_nodes) part, nodes = self.get_container_ring().get_nodes(account, container) node = nodes[-1] pool = GreenPool(size=self.object_concurrency) marker = '' while True: objects = None try: headers, objects = direct_get_container( node, part, account, container, marker=marker, conn_timeout=self.conn_timeout, response_timeout=self.node_timeout) self.stats_return_codes[2] = \ self.stats_return_codes.get(2, 0) + 1 self.logger.increment('return_codes.2') except ClientException as err: if self.logger.getEffectiveLevel() <= DEBUG: self.logger.exception( _('Exception with %(ip)s:%(port)s/%(device)s'), node) self.stats_return_codes[err.http_status / 100] = \ self.stats_return_codes.get(err.http_status / 100, 0) + 1 self.logger.increment( 'return_codes.%d' % (err.http_status / 100,)) if not objects: break try: policy_index = headers.get('X-Backend-Storage-Policy-Index', 0) for obj in objects: if isinstance(obj['name'], unicode): obj['name'] = obj['name'].encode('utf8') pool.spawn(self.reap_object, account, container, part, nodes, obj['name'], policy_index) pool.waitall() except (Exception, Timeout): self.logger.exception(_('Exception with objects for container ' '%(container)s for account %(account)s' ), {'container': container, 'account': account}) marker = objects[-1]['name'] if marker == '': break successes = 0 failures = 0 for node in nodes: anode = account_nodes.pop() try: direct_delete_container( node, part, account, container, conn_timeout=self.conn_timeout, response_timeout=self.node_timeout, headers={'X-Account-Host': '%(ip)s:%(port)s' % anode, 'X-Account-Partition': str(account_partition), 'X-Account-Device': anode['device'], 'X-Account-Override-Deleted': 'yes'}) successes += 1 self.stats_return_codes[2] = \ self.stats_return_codes.get(2, 0) + 1 self.logger.increment('return_codes.2') except ClientException as err: if self.logger.getEffectiveLevel() <= DEBUG: self.logger.exception( _('Exception with %(ip)s:%(port)s/%(device)s'), node) failures += 1 self.logger.increment('containers_failures') self.stats_return_codes[err.http_status / 100] = \ self.stats_return_codes.get(err.http_status / 100, 0) + 1 self.logger.increment( 'return_codes.%d' % (err.http_status / 100,)) if successes > failures: self.stats_containers_deleted += 1 self.logger.increment('containers_deleted') elif not successes: self.stats_containers_remaining += 1 self.logger.increment('containers_remaining') else: self.stats_containers_possibly_remaining += 1 self.logger.increment('containers_possibly_remaining')
def test_direct_delete_container(self): with mocked_http_conn(200) as conn: direct_client.direct_delete_container( self.node, self.part, self.account, self.container) self.assertEqual(conn.method, 'DELETE') self.assertEqual(conn.path, self.container_path)
def reap_container(self, account, account_partition, account_nodes, container): """ Deletes the data and the container itself for the given container. This will call :func:`reap_object` up to sqrt(self.concurrency) times concurrently for the objects in the container. If there is any exception while deleting a single object, the process will continue for any other objects in the container and the failed objects will be tried again the next time this function is called with the same parameters. If there is any exception while listing the objects for deletion, the process will stop (but will obviously be tried again the next time this function is called with the same parameters). This is a possibility since the listing comes from querying just the primary remote container server. Once all objects have been attempted to be deleted, the container itself will be attempted to be deleted by sending a delete request to all container nodes. The format of the delete request is such that each container server will update a corresponding account server, removing the container from the account's listing. This function returns nothing and should raise no exception but only update various self.stats_* values for what occurs. :param account: The name of the account for the container. :param account_partition: The partition for the account on the account ring. :param account_nodes: The primary node dicts for the account. :param container: The name of the container to delete. * See also: :func:`swift.common.ring.Ring.get_nodes` for a description of the account node dicts. """ account_nodes = list(account_nodes) part, nodes = self.get_container_ring().get_nodes(account, container) node = nodes[-1] pool = GreenPool(size=self.object_concurrency) marker = '' while True: objects = None try: headers, objects = direct_get_container( node, part, account, container, marker=marker, conn_timeout=self.conn_timeout, response_timeout=self.node_timeout) self.stats_return_codes[2] = \ self.stats_return_codes.get(2, 0) + 1 self.logger.increment('return_codes.2') except ClientException as err: if self.logger.getEffectiveLevel() <= DEBUG: self.logger.exception( _('Exception with %(ip)s:%(port)s/%(device)s'), node) self.stats_return_codes[err.http_status / 100] = \ self.stats_return_codes.get(err.http_status / 100, 0) + 1 self.logger.increment('return_codes.%d' % (err.http_status / 100, )) if not objects: break try: policy_index = headers.get('X-Backend-Storage-Policy-Index', 0) policy = POLICIES.get_by_index(policy_index) if not policy: self.logger.error( 'ERROR: invalid storage policy index: %r' % policy_index) for obj in objects: if isinstance(obj['name'], unicode): obj['name'] = obj['name'].encode('utf8') pool.spawn(self.reap_object, account, container, part, nodes, obj['name'], policy_index) pool.waitall() except (Exception, Timeout): self.logger.exception( _('Exception with objects for container ' '%(container)s for account %(account)s'), { 'container': container, 'account': account }) marker = objects[-1]['name'] if marker == '': break successes = 0 failures = 0 for node in nodes: anode = account_nodes.pop() try: direct_delete_container(node, part, account, container, conn_timeout=self.conn_timeout, response_timeout=self.node_timeout, headers={ 'X-Account-Host': '%(ip)s:%(port)s' % anode, 'X-Account-Partition': str(account_partition), 'X-Account-Device': anode['device'], 'X-Account-Override-Deleted': 'yes' }) successes += 1 self.stats_return_codes[2] = \ self.stats_return_codes.get(2, 0) + 1 self.logger.increment('return_codes.2') except ClientException as err: if self.logger.getEffectiveLevel() <= DEBUG: self.logger.exception( _('Exception with %(ip)s:%(port)s/%(device)s'), node) failures += 1 self.logger.increment('containers_failures') self.stats_return_codes[err.http_status / 100] = \ self.stats_return_codes.get(err.http_status / 100, 0) + 1 self.logger.increment('return_codes.%d' % (err.http_status / 100, )) if successes > failures: self.stats_containers_deleted += 1 self.logger.increment('containers_deleted') elif not successes: self.stats_containers_remaining += 1 self.logger.increment('containers_remaining') else: self.stats_containers_possibly_remaining += 1 self.logger.increment('containers_possibly_remaining')
class AccountReaper(Daemon): """ Removes data from status=DELETED accounts. These are accounts that have been asked to be removed by the reseller via services remove_storage_account XMLRPC call. The account is not deleted immediately by the services call, but instead the account is simply marked for deletion by setting the status column in the account_stat table of the account database. This account reaper scans for such accounts and removes the data in the background. The background deletion process will occur on the primary account server for the account. :param server_conf: The [account-server] dictionary of the account server configuration file :param reaper_conf: The [account-reaper] dictionary of the account server configuration file See the etc/account-server.conf-sample for information on the possible configuration parameters. """ def __init__(self, conf): self.conf = conf self.logger = get_logger(conf, log_route='account-reaper') self.devices = conf.get('devices', '/srv/node') self.mount_check = conf.get('mount_check', 'true').lower() in \ ('true', 't', '1', 'on', 'yes', 'y') self.interval = int(conf.get('interval', 3600)) swift_dir = conf.get('swift_dir', '/etc/swift') self.account_ring_path = os.path.join(swift_dir, 'account.ring.gz') self.container_ring_path = os.path.join(swift_dir, 'container.ring.gz') self.object_ring_path = os.path.join(swift_dir, 'object.ring.gz') self.account_ring = None self.container_ring = None self.object_ring = None self.node_timeout = int(conf.get('node_timeout', 10)) self.conn_timeout = float(conf.get('conn_timeout', 0.5)) self.myips = whataremyips() self.concurrency = int(conf.get('concurrency', 25)) self.container_concurrency = self.object_concurrency = \ sqrt(self.concurrency) self.container_pool = GreenPool(size=self.container_concurrency) def get_account_ring(self): """ The account :class:`swift.common.ring.Ring` for the cluster. """ if not self.account_ring: self.logger.debug( _('Loading account ring from %s'), self.account_ring_path) self.account_ring = Ring(self.account_ring_path) return self.account_ring def get_container_ring(self): """ The container :class:`swift.common.ring.Ring` for the cluster. """ if not self.container_ring: self.logger.debug( _('Loading container ring from %s'), self.container_ring_path) self.container_ring = Ring(self.container_ring_path) return self.container_ring def get_object_ring(self): """ The object :class:`swift.common.ring.Ring` for the cluster. """ if not self.object_ring: self.logger.debug( _('Loading object ring from %s'), self.object_ring_path) self.object_ring = Ring(self.object_ring_path) return self.object_ring def run_forever(self, *args, **kwargs): """ Main entry point when running the reaper in its normal daemon mode. This repeatedly calls :func:`reap_once` no quicker than the configuration interval. """ self.logger.debug(_('Daemon started.')) sleep(random.random() * self.interval) while True: begin = time() self.run_once() elapsed = time() - begin if elapsed < self.interval: sleep(self.interval - elapsed) def run_once(self, *args, **kwargs): """ Main entry point when running the reaper in 'once' mode, where it will do a single pass over all accounts on the server. This is called repeatedly by :func:`run_forever`. This will call :func:`reap_device` once for each device on the server. """ self.logger.debug(_('Begin devices pass: %s'), self.devices) begin = time() for device in os.listdir(self.devices): if self.mount_check and \ not os.path.ismount(os.path.join(self.devices, device)): self.logger.debug( _('Skipping %s as it is not mounted'), device) continue self.reap_device(device) elapsed = time() - begin self.logger.info(_('Devices pass completed: %.02fs'), elapsed) def reap_device(self, device): """ Called once per pass for each device on the server. This will scan the accounts directory for the device, looking for partitions this device is the primary for, then looking for account databases that are marked status=DELETED and still have containers and calling :func:`reap_account`. Account databases marked status=DELETED that no longer have containers will eventually be permanently removed by the reclaim process within the account replicator (see :mod:`swift.db_replicator`). :param device: The device to look for accounts to be deleted. """ datadir = os.path.join(self.devices, device, DATADIR) if not os.path.exists(datadir): return for partition in os.listdir(datadir): partition_path = os.path.join(datadir, partition) if not partition.isdigit(): continue nodes = self.get_account_ring().get_part_nodes(int(partition)) if nodes[0]['ip'] not in self.myips or \ not os.path.isdir(partition_path): continue for suffix in os.listdir(partition_path): suffix_path = os.path.join(partition_path, suffix) if not os.path.isdir(suffix_path): continue for hsh in os.listdir(suffix_path): hsh_path = os.path.join(suffix_path, hsh) if not os.path.isdir(hsh_path): continue for fname in sorted(os.listdir(hsh_path), reverse=True): if fname.endswith('.ts'): break elif fname.endswith('.db'): broker = \ AccountBroker(os.path.join(hsh_path, fname)) if broker.is_status_deleted() and \ not broker.empty(): self.reap_account(broker, partition, nodes) def reap_account(self, broker, partition, nodes): """ Called once per pass for each account this server is the primary for and attempts to delete the data for the given account. The reaper will only delete one account at any given time. It will call :func:`reap_container` up to sqrt(self.concurrency) times concurrently while reaping the account. If there is any exception while deleting a single container, the process will continue for any other containers and the failed containers will be tried again the next time this function is called with the same parameters. If there is any exception while listing the containers for deletion, the process will stop (but will obviously be tried again the next time this function is called with the same parameters). This isn't likely since the listing comes from the local database. After the process completes (successfully or not) statistics about what was accomplished will be logged. This function returns nothing and should raise no exception but only update various self.stats_* values for what occurs. :param broker: The AccountBroker for the account to delete. :param partition: The partition in the account ring the account is on. :param nodes: The primary node dicts for the account to delete. .. seealso:: :class:`swift.common.db.AccountBroker` for the broker class. .. seealso:: :func:`swift.common.ring.Ring.get_nodes` for a description of the node dicts. """ begin = time() account = broker.get_info()['account'] self.logger.info(_('Beginning pass on account %s'), account) self.stats_return_codes = {} self.stats_containers_deleted = 0 self.stats_objects_deleted = 0 self.stats_containers_remaining = 0 self.stats_objects_remaining = 0 self.stats_containers_possibly_remaining = 0 self.stats_objects_possibly_remaining = 0 try: marker = '' while True: containers = \ list(broker.list_containers_iter(1000, marker, None, None, None)) if not containers: break try: for (container, _junk, _junk, _junk) in containers: self.container_pool.spawn(self.reap_container, account, partition, nodes, container) self.container_pool.waitall() except Exception: self.logger.exception( _('Exception with containers for account %s'), account) marker = containers[-1][0] log = 'Completed pass on account %s' % account except Exception: self.logger.exception( _('Exception with account %s'), account) log = _('Incomplete pass on account %s') % account if self.stats_containers_deleted: log += _(', %s containers deleted') % self.stats_containers_deleted if self.stats_objects_deleted: log += _(', %s objects deleted') % self.stats_objects_deleted if self.stats_containers_remaining: log += _(', %s containers remaining') % \ self.stats_containers_remaining if self.stats_objects_remaining: log += _(', %s objects remaining') % self.stats_objects_remaining if self.stats_containers_possibly_remaining: log += _(', %s containers possibly remaining') % \ self.stats_containers_possibly_remaining if self.stats_objects_possibly_remaining: log += _(', %s objects possibly remaining') % \ self.stats_objects_possibly_remaining if self.stats_return_codes: log += _(', return codes: ') for code in sorted(self.stats_return_codes.keys()): log += '%s %sxxs, ' % (self.stats_return_codes[code], code) log = log[:-2] log += _(', elapsed: %.02fs') % (time() - begin) self.logger.info(log) def reap_container(self, account, account_partition, account_nodes, container): """ Deletes the data and the container itself for the given container. This will call :func:`reap_object` up to sqrt(self.concurrency) times concurrently for the objects in the container. If there is any exception while deleting a single object, the process will continue for any other objects in the container and the failed objects will be tried again the next time this function is called with the same parameters. If there is any exception while listing the objects for deletion, the process will stop (but will obviously be tried again the next time this function is called with the same parameters). This is a possibility since the listing comes from querying just the primary remote container server. Once all objects have been attempted to be deleted, the container itself will be attempted to be deleted by sending a delete request to all container nodes. The format of the delete request is such that each container server will update a corresponding account server, removing the container from the account's listing. This function returns nothing and should raise no exception but only update various self.stats_* values for what occurs. :param account: The name of the account for the container. :param account_partition: The partition for the account on the account ring. :param account_nodes: The primary node dicts for the account. :param container: The name of the container to delete. * See also: :func:`swift.common.ring.Ring.get_nodes` for a description of the account node dicts. """ account_nodes = list(account_nodes) part, nodes = self.get_container_ring().get_nodes(account, container) node = nodes[-1] pool = GreenPool(size=self.object_concurrency) marker = '' while True: objects = None try: objects = direct_get_container(node, part, account, container, marker=marker, conn_timeout=self.conn_timeout, response_timeout=self.node_timeout)[1] self.stats_return_codes[2] = \ self.stats_return_codes.get(2, 0) + 1 except ClientException, err: if self.logger.getEffectiveLevel() <= DEBUG: self.logger.exception( _('Exception with %(ip)s:%(port)s/%(device)s'), node) self.stats_return_codes[err.http_status / 100] = \ self.stats_return_codes.get(err.http_status / 100, 0) + 1 if not objects: break try: for obj in objects: if isinstance(obj['name'], unicode): obj['name'] = obj['name'].encode('utf8') pool.spawn(self.reap_object, account, container, part, nodes, obj['name']) pool.waitall() except Exception: self.logger.exception(_('Exception with objects for container ' '%(container)s for account %(account)s'), {'container': container, 'account': account}) marker = objects[-1]['name'] successes = 0 failures = 0 for node in nodes: anode = account_nodes.pop() try: direct_delete_container(node, part, account, container, conn_timeout=self.conn_timeout, response_timeout=self.node_timeout, headers={'X-Account-Host': '%(ip)s:%(port)s' % anode, 'X-Account-Partition': str(account_partition), 'X-Account-Device': anode['device'], 'X-Account-Override-Deleted': 'yes'}) successes += 1 self.stats_return_codes[2] = \ self.stats_return_codes.get(2, 0) + 1 except ClientException, err: if self.logger.getEffectiveLevel() <= DEBUG: self.logger.exception( _('Exception with %(ip)s:%(port)s/%(device)s'), node) failures += 1 self.stats_return_codes[err.http_status / 100] = \ self.stats_return_codes.get(err.http_status / 100, 0) + 1