def can_be_deleted(self, storagedriver): """ Checks whether a Storage Driver can be deleted """ result = True storagerouter = storagedriver.storagerouter storagedrivers_left = len([ sd for sd in storagerouter.storagedrivers if sd.guid != storagedriver.guid ]) pmachine = storagerouter.pmachine vmachines = VMachineList.get_customer_vmachines() vpools_guids = [ vmachine.vpool_guid for vmachine in vmachines if vmachine.vpool_guid is not None ] pmachine_guids = [vmachine.pmachine_guid for vmachine in vmachines] vpool = storagedriver.vpool if storagedrivers_left is False and pmachine.guid in pmachine_guids and vpool.guid in vpools_guids: result = False if any(vdisk for vdisk in vpool.vdisks if vdisk.storagedriver_id == storagedriver.storagedriver_id): result = False return result
def snapshot_all_vms(): """ Snapshots all VMachines """ logger.info("[SSA] started") success = [] fail = [] machines = VMachineList.get_customer_vmachines() for machine in machines: try: VMachineController.snapshot(machineguid=machine.guid, label="", is_consistent=False, is_automatic=True) success.append(machine.guid) except: fail.append(machine.guid) logger.info("[SSA] Snapshot has been taken for {0} vMachines, {1} failed.".format(len(success), len(fail)))
def can_be_deleted(self, storagedriver): """ Checks whether a Storage Driver can be deleted """ result = True storagerouter = storagedriver.storagerouter pmachine = storagerouter.pmachine vmachines = VMachineList.get_customer_vmachines() vpools_guids = [vmachine.vpool_guid for vmachine in vmachines if vmachine.vpool_guid is not None] pmachine_guids = [vmachine.pmachine_guid for vmachine in vmachines] vpool = storagedriver.vpool if pmachine.guid in pmachine_guids and vpool.guid in vpools_guids: result = False if any(vdisk for vdisk in vpool.vdisks if vdisk.storagedriver_id == storagedriver.storagedriver_id): result = False return Response(result, status=status.HTTP_200_OK)
def can_be_deleted(self, storagedriver): """ Checks whether a Storage Driver can be deleted """ result = True storagerouter = storagedriver.storagerouter storagedrivers_left = len([sd for sd in storagerouter.storagedrivers if sd.guid != storagedriver.guid]) pmachine = storagerouter.pmachine vmachines = VMachineList.get_customer_vmachines() vpools_guids = [vmachine.vpool_guid for vmachine in vmachines if vmachine.vpool_guid is not None] pmachine_guids = [vmachine.pmachine_guid for vmachine in vmachines] vpool = storagedriver.vpool if storagedrivers_left is False and pmachine.guid in pmachine_guids and vpool.guid in vpools_guids: result = False if any(vdisk for vdisk in vpool.vdisks if vdisk.storagedriver_id == storagedriver.storagedriver_id): result = False return result
def snapshot_all_vms(): """ Snapshots all VMachines """ logger.info('[SSA] started') success = [] fail = [] machines = VMachineList.get_customer_vmachines() for machine in machines: try: VMachineController.snapshot(machineguid=machine.guid, label='', is_consistent=False, is_automatic=True) success.append(machine.guid) except: fail.append(machine.guid) logger.info('[SSA] {0} vMachines were snapshotted, {1} failed.'.format( len(success), len(fail)))
def snapshot_all_vms(): """ Snapshots all VMachines """ ScheduledTaskController._logger.info('[SSA] started') success = [] fail = [] machines = VMachineList.get_customer_vmachines() for machine in machines: try: VMachineController.snapshot(machineguid=machine.guid, label='', is_consistent=False, is_automatic=True, is_sticky=False) success.append(machine.guid) except: fail.append(machine.guid) ScheduledTaskController._logger.info( '[SSA] Snapshot has been taken for {0} vMachines, {1} failed.'. format(len(success), len(fail)))
def gather_scrub_work(): """ Retrieve and execute scrub work :return: None """ logger.info('Gather Scrub - Started') scrub_locations = {} for storage_driver in StorageDriverList.get_storagedrivers(): for partition in storage_driver.partitions: if DiskPartition.ROLES.SCRUB == partition.role: logger.info('Gather Scrub - Storage Router {0:<15} has SCRUB partition at {1}'.format(storage_driver.storagerouter.ip, partition.path)) if storage_driver.storagerouter not in scrub_locations: try: _ = SSHClient(storage_driver.storagerouter) scrub_locations[storage_driver.storagerouter] = str(partition.path) except UnableToConnectException: logger.warning('Gather Scrub - Storage Router {0:<15} is not reachable'.format(storage_driver.storagerouter.ip)) if len(scrub_locations) == 0: raise RuntimeError('No scrub locations found') vdisk_guids = set() for vmachine in VMachineList.get_customer_vmachines(): for vdisk in vmachine.vdisks: if vdisk.info['object_type'] == 'BASE': vdisk_guids.add(vdisk.guid) for vdisk in VDiskList.get_without_vmachine(): if vdisk.info['object_type'] == 'BASE': vdisk_guids.add(vdisk.guid) logger.info('Gather Scrub - Checking {0} volumes for scrub work'.format(len(vdisk_guids))) local_machineid = System.get_my_machine_id() local_storage_router = None local_scrub_location = None local_vdisks_to_scrub = [] result_set = ResultSet([]) storage_router_list = [] for index, scrub_info in enumerate(scrub_locations.items()): start_index = index * len(vdisk_guids) / len(scrub_locations) end_index = (index + 1) * len(vdisk_guids) / len(scrub_locations) storage_router = scrub_info[0] vdisk_guids_to_scrub = list(vdisk_guids)[start_index:end_index] local = storage_router.machine_id == local_machineid logger.info('Gather Scrub - Storage Router {0:<15} ({1}) - Scrubbing {2} virtual disks'.format(storage_router.ip, 'local' if local is True else 'remote', len(vdisk_guids_to_scrub))) if local is True: local_storage_router = storage_router local_scrub_location = scrub_info[1] local_vdisks_to_scrub = vdisk_guids_to_scrub else: result_set.add(ScheduledTaskController._execute_scrub_work.s(scrub_location=scrub_info[1], vdisk_guids=vdisk_guids_to_scrub).apply_async( routing_key='sr.{0}'.format(storage_router.machine_id) )) storage_router_list.append(storage_router) # Remote tasks have been launched, now start the local task and then wait for remote tasks to finish processed_guids = [] if local_scrub_location is not None and len(local_vdisks_to_scrub) > 0: try: processed_guids = ScheduledTaskController._execute_scrub_work(scrub_location=local_scrub_location, vdisk_guids=local_vdisks_to_scrub) except Exception as ex: logger.error('Gather Scrub - Storage Router {0:<15} - Scrubbing failed with error:\n - {1}'.format(local_storage_router.ip, ex)) all_results = result_set.join(propagate=False) # Propagate False makes sure all jobs are waited for even when 1 or more jobs fail for index, result in enumerate(all_results): if isinstance(result, list): processed_guids.extend(result) else: logger.error('Gather Scrub - Storage Router {0:<15} - Scrubbing failed with error:\n - {1}'.format(storage_router_list[index].ip, result)) if len(processed_guids) != len(vdisk_guids) or set(processed_guids).difference(vdisk_guids): raise RuntimeError('Scrubbing failed for 1 or more storagerouters') logger.info('Gather Scrub - Finished')
def remove_storagedriver(storagedriver_guid): """ Removes a Storage Driver (and, if it was the last Storage Driver for a vPool, the vPool is removed as well) """ # Get objects & Make some checks storagedriver = StorageDriver(storagedriver_guid) storagerouter = storagedriver.storagerouter ip = storagerouter.ip pmachine = storagerouter.pmachine vmachines = VMachineList.get_customer_vmachines() pmachine_guids = [vm.pmachine_guid for vm in vmachines] vpools_guids = [vm.vpool_guid for vm in vmachines if vm.vpool_guid is not None] vpool = storagedriver.vpool if pmachine.guid in pmachine_guids and vpool.guid in vpools_guids: raise RuntimeError('There are still vMachines served from the given Storage Driver') if any(vdisk for vdisk in vpool.vdisks if vdisk.storagedriver_id == storagedriver.storagedriver_id): raise RuntimeError('There are still vDisks served from the given Storage Driver') services = ['volumedriver_{0}'.format(vpool.name), 'failovercache_{0}'.format(vpool.name)] storagedrivers_left = False # Stop services for current_storagedriver in vpool.storagedrivers: if current_storagedriver.guid != storagedriver_guid: storagedrivers_left = True client = SSHClient.load(current_storagedriver.storagerouter.ip) for service in services: System.exec_remote_python(client, """ from ovs.plugin.provider.service import Service if Service.has_service('{0}'): Service.disable_service('{0}') """.format(service)) System.exec_remote_python(client, """ from ovs.plugin.provider.service import Service if Service.has_service('{0}'): Service.stop_service('{0}') """.format(service)) # Unconfigure Cinder ovsdb = PersistentFactory.get_client() key = str('ovs_openstack_cinder_%s' % storagedriver.vpool_guid) if ovsdb.exists(key): cinder_password, cinder_user, tenant_name, controller_ip, _ = ovsdb.get(key) client = SSHClient.load(ip) System.exec_remote_python(client, """ from ovs.extensions.openstack.cinder import OpenStackCinder osc = OpenStackCinder(cinder_password = '******', cinder_user = '******', tenant_name = '{2}', controller_ip = '{3}') osc.unconfigure_vpool('{4}', '{5}', {6}) """.format(cinder_password, cinder_user, tenant_name, controller_ip, vpool.name, storagedriver.mountpoint, not storagedrivers_left)) if not storagedrivers_left: ovsdb.delete(key) # KVM pool client = SSHClient.load(ip) if pmachine.hvtype == 'KVM': if vpool.name in client.run('virsh pool-list'): client.run('virsh pool-destroy {0}'.format(vpool.name)) try: client.run('virsh pool-undefine {0}'.format(vpool.name)) except: pass # Ignore undefine errors, since that can happen on re-entrance # Remove services client = SSHClient.load(ip) for service in services: System.exec_remote_python(client, """ from ovs.plugin.provider.service import Service if Service.has_service('{0}'): Service.remove_service(domain='openvstorage', name='{0}') """.format(service)) configuration_dir = System.read_remote_config(client, 'ovs.core.cfgdir') voldrv_arakoon_cluster_id = str(System.read_remote_config(client, 'volumedriver.arakoon.clusterid')) voldrv_arakoon_cluster = ArakoonManagementEx().getCluster(voldrv_arakoon_cluster_id) voldrv_arakoon_client_config = voldrv_arakoon_cluster.getClientConfig() arakoon_node_configs = [] for arakoon_node in voldrv_arakoon_client_config.keys(): arakoon_node_configs.append(ArakoonNodeConfig(arakoon_node, voldrv_arakoon_client_config[arakoon_node][0][0], voldrv_arakoon_client_config[arakoon_node][1])) vrouter_clusterregistry = ClusterRegistry(str(vpool.guid), voldrv_arakoon_cluster_id, arakoon_node_configs) # Reconfigure volumedriver if storagedrivers_left: node_configs = [] for current_storagedriver in vpool.storagedrivers: if current_storagedriver.guid != storagedriver_guid: node_configs.append(ClusterNodeConfig(str(current_storagedriver.storagedriver_id), str(current_storagedriver.cluster_ip), current_storagedriver.ports[0], current_storagedriver.ports[1], current_storagedriver.ports[2])) vrouter_clusterregistry.set_node_configs(node_configs) else: try: storagedriver_client = LocalStorageRouterClient('{0}/voldrv_vpools/{1}.json'.format(configuration_dir, vpool.name)) storagedriver_client.destroy_filesystem() vrouter_clusterregistry.erase_node_configs() except RuntimeError as ex: print('Could not destroy filesystem or erase node configs due to error: {}'.format(ex)) # Cleanup directories client = SSHClient.load(ip) client.run('rm -rf {}/read1_{}'.format(storagedriver.mountpoint_readcache1, vpool.name)) if storagedriver.mountpoint_readcache2: client.run('rm -rf {}/read2_{}'.format(storagedriver.mountpoint_readcache2, vpool.name)) client.run('rm -rf {}/sco_{}'.format(storagedriver.mountpoint_writecache, vpool.name)) client.run('rm -rf {}/foc_{}'.format(storagedriver.mountpoint_foc, vpool.name)) client.run('rm -rf {}/fd_{}'.format(storagedriver.mountpoint_writecache, vpool.name)) client.run('rm -rf {}/metadata_{}'.format(storagedriver.mountpoint_md, vpool.name)) client.run('rm -rf {}/tlogs_{}'.format(storagedriver.mountpoint_md, vpool.name)) client.run('rm -rf /var/rsp/{}'.format(vpool.name)) # Remove files client.run('rm -f {0}/voldrv_vpools/{1}.json'.format(configuration_dir, vpool.name)) # Remove top directories client.run('if [ -d {0} ] && [ ! "$(ls -A {0})" ]; then rmdir {0}; fi'.format(storagedriver.mountpoint_readcache1)) if storagedriver.mountpoint_readcache2: client.run('if [ -d {0} ] && [ ! "$(ls -A {0})" ]; then rmdir {0}; fi'.format(storagedriver.mountpoint_readcache2)) client.run('if [ -d {0} ] && [ ! "$(ls -A {0})" ]; then rmdir {0}; fi'.format(storagedriver.mountpoint_writecache)) client.run('if [ -d {0} ] && [ ! "$(ls -A {0})" ]; then rmdir {0}; fi'.format(storagedriver.mountpoint_foc)) client.run('if [ -d {0} ] && [ ! "$(ls -A {0})" ]; then rmdir {0}; fi'.format(storagedriver.mountpoint_md)) client.run('if [ -d {0} ] && [ ! "$(ls -A {0})" ]; then rmdir {0}; fi'.format(storagedriver.mountpoint)) # First model cleanup storagedriver.delete(abandon=True) # Detach from the log entries if storagedrivers_left: # Restart leftover services for current_storagedriver in vpool.storagedrivers: if current_storagedriver.guid != storagedriver_guid: client = SSHClient.load(current_storagedriver.storagerouter.ip) for service in services: System.exec_remote_python(client, """ from ovs.plugin.provider.service import Service if Service.has_service('{0}'): Service.enable_service('{0}') """.format(service)) System.exec_remote_python(client, """ from ovs.plugin.provider.service import Service if Service.has_service('{0}'): Service.start_service('{0}') """.format(service)) else: # Final model cleanup vpool.delete()
def gather_scrub_work(): """ Retrieve and execute scrub work :return: None """ ScheduledTaskController._logger.info('Gather Scrub - Started') scrub_locations = {} for storage_driver in StorageDriverList.get_storagedrivers(): for partition in storage_driver.partitions: if DiskPartition.ROLES.SCRUB == partition.role: ScheduledTaskController._logger.info('Gather Scrub - Storage Router {0:<15} has SCRUB partition at {1}'.format(storage_driver.storagerouter.ip, partition.path)) if storage_driver.storagerouter not in scrub_locations: try: sshclient = SSHClient(storage_driver.storagerouter) # Use ServiceManager(sshclient) to make sure ovs-workers are actually running if ServiceManager.get_service_status('workers', sshclient) is False: ScheduledTaskController._logger.warning('Gather Scrub - Storage Router {0:<15} - workers are not running'.format(storage_driver.storagerouter.ip)) else: scrub_locations[storage_driver.storagerouter] = str(partition.path) except UnableToConnectException: ScheduledTaskController._logger.warning('Gather Scrub - Storage Router {0:<15} is not reachable'.format(storage_driver.storagerouter.ip)) if len(scrub_locations) == 0: raise RuntimeError('No scrub locations found') vdisk_guids = set() for vmachine in VMachineList.get_customer_vmachines(): for vdisk in vmachine.vdisks: if vdisk.info['object_type'] == 'BASE': vdisk_guids.add(vdisk.guid) for vdisk in VDiskList.get_without_vmachine(): if vdisk.info['object_type'] == 'BASE': vdisk_guids.add(vdisk.guid) if len(vdisk_guids) == 0: ScheduledTaskController._logger.info('Gather Scrub - No scrub work needed'.format(len(vdisk_guids))) return ScheduledTaskController._logger.info('Gather Scrub - Checking {0} volumes for scrub work'.format(len(vdisk_guids))) local_machineid = System.get_my_machine_id() local_storage_router = None local_scrub_location = None local_vdisks_to_scrub = [] result_set = {} storage_router_list = [] scrub_map = {} for index, scrub_info in enumerate(scrub_locations.items()): start_index = index * len(vdisk_guids) / len(scrub_locations) end_index = (index + 1) * len(vdisk_guids) / len(scrub_locations) storage_router = scrub_info[0] vdisk_guids_to_scrub = list(vdisk_guids)[start_index:end_index] local = storage_router.machine_id == local_machineid ScheduledTaskController._logger.info('Gather Scrub - Storage Router {0:<15} ({1}) - Scrubbing {2} virtual disks'.format(storage_router.ip, 'local' if local is True else 'remote', len(vdisk_guids_to_scrub))) if local is True: local_storage_router = storage_router local_scrub_location = scrub_info[1] local_vdisks_to_scrub = vdisk_guids_to_scrub else: result_set[storage_router.ip] = ScheduledTaskController._execute_scrub_work.s(scrub_location=scrub_info[1], vdisk_guids=vdisk_guids_to_scrub).apply_async(routing_key='sr.{0}'.format(storage_router.machine_id)) storage_router_list.append(storage_router) scrub_map[storage_router.ip] = vdisk_guids_to_scrub # Remote tasks have been launched, now start the local task and then wait for remote tasks to finish processed_guids = [] if local_scrub_location is not None and len(local_vdisks_to_scrub) > 0: try: processed_guids = ScheduledTaskController._execute_scrub_work(scrub_location=local_scrub_location, vdisk_guids=local_vdisks_to_scrub) except Exception as ex: ScheduledTaskController._logger.error('Gather Scrub - Storage Router {0:<15} - Scrubbing failed with error:\n - {1}'.format(local_storage_router.ip, ex)) all_results, failed_nodes = CeleryToolbox.manage_running_tasks(result_set, timesleep=60) # Check every 60 seconds if tasks are still running for ip, result in all_results.iteritems(): if isinstance(result, list): processed_guids.extend(result) else: ScheduledTaskController._logger.error('Gather Scrub - Storage Router {0:<15} - Scrubbing failed with error:\n - {1}'.format(ip, result)) result_set = {} for failed_node in failed_nodes: ScheduledTaskController._logger.warning('Scrubbing failed on node {0}. Will reschedule on another node.'.format(failed_node)) vdisk_guids_to_scrub = scrub_map[failed_node] rescheduled_work = False for storage_router, scrub_location in scrub_locations.items(): if storage_router.ip not in failed_nodes: if storage_router.machine_id != local_machineid: ScheduledTaskController._logger.info('Rescheduled scrub work from node {0} to node {1}.'.format(failed_node, storage_router.ip)) result_set[storage_router.ip] = ScheduledTaskController._execute_scrub_work.s(scrub_location=scrub_location, vdisk_guids=vdisk_guids_to_scrub).apply_async( routing_key='sr.{0}'.format(storage_router.machine_id)) storage_router_list.append(storage_router) rescheduled_work = True break if rescheduled_work is False: if local_scrub_location is not None: try: processed_guids.extend(ScheduledTaskController._execute_scrub_work(scrub_location=local_scrub_location, vdisk_guids=vdisk_guids_to_scrub)) except Exception as ex: ScheduledTaskController._logger.error( 'Gather Scrub - Storage Router Local - Scrubbing failed with error:\n - {0}'.format(ex)) else: ScheduledTaskController._logger.warning('No nodes left to reschedule work from node {0}'.format(failed_node)) if len(result_set) > 0: all_results2, failed_nodes = CeleryToolbox.manage_running_tasks(result_set, timesleep=60) # Check every 60 seconds if tasks are still running for ip, result in all_results2.iteritems(): if isinstance(result, list): processed_guids.extend(result) else: ScheduledTaskController._logger.error('Gather Scrub - Storage Router {0:<15} - Scrubbing failed with error:\n - {1}'.format(ip, result)) if len(set(processed_guids)) != len(vdisk_guids) or set(processed_guids).difference(vdisk_guids): raise RuntimeError('Scrubbing failed for 1 or more storagerouters') ScheduledTaskController._logger.info('Gather Scrub - Finished')
def deletescrubsnapshots(timestamp=None): """ Delete snapshots & scrubbing policy Implemented delete snapshot policy: < 1d | 1d bucket | 1 | best of bucket | 1d < 1w | 1d bucket | 6 | oldest of bucket | 7d = 1w < 1m | 1w bucket | 3 | oldest of bucket | 4w = 1m > 1m | delete """ logger.info('Delete snapshots started') day = 60 * 60 * 24 week = day * 7 # Calculate bucket structure if timestamp is None: timestamp = time.time() offset = int(mktime(datetime.fromtimestamp(timestamp).date().timetuple())) - day buckets = [] # Buckets first 7 days: [0-1[, [1-2[, [2-3[, [3-4[, [4-5[, [5-6[, [6-7[ for i in xrange(0, 7): buckets.append({'start': offset - (day * i), 'end': offset - (day * (i + 1)), 'type': '1d', 'snapshots': []}) # Week buckets next 3 weeks: [7-14[, [14-21[, [21-28[ for i in xrange(1, 4): buckets.append({'start': offset - (week * i), 'end': offset - (week * (i + 1)), 'type': '1w', 'snapshots': []}) buckets.append({'start': offset - (week * 4), 'end': 0, 'type': 'rest', 'snapshots': []}) # Place all snapshots in bucket_chains bucket_chains = [] for vmachine in VMachineList.get_customer_vmachines(): if any(vd.info['object_type'] in ['BASE'] for vd in vmachine.vdisks): bucket_chain = copy.deepcopy(buckets) for snapshot in vmachine.snapshots: timestamp = int(snapshot['timestamp']) for bucket in bucket_chain: if bucket['start'] >= timestamp > bucket['end']: for diskguid, snapshotguid in snapshot['snapshots'].iteritems(): bucket['snapshots'].append({'timestamp': timestamp, 'snapshotid': snapshotguid, 'diskguid': diskguid, 'is_consistent': snapshot['is_consistent']}) bucket_chains.append(bucket_chain) for vdisk in VDiskList.get_without_vmachine(): if vdisk.info['object_type'] in ['BASE']: bucket_chain = copy.deepcopy(buckets) for snapshot in vdisk.snapshots: timestamp = int(snapshot['timestamp']) for bucket in bucket_chain: if bucket['start'] >= timestamp > bucket['end']: bucket['snapshots'].append({'timestamp': timestamp, 'snapshotid': snapshot['guid'], 'diskguid': vdisk.guid, 'is_consistent': snapshot['is_consistent']}) bucket_chains.append(bucket_chain) # Clean out the snapshot bucket_chains, we delete the snapshots we want to keep # And we'll remove all snapshots that remain in the buckets for bucket_chain in bucket_chains: first = True for bucket in bucket_chain: if first is True: best = None for snapshot in bucket['snapshots']: if best is None: best = snapshot # Consistent is better than inconsistent elif snapshot['is_consistent'] and not best['is_consistent']: best = snapshot # Newer (larger timestamp) is better than older snapshots elif snapshot['is_consistent'] == best['is_consistent'] and \ snapshot['timestamp'] > best['timestamp']: best = snapshot bucket['snapshots'] = [s for s in bucket['snapshots'] if s['timestamp'] != best['timestamp']] first = False elif bucket['end'] > 0: oldest = None for snapshot in bucket['snapshots']: if oldest is None: oldest = snapshot # Older (smaller timestamp) is the one we want to keep elif snapshot['timestamp'] < oldest['timestamp']: oldest = snapshot bucket['snapshots'] = [s for s in bucket['snapshots'] if s['timestamp'] != oldest['timestamp']] # Delete obsolete snapshots for bucket_chain in bucket_chains: for bucket in bucket_chain: for snapshot in bucket['snapshots']: VDiskController.delete_snapshot(diskguid=snapshot['diskguid'], snapshotid=snapshot['snapshotid']) logger.info('Delete snapshots finished') logger.info('Scrubbing started') vdisks = [] for vmachine in VMachineList.get_customer_vmachines(): for vdisk in vmachine.vdisks: if vdisk.info['object_type'] in ['BASE']: vdisks.append(vdisk) for vdisk in VDiskList.get_without_vmachine(): if vdisk.info['object_type'] in ['BASE']: vdisks.append(vdisk) total = 0 failed = 0 for vdisk in vdisks: work_units = vdisk.storagedriver_client.get_scrubbing_workunits(str(vdisk.volume_id)) for work_unit in work_units: try: total += 1 scrubbing_result = _storagedriver_scrubber.scrub(work_unit, vdisk.vpool.mountpoint_temp) vdisk.storagedriver_client.apply_scrubbing_result(scrubbing_result) except: failed += 1 logger.info('Failed scrubbing work unit for volume {}'.format( vdisk.volume_id )) logger.info('Scrubbing finished. {} out of {} items failed.'.format( failed, total ))
def gather_scrub_work(): """ Retrieve and execute scrub work :return: None """ logger.info('Gather Scrub - Started') scrub_locations = {} for storage_driver in StorageDriverList.get_storagedrivers(): for partition in storage_driver.partitions: if DiskPartition.ROLES.SCRUB == partition.role: logger.info( 'Gather Scrub - Storage Router {0:<15} has SCRUB partition at {1}' .format(storage_driver.storagerouter.ip, partition.path)) if storage_driver.storagerouter not in scrub_locations: try: _ = SSHClient(storage_driver.storagerouter) scrub_locations[ storage_driver.storagerouter] = str( partition.path) except UnableToConnectException: logger.warning( 'Gather Scrub - Storage Router {0:<15} is not reachable' .format(storage_driver.storagerouter.ip)) if len(scrub_locations) == 0: raise RuntimeError('No scrub locations found') vdisk_guids = set() for vmachine in VMachineList.get_customer_vmachines(): for vdisk in vmachine.vdisks: if vdisk.info['object_type'] == 'BASE': vdisk_guids.add(vdisk.guid) for vdisk in VDiskList.get_without_vmachine(): if vdisk.info['object_type'] == 'BASE': vdisk_guids.add(vdisk.guid) logger.info( 'Gather Scrub - Checking {0} volumes for scrub work'.format( len(vdisk_guids))) local_machineid = System.get_my_machine_id() local_storage_router = None local_scrub_location = None local_vdisks_to_scrub = [] result_set = ResultSet([]) storage_router_list = [] for index, scrub_info in enumerate(scrub_locations.items()): start_index = index * len(vdisk_guids) / len(scrub_locations) end_index = (index + 1) * len(vdisk_guids) / len(scrub_locations) storage_router = scrub_info[0] vdisk_guids_to_scrub = list(vdisk_guids)[start_index:end_index] local = storage_router.machine_id == local_machineid logger.info( 'Gather Scrub - Storage Router {0:<15} ({1}) - Scrubbing {2} virtual disks' .format(storage_router.ip, 'local' if local is True else 'remote', len(vdisk_guids_to_scrub))) if local is True: local_storage_router = storage_router local_scrub_location = scrub_info[1] local_vdisks_to_scrub = vdisk_guids_to_scrub else: result_set.add( ScheduledTaskController._execute_scrub_work.s( scrub_location=scrub_info[1], vdisk_guids=vdisk_guids_to_scrub).apply_async( routing_key='sr.{0}'.format( storage_router.machine_id))) storage_router_list.append(storage_router) # Remote tasks have been launched, now start the local task and then wait for remote tasks to finish processed_guids = [] if local_scrub_location is not None and len(local_vdisks_to_scrub) > 0: try: processed_guids = ScheduledTaskController._execute_scrub_work( scrub_location=local_scrub_location, vdisk_guids=local_vdisks_to_scrub) except Exception as ex: logger.error( 'Gather Scrub - Storage Router {0:<15} - Scrubbing failed with error:\n - {1}' .format(local_storage_router.ip, ex)) all_results = result_set.join( propagate=False ) # Propagate False makes sure all jobs are waited for even when 1 or more jobs fail for index, result in enumerate(all_results): if isinstance(result, list): processed_guids.extend(result) else: logger.error( 'Gather Scrub - Storage Router {0:<15} - Scrubbing failed with error:\n - {1}' .format(storage_router_list[index].ip, result)) if len(processed_guids) != len(vdisk_guids) or set( processed_guids).difference(vdisk_guids): raise RuntimeError('Scrubbing failed for 1 or more storagerouters') logger.info('Gather Scrub - Finished')
def deletescrubsnapshots(timestamp=None): """ Delete snapshots & scrubbing policy Implemented delete snapshot policy: < 1d | 1d bucket | 1 | best of bucket | 1d < 1w | 1d bucket | 6 | oldest of bucket | 7d = 1w < 1m | 1w bucket | 3 | oldest of bucket | 4w = 1m > 1m | delete """ logger.info('Delete snapshots started') day = 60 * 60 * 24 week = day * 7 # Calculate bucket structure if timestamp is None: timestamp = time.time() offset = int(mktime(datetime.fromtimestamp(timestamp).date().timetuple())) - day buckets = [] # Buckets first 7 days: [0-1[, [1-2[, [2-3[, [3-4[, [4-5[, [5-6[, [6-7[ for i in xrange(0, 7): buckets.append({'start': offset - (day * i), 'end': offset - (day * (i + 1)), 'type': '1d', 'snapshots': []}) # Week buckets next 3 weeks: [7-14[, [14-21[, [21-28[ for i in xrange(1, 4): buckets.append({'start': offset - (week * i), 'end': offset - (week * (i + 1)), 'type': '1w', 'snapshots': []}) buckets.append({'start': offset - (week * 4), 'end': 0, 'type': 'rest', 'snapshots': []}) # Place all snapshots in bucket_chains bucket_chains = [] for vmachine in VMachineList.get_customer_vmachines(): if any(vd.info['object_type'] in ['BASE'] for vd in vmachine.vdisks): bucket_chain = copy.deepcopy(buckets) for snapshot in vmachine.snapshots: timestamp = int(snapshot['timestamp']) for bucket in bucket_chain: if bucket['start'] >= timestamp > bucket['end']: for diskguid, snapshotguid in snapshot['snapshots'].iteritems(): bucket['snapshots'].append({'timestamp': timestamp, 'snapshotid': snapshotguid, 'diskguid': diskguid, 'is_consistent': snapshot['is_consistent']}) bucket_chains.append(bucket_chain) for vdisk in VDiskList.get_without_vmachine(): if vdisk.info['object_type'] in ['BASE']: bucket_chain = copy.deepcopy(buckets) for snapshot in vdisk.snapshots: timestamp = int(snapshot['timestamp']) for bucket in bucket_chain: if bucket['start'] >= timestamp > bucket['end']: bucket['snapshots'].append({'timestamp': timestamp, 'snapshotid': snapshot['guid'], 'diskguid': vdisk.guid, 'is_consistent': snapshot['is_consistent']}) bucket_chains.append(bucket_chain) # Clean out the snapshot bucket_chains, we delete the snapshots we want to keep # And we'll remove all snapshots that remain in the buckets for bucket_chain in bucket_chains: first = True for bucket in bucket_chain: if first is True: best = None for snapshot in bucket['snapshots']: if best is None: best = snapshot # Consistent is better than inconsistent elif snapshot['is_consistent'] and not best['is_consistent']: best = snapshot # Newer (larger timestamp) is better than older snapshots elif snapshot['is_consistent'] == best['is_consistent'] and \ snapshot['timestamp'] > best['timestamp']: best = snapshot bucket['snapshots'] = [s for s in bucket['snapshots'] if s['timestamp'] != best['timestamp']] first = False elif bucket['end'] > 0: oldest = None for snapshot in bucket['snapshots']: if oldest is None: oldest = snapshot # Older (smaller timestamp) is the one we want to keep elif snapshot['timestamp'] < oldest['timestamp']: oldest = snapshot bucket['snapshots'] = [s for s in bucket['snapshots'] if s['timestamp'] != oldest['timestamp']] # Delete obsolete snapshots for bucket_chain in bucket_chains: for bucket in bucket_chain: for snapshot in bucket['snapshots']: VDiskController.delete_snapshot(diskguid=snapshot['diskguid'], snapshotid=snapshot['snapshotid']) logger.info('Delete snapshots finished') logger.info('Scrubbing started') vdisks = [] for vmachine in VMachineList.get_customer_vmachines(): for vdisk in vmachine.vdisks: if vdisk.info['object_type'] in ['BASE'] and len(vdisk.child_vdisks) == 0: vdisks.append(vdisk) for vdisk in VDiskList.get_without_vmachine(): if vdisk.info['object_type'] in ['BASE'] and len(vdisk.child_vdisks) == 0: vdisks.append(vdisk) total = 0 failed = 0 skipped = 0 storagedrivers = {} for vdisk in vdisks: try: total += 1 # Load the vDisk's StorageDriver vdisk.invalidate_dynamics(['info', 'storagedriver_id']) if vdisk.storagedriver_id not in storagedrivers: storagedrivers[vdisk.storagedriver_id] = StorageDriverList.get_by_storagedriver_id(vdisk.storagedriver_id) storagedriver = storagedrivers[vdisk.storagedriver_id] # Load the vDisk's MDS configuration vdisk.invalidate_dynamics(['info']) configs = vdisk.info['metadata_backend_config'] if len(configs) == 0: raise RuntimeError('Could not load MDS configuration') if configs[0]['ip'] != storagedriver.storagerouter.ip: # The MDS master is not local. Trigger an MDS handover and try again logger.debug('MDS for volume {0} is not local. Trigger handover'.format(vdisk.volume_id)) MDSServiceController.ensure_safety(vdisk) vdisk.invalidate_dynamics(['info']) configs = vdisk.info['metadata_backend_config'] if len(configs) == 0: raise RuntimeError('Could not load MDS configuration') if configs[0]['ip'] != storagedriver.storagerouter.ip: skipped += 1 logger.info('Skipping scrubbing work unit for volume {0}: MDS master is not local'.format( vdisk.volume_id )) continue work_units = vdisk.storagedriver_client.get_scrubbing_workunits(str(vdisk.volume_id)) for work_unit in work_units: scrubbing_result = _storagedriver_scrubber.scrub(work_unit, str(storagedriver.mountpoint_temp)) vdisk.storagedriver_client.apply_scrubbing_result(scrubbing_result) except Exception, ex: failed += 1 logger.info('Failed scrubbing work unit for volume {0}: {1}'.format( vdisk.volume_id, ex ))
def deletescrubsnapshots(timestamp=None): """ Delete snapshots & scrubbing policy Implemented delete snapshot policy: < 1d | 1d bucket | 1 | best of bucket | 1d < 1w | 1d bucket | 6 | oldest of bucket | 7d = 1w < 1m | 1w bucket | 3 | oldest of bucket | 4w = 1m > 1m | delete """ logger.info('Delete snapshots started') day = 60 * 60 * 24 week = day * 7 # Calculate bucket structure if timestamp is None: timestamp = time.time() offset = int( mktime(datetime.fromtimestamp(timestamp).date().timetuple())) - day buckets = [] # Buckets first 7 days: [0-1[, [1-2[, [2-3[, [3-4[, [4-5[, [5-6[, [6-7[ for i in xrange(0, 7): buckets.append({ 'start': offset - (day * i), 'end': offset - (day * (i + 1)), 'type': '1d', 'snapshots': [] }) # Week buckets next 3 weeks: [7-14[, [14-21[, [21-28[ for i in xrange(1, 4): buckets.append({ 'start': offset - (week * i), 'end': offset - (week * (i + 1)), 'type': '1w', 'snapshots': [] }) buckets.append({ 'start': offset - (week * 4), 'end': 0, 'type': 'rest', 'snapshots': [] }) # Place all snapshots in bucket_chains bucket_chains = [] for vmachine in VMachineList.get_customer_vmachines(): if any(vd.info['object_type'] in ['BASE'] for vd in vmachine.vdisks): bucket_chain = copy.deepcopy(buckets) for snapshot in vmachine.snapshots: timestamp = int(snapshot['timestamp']) for bucket in bucket_chain: if bucket['start'] >= timestamp > bucket['end']: for diskguid, snapshotguid in snapshot[ 'snapshots'].iteritems(): bucket['snapshots'].append({ 'timestamp': timestamp, 'snapshotid': snapshotguid, 'diskguid': diskguid, 'is_consistent': snapshot['is_consistent'] }) bucket_chains.append(bucket_chain) for vdisk in VDiskList.get_without_vmachine(): if vdisk.info['object_type'] in ['BASE']: bucket_chain = copy.deepcopy(buckets) for snapshot in vdisk.snapshots: timestamp = int(snapshot['timestamp']) for bucket in bucket_chain: if bucket['start'] >= timestamp > bucket['end']: bucket['snapshots'].append({ 'timestamp': timestamp, 'snapshotid': snapshot['guid'], 'diskguid': vdisk.guid, 'is_consistent': snapshot['is_consistent'] }) bucket_chains.append(bucket_chain) # Clean out the snapshot bucket_chains, we delete the snapshots we want to keep # And we'll remove all snapshots that remain in the buckets for bucket_chain in bucket_chains: first = True for bucket in bucket_chain: if first is True: best = None for snapshot in bucket['snapshots']: if best is None: best = snapshot # Consistent is better than inconsistent elif snapshot[ 'is_consistent'] and not best['is_consistent']: best = snapshot # Newer (larger timestamp) is better than older snapshots elif snapshot['is_consistent'] == best['is_consistent'] and \ snapshot['timestamp'] > best['timestamp']: best = snapshot bucket['snapshots'] = [ s for s in bucket['snapshots'] if s['timestamp'] != best['timestamp'] ] first = False elif bucket['end'] > 0: oldest = None for snapshot in bucket['snapshots']: if oldest is None: oldest = snapshot # Older (smaller timestamp) is the one we want to keep elif snapshot['timestamp'] < oldest['timestamp']: oldest = snapshot bucket['snapshots'] = [ s for s in bucket['snapshots'] if s['timestamp'] != oldest['timestamp'] ] # Delete obsolete snapshots for bucket_chain in bucket_chains: for bucket in bucket_chain: for snapshot in bucket['snapshots']: VDiskController.delete_snapshot( diskguid=snapshot['diskguid'], snapshotid=snapshot['snapshotid']) logger.info('Delete snapshots finished') logger.info('Scrubbing started') vdisks = [] for vmachine in VMachineList.get_customer_vmachines(): for vdisk in vmachine.vdisks: if vdisk.info['object_type'] in ['BASE'] and len( vdisk.child_vdisks) == 0: vdisks.append(vdisk) for vdisk in VDiskList.get_without_vmachine(): if vdisk.info['object_type'] in ['BASE'] and len( vdisk.child_vdisks) == 0: vdisks.append(vdisk) total = 0 failed = 0 skipped = 0 storagedrivers = {} for vdisk in vdisks: try: total += 1 # Load the vDisk's StorageDriver vdisk.invalidate_dynamics(['info', 'storagedriver_id']) if vdisk.storagedriver_id not in storagedrivers: storagedrivers[ vdisk. storagedriver_id] = StorageDriverList.get_by_storagedriver_id( vdisk.storagedriver_id) storagedriver = storagedrivers[vdisk.storagedriver_id] # Load the vDisk's MDS configuration vdisk.invalidate_dynamics(['info']) configs = vdisk.info['metadata_backend_config'] if len(configs) == 0: raise RuntimeError('Could not load MDS configuration') if configs[0]['ip'] != storagedriver.storagerouter.ip: # The MDS master is not local. Trigger an MDS handover and try again logger.debug( 'MDS for volume {0} is not local. Trigger handover'. format(vdisk.volume_id)) MDSServiceController.ensure_safety(vdisk) vdisk.invalidate_dynamics(['info']) configs = vdisk.info['metadata_backend_config'] if len(configs) == 0: raise RuntimeError('Could not load MDS configuration') if configs[0]['ip'] != storagedriver.storagerouter.ip: skipped += 1 logger.info( 'Skipping scrubbing work unit for volume {0}: MDS master is not local' .format(vdisk.volume_id)) continue work_units = vdisk.storagedriver_client.get_scrubbing_workunits( str(vdisk.volume_id)) for work_unit in work_units: scrubbing_result = _storagedriver_scrubber.scrub( work_unit, str(storagedriver.mountpoint_temp)) vdisk.storagedriver_client.apply_scrubbing_result( scrubbing_result) except Exception, ex: failed += 1 logger.info( 'Failed scrubbing work unit for volume {0}: {1}'.format( vdisk.volume_id, ex))
def delete_snapshots(timestamp=None): """ Delete snapshots & scrubbing policy Implemented delete snapshot policy: < 1d | 1d bucket | 1 | best of bucket | 1d < 1w | 1d bucket | 6 | oldest of bucket | 7d = 1w < 1m | 1w bucket | 3 | oldest of bucket | 4w = 1m > 1m | delete :param timestamp: Timestamp to determine whether snapshots should be kept or not, if none provided, current time will be used """ logger.info("Delete snapshots started") day = timedelta(1) week = day * 7 def make_timestamp(offset): return int(mktime((base - offset).timetuple())) # Calculate bucket structure if timestamp is None: timestamp = time.time() base = datetime.fromtimestamp(timestamp).date() - day buckets = [] # Buckets first 7 days: [0-1[, [1-2[, [2-3[, [3-4[, [4-5[, [5-6[, [6-7[ for i in xrange(0, 7): buckets.append( {"start": make_timestamp(day * i), "end": make_timestamp(day * (i + 1)), "type": "1d", "snapshots": []} ) # Week buckets next 3 weeks: [7-14[, [14-21[, [21-28[ for i in xrange(1, 4): buckets.append( { "start": make_timestamp(week * i), "end": make_timestamp(week * (i + 1)), "type": "1w", "snapshots": [], } ) buckets.append({"start": make_timestamp(week * 4), "end": 0, "type": "rest", "snapshots": []}) # Place all snapshots in bucket_chains bucket_chains = [] for vmachine in VMachineList.get_customer_vmachines(): if any(vd.info["object_type"] in ["BASE"] for vd in vmachine.vdisks): bucket_chain = copy.deepcopy(buckets) for snapshot in vmachine.snapshots: timestamp = int(snapshot["timestamp"]) for bucket in bucket_chain: if bucket["start"] >= timestamp > bucket["end"]: for diskguid, snapshotguid in snapshot["snapshots"].iteritems(): bucket["snapshots"].append( { "timestamp": timestamp, "snapshotid": snapshotguid, "diskguid": diskguid, "is_consistent": snapshot["is_consistent"], } ) bucket_chains.append(bucket_chain) for vdisk in VDiskList.get_without_vmachine(): if vdisk.info["object_type"] in ["BASE"]: bucket_chain = copy.deepcopy(buckets) for snapshot in vdisk.snapshots: timestamp = int(snapshot["timestamp"]) for bucket in bucket_chain: if bucket["start"] >= timestamp > bucket["end"]: bucket["snapshots"].append( { "timestamp": timestamp, "snapshotid": snapshot["guid"], "diskguid": vdisk.guid, "is_consistent": snapshot["is_consistent"], } ) bucket_chains.append(bucket_chain) # Clean out the snapshot bucket_chains, we delete the snapshots we want to keep # And we'll remove all snapshots that remain in the buckets for bucket_chain in bucket_chains: first = True for bucket in bucket_chain: if first is True: best = None for snapshot in bucket["snapshots"]: if best is None: best = snapshot # Consistent is better than inconsistent elif snapshot["is_consistent"] and not best["is_consistent"]: best = snapshot # Newer (larger timestamp) is better than older snapshots elif ( snapshot["is_consistent"] == best["is_consistent"] and snapshot["timestamp"] > best["timestamp"] ): best = snapshot bucket["snapshots"] = [s for s in bucket["snapshots"] if s["timestamp"] != best["timestamp"]] first = False elif bucket["end"] > 0: oldest = None for snapshot in bucket["snapshots"]: if oldest is None: oldest = snapshot # Older (smaller timestamp) is the one we want to keep elif snapshot["timestamp"] < oldest["timestamp"]: oldest = snapshot bucket["snapshots"] = [s for s in bucket["snapshots"] if s["timestamp"] != oldest["timestamp"]] # Delete obsolete snapshots for bucket_chain in bucket_chains: for bucket in bucket_chain: for snapshot in bucket["snapshots"]: VDiskController.delete_snapshot(diskguid=snapshot["diskguid"], snapshotid=snapshot["snapshotid"]) logger.info("Delete snapshots finished")
def gather_scrub_work(): logger.info("Divide scrubbing work among allowed Storage Routers") scrub_locations = {} for storage_driver in StorageDriverList.get_storagedrivers(): for partition in storage_driver.partitions: if DiskPartition.ROLES.SCRUB == partition.role: logger.info( "Scrub partition found on Storage Router {0}: {1}".format(storage_driver.name, partition.folder) ) if storage_driver.storagerouter not in scrub_locations: try: _ = SSHClient(storage_driver.storagerouter.ip) scrub_locations[storage_driver.storagerouter] = str(partition.path) except UnableToConnectException: logger.warning("StorageRouter {0} is not reachable".format(storage_driver.storagerouter.ip)) if len(scrub_locations) == 0: raise RuntimeError("No scrub locations found") vdisk_guids = set() for vmachine in VMachineList.get_customer_vmachines(): for vdisk in vmachine.vdisks: if vdisk.info["object_type"] in ["BASE"] and len(vdisk.child_vdisks) == 0: vdisk_guids.add(vdisk.guid) for vdisk in VDiskList.get_without_vmachine(): if vdisk.info["object_type"] in ["BASE"] and len(vdisk.child_vdisks) == 0: vdisk_guids.add(vdisk.guid) logger.info("Found {0} virtual disks which need to be check for scrub work".format(len(vdisk_guids))) local_machineid = System.get_my_machine_id() local_scrub_location = None local_vdisks_to_scrub = [] result_set = ResultSet([]) storage_router_list = [] for index, scrub_info in enumerate(scrub_locations.items()): start_index = index * len(vdisk_guids) / len(scrub_locations) end_index = (index + 1) * len(vdisk_guids) / len(scrub_locations) storage_router = scrub_info[0] vdisk_guids_to_scrub = list(vdisk_guids)[start_index:end_index] local = storage_router.machine_id == local_machineid logger.info( "Executing scrub work on {0} Storage Router {1} for {2} virtual disks".format( "local" if local is True else "remote", storage_router.name, len(vdisk_guids_to_scrub) ) ) if local is True: local_scrub_location = scrub_info[1] local_vdisks_to_scrub = vdisk_guids_to_scrub else: result_set.add( ScheduledTaskController._execute_scrub_work.s( scrub_location=scrub_info[1], vdisk_guids=vdisk_guids_to_scrub ).apply_async(routing_key="sr.{0}".format(storage_router.machine_id)) ) storage_router_list.append(storage_router) logger.info("Launched scrub task on Storage Router {0}".format(storage_router.name)) # Remote tasks have been launched, now start the local task and then wait for remote tasks to finish if local_scrub_location is not None and len(local_vdisks_to_scrub) > 0: ScheduledTaskController._execute_scrub_work( scrub_location=local_scrub_location, vdisk_guids=local_vdisks_to_scrub ) all_results = result_set.join( propagate=False ) # Propagate False makes sure all jobs are waited for even when 1 or more jobs fail for index, result in enumerate(all_results): if result is not None: logger.error( "Scrubbing failed on Storage Router {0} with error {1}".format( storage_router_list[index].name, result ) )
def delete_snapshots(timestamp=None): """ Delete snapshots & scrubbing policy Implemented delete snapshot policy: < 1d | 1d bucket | 1 | best of bucket | 1d < 1w | 1d bucket | 6 | oldest of bucket | 7d = 1w < 1m | 1w bucket | 3 | oldest of bucket | 4w = 1m > 1m | delete :param timestamp: Timestamp to determine whether snapshots should be kept or not, if none provided, current time will be used """ logger.info('Delete snapshots started') day = timedelta(1) week = day * 7 def make_timestamp(offset): """ Create an integer based timestamp :param offset: Offset in days :return: Timestamp """ return int(mktime((base - offset).timetuple())) # Calculate bucket structure if timestamp is None: timestamp = time.time() base = datetime.fromtimestamp(timestamp).date() - day buckets = [] # Buckets first 7 days: [0-1[, [1-2[, [2-3[, [3-4[, [4-5[, [5-6[, [6-7[ for i in xrange(0, 7): buckets.append({'start': make_timestamp(day * i), 'end': make_timestamp(day * (i + 1)), 'type': '1d', 'snapshots': []}) # Week buckets next 3 weeks: [7-14[, [14-21[, [21-28[ for i in xrange(1, 4): buckets.append({'start': make_timestamp(week * i), 'end': make_timestamp(week * (i + 1)), 'type': '1w', 'snapshots': []}) buckets.append({'start': make_timestamp(week * 4), 'end': 0, 'type': 'rest', 'snapshots': []}) # Place all snapshots in bucket_chains bucket_chains = [] for vmachine in VMachineList.get_customer_vmachines(): if any(vd.info['object_type'] in ['BASE'] for vd in vmachine.vdisks): bucket_chain = copy.deepcopy(buckets) for snapshot in vmachine.snapshots: if snapshot.get('is_sticky') is True: continue timestamp = int(snapshot['timestamp']) for bucket in bucket_chain: if bucket['start'] >= timestamp > bucket['end']: for diskguid, snapshotguid in snapshot['snapshots'].iteritems(): bucket['snapshots'].append({'timestamp': timestamp, 'snapshotid': snapshotguid, 'diskguid': diskguid, 'is_consistent': snapshot['is_consistent']}) bucket_chains.append(bucket_chain) for vdisk in VDiskList.get_without_vmachine(): if vdisk.info['object_type'] in ['BASE']: bucket_chain = copy.deepcopy(buckets) for snapshot in vdisk.snapshots: if snapshot.get('is_sticky') is True: continue timestamp = int(snapshot['timestamp']) for bucket in bucket_chain: if bucket['start'] >= timestamp > bucket['end']: bucket['snapshots'].append({'timestamp': timestamp, 'snapshotid': snapshot['guid'], 'diskguid': vdisk.guid, 'is_consistent': snapshot['is_consistent']}) bucket_chains.append(bucket_chain) # Clean out the snapshot bucket_chains, we delete the snapshots we want to keep # And we'll remove all snapshots that remain in the buckets for bucket_chain in bucket_chains: first = True for bucket in bucket_chain: if first is True: best = None for snapshot in bucket['snapshots']: if best is None: best = snapshot # Consistent is better than inconsistent elif snapshot['is_consistent'] and not best['is_consistent']: best = snapshot # Newer (larger timestamp) is better than older snapshots elif snapshot['is_consistent'] == best['is_consistent'] and \ snapshot['timestamp'] > best['timestamp']: best = snapshot bucket['snapshots'] = [s for s in bucket['snapshots'] if s['timestamp'] != best['timestamp']] first = False elif bucket['end'] > 0: oldest = None for snapshot in bucket['snapshots']: if oldest is None: oldest = snapshot # Older (smaller timestamp) is the one we want to keep elif snapshot['timestamp'] < oldest['timestamp']: oldest = snapshot bucket['snapshots'] = [s for s in bucket['snapshots'] if s['timestamp'] != oldest['timestamp']] # Delete obsolete snapshots for bucket_chain in bucket_chains: for bucket in bucket_chain: for snapshot in bucket['snapshots']: VDiskController.delete_snapshot(diskguid=snapshot['diskguid'], snapshotid=snapshot['snapshotid']) logger.info('Delete snapshots finished')
def delete_snapshots(timestamp=None): """ Delete snapshots & scrubbing policy Implemented delete snapshot policy: < 1d | 1d bucket | 1 | best of bucket | 1d < 1w | 1d bucket | 6 | oldest of bucket | 7d = 1w < 1m | 1w bucket | 3 | oldest of bucket | 4w = 1m > 1m | delete :param timestamp: Timestamp to determine whether snapshots should be kept or not, if none provided, current time will be used """ logger.info('Delete snapshots started') day = timedelta(1) week = day * 7 def make_timestamp(offset): """ Create an integer based timestamp :param offset: Offset in days :return: Timestamp """ return int(mktime((base - offset).timetuple())) # Calculate bucket structure if timestamp is None: timestamp = time.time() base = datetime.fromtimestamp(timestamp).date() - day buckets = [] # Buckets first 7 days: [0-1[, [1-2[, [2-3[, [3-4[, [4-5[, [5-6[, [6-7[ for i in xrange(0, 7): buckets.append({ 'start': make_timestamp(day * i), 'end': make_timestamp(day * (i + 1)), 'type': '1d', 'snapshots': [] }) # Week buckets next 3 weeks: [7-14[, [14-21[, [21-28[ for i in xrange(1, 4): buckets.append({ 'start': make_timestamp(week * i), 'end': make_timestamp(week * (i + 1)), 'type': '1w', 'snapshots': [] }) buckets.append({ 'start': make_timestamp(week * 4), 'end': 0, 'type': 'rest', 'snapshots': [] }) # Place all snapshots in bucket_chains bucket_chains = [] for vmachine in VMachineList.get_customer_vmachines(): if any(vd.info['object_type'] in ['BASE'] for vd in vmachine.vdisks): bucket_chain = copy.deepcopy(buckets) for snapshot in vmachine.snapshots: if snapshot.get('is_sticky') is True: continue timestamp = int(snapshot['timestamp']) for bucket in bucket_chain: if bucket['start'] >= timestamp > bucket['end']: for diskguid, snapshotguid in snapshot[ 'snapshots'].iteritems(): bucket['snapshots'].append({ 'timestamp': timestamp, 'snapshotid': snapshotguid, 'diskguid': diskguid, 'is_consistent': snapshot['is_consistent'] }) bucket_chains.append(bucket_chain) for vdisk in VDiskList.get_without_vmachine(): if vdisk.info['object_type'] in ['BASE']: bucket_chain = copy.deepcopy(buckets) for snapshot in vdisk.snapshots: if snapshot.get('is_sticky') is True: continue timestamp = int(snapshot['timestamp']) for bucket in bucket_chain: if bucket['start'] >= timestamp > bucket['end']: bucket['snapshots'].append({ 'timestamp': timestamp, 'snapshotid': snapshot['guid'], 'diskguid': vdisk.guid, 'is_consistent': snapshot['is_consistent'] }) bucket_chains.append(bucket_chain) # Clean out the snapshot bucket_chains, we delete the snapshots we want to keep # And we'll remove all snapshots that remain in the buckets for bucket_chain in bucket_chains: first = True for bucket in bucket_chain: if first is True: best = None for snapshot in bucket['snapshots']: if best is None: best = snapshot # Consistent is better than inconsistent elif snapshot[ 'is_consistent'] and not best['is_consistent']: best = snapshot # Newer (larger timestamp) is better than older snapshots elif snapshot['is_consistent'] == best['is_consistent'] and \ snapshot['timestamp'] > best['timestamp']: best = snapshot bucket['snapshots'] = [ s for s in bucket['snapshots'] if s['timestamp'] != best['timestamp'] ] first = False elif bucket['end'] > 0: oldest = None for snapshot in bucket['snapshots']: if oldest is None: oldest = snapshot # Older (smaller timestamp) is the one we want to keep elif snapshot['timestamp'] < oldest['timestamp']: oldest = snapshot bucket['snapshots'] = [ s for s in bucket['snapshots'] if s['timestamp'] != oldest['timestamp'] ] # Delete obsolete snapshots for bucket_chain in bucket_chains: for bucket in bucket_chain: for snapshot in bucket['snapshots']: VDiskController.delete_snapshot( diskguid=snapshot['diskguid'], snapshotid=snapshot['snapshotid']) logger.info('Delete snapshots finished')
def gather_scrub_work(): """ Retrieve and execute scrub work :return: None """ ScheduledTaskController._logger.info('Gather Scrub - Started') scrub_locations = {} for storage_driver in StorageDriverList.get_storagedrivers(): for partition in storage_driver.partitions: if DiskPartition.ROLES.SCRUB == partition.role: ScheduledTaskController._logger.info( 'Gather Scrub - Storage Router {0:<15} has SCRUB partition at {1}' .format(storage_driver.storagerouter.ip, partition.path)) if storage_driver.storagerouter not in scrub_locations: try: sshclient = SSHClient(storage_driver.storagerouter) # Use ServiceManager(sshclient) to make sure ovs-workers are actually running if ServiceManager.get_service_status( 'workers', sshclient) is False: ScheduledTaskController._logger.warning( 'Gather Scrub - Storage Router {0:<15} - workers are not running' .format(storage_driver.storagerouter.ip)) else: scrub_locations[ storage_driver.storagerouter] = str( partition.path) except UnableToConnectException: ScheduledTaskController._logger.warning( 'Gather Scrub - Storage Router {0:<15} is not reachable' .format(storage_driver.storagerouter.ip)) if len(scrub_locations) == 0: raise RuntimeError('No scrub locations found') vdisk_guids = set() for vmachine in VMachineList.get_customer_vmachines(): for vdisk in vmachine.vdisks: if vdisk.info['object_type'] == 'BASE': vdisk_guids.add(vdisk.guid) for vdisk in VDiskList.get_without_vmachine(): if vdisk.info['object_type'] == 'BASE': vdisk_guids.add(vdisk.guid) if len(vdisk_guids) == 0: ScheduledTaskController._logger.info( 'Gather Scrub - No scrub work needed'.format(len(vdisk_guids))) return ScheduledTaskController._logger.info( 'Gather Scrub - Checking {0} volumes for scrub work'.format( len(vdisk_guids))) local_machineid = System.get_my_machine_id() local_storage_router = None local_scrub_location = None local_vdisks_to_scrub = [] result_set = {} storage_router_list = [] scrub_map = {} for index, scrub_info in enumerate(scrub_locations.items()): start_index = index * len(vdisk_guids) / len(scrub_locations) end_index = (index + 1) * len(vdisk_guids) / len(scrub_locations) storage_router = scrub_info[0] vdisk_guids_to_scrub = list(vdisk_guids)[start_index:end_index] local = storage_router.machine_id == local_machineid ScheduledTaskController._logger.info( 'Gather Scrub - Storage Router {0:<15} ({1}) - Scrubbing {2} virtual disks' .format(storage_router.ip, 'local' if local is True else 'remote', len(vdisk_guids_to_scrub))) if local is True: local_storage_router = storage_router local_scrub_location = scrub_info[1] local_vdisks_to_scrub = vdisk_guids_to_scrub else: result_set[storage_router. ip] = ScheduledTaskController._execute_scrub_work.s( scrub_location=scrub_info[1], vdisk_guids=vdisk_guids_to_scrub).apply_async( routing_key='sr.{0}'.format( storage_router.machine_id)) storage_router_list.append(storage_router) scrub_map[storage_router.ip] = vdisk_guids_to_scrub # Remote tasks have been launched, now start the local task and then wait for remote tasks to finish processed_guids = [] if local_scrub_location is not None and len(local_vdisks_to_scrub) > 0: try: processed_guids = ScheduledTaskController._execute_scrub_work( scrub_location=local_scrub_location, vdisk_guids=local_vdisks_to_scrub) except Exception as ex: ScheduledTaskController._logger.error( 'Gather Scrub - Storage Router {0:<15} - Scrubbing failed with error:\n - {1}' .format(local_storage_router.ip, ex)) all_results, failed_nodes = CeleryToolbox.manage_running_tasks( result_set, timesleep=60) # Check every 60 seconds if tasks are still running for ip, result in all_results.iteritems(): if isinstance(result, list): processed_guids.extend(result) else: ScheduledTaskController._logger.error( 'Gather Scrub - Storage Router {0:<15} - Scrubbing failed with error:\n - {1}' .format(ip, result)) result_set = {} for failed_node in failed_nodes: ScheduledTaskController._logger.warning( 'Scrubbing failed on node {0}. Will reschedule on another node.' .format(failed_node)) vdisk_guids_to_scrub = scrub_map[failed_node] rescheduled_work = False for storage_router, scrub_location in scrub_locations.items(): if storage_router.ip not in failed_nodes: if storage_router.machine_id != local_machineid: ScheduledTaskController._logger.info( 'Rescheduled scrub work from node {0} to node {1}.' .format(failed_node, storage_router.ip)) result_set[ storage_router. ip] = ScheduledTaskController._execute_scrub_work.s( scrub_location=scrub_location, vdisk_guids=vdisk_guids_to_scrub).apply_async( routing_key='sr.{0}'.format( storage_router.machine_id)) storage_router_list.append(storage_router) rescheduled_work = True break if rescheduled_work is False: if local_scrub_location is not None: try: processed_guids.extend( ScheduledTaskController._execute_scrub_work( scrub_location=local_scrub_location, vdisk_guids=vdisk_guids_to_scrub)) except Exception as ex: ScheduledTaskController._logger.error( 'Gather Scrub - Storage Router Local - Scrubbing failed with error:\n - {0}' .format(ex)) else: ScheduledTaskController._logger.warning( 'No nodes left to reschedule work from node {0}'. format(failed_node)) if len(result_set) > 0: all_results2, failed_nodes = CeleryToolbox.manage_running_tasks( result_set, timesleep=60 ) # Check every 60 seconds if tasks are still running for ip, result in all_results2.iteritems(): if isinstance(result, list): processed_guids.extend(result) else: ScheduledTaskController._logger.error( 'Gather Scrub - Storage Router {0:<15} - Scrubbing failed with error:\n - {1}' .format(ip, result)) if len(set(processed_guids)) != len(vdisk_guids) or set( processed_guids).difference(vdisk_guids): raise RuntimeError('Scrubbing failed for 1 or more storagerouters') ScheduledTaskController._logger.info('Gather Scrub - Finished')