def run_once(self, *args, **kwargs): """Run a replication pass once.""" self._zero_stats() dirs = [] ips = whataremyips() if not ips: self.logger.error(_('ERROR Failed to get my own IPs?')) return self._local_device_ids = set() for node in self.ring.devs: if node and is_local_device(ips, self.port, node['replication_ip'], node['replication_port']): if self.mount_check and not ismount( os.path.join(self.root, node['device'])): self.logger.warn( _('Skipping %(device)s as it is not mounted') % node) continue unlink_older_than( os.path.join(self.root, node['device'], 'tmp'), time.time() - self.reclaim_age) datadir = os.path.join(self.root, node['device'], self.datadir) if os.path.isdir(datadir): self._local_device_ids.add(node['id']) dirs.append((datadir, node['id'])) self.logger.info(_('Beginning replication run')) for part, object_file, node_id in roundrobin_datadirs(dirs): self.cpool.spawn_n( self._replicate_object, part, object_file, node_id) self.cpool.waitall() self.logger.info(_('Replication run OVER')) self._report_stats()
def run_once(self, *args, **kwargs): """Run a replication pass once.""" self._zero_stats() dirs = [] ips = whataremyips() if not ips: self.logger.error(_('ERROR Failed to get my own IPs?')) return for node in self.ring.devs: if (node and node['replication_ip'] in ips and node['replication_port'] == self.port): if self.mount_check and not ismount( os.path.join(self.root, node['device'])): self.logger.warn( _('Skipping %(device)s as it is not mounted') % node) continue unlink_older_than( os.path.join(self.root, node['device'], 'tmp'), time.time() - self.reclaim_age) datadir = os.path.join(self.root, node['device'], self.datadir) if os.path.isdir(datadir): dirs.append((datadir, node['id'])) self.logger.info(_('Beginning replication run')) for part, object_file, node_id in roundrobin_datadirs(dirs): self.cpool.spawn_n(self._replicate_object, part, object_file, node_id) self.cpool.waitall() self.logger.info(_('Replication run OVER')) self._report_stats()
def collect_parts(self, override_devices=None, override_partitions=None): """ Helper for yielding partitions in the top level reconstructor """ override_devices = override_devices or [] override_partitions = override_partitions or [] ips = whataremyips() for policy in POLICIES: if policy.policy_type != EC_POLICY: continue self._diskfile_mgr = self._df_router[policy] self.load_object_ring(policy) data_dir = get_data_dir(policy) local_devices = itertools.ifilter( lambda dev: dev and is_local_device(ips, self.port, dev[ 'replication_ip'], dev['replication_port']), policy.object_ring.devs) for local_dev in local_devices: if override_devices and (local_dev['device'] not in override_devices): continue dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(int(policy))) if self.mount_check and not ismount(dev_path): self.logger.warn(_('%s is not mounted'), local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception('Unable to create %s' % obj_path) continue try: partitions = os.listdir(obj_path) except OSError: self.logger.exception('Unable to list partitions in %r' % obj_path) continue for partition in partitions: part_path = join(obj_path, partition) if not (partition.isdigit() and os.path.isdir(part_path)): self.logger.warning( 'Unexpected entity in data dir: %r' % part_path) remove_file(part_path) continue partition = int(partition) if override_partitions and (partition not in override_partitions): continue part_info = { 'local_dev': local_dev, 'policy': policy, 'partition': partition, 'part_path': part_path, } yield part_info
def collect_jobs(self): """ Returns a sorted list of jobs (dictionaries) that specify the partitions, nodes, etc to be synced. """ jobs = [] ips = whataremyips() for local_dev in [ dev for dev in self.object_ring.devs if dev and dev['replication_ip'] in ips and dev['replication_port'] == self.port ]: dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, 'objects') tmp_path = join(dev_path, 'tmp') if self.mount_check and not ismount(dev_path): self.logger.warn(_('%s is not mounted'), local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception('ERROR creating %s' % obj_path) continue for partition in os.listdir(obj_path): try: job_path = join(obj_path, partition) if isfile(job_path): # Clean up any (probably zero-byte) files where a # partition should be. self.logger.warning( 'Removing partition directory ' 'which was a file: %s', job_path) os.remove(job_path) continue part_nodes = \ self.object_ring.get_part_nodes(int(partition)) nodes = [ node for node in part_nodes if node['id'] != local_dev['id'] ] jobs.append( dict(path=job_path, device=local_dev['device'], nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, partition=partition)) except (ValueError, OSError): continue random.shuffle(jobs) if self.handoffs_first: # Move the handoff parts to the front of the list jobs.sort(key=lambda job: not job['delete']) self.job_count = len(jobs) return jobs
def replicate(self, override_devices=None, override_partitions=None, override_policies=None): """Run a replication pass""" self.start = time.time() self.suffix_count = 0 self.suffix_sync = 0 self.suffix_hash = 0 self.replication_count = 0 self.last_replication_count = -1 self.partition_times = [] stats = eventlet.spawn(self.heartbeat) lockup_detector = eventlet.spawn(self.detect_lockups) eventlet.sleep() # Give spawns a cycle try: self.run_pool = GreenPool(size=self.concurrency) jobs = self.collect_jobs(override_devices=override_devices, override_partitions=override_partitions, override_policies=override_policies) for job in jobs: dev_path = join(self.devices_dir, job['device']) if self.mount_check and not ismount(dev_path): self.logger.warn(_('%s is not mounted'), job['device']) continue if not self.check_ring(job['policy'].object_ring): self.logger.info( _("Ring change detected. Aborting " "current replication pass.")) return try: if isfile(job['path']): # Clean up any (probably zero-byte) files where a # partition should be. self.logger.warning( 'Removing partition directory ' 'which was a file: %s', job['path']) os.remove(job['path']) continue except OSError: continue if job['delete']: self.run_pool.spawn(self.update_deleted, job) else: self.run_pool.spawn(self.update, job) with Timeout(self.lockup_timeout): self.run_pool.waitall() except (Exception, Timeout): self.logger.exception(_("Exception in top-level replication loop")) self.kill_coros() finally: stats.kill() lockup_detector.kill() self.stats_line()
def replicate(self, override_devices=None, override_partitions=None): """Run a replication pass""" self.start = time.time() self.suffix_count = 0 self.suffix_sync = 0 self.suffix_hash = 0 self.replication_count = 0 self.last_replication_count = -1 self.partition_times = [] if override_devices is None: override_devices = [] if override_partitions is None: override_partitions = [] stats = eventlet.spawn(self.heartbeat) lockup_detector = eventlet.spawn(self.detect_lockups) eventlet.sleep() # Give spawns a cycle try: self.run_pool = GreenPool(size=self.concurrency) jobs = self.collect_jobs() for job in jobs: if override_devices and job["device"] not in override_devices: continue if override_partitions and job["partition"] not in override_partitions: continue dev_path = join(self.devices_dir, job["device"]) if self.mount_check and not ismount(dev_path): self.logger.warn(_("%s is not mounted"), job["device"]) continue if not self.check_ring(job["object_ring"]): self.logger.info(_("Ring change detected. Aborting " "current replication pass.")) return try: if isfile(job["path"]): # Clean up any (probably zero-byte) files where a # partition should be. self.logger.warning("Removing partition directory " "which was a file: %s", job["path"]) os.remove(job["path"]) continue except OSError: continue if job["delete"]: self.run_pool.spawn(self.update_deleted, job) else: self.run_pool.spawn(self.update, job) with Timeout(self.lockup_timeout): self.run_pool.waitall() except (Exception, Timeout): self.logger.exception(_("Exception in top-level replication loop")) self.kill_coros() finally: stats.kill() lockup_detector.kill() self.stats_line()
def build_replication_jobs(self, policy, ips, override_devices=None, override_partitions=None): """ Helper function for collect_jobs to build jobs for replication using replication style storage policy """ jobs = [] data_dir = get_data_dir(policy) for local_dev in [ dev for dev in policy.object_ring.devs if (dev and is_local_device(ips, self.port, dev['replication_ip'], dev['replication_port']) and (override_devices is None or dev['device'] in override_devices)) ]: dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(policy)) if self.mount_check and not ismount(dev_path): self.logger.warn(_('%s is not mounted'), local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception('ERROR creating %s' % obj_path) continue for partition in os.listdir(obj_path): if (override_partitions is not None and partition not in override_partitions): continue try: job_path = join(obj_path, partition) part_nodes = policy.object_ring.get_part_nodes( int(partition)) nodes = [ node for node in part_nodes if node['id'] != local_dev['id'] ] jobs.append( dict(path=job_path, device=local_dev['device'], obj_path=obj_path, nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, policy=policy, partition=partition, region=local_dev['region'])) except ValueError: continue return jobs
def collect_jobs(self): """ Returns a sorted list of jobs (dictionaries) that specify the partitions, nodes, etc to be synced. """ jobs = [] ips = whataremyips() for local_dev in [dev for dev in self.object_ring.devs if dev and dev['replication_ip'] in ips and dev['replication_port'] == self.port]: dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, 'objects') tmp_path = join(dev_path, 'tmp') if self.mount_check and not ismount(dev_path): self.logger.warn(_('%s is not mounted'), local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception('ERROR creating %s' % obj_path) continue for partition in os.listdir(obj_path): try: job_path = join(obj_path, partition) if isfile(job_path): # Clean up any (probably zero-byte) files where a # partition should be. self.logger.warning('Removing partition directory ' 'which was a file: %s', job_path) os.remove(job_path) continue part_nodes = \ self.object_ring.get_part_nodes(int(partition)) #MODIFIED LightSync for mypos in range(len(part_nodes)): if part_nodes[mypos]['id'] == local_dev['id']: break nodes = part_nodes[mypos+1:]+part_nodes[:mypos] ## jobs.append( dict(path=job_path, device=local_dev['device'], nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, partition=partition)) except (ValueError, OSError): continue random.shuffle(jobs) if self.handoffs_first: # Move the handoff parts to the front of the list jobs.sort(key=lambda job: not job['delete']) self.job_count = len(jobs) return jobs
def process_repl(self, policy, ips, override_devices=None, override_partitions=None): """ Helper function for collect_jobs to build jobs for replication using replication style storage policy """ jobs = [] obj_ring = self.get_object_ring(policy.idx) data_dir = get_data_dir(policy.idx) for local_dev in [dev for dev in obj_ring.devs if (dev and is_local_device(ips, self.port, dev['replication_ip'], dev['replication_port']) and (override_devices is None or dev['device'] in override_devices))]: dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(int(policy))) if self.mount_check and not ismount(dev_path): self.logger.warn(_('%s is not mounted'), local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception('ERROR creating %s' % obj_path) continue for partition in os.listdir(obj_path): if (override_partitions is not None and partition not in override_partitions): continue try: job_path = join(obj_path, partition) part_nodes = obj_ring.get_part_nodes(int(partition)) nodes = [node for node in part_nodes if node['id'] != local_dev['id']] jobs.append( dict(path=job_path, device=local_dev['device'], obj_path=obj_path, nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, policy_idx=policy.idx, partition=partition, object_ring=obj_ring, region=local_dev['region'])) except ValueError: continue return jobs
def process_repl(self, policy, jobs, ips): """ Helper function for collect_jobs to build jobs for replication using replication style storage policy """ obj_ring = self.get_object_ring(policy.idx) data_dir = get_data_dir(policy.idx) for local_dev in [ dev for dev in obj_ring.devs if dev and dev['replication_ip'] in ips and dev['replication_port'] == self.port ]: dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, 'tmp') if self.mount_check and not ismount(dev_path): self.logger.warn(_('%s is not mounted'), local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception('ERROR creating %s' % obj_path) continue for partition in os.listdir(obj_path): try: job_path = join(obj_path, partition) if isfile(job_path): # Clean up any (probably zero-byte) files where a # partition should be. self.logger.warning( 'Removing partition directory ' 'which was a file: %s', job_path) os.remove(job_path) continue part_nodes = obj_ring.get_part_nodes(int(partition)) nodes = [ node for node in part_nodes if node['id'] != local_dev['id'] ] jobs.append( dict(path=job_path, device=local_dev['device'], nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, policy_idx=policy.idx, partition=partition, object_ring=obj_ring)) except (ValueError, OSError): continue
def run_forever(self, *args, **kwargs): """Run the updater continuously.""" time.sleep(random() * self.interval) while True: self.logger.info(_('Begin object update sweep')) begin = time.time() pids = [] # read from container ring to ensure it's fresh self.get_container_ring().get_nodes('') for device in self._listdir(self.devices): if self.mount_check and \ not ismount(os.path.join(self.devices, device)): self.logger.increment('errors') self.logger.warning(_('Skipping %s as it is not mounted'), device) continue while len(pids) >= self.concurrency: pids.remove(os.wait()[0]) pid = os.fork() if pid: pids.append(pid) else: signal.signal(signal.SIGTERM, signal.SIG_DFL) patcher.monkey_patch(all=False, socket=True, select=True, thread=True) self.successes = 0 self.failures = 0 forkbegin = time.time() self.object_sweep(os.path.join(self.devices, device)) elapsed = time.time() - forkbegin self.logger.info( _('Object update sweep of %(device)s' ' completed: %(elapsed).02fs, %(success)s successes' ', %(fail)s failures'), { 'device': device, 'elapsed': elapsed, 'success': self.successes, 'fail': self.failures }) sys.exit() while pids: pids.remove(os.wait()[0]) elapsed = time.time() - begin self.logger.info(_('Object update sweep completed: %.02fs'), elapsed) dump_recon_cache({'object_updater_sweep': elapsed}, self.rcache, self.logger) if elapsed < self.interval: time.sleep(self.interval - elapsed)
def replicate(self, override_devices=None, override_partitions=None): """Run a replication pass""" self.start = time.time() self.suffix_count = 0 self.suffix_sync = 0 self.suffix_hash = 0 self.replication_count = 0 self.last_replication_count = -1 self.partition_times = [] if override_devices is None: override_devices = [] if override_partitions is None: override_partitions = [] stats = eventlet.spawn(self.heartbeat) lockup_detector = eventlet.spawn(self.detect_lockups) eventlet.sleep() # Give spawns a cycle try: self.run_pool = GreenPool(size=self.concurrency) jobs = self.collect_jobs() for job in jobs: if override_devices and job['device'] not in override_devices: continue if override_partitions and \ job['partition'] not in override_partitions: continue dev_path = join(self.devices_dir, job['device']) if self.mount_check and not ismount(dev_path): self.logger.warn(_('%s is not mounted'), job['device']) continue if not self.check_ring(job['object_ring']): self.logger.info( _("Ring change detected. Aborting " "current replication pass.")) return if job['delete']: self.run_pool.spawn(self.update_deleted, job) else: self.run_pool.spawn(self.update, job) with Timeout(self.lockup_timeout): self.run_pool.waitall() except (Exception, Timeout): self.logger.exception(_("Exception in top-level replication loop")) self.kill_coros() finally: stats.kill() lockup_detector.kill() self.stats_line()
def replicate(self, override_devices=None, override_partitions=None): """Run a replication pass""" self.start = time.time() self.suffix_count = 0 self.suffix_sync = 0 self.suffix_hash = 0 self.replication_count = 0 self.last_replication_count = -1 self.partition_times = [] if override_devices is None: override_devices = [] if override_partitions is None: override_partitions = [] stats = eventlet.spawn(self.heartbeat) lockup_detector = eventlet.spawn(self.detect_lockups) eventlet.sleep() # Give spawns a cycle try: self.run_pool = GreenPool(size=self.concurrency) jobs = self.collect_jobs() for job in jobs: if override_devices and job['device'] not in override_devices: continue if override_partitions and \ job['partition'] not in override_partitions: continue dev_path = join(self.devices_dir, job['device']) if self.mount_check and not ismount(dev_path): self.logger.warn(_('%s is not mounted'), job['device']) continue if not self.check_ring(): self.logger.info(_("Ring change detected. Aborting " "current replication pass.")) return if job['delete']: self.run_pool.spawn(self.update_deleted, job) else: self.run_pool.spawn(self.update, job) with Timeout(self.lockup_timeout): self.run_pool.waitall() except (Exception, Timeout): self.logger.exception(_("Exception in top-level replication loop")) self.kill_coros() finally: stats.kill() lockup_detector.kill() self.stats_line()
def check_mount(root, drive): """ Verify that the path to the device is a mount point and mounted. This allows us to fast fail on drives that have been unmounted because of issues, and also prevents us for accidentally filling up the root partition. :param root: base path where the devices are mounted :param drive: drive name to be checked :returns: True if it is a valid mounted device, False otherwise """ if not (urllib.quote_plus(drive) == drive): return False path = os.path.join(root, drive) return utils.ismount(path)
def process_repl(self, policy, jobs, ips): """ Helper function for collect_jobs to build jobs for replication using replication style storage policy """ obj_ring = self.get_object_ring(policy.idx) data_dir = get_data_dir(policy.idx) for local_dev in [dev for dev in obj_ring.devs if dev and dev['replication_ip'] in ips and dev['replication_port'] == self.port]: dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(int(policy))) if self.mount_check and not ismount(dev_path): self.logger.warn(_('%s is not mounted'), local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception('ERROR creating %s' % obj_path) continue for partition in os.listdir(obj_path): try: job_path = join(obj_path, partition) if isfile(job_path): # Clean up any (probably zero-byte) files where a # partition should be. self.logger.warning( 'Removing partition directory ' 'which was a file: %s', job_path) os.remove(job_path) continue part_nodes = obj_ring.get_part_nodes(int(partition)) nodes = [node for node in part_nodes if node['id'] != local_dev['id']] jobs.append( dict(path=job_path, device=local_dev['device'], nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, policy_idx=policy.idx, partition=partition, object_ring=obj_ring)) except (ValueError, OSError): continue
def run_forever(self, *args, **kwargs): """Run the updater continuously.""" time.sleep(random() * self.interval) while True: self.logger.info(_('Begin object update sweep')) begin = time.time() pids = [] # read from container ring to ensure it's fresh self.get_container_ring().get_nodes('') for device in self._listdir(self.devices): if self.mount_check and \ not ismount(os.path.join(self.devices, device)): self.logger.increment('errors') self.logger.warning( _('Skipping %s as it is not mounted'), device) continue while len(pids) >= self.concurrency: pids.remove(os.wait()[0]) pid = os.fork() if pid: pids.append(pid) else: signal.signal(signal.SIGTERM, signal.SIG_DFL) patcher.monkey_patch(all=False, socket=True, select=True, thread=True) self.successes = 0 self.failures = 0 forkbegin = time.time() self.object_sweep(os.path.join(self.devices, device)) elapsed = time.time() - forkbegin self.logger.info( _('Object update sweep of %(device)s' ' completed: %(elapsed).02fs, %(success)s successes' ', %(fail)s failures'), {'device': device, 'elapsed': elapsed, 'success': self.successes, 'fail': self.failures}) sys.exit() while pids: pids.remove(os.wait()[0]) elapsed = time.time() - begin self.logger.info(_('Object update sweep completed: %.02fs'), elapsed) dump_recon_cache({'object_updater_sweep': elapsed}, self.rcache, self.logger) if elapsed < self.interval: time.sleep(self.interval - elapsed)
def get_ring(server, force_validate=None): ring = Ring('/etc/swift/%s.ring.gz' % server) if not VALIDATE_RSYNC and not force_validate: return ring # easy sanity checks assert 3 == ring.replica_count, '%s has %s replicas instead of 3' % ( ring.serialized_path, ring.replica_count) assert 4 == len( ring.devs), '%s has %s devices instead of 4' % (ring.serialized_path, len(ring.devs)) # map server to config by port port_to_config = {} for node_id in range(1, 5): conf = readconf('/etc/swift/%s-server/%d.conf' % (server, node_id), section_name='%s-replicator' % server) port_to_config[int(conf['bind_port'])] = conf for dev in ring.devs: # verify server is exposing mounted device conf = port_to_config[dev['port']] for device in os.listdir(conf['devices']): if device == dev['device']: full_path = path.realpath(path.join(conf['devices'], device)) assert ismount(full_path), \ 'device %s in %s was not mounted (%s)' % ( device, conf['devices'], full_path) break else: raise AssertionError( "unable to find ring device %s under %s's devices (%s)" % (dev['device'], server, conf['devices'])) # verify server is exposing rsync device rsync_export = '%s%s' % (server, dev['replication_port']) cmd = "rsync rsync://localhost/%s" % rsync_export p = Popen(cmd, shell=True, stdout=PIPE) stdout, _stderr = p.communicate() if p.returncode: raise AssertionError('unable to connect to rsync ' 'export %s (%s)' % (rsync_export, cmd)) for line in stdout.splitlines(): if line.rsplit(None, 1)[-1] == dev['device']: break else: raise AssertionError("unable to find ring device %s under rsync's " "exported devices for %s (%s)" % (dev['device'], rsync_export, cmd)) return ring
def move(self, old_dict, new_dict, moving_map): """Run a move pass. :param old_dict: dictionary with devices from old ring :param new_dict: dictionary with devices from new ring :param moving_map: the dictionary that contains all the partitions that should be moved, their sources and destinations """ self.start = time.time() self.replication_count = 0 self.last_replication_count = -1 self.partition_times = [] try: self.run_pool = GreenPool(size=self.concurrency) jobs = self.collect_jobs(old_dict, new_dict, moving_map) for job in jobs: dev_path = join(self.devices_dir, job['device']) if self.mount_check and not ismount(dev_path): self.logger.warn('%s is not mounted' % job['device']) continue try: if isfile(job['path']): # Clean up any (probably zero-byte) files where a # partition should be. self.logger.warning( 'Removing partition directory ' 'which was a file: %s', job['path']) os.remove(job['path']) continue except OSError: continue self.run_pool.spawn(self.update, job) self.run_pool.waitall() except (Exception, Timeout) as e: self.kill_coros() self.logger.exception( "Exception in top-level partition move loop %s" % e) if self.test: print e
def get_ring(server, force_validate=None): ring = Ring('/etc/swift/%s.ring.gz' % server) if not VALIDATE_RSYNC and not force_validate: return ring # easy sanity checks assert 3 == ring.replica_count, '%s has %s replicas instead of 3' % ( ring.serialized_path, ring.replica_count) assert 4 == len(ring.devs), '%s has %s devices instead of 4' % ( ring.serialized_path, len(ring.devs)) # map server to config by port port_to_config = {} for node_id in range(1, 5): conf = readconf('/etc/swift/%s-server/%d.conf' % (server, node_id), section_name='%s-replicator' % server) port_to_config[int(conf['bind_port'])] = conf for dev in ring.devs: # verify server is exposing mounted device conf = port_to_config[dev['port']] for device in os.listdir(conf['devices']): if device == dev['device']: full_path = path.realpath(path.join(conf['devices'], device)) assert ismount(full_path), \ 'device %s in %s was not mounted (%s)' % ( device, conf['devices'], full_path) break else: raise AssertionError( "unable to find ring device %s under %s's devices (%s)" % ( dev['device'], server, conf['devices'])) # verify server is exposing rsync device rsync_export = '%s%s' % (server, dev['replication_port']) cmd = "rsync rsync://localhost/%s" % rsync_export p = Popen(cmd, shell=True, stdout=PIPE) stdout, _stderr = p.communicate() if p.returncode: raise AssertionError('unable to connect to rsync ' 'export %s (%s)' % (rsync_export, cmd)) for line in stdout.splitlines(): if line.rsplit(None, 1)[-1] == dev['device']: break else: raise AssertionError("unable to find ring device %s under rsync's " "exported devices for %s (%s)" % ( dev['device'], rsync_export, cmd)) return ring
def run_once(self, *args, **kwargs): """Run a replication pass once.""" self._zero_stats() dirs = [] ips = whataremyips(self.bind_ip) if not ips: self.logger.error(_('ERROR Failed to get my own IPs?')) return self._local_device_ids = set() found_local = False ###遍历节点 for node in self.ring.devs: if node and is_local_device(ips, self.port, node['replication_ip'], node['replication_port']): found_local = True if self.mount_check and not ismount( os.path.join(self.root, node['device'])): self._add_failure_stats( [(failure_dev['replication_ip'], failure_dev['device']) for failure_dev in self.ring.devs if failure_dev]) self.logger.warning( _('Skipping %(device)s as it is not mounted') % node) continue unlink_older_than( os.path.join(self.root, node['device'], 'tmp'), time.time() - self.reclaim_age) datadir = os.path.join(self.root, node['device'], self.datadir) if os.path.isdir(datadir): self._local_device_ids.add(node['id']) dirs.append((datadir, node['id'])) if not found_local: self.logger.error("Can't find itself %s with port %s in ring " "file, not replicating", ", ".join(ips), self.port) self.logger.info(_('Beginning replication run')) for part, object_file, node_id in roundrobin_datadirs(dirs): self.cpool.spawn_n( self._replicate_object, part, object_file, node_id) self.cpool.waitall() self.logger.info(_('Replication run OVER')) self._report_stats()
def get_paths(self): """ Get paths to all of the partitions on each drive to be processed. :returns: a list of paths """ paths = [] for device in self._listdir(self.devices): dev_path = os.path.join(self.devices, device) if self.mount_check and not ismount(dev_path): self.logger.warning(_('%s is not mounted'), device) continue con_path = os.path.join(dev_path, DATADIR) if not os.path.exists(con_path): continue for partition in self._listdir(con_path): paths.append(os.path.join(con_path, partition)) shuffle(paths) return paths
def process_repl(self, policy, jobs, ips): """ Helper function for collect_jobs to build jobs for replication using replication style storage policy """ obj_ring = self.get_object_ring(policy.idx) data_dir = get_data_dir(policy.idx) for local_dev in [ dev for dev in obj_ring.devs if dev and dev["replication_ip"] in ips and dev["replication_port"] == self.port ]: dev_path = join(self.devices_dir, local_dev["device"]) obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(int(policy))) if self.mount_check and not ismount(dev_path): self.logger.warn(_("%s is not mounted"), local_dev["device"]) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception("ERROR creating %s" % obj_path) continue for partition in os.listdir(obj_path): try: job_path = join(obj_path, partition) part_nodes = obj_ring.get_part_nodes(int(partition)) nodes = [node for node in part_nodes if node["id"] != local_dev["id"]] jobs.append( dict( path=job_path, device=local_dev["device"], nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, policy_idx=policy.idx, partition=partition, object_ring=obj_ring, ) ) except ValueError: continue
def check_drive(root, drive, mount_check): """ Validate the path given by root and drive is a valid existing directory. :param root: base path where the devices are mounted :param drive: drive name to be checked :param mount_check: additionally require path is mounted :returns: full path to the device, or None if drive fails to validate """ if not (urllib.parse.quote_plus(drive) == drive): return None path = os.path.join(root, drive) if mount_check: if utils.ismount(path): return path else: if isdir(path): return path return None
def check_drive(root, drive, mount_check): """ Validate the path given by root and drive is a valid existing directory. :param root: base path where the devices are mounted :param drive: drive name to be checked :param mount_check: additionally require path is mounted :returns: full path to the device :raises ValueError: if drive fails to validate """ if not (urllib.parse.quote_plus(drive) == drive): raise ValueError('%s is not a valid drive name' % drive) path = os.path.join(root, drive) if mount_check: if not utils.ismount(path): raise ValueError('%s is not mounted' % path) else: if not isdir(path): raise ValueError('%s is not a directory' % path) return path
def dispatch(self, replicate_args, args): if not hasattr(args, 'pop'): return HTTPBadRequest(body='Invalid object type') op = args.pop(0) drive, partition, hsh = replicate_args if self.mount_check and not ismount(os.path.join(self.root, drive)): return Response(status='507 %s is not mounted' % drive) db_file = os.path.join(self.root, drive, storage_directory(self.datadir, partition, hsh), hsh + '.db') if op == 'rsync_then_merge': return self.rsync_then_merge(drive, db_file, args) if op == 'complete_rsync': return self.complete_rsync(drive, db_file, args) else: # someone might be about to rsync a db to us, # make sure there's a tmp dir to receive it. mkdirs(os.path.join(self.root, drive, 'tmp')) if not os.path.exists(db_file): return HTTPNotFound() return getattr(self, op)(self.broker_class(db_file), args)
def run_once(self, *args, **kwargs): """Run the updater once.""" self.logger.info(_('Begin object update single threaded sweep')) begin = time.time() self.successes = 0 self.failures = 0 for device in os.listdir(self.devices): if self.mount_check and \ not ismount(os.path.join(self.devices, device)): self.logger.increment('errors') self.logger.warn( _('Skipping %s as it is not mounted'), device) continue self.object_sweep(os.path.join(self.devices, device)) elapsed = time.time() - begin self.logger.info( _('Object update single threaded sweep completed: ' '%(elapsed).02fs, %(success)s successes, %(fail)s failures'), {'elapsed': elapsed, 'success': self.successes, 'fail': self.failures}) dump_recon_cache({'object_updater_sweep': elapsed}, self.rcache, self.logger)
def run_once(self, *args, **kwargs): """Run the updater once.""" self.logger.info(_('Begin object update single threaded sweep')) begin = time.time() self.successes = 0 self.failures = 0 for device in self._listdir(self.devices): if self.mount_check and \ not ismount(os.path.join(self.devices, device)): self.logger.increment('errors') self.logger.warning( _('Skipping %s as it is not mounted'), device) continue self.object_sweep(os.path.join(self.devices, device)) elapsed = time.time() - begin self.logger.info( _('Object update single threaded sweep completed: ' '%(elapsed).02fs, %(success)s successes, %(fail)s failures'), {'elapsed': elapsed, 'success': self.successes, 'fail': self.failures}) dump_recon_cache({'object_updater_sweep': elapsed}, self.rcache, self.logger)
def run_once(self, *args, **kwargs): """ Main entry point when running the reaper in 'once' mode, where it will do a single pass over all accounts on the server. This is called repeatedly by :func:`run_forever`. This will call :func:`reap_device` once for each device on the server. """ self.logger.debug('Begin devices pass: %s', self.devices) begin = time() try: for device in os.listdir(self.devices): if self.mount_check and not ismount( os.path.join(self.devices, device)): self.logger.increment('errors') self.logger.debug( _('Skipping %s as it is not mounted'), device) continue self.reap_device(device) except (Exception, Timeout): self.logger.exception(_("Exception in top-level account reaper " "loop")) elapsed = time() - begin self.logger.info(_('Devices pass completed: %.02fs'), elapsed)
def build_replication_jobs(self, policy, ips, old_dict, new_dict, moving_map): """ Helper function for collect_jobs to build jobs for replication using replication style storage policy :param policy: swift policy object :param ips: the local server ips :param old_dict: dictionary with devices from old ring :param new_dict: dictionary with devices from new ring :param moving_map: the dictionary that contains all the partitions that should be moved, their sources and destinations """ jobs = [] data_dir = get_data_dir(policy) devices = Set(map(lambda x: x[1], moving_map.values())) partitions = Set(map(lambda x: x[0], moving_map.values())) for local_dev in [dev for dev in policy.object_ring.devs if (dev and is_local_device(ips, self.port, dev['replication_ip'], dev['replication_port']) )]: if self.test: print local_dev['id'] if unicode(local_dev['id']) not in devices: continue dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(policy)) if self.mount_check and not ismount(dev_path): self.logger.warn('%s is not mounted' % local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) for partition in os.listdir(obj_path): partition = unicode(partition) if (partition not in partitions): continue try: key = "%s_%s" % (local_dev['id'], partition) if key not in moving_map: continue job_path = join(obj_path, partition) _, source_id, dest_id = moving_map[key] if source_id != unicode(local_dev['id']): continue node = {} replication_ip, replication_device = new_dict[dest_id] node['replication_ip'] = replication_ip node['device'] = replication_device remote_path = os.path.join(self.devices_dir, node['device'], self.mover_tmp_dir) jobs.append( dict(path=job_path, device=local_dev['device'], obj_path=obj_path, node=node, policy=policy, partition=partition, remote_path=remote_path)) except ValueError: continue except Exception as e: self.logger.exception( "an %s exception accure at build_replication_jobs" % e) if self.test: print e return jobs
def replicate(self, override_devices=None, override_partitions=None, override_policies=None): """Run a replication pass""" self.start = time.time() self.suffix_count = 0 self.suffix_sync = 0 self.suffix_hash = 0 self.replication_count = 0 self.last_replication_count = -1 self.replication_cycle = (self.replication_cycle + 1) % 10 self.partition_times = [] self.my_replication_ips = self._get_my_replication_ips() self.all_devs_info = set() self.handoffs_remaining = 0 stats = eventlet.spawn(self.heartbeat) lockup_detector = eventlet.spawn(self.detect_lockups) eventlet.sleep() # Give spawns a cycle current_nodes = None try: self.run_pool = GreenPool(size=self.concurrency) jobs = self.collect_jobs(override_devices=override_devices, override_partitions=override_partitions, override_policies=override_policies) for job in jobs: current_nodes = job['nodes'] if override_devices and job['device'] not in override_devices: continue if override_partitions and \ job['partition'] not in override_partitions: continue dev_path = join(self.devices_dir, job['device']) if self.mount_check and not ismount(dev_path): self._add_failure_stats([(failure_dev['replication_ip'], failure_dev['device']) for failure_dev in job['nodes']]) self.logger.warning(_('%s is not mounted'), job['device']) continue if self.handoffs_first and not job['delete']: # in handoffs first mode, we won't process primary # partitions until rebalance was successful! if self.handoffs_remaining: self.logger.warning( _("Handoffs first mode still has handoffs " "remaining. Aborting current " "replication pass.")) break if not self.check_ring(job['policy'].object_ring): self.logger.info( _("Ring change detected. Aborting " "current replication pass.")) return try: if isfile(job['path']): # Clean up any (probably zero-byte) files where a # partition should be. self.logger.warning( 'Removing partition directory ' 'which was a file: %s', job['path']) os.remove(job['path']) continue except OSError: continue if job['delete']: self.run_pool.spawn(self.update_deleted, job) else: self.run_pool.spawn(self.update, job) current_nodes = None with Timeout(self.lockup_timeout): self.run_pool.waitall() except (Exception, Timeout): if current_nodes: self._add_failure_stats([(failure_dev['replication_ip'], failure_dev['device']) for failure_dev in current_nodes]) else: self._add_failure_stats(self.all_devs_info) self.logger.exception(_("Exception in top-level replication loop")) self.kill_coros() finally: stats.kill() lockup_detector.kill() self.stats_line() self.stats['attempted'] = self.replication_count
def build_replication_jobs(self, policy, ips, override_devices=None, override_partitions=None): """ Helper function for collect_jobs to build jobs for replication using replication style storage policy """ jobs = [] self.all_devs_info.update([(dev['replication_ip'], dev['device']) for dev in policy.object_ring.devs if dev]) data_dir = get_data_dir(policy) found_local = False for local_dev in [ dev for dev in policy.object_ring.devs if (dev and is_local_device(ips, self.port, dev['replication_ip'], dev['replication_port']) and (override_devices is None or dev['device'] in override_devices)) ]: found_local = True dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(policy)) if self.mount_check and not ismount(dev_path): self._add_failure_stats([ (failure_dev['replication_ip'], failure_dev['device']) for failure_dev in policy.object_ring.devs if failure_dev ]) self.logger.warning(_('%s is not mounted'), local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception('ERROR creating %s' % obj_path) continue for partition in os.listdir(obj_path): if (override_partitions is not None and partition not in override_partitions): continue if (partition.startswith('auditor_status_') and partition.endswith('.json')): # ignore auditor status files continue part_nodes = None try: job_path = join(obj_path, partition) part_nodes = policy.object_ring.get_part_nodes( int(partition)) nodes = [ node for node in part_nodes if node['id'] != local_dev['id'] ] jobs.append( dict(path=job_path, device=local_dev['device'], obj_path=obj_path, nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, policy=policy, partition=partition, region=local_dev['region'])) except ValueError: if part_nodes: self._add_failure_stats([ (failure_dev['replication_ip'], failure_dev['device']) for failure_dev in nodes ]) else: self._add_failure_stats([ (failure_dev['replication_ip'], failure_dev['device']) for failure_dev in policy.object_ring.devs if failure_dev ]) continue if not found_local: self.logger.error( "Can't find itself in policy with index %d with" " ips %s and with port %s in ring file, not" " replicating", int(policy), ", ".join(ips), self.port) return jobs
def build_replication_jobs(self, policy, ips, old_dict, new_dict, moving_map): """ Helper function for collect_jobs to build jobs for replication using replication style storage policy :param policy: swift policy object :param ips: the local server ips :param old_dict: dictionary with devices from old ring :param new_dict: dictionary with devices from new ring :param moving_map: the dictionary that contains all the partitions that should be moved, their sources and destinations """ jobs = [] data_dir = get_data_dir(policy) devices = Set(map(lambda x: x[1], moving_map.values())) partitions = Set(map(lambda x: x[0], moving_map.values())) for local_dev in [ dev for dev in policy.object_ring.devs if (dev and is_local_device(ips, self.port, dev['replication_ip'], dev['replication_port'])) ]: if self.test: print local_dev['id'] if unicode(local_dev['id']) not in devices: continue dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(policy)) if self.mount_check and not ismount(dev_path): self.logger.warn('%s is not mounted' % local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) for partition in os.listdir(obj_path): partition = unicode(partition) if (partition not in partitions): continue try: key = "%s_%s" % (local_dev['id'], partition) if key not in moving_map: continue job_path = join(obj_path, partition) _, source_id, dest_id = moving_map[key] if source_id != unicode(local_dev['id']): continue node = {} replication_ip, replication_device = new_dict[dest_id] node['replication_ip'] = replication_ip node['device'] = replication_device remote_path = os.path.join(self.devices_dir, node['device'], self.mover_tmp_dir) jobs.append( dict(path=job_path, device=local_dev['device'], obj_path=obj_path, node=node, policy=policy, partition=partition, remote_path=remote_path)) except ValueError: continue except Exception as e: self.logger.exception( "an %s exception accure at build_replication_jobs" % e) if self.test: print e return jobs
def collect_parts(self, override_devices=None, override_partitions=None): """ Helper for yielding partitions in the top level reconstructor """ override_devices = override_devices or [] override_partitions = override_partitions or [] ips = whataremyips() for policy in POLICIES: if policy.policy_type != EC_POLICY: continue self._diskfile_mgr = self._df_router[policy] self.load_object_ring(policy) data_dir = get_data_dir(policy) local_devices = itertools.ifilter( lambda dev: dev and is_local_device( ips, self.port, dev['replication_ip'], dev['replication_port']), policy.object_ring.devs) for local_dev in local_devices: if override_devices and (local_dev['device'] not in override_devices): continue dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(int(policy))) if self.mount_check and not ismount(dev_path): self.logger.warn(_('%s is not mounted'), local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception( 'Unable to create %s' % obj_path) continue try: partitions = os.listdir(obj_path) except OSError: self.logger.exception( 'Unable to list partitions in %r' % obj_path) continue for partition in partitions: part_path = join(obj_path, partition) if not (partition.isdigit() and os.path.isdir(part_path)): self.logger.warning( 'Unexpected entity in data dir: %r' % part_path) remove_file(part_path) continue partition = int(partition) if override_partitions and (partition not in override_partitions): continue part_info = { 'local_dev': local_dev, 'policy': policy, 'partition': partition, 'part_path': part_path, } yield part_info
def collect_jobs(self): """ Returns a sorted list of jobs (dictionaries) that specify the partitions, nodes, etc to be synced. """ jobs = [] ips = whataremyips() for local_dev in [dev for dev in self.object_ring.devs if dev and dev['replication_ip'] in ips and dev['replication_port'] == self.port]: dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, 'objects') tmp_path = join(dev_path, 'tmp') if self.mount_check and not ismount(dev_path): self.logger.warn(_('%s is not mounted'), local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception('ERROR creating %s' % obj_path) continue for partition in os.listdir(obj_path): try: job_path = join(obj_path, partition) if isfile(job_path): # Clean up any (probably zero-byte) files where a # partition should be. self.logger.warning('Removing partition directory ' 'which was a file: %s', job_path) os.remove(job_path) continue part_nodes = \ self.object_ring.get_part_nodes(int(partition)) #### CHANGED CODE #### #f = open("/home/swift/spindowndevices","r") #sdlist = f.read().strip().split("\n") #logging.info("===Spun down devices===:%s",str(sdlist)) #f.close() #sddict =dict() #for i in sdlist: # logging.info("===sdditc===%s",sddict) # if(i.split(":")[0] in sddict): # sddict[i.split(":")[0]].append(i.split(":")[1]) # else: # sddict[i.split(":")[0]] = [] # sddict[i.split(":")[0]].append(i.split(":")[1]) #nodes = [] #for node in part_nodes: # if(node['ip'] not in sddict and node['id']!= local_dev['id']): # nodes.append(node) # else: # if(node['device'] not in sddict[node['ip']] and node['id']!=local_dev['id']): # nodes.append(node) nodes = [node for node in part_nodes if node['id'] != local_dev['id']] logging.info("===Replication nodes===%s",str(nodes)) # logging.info("===sddict===%s",str(sddict)) #### END CHANGED CODE #### jobs.append( dict(path=job_path, device=local_dev['device'], nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, partition=partition)) except (ValueError, OSError): continue random.shuffle(jobs) if self.handoffs_first: # Move the handoff parts to the front of the list jobs.sort(key=lambda job: not job['delete']) self.job_count = len(jobs) return jobs
def process_repl(self, policy, ips, override_devices=None, override_partitions=None): """ Helper function for collect_jobs to build jobs for replication using replication style storage policy """ jobs = [] obj_ring = self.get_object_ring(policy.idx) data_dir = get_data_dir(policy.idx) for local_dev in [dev for dev in obj_ring.devs if (dev and is_local_device(ips, self.port, dev['replication_ip'], dev['replication_port']) and (override_devices is None or dev['device'] in override_devices))]: dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(int(policy))) if self.mount_check and not ismount(dev_path): self.logger.warn(_('%s is not mounted'), local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception('ERROR creating %s' % obj_path) continue for partition in os.listdir(obj_path): if (override_partitions is not None and partition not in override_partitions): continue try: job_path = join(obj_path, partition) part_nodes = obj_ring.get_part_nodes(int(partition)) ###################################### CHANGED_CODE ######################################################## f = open("/home/hduser/swift/swift/proxy/controllers/spindowndevices") downlist = f.read().split("\n") f.close() nodes = [node for node in part_nodes if node['id'] != local_dev['id'] and node['device'] not in downlist] print("===Replication nodes===",nodes) ###################################### CHANGED_CODE ######################################################## jobs.append( dict(path=job_path, device=local_dev['device'], obj_path=obj_path, nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, policy_idx=policy.idx, partition=partition, object_ring=obj_ring, region=local_dev['region'])) except ValueError: continue return jobs
def build_replication_jobs(self, policy, ips, override_devices=None, override_partitions=None): """ Helper function for collect_jobs to build jobs for replication using replication style storage policy """ jobs = [] self.all_devs_info.update([(dev["replication_ip"], dev["device"]) for dev in policy.object_ring.devs if dev]) data_dir = get_data_dir(policy) found_local = False for local_dev in [ dev for dev in policy.object_ring.devs if ( dev and is_local_device(ips, self.port, dev["replication_ip"], dev["replication_port"]) and (override_devices is None or dev["device"] in override_devices) ) ]: found_local = True dev_path = join(self.devices_dir, local_dev["device"]) obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(policy)) if self.mount_check and not ismount(dev_path): self._add_failure_stats( [ (failure_dev["replication_ip"], failure_dev["device"]) for failure_dev in policy.object_ring.devs if failure_dev ] ) self.logger.warning(_("%s is not mounted"), local_dev["device"]) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception("ERROR creating %s" % obj_path) continue for partition in os.listdir(obj_path): if override_partitions is not None and partition not in override_partitions: continue part_nodes = None try: job_path = join(obj_path, partition) part_nodes = policy.object_ring.get_part_nodes(int(partition)) nodes = [node for node in part_nodes if node["id"] != local_dev["id"]] jobs.append( dict( path=job_path, device=local_dev["device"], obj_path=obj_path, nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, policy=policy, partition=partition, region=local_dev["region"], ) ) except ValueError: if part_nodes: self._add_failure_stats( [(failure_dev["replication_ip"], failure_dev["device"]) for failure_dev in nodes] ) else: self._add_failure_stats( [ (failure_dev["replication_ip"], failure_dev["device"]) for failure_dev in policy.object_ring.devs if failure_dev ] ) continue if not found_local: self.logger.error( "Can't find itself %s with port %s in ring " "file, not replicating", ", ".join(ips), self.port ) return jobs
def replicate(self, override_devices=None, override_partitions=None, override_policies=None): """Run a replication pass""" self.start = time.time() self.suffix_count = 0 self.suffix_sync = 0 self.suffix_hash = 0 self.replication_count = 0 self.last_replication_count = -1 self.partition_times = [] self.my_replication_ips = self._get_my_replication_ips() self.all_devs_info = set() stats = eventlet.spawn(self.heartbeat) lockup_detector = eventlet.spawn(self.detect_lockups) eventlet.sleep() # Give spawns a cycle current_nodes = None try: self.run_pool = GreenPool(size=self.concurrency) jobs = self.collect_jobs(override_devices=override_devices, override_partitions=override_partitions, override_policies=override_policies) for job in jobs: current_nodes = job['nodes'] if override_devices and job['device'] not in override_devices: continue if override_partitions and \ job['partition'] not in override_partitions: continue dev_path = join(self.devices_dir, job['device']) if self.mount_check and not ismount(dev_path): self._add_failure_stats([(failure_dev['replication_ip'], failure_dev['device']) for failure_dev in job['nodes']]) self.logger.warn(_('%s is not mounted'), job['device']) continue if not self.check_ring(job['policy'].object_ring): self.logger.info(_("Ring change detected. Aborting " "current replication pass.")) return try: if isfile(job['path']): # Clean up any (probably zero-byte) files where a # partition should be. self.logger.warning( 'Removing partition directory ' 'which was a file: %s', job['path']) os.remove(job['path']) continue except OSError: continue if job['delete']: self.run_pool.spawn(self.update_deleted, job) else: self.run_pool.spawn(self.update, job) current_nodes = None with Timeout(self.lockup_timeout): self.run_pool.waitall() except (Exception, Timeout): if current_nodes: self._add_failure_stats([(failure_dev['replication_ip'], failure_dev['device']) for failure_dev in current_nodes]) else: self._add_failure_stats(self.all_devs_info) self.logger.exception(_("Exception in top-level replication loop")) self.kill_coros() finally: stats.kill() lockup_detector.kill() self.stats_line() self.stats['attempted'] = self.replication_count
def build_replication_jobs(self, policy, ips, override_devices=None, override_partitions=None): """ Helper function for collect_jobs to build jobs for replication using replication style storage policy """ jobs = [] df_mgr = self._df_router[policy] self.all_devs_info.update( [(dev['replication_ip'], dev['device']) for dev in policy.object_ring.devs if dev]) data_dir = get_data_dir(policy) found_local = False for local_dev in [dev for dev in policy.object_ring.devs if (dev and is_local_device(ips, self.port, dev['replication_ip'], dev['replication_port']) and (override_devices is None or dev['device'] in override_devices))]: found_local = True dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(policy)) if self.mount_check and not ismount(dev_path): self._add_failure_stats( [(failure_dev['replication_ip'], failure_dev['device']) for failure_dev in policy.object_ring.devs if failure_dev]) self.logger.warning( _('%s is not mounted'), local_dev['device']) continue unlink_older_than(tmp_path, time.time() - df_mgr.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception('ERROR creating %s' % obj_path) continue for partition in os.listdir(obj_path): if (override_partitions is not None and partition not in override_partitions): continue if (partition.startswith('auditor_status_') and partition.endswith('.json')): # ignore auditor status files continue part_nodes = None try: job_path = join(obj_path, partition) part_nodes = policy.object_ring.get_part_nodes( int(partition)) nodes = [node for node in part_nodes if node['id'] != local_dev['id']] jobs.append( dict(path=job_path, device=local_dev['device'], obj_path=obj_path, nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, policy=policy, partition=partition, region=local_dev['region'])) except ValueError: if part_nodes: self._add_failure_stats( [(failure_dev['replication_ip'], failure_dev['device']) for failure_dev in nodes]) else: self._add_failure_stats( [(failure_dev['replication_ip'], failure_dev['device']) for failure_dev in policy.object_ring.devs if failure_dev]) continue if not found_local: self.logger.error("Can't find itself in policy with index %d with" " ips %s and with port %s in ring file, not" " replicating", int(policy), ", ".join(ips), self.port) return jobs