Esempio n. 1
0
 def run_once(self, *args, **kwargs):
     """Run a replication pass once."""
     self._zero_stats()
     dirs = []
     ips = whataremyips()
     if not ips:
         self.logger.error(_('ERROR Failed to get my own IPs?'))
         return
     self._local_device_ids = set()
     for node in self.ring.devs:
         if node and is_local_device(ips, self.port,
                                     node['replication_ip'],
                                     node['replication_port']):
             if self.mount_check and not ismount(
                     os.path.join(self.root, node['device'])):
                 self.logger.warn(
                     _('Skipping %(device)s as it is not mounted') % node)
                 continue
             unlink_older_than(
                 os.path.join(self.root, node['device'], 'tmp'),
                 time.time() - self.reclaim_age)
             datadir = os.path.join(self.root, node['device'], self.datadir)
             if os.path.isdir(datadir):
                 self._local_device_ids.add(node['id'])
                 dirs.append((datadir, node['id']))
     self.logger.info(_('Beginning replication run'))
     for part, object_file, node_id in roundrobin_datadirs(dirs):
         self.cpool.spawn_n(
             self._replicate_object, part, object_file, node_id)
     self.cpool.waitall()
     self.logger.info(_('Replication run OVER'))
     self._report_stats()
Esempio n. 2
0
 def run_once(self, *args, **kwargs):
     """Run a replication pass once."""
     self._zero_stats()
     dirs = []
     ips = whataremyips()
     if not ips:
         self.logger.error(_('ERROR Failed to get my own IPs?'))
         return
     for node in self.ring.devs:
         if (node and node['replication_ip'] in ips
                 and node['replication_port'] == self.port):
             if self.mount_check and not ismount(
                     os.path.join(self.root, node['device'])):
                 self.logger.warn(
                     _('Skipping %(device)s as it is not mounted') % node)
                 continue
             unlink_older_than(
                 os.path.join(self.root, node['device'], 'tmp'),
                 time.time() - self.reclaim_age)
             datadir = os.path.join(self.root, node['device'], self.datadir)
             if os.path.isdir(datadir):
                 dirs.append((datadir, node['id']))
     self.logger.info(_('Beginning replication run'))
     for part, object_file, node_id in roundrobin_datadirs(dirs):
         self.cpool.spawn_n(self._replicate_object, part, object_file,
                            node_id)
     self.cpool.waitall()
     self.logger.info(_('Replication run OVER'))
     self._report_stats()
Esempio n. 3
0
 def collect_parts(self, override_devices=None, override_partitions=None):
     """
     Helper for yielding partitions in the top level reconstructor
     """
     override_devices = override_devices or []
     override_partitions = override_partitions or []
     ips = whataremyips()
     for policy in POLICIES:
         if policy.policy_type != EC_POLICY:
             continue
         self._diskfile_mgr = self._df_router[policy]
         self.load_object_ring(policy)
         data_dir = get_data_dir(policy)
         local_devices = itertools.ifilter(
             lambda dev: dev and is_local_device(ips, self.port, dev[
                 'replication_ip'], dev['replication_port']),
             policy.object_ring.devs)
         for local_dev in local_devices:
             if override_devices and (local_dev['device']
                                      not in override_devices):
                 continue
             dev_path = join(self.devices_dir, local_dev['device'])
             obj_path = join(dev_path, data_dir)
             tmp_path = join(dev_path, get_tmp_dir(int(policy)))
             if self.mount_check and not ismount(dev_path):
                 self.logger.warn(_('%s is not mounted'),
                                  local_dev['device'])
                 continue
             unlink_older_than(tmp_path, time.time() - self.reclaim_age)
             if not os.path.exists(obj_path):
                 try:
                     mkdirs(obj_path)
                 except Exception:
                     self.logger.exception('Unable to create %s' % obj_path)
                 continue
             try:
                 partitions = os.listdir(obj_path)
             except OSError:
                 self.logger.exception('Unable to list partitions in %r' %
                                       obj_path)
                 continue
             for partition in partitions:
                 part_path = join(obj_path, partition)
                 if not (partition.isdigit() and os.path.isdir(part_path)):
                     self.logger.warning(
                         'Unexpected entity in data dir: %r' % part_path)
                     remove_file(part_path)
                     continue
                 partition = int(partition)
                 if override_partitions and (partition
                                             not in override_partitions):
                     continue
                 part_info = {
                     'local_dev': local_dev,
                     'policy': policy,
                     'partition': partition,
                     'part_path': part_path,
                 }
                 yield part_info
Esempio n. 4
0
 def collect_jobs(self):
     """
     Returns a sorted list of jobs (dictionaries) that specify the
     partitions, nodes, etc to be synced.
     """
     jobs = []
     ips = whataremyips()
     for local_dev in [
             dev for dev in self.object_ring.devs
             if dev and dev['replication_ip'] in ips
             and dev['replication_port'] == self.port
     ]:
         dev_path = join(self.devices_dir, local_dev['device'])
         obj_path = join(dev_path, 'objects')
         tmp_path = join(dev_path, 'tmp')
         if self.mount_check and not ismount(dev_path):
             self.logger.warn(_('%s is not mounted'), local_dev['device'])
             continue
         unlink_older_than(tmp_path, time.time() - self.reclaim_age)
         if not os.path.exists(obj_path):
             try:
                 mkdirs(obj_path)
             except Exception:
                 self.logger.exception('ERROR creating %s' % obj_path)
             continue
         for partition in os.listdir(obj_path):
             try:
                 job_path = join(obj_path, partition)
                 if isfile(job_path):
                     # Clean up any (probably zero-byte) files where a
                     # partition should be.
                     self.logger.warning(
                         'Removing partition directory '
                         'which was a file: %s', job_path)
                     os.remove(job_path)
                     continue
                 part_nodes = \
                     self.object_ring.get_part_nodes(int(partition))
                 nodes = [
                     node for node in part_nodes
                     if node['id'] != local_dev['id']
                 ]
                 jobs.append(
                     dict(path=job_path,
                          device=local_dev['device'],
                          nodes=nodes,
                          delete=len(nodes) > len(part_nodes) - 1,
                          partition=partition))
             except (ValueError, OSError):
                 continue
     random.shuffle(jobs)
     if self.handoffs_first:
         # Move the handoff parts to the front of the list
         jobs.sort(key=lambda job: not job['delete'])
     self.job_count = len(jobs)
     return jobs
Esempio n. 5
0
    def replicate(self,
                  override_devices=None,
                  override_partitions=None,
                  override_policies=None):
        """Run a replication pass"""
        self.start = time.time()
        self.suffix_count = 0
        self.suffix_sync = 0
        self.suffix_hash = 0
        self.replication_count = 0
        self.last_replication_count = -1
        self.partition_times = []

        stats = eventlet.spawn(self.heartbeat)
        lockup_detector = eventlet.spawn(self.detect_lockups)
        eventlet.sleep()  # Give spawns a cycle

        try:
            self.run_pool = GreenPool(size=self.concurrency)
            jobs = self.collect_jobs(override_devices=override_devices,
                                     override_partitions=override_partitions,
                                     override_policies=override_policies)
            for job in jobs:
                dev_path = join(self.devices_dir, job['device'])
                if self.mount_check and not ismount(dev_path):
                    self.logger.warn(_('%s is not mounted'), job['device'])
                    continue
                if not self.check_ring(job['policy'].object_ring):
                    self.logger.info(
                        _("Ring change detected. Aborting "
                          "current replication pass."))
                    return
                try:
                    if isfile(job['path']):
                        # Clean up any (probably zero-byte) files where a
                        # partition should be.
                        self.logger.warning(
                            'Removing partition directory '
                            'which was a file: %s', job['path'])
                        os.remove(job['path'])
                        continue
                except OSError:
                    continue
                if job['delete']:
                    self.run_pool.spawn(self.update_deleted, job)
                else:
                    self.run_pool.spawn(self.update, job)
            with Timeout(self.lockup_timeout):
                self.run_pool.waitall()
        except (Exception, Timeout):
            self.logger.exception(_("Exception in top-level replication loop"))
            self.kill_coros()
        finally:
            stats.kill()
            lockup_detector.kill()
            self.stats_line()
Esempio n. 6
0
    def replicate(self, override_devices=None, override_partitions=None):
        """Run a replication pass"""
        self.start = time.time()
        self.suffix_count = 0
        self.suffix_sync = 0
        self.suffix_hash = 0
        self.replication_count = 0
        self.last_replication_count = -1
        self.partition_times = []

        if override_devices is None:
            override_devices = []
        if override_partitions is None:
            override_partitions = []

        stats = eventlet.spawn(self.heartbeat)
        lockup_detector = eventlet.spawn(self.detect_lockups)
        eventlet.sleep()  # Give spawns a cycle

        try:
            self.run_pool = GreenPool(size=self.concurrency)
            jobs = self.collect_jobs()
            for job in jobs:
                if override_devices and job["device"] not in override_devices:
                    continue
                if override_partitions and job["partition"] not in override_partitions:
                    continue
                dev_path = join(self.devices_dir, job["device"])
                if self.mount_check and not ismount(dev_path):
                    self.logger.warn(_("%s is not mounted"), job["device"])
                    continue
                if not self.check_ring(job["object_ring"]):
                    self.logger.info(_("Ring change detected. Aborting " "current replication pass."))
                    return
                try:
                    if isfile(job["path"]):
                        # Clean up any (probably zero-byte) files where a
                        # partition should be.
                        self.logger.warning("Removing partition directory " "which was a file: %s", job["path"])
                        os.remove(job["path"])
                        continue
                except OSError:
                    continue
                if job["delete"]:
                    self.run_pool.spawn(self.update_deleted, job)
                else:
                    self.run_pool.spawn(self.update, job)
            with Timeout(self.lockup_timeout):
                self.run_pool.waitall()
        except (Exception, Timeout):
            self.logger.exception(_("Exception in top-level replication loop"))
            self.kill_coros()
        finally:
            stats.kill()
            lockup_detector.kill()
            self.stats_line()
Esempio n. 7
0
    def build_replication_jobs(self,
                               policy,
                               ips,
                               override_devices=None,
                               override_partitions=None):
        """
        Helper function for collect_jobs to build jobs for replication
        using replication style storage policy
        """
        jobs = []
        data_dir = get_data_dir(policy)
        for local_dev in [
                dev for dev in policy.object_ring.devs if
            (dev and is_local_device(ips, self.port, dev['replication_ip'],
                                     dev['replication_port']) and
             (override_devices is None or dev['device'] in override_devices))
        ]:
            dev_path = join(self.devices_dir, local_dev['device'])
            obj_path = join(dev_path, data_dir)
            tmp_path = join(dev_path, get_tmp_dir(policy))
            if self.mount_check and not ismount(dev_path):
                self.logger.warn(_('%s is not mounted'), local_dev['device'])
                continue
            unlink_older_than(tmp_path, time.time() - self.reclaim_age)
            if not os.path.exists(obj_path):
                try:
                    mkdirs(obj_path)
                except Exception:
                    self.logger.exception('ERROR creating %s' % obj_path)
                continue
            for partition in os.listdir(obj_path):
                if (override_partitions is not None
                        and partition not in override_partitions):
                    continue

                try:
                    job_path = join(obj_path, partition)
                    part_nodes = policy.object_ring.get_part_nodes(
                        int(partition))
                    nodes = [
                        node for node in part_nodes
                        if node['id'] != local_dev['id']
                    ]
                    jobs.append(
                        dict(path=job_path,
                             device=local_dev['device'],
                             obj_path=obj_path,
                             nodes=nodes,
                             delete=len(nodes) > len(part_nodes) - 1,
                             policy=policy,
                             partition=partition,
                             region=local_dev['region']))
                except ValueError:
                    continue
        return jobs
    def collect_jobs(self):
        """
        Returns a sorted list of jobs (dictionaries) that specify the
        partitions, nodes, etc to be synced.
        """
        jobs = []
        ips = whataremyips()
        for local_dev in [dev for dev in self.object_ring.devs
                          if dev and dev['replication_ip'] in ips and
                          dev['replication_port'] == self.port]:
            dev_path = join(self.devices_dir, local_dev['device'])
            obj_path = join(dev_path, 'objects')
            tmp_path = join(dev_path, 'tmp')
            if self.mount_check and not ismount(dev_path):
                self.logger.warn(_('%s is not mounted'), local_dev['device'])
                continue
            unlink_older_than(tmp_path, time.time() - self.reclaim_age)
            if not os.path.exists(obj_path):
                try:
                    mkdirs(obj_path)
                except Exception:
                    self.logger.exception('ERROR creating %s' % obj_path)
                continue
            for partition in os.listdir(obj_path):
                try:
                    job_path = join(obj_path, partition)
                    if isfile(job_path):
                        # Clean up any (probably zero-byte) files where a
                        # partition should be.
                        self.logger.warning('Removing partition directory '
                                            'which was a file: %s', job_path)
                        os.remove(job_path)
                        continue
                    part_nodes = \
                        self.object_ring.get_part_nodes(int(partition))
		#MODIFIED LightSync
                    for mypos in range(len(part_nodes)):
                        if part_nodes[mypos]['id'] == local_dev['id']:
                            break
                    nodes = part_nodes[mypos+1:]+part_nodes[:mypos]
		##
                    jobs.append(
                        dict(path=job_path,
                             device=local_dev['device'],
                             nodes=nodes,
                             delete=len(nodes) > len(part_nodes) - 1,
                             partition=partition))
                except (ValueError, OSError):
                    continue
        random.shuffle(jobs)
        if self.handoffs_first:
            # Move the handoff parts to the front of the list
            jobs.sort(key=lambda job: not job['delete'])
        self.job_count = len(jobs)
        return jobs
Esempio n. 9
0
    def process_repl(self, policy, ips, override_devices=None,
                     override_partitions=None):
        """
        Helper function for collect_jobs to build jobs for replication
        using replication style storage policy
        """
        jobs = []
        obj_ring = self.get_object_ring(policy.idx)
        data_dir = get_data_dir(policy.idx)
        for local_dev in [dev for dev in obj_ring.devs
                          if (dev
                              and is_local_device(ips,
                                                  self.port,
                                                  dev['replication_ip'],
                                                  dev['replication_port'])
                              and (override_devices is None
                                   or dev['device'] in override_devices))]:
            dev_path = join(self.devices_dir, local_dev['device'])
            obj_path = join(dev_path, data_dir)
            tmp_path = join(dev_path, get_tmp_dir(int(policy)))
            if self.mount_check and not ismount(dev_path):
                self.logger.warn(_('%s is not mounted'), local_dev['device'])
                continue
            unlink_older_than(tmp_path, time.time() - self.reclaim_age)
            if not os.path.exists(obj_path):
                try:
                    mkdirs(obj_path)
                except Exception:
                    self.logger.exception('ERROR creating %s' % obj_path)
                continue
            for partition in os.listdir(obj_path):
                if (override_partitions is not None
                        and partition not in override_partitions):
                    continue

                try:
                    job_path = join(obj_path, partition)
                    part_nodes = obj_ring.get_part_nodes(int(partition))
                    nodes = [node for node in part_nodes
                             if node['id'] != local_dev['id']]
                    jobs.append(
                        dict(path=job_path,
                             device=local_dev['device'],
                             obj_path=obj_path,
                             nodes=nodes,
                             delete=len(nodes) > len(part_nodes) - 1,
                             policy_idx=policy.idx,
                             partition=partition,
                             object_ring=obj_ring,
                             region=local_dev['region']))
                except ValueError:
                    continue
        return jobs
Esempio n. 10
0
    def process_repl(self, policy, jobs, ips):
        """
        Helper function for collect_jobs to build jobs for replication
        using replication style storage policy
        """
        obj_ring = self.get_object_ring(policy.idx)
        data_dir = get_data_dir(policy.idx)
        for local_dev in [
                dev for dev in obj_ring.devs
                if dev and dev['replication_ip'] in ips
                and dev['replication_port'] == self.port
        ]:
            dev_path = join(self.devices_dir, local_dev['device'])
            obj_path = join(dev_path, data_dir)
            tmp_path = join(dev_path, 'tmp')
            if self.mount_check and not ismount(dev_path):
                self.logger.warn(_('%s is not mounted'), local_dev['device'])
                continue
            unlink_older_than(tmp_path, time.time() - self.reclaim_age)
            if not os.path.exists(obj_path):
                try:
                    mkdirs(obj_path)
                except Exception:
                    self.logger.exception('ERROR creating %s' % obj_path)
                continue
            for partition in os.listdir(obj_path):
                try:
                    job_path = join(obj_path, partition)
                    if isfile(job_path):
                        # Clean up any (probably zero-byte) files where a
                        # partition should be.
                        self.logger.warning(
                            'Removing partition directory '
                            'which was a file: %s', job_path)
                        os.remove(job_path)
                        continue
                    part_nodes = obj_ring.get_part_nodes(int(partition))
                    nodes = [
                        node for node in part_nodes
                        if node['id'] != local_dev['id']
                    ]
                    jobs.append(
                        dict(path=job_path,
                             device=local_dev['device'],
                             nodes=nodes,
                             delete=len(nodes) > len(part_nodes) - 1,
                             policy_idx=policy.idx,
                             partition=partition,
                             object_ring=obj_ring))

                except (ValueError, OSError):
                    continue
Esempio n. 11
0
 def run_forever(self, *args, **kwargs):
     """Run the updater continuously."""
     time.sleep(random() * self.interval)
     while True:
         self.logger.info(_('Begin object update sweep'))
         begin = time.time()
         pids = []
         # read from container ring to ensure it's fresh
         self.get_container_ring().get_nodes('')
         for device in self._listdir(self.devices):
             if self.mount_check and \
                     not ismount(os.path.join(self.devices, device)):
                 self.logger.increment('errors')
                 self.logger.warning(_('Skipping %s as it is not mounted'),
                                     device)
                 continue
             while len(pids) >= self.concurrency:
                 pids.remove(os.wait()[0])
             pid = os.fork()
             if pid:
                 pids.append(pid)
             else:
                 signal.signal(signal.SIGTERM, signal.SIG_DFL)
                 patcher.monkey_patch(all=False,
                                      socket=True,
                                      select=True,
                                      thread=True)
                 self.successes = 0
                 self.failures = 0
                 forkbegin = time.time()
                 self.object_sweep(os.path.join(self.devices, device))
                 elapsed = time.time() - forkbegin
                 self.logger.info(
                     _('Object update sweep of %(device)s'
                       ' completed: %(elapsed).02fs, %(success)s successes'
                       ', %(fail)s failures'), {
                           'device': device,
                           'elapsed': elapsed,
                           'success': self.successes,
                           'fail': self.failures
                       })
                 sys.exit()
         while pids:
             pids.remove(os.wait()[0])
         elapsed = time.time() - begin
         self.logger.info(_('Object update sweep completed: %.02fs'),
                          elapsed)
         dump_recon_cache({'object_updater_sweep': elapsed}, self.rcache,
                          self.logger)
         if elapsed < self.interval:
             time.sleep(self.interval - elapsed)
Esempio n. 12
0
    def replicate(self, override_devices=None, override_partitions=None):
        """Run a replication pass"""
        self.start = time.time()
        self.suffix_count = 0
        self.suffix_sync = 0
        self.suffix_hash = 0
        self.replication_count = 0
        self.last_replication_count = -1
        self.partition_times = []

        if override_devices is None:
            override_devices = []
        if override_partitions is None:
            override_partitions = []

        stats = eventlet.spawn(self.heartbeat)
        lockup_detector = eventlet.spawn(self.detect_lockups)
        eventlet.sleep()  # Give spawns a cycle

        try:
            self.run_pool = GreenPool(size=self.concurrency)
            jobs = self.collect_jobs()
            for job in jobs:
                if override_devices and job['device'] not in override_devices:
                    continue
                if override_partitions and \
                        job['partition'] not in override_partitions:
                    continue
                dev_path = join(self.devices_dir, job['device'])
                if self.mount_check and not ismount(dev_path):
                    self.logger.warn(_('%s is not mounted'), job['device'])
                    continue
                if not self.check_ring(job['object_ring']):
                    self.logger.info(
                        _("Ring change detected. Aborting "
                          "current replication pass."))
                    return
                if job['delete']:
                    self.run_pool.spawn(self.update_deleted, job)
                else:
                    self.run_pool.spawn(self.update, job)
            with Timeout(self.lockup_timeout):
                self.run_pool.waitall()
        except (Exception, Timeout):
            self.logger.exception(_("Exception in top-level replication loop"))
            self.kill_coros()
        finally:
            stats.kill()
            lockup_detector.kill()
            self.stats_line()
    def replicate(self, override_devices=None, override_partitions=None):
        """Run a replication pass"""
        self.start = time.time()
        self.suffix_count = 0
        self.suffix_sync = 0
        self.suffix_hash = 0
        self.replication_count = 0
        self.last_replication_count = -1
        self.partition_times = []

        if override_devices is None:
            override_devices = []
        if override_partitions is None:
            override_partitions = []

        stats = eventlet.spawn(self.heartbeat)
        lockup_detector = eventlet.spawn(self.detect_lockups)
        eventlet.sleep()  # Give spawns a cycle

        try:
            self.run_pool = GreenPool(size=self.concurrency)
            jobs = self.collect_jobs()
            for job in jobs:
                if override_devices and job['device'] not in override_devices:
                    continue
                if override_partitions and \
                        job['partition'] not in override_partitions:
                    continue
                dev_path = join(self.devices_dir, job['device'])
                if self.mount_check and not ismount(dev_path):
                    self.logger.warn(_('%s is not mounted'), job['device'])
                    continue
                if not self.check_ring():
                    self.logger.info(_("Ring change detected. Aborting "
                                       "current replication pass."))
                    return
                if job['delete']:
                    self.run_pool.spawn(self.update_deleted, job)
                else:
                    self.run_pool.spawn(self.update, job)
            with Timeout(self.lockup_timeout):
                self.run_pool.waitall()
        except (Exception, Timeout):
            self.logger.exception(_("Exception in top-level replication loop"))
            self.kill_coros()
        finally:
            stats.kill()
            lockup_detector.kill()
            self.stats_line()
Esempio n. 14
0
def check_mount(root, drive):
    """
    Verify that the path to the device is a mount point and mounted.  This
    allows us to fast fail on drives that have been unmounted because of
    issues, and also prevents us for accidentally filling up the root
    partition.

    :param root:  base path where the devices are mounted
    :param drive: drive name to be checked
    :returns: True if it is a valid mounted device, False otherwise
    """
    if not (urllib.quote_plus(drive) == drive):
        return False
    path = os.path.join(root, drive)
    return utils.ismount(path)
Esempio n. 15
0
def check_mount(root, drive):
    """
    Verify that the path to the device is a mount point and mounted.  This
    allows us to fast fail on drives that have been unmounted because of
    issues, and also prevents us for accidentally filling up the root
    partition.

    :param root:  base path where the devices are mounted
    :param drive: drive name to be checked
    :returns: True if it is a valid mounted device, False otherwise
    """
    if not (urllib.quote_plus(drive) == drive):
        return False
    path = os.path.join(root, drive)
    return utils.ismount(path)
Esempio n. 16
0
    def process_repl(self, policy, jobs, ips):
        """
        Helper function for collect_jobs to build jobs for replication
        using replication style storage policy
        """
        obj_ring = self.get_object_ring(policy.idx)
        data_dir = get_data_dir(policy.idx)
        for local_dev in [dev for dev in obj_ring.devs
                          if dev and dev['replication_ip'] in ips and
                          dev['replication_port'] == self.port]:
            dev_path = join(self.devices_dir, local_dev['device'])
            obj_path = join(dev_path, data_dir)
            tmp_path = join(dev_path, get_tmp_dir(int(policy)))
            if self.mount_check and not ismount(dev_path):
                self.logger.warn(_('%s is not mounted'), local_dev['device'])
                continue
            unlink_older_than(tmp_path, time.time() - self.reclaim_age)
            if not os.path.exists(obj_path):
                try:
                    mkdirs(obj_path)
                except Exception:
                    self.logger.exception('ERROR creating %s' % obj_path)
                continue
            for partition in os.listdir(obj_path):
                try:
                    job_path = join(obj_path, partition)
                    if isfile(job_path):
                        # Clean up any (probably zero-byte) files where a
                        # partition should be.
                        self.logger.warning(
                            'Removing partition directory '
                            'which was a file: %s', job_path)
                        os.remove(job_path)
                        continue
                    part_nodes = obj_ring.get_part_nodes(int(partition))
                    nodes = [node for node in part_nodes
                             if node['id'] != local_dev['id']]
                    jobs.append(
                        dict(path=job_path,
                             device=local_dev['device'],
                             nodes=nodes,
                             delete=len(nodes) > len(part_nodes) - 1,
                             policy_idx=policy.idx,
                             partition=partition,
                             object_ring=obj_ring))

                except (ValueError, OSError):
                    continue
Esempio n. 17
0
 def run_forever(self, *args, **kwargs):
     """Run the updater continuously."""
     time.sleep(random() * self.interval)
     while True:
         self.logger.info(_('Begin object update sweep'))
         begin = time.time()
         pids = []
         # read from container ring to ensure it's fresh
         self.get_container_ring().get_nodes('')
         for device in self._listdir(self.devices):
             if self.mount_check and \
                     not ismount(os.path.join(self.devices, device)):
                 self.logger.increment('errors')
                 self.logger.warning(
                     _('Skipping %s as it is not mounted'), device)
                 continue
             while len(pids) >= self.concurrency:
                 pids.remove(os.wait()[0])
             pid = os.fork()
             if pid:
                 pids.append(pid)
             else:
                 signal.signal(signal.SIGTERM, signal.SIG_DFL)
                 patcher.monkey_patch(all=False, socket=True, select=True,
                                      thread=True)
                 self.successes = 0
                 self.failures = 0
                 forkbegin = time.time()
                 self.object_sweep(os.path.join(self.devices, device))
                 elapsed = time.time() - forkbegin
                 self.logger.info(
                     _('Object update sweep of %(device)s'
                       ' completed: %(elapsed).02fs, %(success)s successes'
                       ', %(fail)s failures'),
                     {'device': device, 'elapsed': elapsed,
                      'success': self.successes, 'fail': self.failures})
                 sys.exit()
         while pids:
             pids.remove(os.wait()[0])
         elapsed = time.time() - begin
         self.logger.info(_('Object update sweep completed: %.02fs'),
                          elapsed)
         dump_recon_cache({'object_updater_sweep': elapsed},
                          self.rcache, self.logger)
         if elapsed < self.interval:
             time.sleep(self.interval - elapsed)
Esempio n. 18
0
def get_ring(server, force_validate=None):
    ring = Ring('/etc/swift/%s.ring.gz' % server)
    if not VALIDATE_RSYNC and not force_validate:
        return ring
    # easy sanity checks
    assert 3 == ring.replica_count, '%s has %s replicas instead of 3' % (
        ring.serialized_path, ring.replica_count)
    assert 4 == len(
        ring.devs), '%s has %s devices instead of 4' % (ring.serialized_path,
                                                        len(ring.devs))
    # map server to config by port
    port_to_config = {}
    for node_id in range(1, 5):
        conf = readconf('/etc/swift/%s-server/%d.conf' % (server, node_id),
                        section_name='%s-replicator' % server)
        port_to_config[int(conf['bind_port'])] = conf
    for dev in ring.devs:
        # verify server is exposing mounted device
        conf = port_to_config[dev['port']]
        for device in os.listdir(conf['devices']):
            if device == dev['device']:
                full_path = path.realpath(path.join(conf['devices'], device))
                assert ismount(full_path), \
                    'device %s in %s was not mounted (%s)' % (
                        device, conf['devices'], full_path)
                break
        else:
            raise AssertionError(
                "unable to find ring device %s under %s's devices (%s)" %
                (dev['device'], server, conf['devices']))
        # verify server is exposing rsync device
        rsync_export = '%s%s' % (server, dev['replication_port'])
        cmd = "rsync rsync://localhost/%s" % rsync_export
        p = Popen(cmd, shell=True, stdout=PIPE)
        stdout, _stderr = p.communicate()
        if p.returncode:
            raise AssertionError('unable to connect to rsync '
                                 'export %s (%s)' % (rsync_export, cmd))
        for line in stdout.splitlines():
            if line.rsplit(None, 1)[-1] == dev['device']:
                break
        else:
            raise AssertionError("unable to find ring device %s under rsync's "
                                 "exported devices for %s (%s)" %
                                 (dev['device'], rsync_export, cmd))
    return ring
Esempio n. 19
0
    def move(self, old_dict, new_dict, moving_map):
        """Run a move pass.

        :param old_dict: dictionary with devices from old ring
        :param new_dict: dictionary with devices from new ring
        :param moving_map: the dictionary that contains all the partitions
            that should be moved, their sources and destinations
        """

        self.start = time.time()
        self.replication_count = 0
        self.last_replication_count = -1
        self.partition_times = []

        try:
            self.run_pool = GreenPool(size=self.concurrency)
            jobs = self.collect_jobs(old_dict, new_dict, moving_map)
            for job in jobs:
                dev_path = join(self.devices_dir, job['device'])
                if self.mount_check and not ismount(dev_path):
                    self.logger.warn('%s is not mounted' % job['device'])
                    continue

                try:
                    if isfile(job['path']):
                        # Clean up any (probably zero-byte) files where a
                        # partition should be.
                        self.logger.warning(
                            'Removing partition directory '
                            'which was a file: %s', job['path'])
                        os.remove(job['path'])
                        continue
                except OSError:
                    continue

                self.run_pool.spawn(self.update, job)

            self.run_pool.waitall()

        except (Exception, Timeout) as e:
            self.kill_coros()
            self.logger.exception(
                "Exception in top-level partition move loop %s" % e)
            if self.test:
                print e
Esempio n. 20
0
    def move(self, old_dict, new_dict, moving_map):
        """Run a move pass.

        :param old_dict: dictionary with devices from old ring
        :param new_dict: dictionary with devices from new ring
        :param moving_map: the dictionary that contains all the partitions
            that should be moved, their sources and destinations
        """

        self.start = time.time()
        self.replication_count = 0
        self.last_replication_count = -1
        self.partition_times = []

        try:
            self.run_pool = GreenPool(size=self.concurrency)
            jobs = self.collect_jobs(old_dict, new_dict, moving_map)
            for job in jobs:
                dev_path = join(self.devices_dir, job['device'])
                if self.mount_check and not ismount(dev_path):
                    self.logger.warn('%s is not mounted' % job['device'])
                    continue

                try:
                    if isfile(job['path']):
                        # Clean up any (probably zero-byte) files where a
                        # partition should be.
                        self.logger.warning(
                            'Removing partition directory '
                            'which was a file: %s', job['path'])
                        os.remove(job['path'])
                        continue
                except OSError:
                    continue

                self.run_pool.spawn(self.update, job)

            self.run_pool.waitall()

        except (Exception, Timeout) as e:
            self.kill_coros()
            self.logger.exception(
                "Exception in top-level partition move loop %s" % e)
            if self.test:
                print e
Esempio n. 21
0
def get_ring(server, force_validate=None):
    ring = Ring('/etc/swift/%s.ring.gz' % server)
    if not VALIDATE_RSYNC and not force_validate:
        return ring
    # easy sanity checks
    assert 3 == ring.replica_count, '%s has %s replicas instead of 3' % (
        ring.serialized_path, ring.replica_count)
    assert 4 == len(ring.devs), '%s has %s devices instead of 4' % (
        ring.serialized_path, len(ring.devs))
    # map server to config by port
    port_to_config = {}
    for node_id in range(1, 5):
        conf = readconf('/etc/swift/%s-server/%d.conf' % (server, node_id),
                        section_name='%s-replicator' % server)
        port_to_config[int(conf['bind_port'])] = conf
    for dev in ring.devs:
        # verify server is exposing mounted device
        conf = port_to_config[dev['port']]
        for device in os.listdir(conf['devices']):
            if device == dev['device']:
                full_path = path.realpath(path.join(conf['devices'], device))
                assert ismount(full_path), \
                    'device %s in %s was not mounted (%s)' % (
                        device, conf['devices'], full_path)
                break
        else:
            raise AssertionError(
                "unable to find ring device %s under %s's devices (%s)" % (
                    dev['device'], server, conf['devices']))
        # verify server is exposing rsync device
        rsync_export = '%s%s' % (server, dev['replication_port'])
        cmd = "rsync rsync://localhost/%s" % rsync_export
        p = Popen(cmd, shell=True, stdout=PIPE)
        stdout, _stderr = p.communicate()
        if p.returncode:
            raise AssertionError('unable to connect to rsync '
                                 'export %s (%s)' % (rsync_export, cmd))
        for line in stdout.splitlines():
            if line.rsplit(None, 1)[-1] == dev['device']:
                break
        else:
            raise AssertionError("unable to find ring device %s under rsync's "
                                 "exported devices for %s (%s)" % (
                                     dev['device'], rsync_export, cmd))
    return ring
Esempio n. 22
0
 def run_once(self, *args, **kwargs):
     """Run a replication pass once."""
     self._zero_stats()
     dirs = []
     ips = whataremyips(self.bind_ip)
     if not ips:
         self.logger.error(_('ERROR Failed to get my own IPs?'))
         return
     self._local_device_ids = set()
     found_local = False
     ###遍历节点
     for node in self.ring.devs:
         if node and is_local_device(ips, self.port,
                                     node['replication_ip'],
                                     node['replication_port']):
             found_local = True
             if self.mount_check and not ismount(
                     os.path.join(self.root, node['device'])):
                 self._add_failure_stats(
                     [(failure_dev['replication_ip'],
                       failure_dev['device'])
                      for failure_dev in self.ring.devs if failure_dev])
                 self.logger.warning(
                     _('Skipping %(device)s as it is not mounted') % node)
                 continue
             unlink_older_than(
                 os.path.join(self.root, node['device'], 'tmp'),
                 time.time() - self.reclaim_age)
             datadir = os.path.join(self.root, node['device'], self.datadir)
             if os.path.isdir(datadir):
                 self._local_device_ids.add(node['id'])
                 dirs.append((datadir, node['id']))
     if not found_local:
         self.logger.error("Can't find itself %s with port %s in ring "
                           "file, not replicating",
                           ", ".join(ips), self.port)
     self.logger.info(_('Beginning replication run'))
     for part, object_file, node_id in roundrobin_datadirs(dirs):
         self.cpool.spawn_n(
             self._replicate_object, part, object_file, node_id)
     self.cpool.waitall()
     self.logger.info(_('Replication run OVER'))
     self._report_stats()
    def get_paths(self):
        """
        Get paths to all of the partitions on each drive to be processed.

        :returns: a list of paths
        """
        paths = []
        for device in self._listdir(self.devices):
            dev_path = os.path.join(self.devices, device)
            if self.mount_check and not ismount(dev_path):
                self.logger.warning(_('%s is not mounted'), device)
                continue
            con_path = os.path.join(dev_path, DATADIR)
            if not os.path.exists(con_path):
                continue
            for partition in self._listdir(con_path):
                paths.append(os.path.join(con_path, partition))
        shuffle(paths)
        return paths
Esempio n. 24
0
 def process_repl(self, policy, jobs, ips):
     """
     Helper function for collect_jobs to build jobs for replication
     using replication style storage policy
     """
     obj_ring = self.get_object_ring(policy.idx)
     data_dir = get_data_dir(policy.idx)
     for local_dev in [
         dev
         for dev in obj_ring.devs
         if dev and dev["replication_ip"] in ips and dev["replication_port"] == self.port
     ]:
         dev_path = join(self.devices_dir, local_dev["device"])
         obj_path = join(dev_path, data_dir)
         tmp_path = join(dev_path, get_tmp_dir(int(policy)))
         if self.mount_check and not ismount(dev_path):
             self.logger.warn(_("%s is not mounted"), local_dev["device"])
             continue
         unlink_older_than(tmp_path, time.time() - self.reclaim_age)
         if not os.path.exists(obj_path):
             try:
                 mkdirs(obj_path)
             except Exception:
                 self.logger.exception("ERROR creating %s" % obj_path)
             continue
         for partition in os.listdir(obj_path):
             try:
                 job_path = join(obj_path, partition)
                 part_nodes = obj_ring.get_part_nodes(int(partition))
                 nodes = [node for node in part_nodes if node["id"] != local_dev["id"]]
                 jobs.append(
                     dict(
                         path=job_path,
                         device=local_dev["device"],
                         nodes=nodes,
                         delete=len(nodes) > len(part_nodes) - 1,
                         policy_idx=policy.idx,
                         partition=partition,
                         object_ring=obj_ring,
                     )
                 )
             except ValueError:
                 continue
Esempio n. 25
0
def check_drive(root, drive, mount_check):
    """
    Validate the path given by root and drive is a valid existing directory.

    :param root:  base path where the devices are mounted
    :param drive: drive name to be checked
    :param mount_check: additionally require path is mounted

    :returns: full path to the device, or None if drive fails to validate
    """
    if not (urllib.parse.quote_plus(drive) == drive):
        return None
    path = os.path.join(root, drive)
    if mount_check:
        if utils.ismount(path):
            return path
    else:
        if isdir(path):
            return path
    return None
Esempio n. 26
0
def check_drive(root, drive, mount_check):
    """
    Validate the path given by root and drive is a valid existing directory.

    :param root:  base path where the devices are mounted
    :param drive: drive name to be checked
    :param mount_check: additionally require path is mounted

    :returns: full path to the device, or None if drive fails to validate
    """
    if not (urllib.parse.quote_plus(drive) == drive):
        return None
    path = os.path.join(root, drive)
    if mount_check:
        if utils.ismount(path):
            return path
    else:
        if isdir(path):
            return path
    return None
Esempio n. 27
0
def check_drive(root, drive, mount_check):
    """
    Validate the path given by root and drive is a valid existing directory.

    :param root:  base path where the devices are mounted
    :param drive: drive name to be checked
    :param mount_check: additionally require path is mounted

    :returns: full path to the device
    :raises ValueError: if drive fails to validate
    """
    if not (urllib.parse.quote_plus(drive) == drive):
        raise ValueError('%s is not a valid drive name' % drive)
    path = os.path.join(root, drive)
    if mount_check:
        if not utils.ismount(path):
            raise ValueError('%s is not mounted' % path)
    else:
        if not isdir(path):
            raise ValueError('%s is not a directory' % path)
    return path
Esempio n. 28
0
def check_drive(root, drive, mount_check):
    """
    Validate the path given by root and drive is a valid existing directory.

    :param root:  base path where the devices are mounted
    :param drive: drive name to be checked
    :param mount_check: additionally require path is mounted

    :returns: full path to the device
    :raises ValueError: if drive fails to validate
    """
    if not (urllib.parse.quote_plus(drive) == drive):
        raise ValueError('%s is not a valid drive name' % drive)
    path = os.path.join(root, drive)
    if mount_check:
        if not utils.ismount(path):
            raise ValueError('%s is not mounted' % path)
    else:
        if not isdir(path):
            raise ValueError('%s is not a directory' % path)
    return path
Esempio n. 29
0
 def dispatch(self, replicate_args, args):
     if not hasattr(args, 'pop'):
         return HTTPBadRequest(body='Invalid object type')
     op = args.pop(0)
     drive, partition, hsh = replicate_args
     if self.mount_check and not ismount(os.path.join(self.root, drive)):
         return Response(status='507 %s is not mounted' % drive)
     db_file = os.path.join(self.root, drive,
                            storage_directory(self.datadir, partition, hsh),
                            hsh + '.db')
     if op == 'rsync_then_merge':
         return self.rsync_then_merge(drive, db_file, args)
     if op == 'complete_rsync':
         return self.complete_rsync(drive, db_file, args)
     else:
         # someone might be about to rsync a db to us,
         # make sure there's a tmp dir to receive it.
         mkdirs(os.path.join(self.root, drive, 'tmp'))
         if not os.path.exists(db_file):
             return HTTPNotFound()
         return getattr(self, op)(self.broker_class(db_file), args)
Esempio n. 30
0
 def dispatch(self, replicate_args, args):
     if not hasattr(args, 'pop'):
         return HTTPBadRequest(body='Invalid object type')
     op = args.pop(0)
     drive, partition, hsh = replicate_args
     if self.mount_check and not ismount(os.path.join(self.root, drive)):
         return Response(status='507 %s is not mounted' % drive)
     db_file = os.path.join(self.root, drive,
                            storage_directory(self.datadir, partition, hsh),
                            hsh + '.db')
     if op == 'rsync_then_merge':
         return self.rsync_then_merge(drive, db_file, args)
     if op == 'complete_rsync':
         return self.complete_rsync(drive, db_file, args)
     else:
         # someone might be about to rsync a db to us,
         # make sure there's a tmp dir to receive it.
         mkdirs(os.path.join(self.root, drive, 'tmp'))
         if not os.path.exists(db_file):
             return HTTPNotFound()
         return getattr(self, op)(self.broker_class(db_file), args)
Esempio n. 31
0
 def run_once(self, *args, **kwargs):
     """Run the updater once."""
     self.logger.info(_('Begin object update single threaded sweep'))
     begin = time.time()
     self.successes = 0
     self.failures = 0
     for device in os.listdir(self.devices):
         if self.mount_check and \
                 not ismount(os.path.join(self.devices, device)):
             self.logger.increment('errors')
             self.logger.warn(
                 _('Skipping %s as it is not mounted'), device)
             continue
         self.object_sweep(os.path.join(self.devices, device))
     elapsed = time.time() - begin
     self.logger.info(
         _('Object update single threaded sweep completed: '
           '%(elapsed).02fs, %(success)s successes, %(fail)s failures'),
         {'elapsed': elapsed, 'success': self.successes,
          'fail': self.failures})
     dump_recon_cache({'object_updater_sweep': elapsed},
                      self.rcache, self.logger)
Esempio n. 32
0
 def run_once(self, *args, **kwargs):
     """Run the updater once."""
     self.logger.info(_('Begin object update single threaded sweep'))
     begin = time.time()
     self.successes = 0
     self.failures = 0
     for device in self._listdir(self.devices):
         if self.mount_check and \
                 not ismount(os.path.join(self.devices, device)):
             self.logger.increment('errors')
             self.logger.warning(
                 _('Skipping %s as it is not mounted'), device)
             continue
         self.object_sweep(os.path.join(self.devices, device))
     elapsed = time.time() - begin
     self.logger.info(
         _('Object update single threaded sweep completed: '
           '%(elapsed).02fs, %(success)s successes, %(fail)s failures'),
         {'elapsed': elapsed, 'success': self.successes,
          'fail': self.failures})
     dump_recon_cache({'object_updater_sweep': elapsed},
                      self.rcache, self.logger)
Esempio n. 33
0
 def run_once(self, *args, **kwargs):
     """
     Main entry point when running the reaper in 'once' mode, where it will
     do a single pass over all accounts on the server. This is called
     repeatedly by :func:`run_forever`. This will call :func:`reap_device`
     once for each device on the server.
     """
     self.logger.debug('Begin devices pass: %s', self.devices)
     begin = time()
     try:
         for device in os.listdir(self.devices):
             if self.mount_check and not ismount(
                     os.path.join(self.devices, device)):
                 self.logger.increment('errors')
                 self.logger.debug(
                     _('Skipping %s as it is not mounted'), device)
                 continue
             self.reap_device(device)
     except (Exception, Timeout):
         self.logger.exception(_("Exception in top-level account reaper "
                                 "loop"))
     elapsed = time() - begin
     self.logger.info(_('Devices pass completed: %.02fs'), elapsed)
Esempio n. 34
0
    def build_replication_jobs(self, policy, ips, old_dict,
                               new_dict, moving_map):
        """
        Helper function for collect_jobs to build jobs for replication
        using replication style storage policy

        :param policy: swift policy object
        :param ips: the local server ips
        :param old_dict: dictionary with devices from old ring
        :param new_dict: dictionary with devices from new ring
        :param moving_map: the dictionary that contains all the partitions
            that should be moved, their sources and destinations
        """

        jobs = []
        data_dir = get_data_dir(policy)
        devices = Set(map(lambda x: x[1], moving_map.values()))
        partitions = Set(map(lambda x: x[0], moving_map.values()))

        for local_dev in [dev for dev in policy.object_ring.devs
                          if (dev
                              and is_local_device(ips,
                                                  self.port,
                                                  dev['replication_ip'],
                                                  dev['replication_port'])
                              )]:

            if self.test:
                print local_dev['id']

            if unicode(local_dev['id']) not in devices:
                continue

            dev_path = join(self.devices_dir, local_dev['device'])
            obj_path = join(dev_path, data_dir)
            tmp_path = join(dev_path, get_tmp_dir(policy))
            if self.mount_check and not ismount(dev_path):
                self.logger.warn('%s is not mounted' % local_dev['device'])
                continue
            unlink_older_than(tmp_path, time.time() - self.reclaim_age)

            for partition in os.listdir(obj_path):
                partition = unicode(partition)

                if (partition not in partitions):
                    continue

                try:

                    key = "%s_%s" % (local_dev['id'], partition)
                    if key not in moving_map:
                        continue

                    job_path = join(obj_path, partition)

                    _, source_id, dest_id = moving_map[key]

                    if source_id != unicode(local_dev['id']):
                        continue

                    node = {}
                    replication_ip, replication_device = new_dict[dest_id]
                    node['replication_ip'] = replication_ip
                    node['device'] = replication_device

                    remote_path = os.path.join(self.devices_dir,
                                               node['device'],
                                               self.mover_tmp_dir)

                    jobs.append(
                        dict(path=job_path,
                             device=local_dev['device'],
                             obj_path=obj_path,
                             node=node,
                             policy=policy,
                             partition=partition,
                             remote_path=remote_path))

                except ValueError:
                    continue
                except Exception as e:
                    self.logger.exception(
                        "an %s exception accure at build_replication_jobs" % e)
                    if self.test:
                        print e
        return jobs
Esempio n. 35
0
    def replicate(self,
                  override_devices=None,
                  override_partitions=None,
                  override_policies=None):
        """Run a replication pass"""
        self.start = time.time()
        self.suffix_count = 0
        self.suffix_sync = 0
        self.suffix_hash = 0
        self.replication_count = 0
        self.last_replication_count = -1
        self.replication_cycle = (self.replication_cycle + 1) % 10
        self.partition_times = []
        self.my_replication_ips = self._get_my_replication_ips()
        self.all_devs_info = set()
        self.handoffs_remaining = 0

        stats = eventlet.spawn(self.heartbeat)
        lockup_detector = eventlet.spawn(self.detect_lockups)
        eventlet.sleep()  # Give spawns a cycle

        current_nodes = None
        try:
            self.run_pool = GreenPool(size=self.concurrency)
            jobs = self.collect_jobs(override_devices=override_devices,
                                     override_partitions=override_partitions,
                                     override_policies=override_policies)
            for job in jobs:
                current_nodes = job['nodes']
                if override_devices and job['device'] not in override_devices:
                    continue
                if override_partitions and \
                        job['partition'] not in override_partitions:
                    continue
                dev_path = join(self.devices_dir, job['device'])
                if self.mount_check and not ismount(dev_path):
                    self._add_failure_stats([(failure_dev['replication_ip'],
                                              failure_dev['device'])
                                             for failure_dev in job['nodes']])
                    self.logger.warning(_('%s is not mounted'), job['device'])
                    continue
                if self.handoffs_first and not job['delete']:
                    # in handoffs first mode, we won't process primary
                    # partitions until rebalance was successful!
                    if self.handoffs_remaining:
                        self.logger.warning(
                            _("Handoffs first mode still has handoffs "
                              "remaining.  Aborting current "
                              "replication pass."))
                        break
                if not self.check_ring(job['policy'].object_ring):
                    self.logger.info(
                        _("Ring change detected. Aborting "
                          "current replication pass."))
                    return
                try:
                    if isfile(job['path']):
                        # Clean up any (probably zero-byte) files where a
                        # partition should be.
                        self.logger.warning(
                            'Removing partition directory '
                            'which was a file: %s', job['path'])
                        os.remove(job['path'])
                        continue
                except OSError:
                    continue
                if job['delete']:
                    self.run_pool.spawn(self.update_deleted, job)
                else:
                    self.run_pool.spawn(self.update, job)
            current_nodes = None
            with Timeout(self.lockup_timeout):
                self.run_pool.waitall()
        except (Exception, Timeout):
            if current_nodes:
                self._add_failure_stats([(failure_dev['replication_ip'],
                                          failure_dev['device'])
                                         for failure_dev in current_nodes])
            else:
                self._add_failure_stats(self.all_devs_info)
            self.logger.exception(_("Exception in top-level replication loop"))
            self.kill_coros()
        finally:
            stats.kill()
            lockup_detector.kill()
            self.stats_line()
            self.stats['attempted'] = self.replication_count
Esempio n. 36
0
    def build_replication_jobs(self,
                               policy,
                               ips,
                               override_devices=None,
                               override_partitions=None):
        """
        Helper function for collect_jobs to build jobs for replication
        using replication style storage policy
        """
        jobs = []
        self.all_devs_info.update([(dev['replication_ip'], dev['device'])
                                   for dev in policy.object_ring.devs if dev])
        data_dir = get_data_dir(policy)
        found_local = False
        for local_dev in [
                dev for dev in policy.object_ring.devs if
            (dev and is_local_device(ips, self.port, dev['replication_ip'],
                                     dev['replication_port']) and
             (override_devices is None or dev['device'] in override_devices))
        ]:
            found_local = True
            dev_path = join(self.devices_dir, local_dev['device'])
            obj_path = join(dev_path, data_dir)
            tmp_path = join(dev_path, get_tmp_dir(policy))
            if self.mount_check and not ismount(dev_path):
                self._add_failure_stats([
                    (failure_dev['replication_ip'], failure_dev['device'])
                    for failure_dev in policy.object_ring.devs if failure_dev
                ])
                self.logger.warning(_('%s is not mounted'),
                                    local_dev['device'])
                continue
            unlink_older_than(tmp_path, time.time() - self.reclaim_age)
            if not os.path.exists(obj_path):
                try:
                    mkdirs(obj_path)
                except Exception:
                    self.logger.exception('ERROR creating %s' % obj_path)
                continue
            for partition in os.listdir(obj_path):
                if (override_partitions is not None
                        and partition not in override_partitions):
                    continue

                if (partition.startswith('auditor_status_')
                        and partition.endswith('.json')):
                    # ignore auditor status files
                    continue

                part_nodes = None
                try:
                    job_path = join(obj_path, partition)
                    part_nodes = policy.object_ring.get_part_nodes(
                        int(partition))
                    nodes = [
                        node for node in part_nodes
                        if node['id'] != local_dev['id']
                    ]
                    jobs.append(
                        dict(path=job_path,
                             device=local_dev['device'],
                             obj_path=obj_path,
                             nodes=nodes,
                             delete=len(nodes) > len(part_nodes) - 1,
                             policy=policy,
                             partition=partition,
                             region=local_dev['region']))
                except ValueError:
                    if part_nodes:
                        self._add_failure_stats([
                            (failure_dev['replication_ip'],
                             failure_dev['device']) for failure_dev in nodes
                        ])
                    else:
                        self._add_failure_stats([
                            (failure_dev['replication_ip'],
                             failure_dev['device'])
                            for failure_dev in policy.object_ring.devs
                            if failure_dev
                        ])
                    continue
        if not found_local:
            self.logger.error(
                "Can't find itself in policy with index %d with"
                " ips %s and with port %s in ring file, not"
                " replicating", int(policy), ", ".join(ips), self.port)
        return jobs
Esempio n. 37
0
    def build_replication_jobs(self, policy, ips, old_dict, new_dict,
                               moving_map):
        """
        Helper function for collect_jobs to build jobs for replication
        using replication style storage policy

        :param policy: swift policy object
        :param ips: the local server ips
        :param old_dict: dictionary with devices from old ring
        :param new_dict: dictionary with devices from new ring
        :param moving_map: the dictionary that contains all the partitions
            that should be moved, their sources and destinations
        """

        jobs = []
        data_dir = get_data_dir(policy)
        devices = Set(map(lambda x: x[1], moving_map.values()))
        partitions = Set(map(lambda x: x[0], moving_map.values()))

        for local_dev in [
                dev for dev in policy.object_ring.devs if
            (dev and is_local_device(ips, self.port, dev['replication_ip'],
                                     dev['replication_port']))
        ]:

            if self.test:
                print local_dev['id']

            if unicode(local_dev['id']) not in devices:
                continue

            dev_path = join(self.devices_dir, local_dev['device'])
            obj_path = join(dev_path, data_dir)
            tmp_path = join(dev_path, get_tmp_dir(policy))
            if self.mount_check and not ismount(dev_path):
                self.logger.warn('%s is not mounted' % local_dev['device'])
                continue
            unlink_older_than(tmp_path, time.time() - self.reclaim_age)

            for partition in os.listdir(obj_path):
                partition = unicode(partition)

                if (partition not in partitions):
                    continue

                try:

                    key = "%s_%s" % (local_dev['id'], partition)
                    if key not in moving_map:
                        continue

                    job_path = join(obj_path, partition)

                    _, source_id, dest_id = moving_map[key]

                    if source_id != unicode(local_dev['id']):
                        continue

                    node = {}
                    replication_ip, replication_device = new_dict[dest_id]
                    node['replication_ip'] = replication_ip
                    node['device'] = replication_device

                    remote_path = os.path.join(self.devices_dir,
                                               node['device'],
                                               self.mover_tmp_dir)

                    jobs.append(
                        dict(path=job_path,
                             device=local_dev['device'],
                             obj_path=obj_path,
                             node=node,
                             policy=policy,
                             partition=partition,
                             remote_path=remote_path))

                except ValueError:
                    continue
                except Exception as e:
                    self.logger.exception(
                        "an %s exception accure at build_replication_jobs" % e)
                    if self.test:
                        print e
        return jobs
Esempio n. 38
0
 def collect_parts(self, override_devices=None,
                   override_partitions=None):
     """
     Helper for yielding partitions in the top level reconstructor
     """
     override_devices = override_devices or []
     override_partitions = override_partitions or []
     ips = whataremyips()
     for policy in POLICIES:
         if policy.policy_type != EC_POLICY:
             continue
         self._diskfile_mgr = self._df_router[policy]
         self.load_object_ring(policy)
         data_dir = get_data_dir(policy)
         local_devices = itertools.ifilter(
             lambda dev: dev and is_local_device(
                 ips, self.port,
                 dev['replication_ip'], dev['replication_port']),
             policy.object_ring.devs)
         for local_dev in local_devices:
             if override_devices and (local_dev['device'] not in
                                      override_devices):
                 continue
             dev_path = join(self.devices_dir, local_dev['device'])
             obj_path = join(dev_path, data_dir)
             tmp_path = join(dev_path, get_tmp_dir(int(policy)))
             if self.mount_check and not ismount(dev_path):
                 self.logger.warn(_('%s is not mounted'),
                                  local_dev['device'])
                 continue
             unlink_older_than(tmp_path, time.time() -
                               self.reclaim_age)
             if not os.path.exists(obj_path):
                 try:
                     mkdirs(obj_path)
                 except Exception:
                     self.logger.exception(
                         'Unable to create %s' % obj_path)
                 continue
             try:
                 partitions = os.listdir(obj_path)
             except OSError:
                 self.logger.exception(
                     'Unable to list partitions in %r' % obj_path)
                 continue
             for partition in partitions:
                 part_path = join(obj_path, partition)
                 if not (partition.isdigit() and
                         os.path.isdir(part_path)):
                     self.logger.warning(
                         'Unexpected entity in data dir: %r' % part_path)
                     remove_file(part_path)
                     continue
                 partition = int(partition)
                 if override_partitions and (partition not in
                                             override_partitions):
                     continue
                 part_info = {
                     'local_dev': local_dev,
                     'policy': policy,
                     'partition': partition,
                     'part_path': part_path,
                 }
                 yield part_info
Esempio n. 39
0
    def collect_jobs(self):
        """
        Returns a sorted list of jobs (dictionaries) that specify the
        partitions, nodes, etc to be synced.
        """
        jobs = []
        ips = whataremyips()
        for local_dev in [dev for dev in self.object_ring.devs
                          if dev and dev['replication_ip'] in ips and
                          dev['replication_port'] == self.port]:
            dev_path = join(self.devices_dir, local_dev['device'])
            obj_path = join(dev_path, 'objects')
            tmp_path = join(dev_path, 'tmp')
            if self.mount_check and not ismount(dev_path):
                self.logger.warn(_('%s is not mounted'), local_dev['device'])
                continue
            unlink_older_than(tmp_path, time.time() - self.reclaim_age)
            if not os.path.exists(obj_path):
                try:
                    mkdirs(obj_path)
                except Exception:
                    self.logger.exception('ERROR creating %s' % obj_path)
                continue
            for partition in os.listdir(obj_path):
                try:
                    job_path = join(obj_path, partition)
                    if isfile(job_path):
                        # Clean up any (probably zero-byte) files where a
                        # partition should be.
                        self.logger.warning('Removing partition directory '
                                            'which was a file: %s', job_path)
                        os.remove(job_path)
                        continue
                    part_nodes = \
                        self.object_ring.get_part_nodes(int(partition))
                    #### CHANGED CODE ####
                    #f = open("/home/swift/spindowndevices","r")
                    #sdlist = f.read().strip().split("\n")
                    #logging.info("===Spun down devices===:%s",str(sdlist))
                    #f.close()
                    #sddict =dict()
                    #for i in sdlist:
                    #    logging.info("===sdditc===%s",sddict)
                    #    if(i.split(":")[0] in sddict):
                    #        sddict[i.split(":")[0]].append(i.split(":")[1])
                    #    else:
                    #        sddict[i.split(":")[0]] = []
                    #        sddict[i.split(":")[0]].append(i.split(":")[1])
                    #nodes = []
                    #for node in part_nodes:
                    #    if(node['ip'] not in sddict and node['id']!= local_dev['id']):
                    #        nodes.append(node)
                    #    else:
                    #        if(node['device'] not in sddict[node['ip']] and node['id']!=local_dev['id']):
                    #            nodes.append(node)
                    nodes = [node for node in part_nodes
                            if node['id'] != local_dev['id']]

                    logging.info("===Replication nodes===%s",str(nodes))
#                    logging.info("===sddict===%s",str(sddict))
                    #### END CHANGED CODE ####
                    jobs.append(
                        dict(path=job_path,
                             device=local_dev['device'],
                             nodes=nodes,
                             delete=len(nodes) > len(part_nodes) - 1,
                             partition=partition))
                except (ValueError, OSError):
                    continue
        random.shuffle(jobs)
        if self.handoffs_first:
            # Move the handoff parts to the front of the list
            jobs.sort(key=lambda job: not job['delete'])
        self.job_count = len(jobs)
        return jobs
Esempio n. 40
0
    def process_repl(self, policy, ips, override_devices=None,
                     override_partitions=None):
        """
        Helper function for collect_jobs to build jobs for replication
        using replication style storage policy
        """
        jobs = []
        obj_ring = self.get_object_ring(policy.idx)
        data_dir = get_data_dir(policy.idx)
        for local_dev in [dev for dev in obj_ring.devs
                          if (dev
                              and is_local_device(ips,
                                                  self.port,
                                                  dev['replication_ip'],
                                                  dev['replication_port'])
                              and (override_devices is None
                                   or dev['device'] in override_devices))]:
            dev_path = join(self.devices_dir, local_dev['device'])
            obj_path = join(dev_path, data_dir)
            tmp_path = join(dev_path, get_tmp_dir(int(policy)))
            if self.mount_check and not ismount(dev_path):
                self.logger.warn(_('%s is not mounted'), local_dev['device'])
                continue
            unlink_older_than(tmp_path, time.time() - self.reclaim_age)
            if not os.path.exists(obj_path):
                try:
                    mkdirs(obj_path)
                except Exception:
                    self.logger.exception('ERROR creating %s' % obj_path)
                continue
            for partition in os.listdir(obj_path):
                if (override_partitions is not None
                        and partition not in override_partitions):
                    continue

                try:
                    job_path = join(obj_path, partition)
                    part_nodes = obj_ring.get_part_nodes(int(partition))

######################################  CHANGED_CODE  ########################################################
                    f = open("/home/hduser/swift/swift/proxy/controllers/spindowndevices")
                    downlist = f.read().split("\n")
                    f.close()

                    nodes = [node for node in part_nodes
                             if node['id'] != local_dev['id'] and node['device'] not in downlist]
                    print("===Replication nodes===",nodes)

######################################  CHANGED_CODE  ########################################################

                    jobs.append(
                        dict(path=job_path,
                             device=local_dev['device'],
                             obj_path=obj_path,
                             nodes=nodes,
                             delete=len(nodes) > len(part_nodes) - 1,
                             policy_idx=policy.idx,
                             partition=partition,
                             object_ring=obj_ring,
                             region=local_dev['region']))
                except ValueError:
                    continue
        return jobs
Esempio n. 41
0
    def build_replication_jobs(self, policy, ips, override_devices=None, override_partitions=None):
        """
        Helper function for collect_jobs to build jobs for replication
        using replication style storage policy
        """
        jobs = []
        self.all_devs_info.update([(dev["replication_ip"], dev["device"]) for dev in policy.object_ring.devs if dev])
        data_dir = get_data_dir(policy)
        found_local = False
        for local_dev in [
            dev
            for dev in policy.object_ring.devs
            if (
                dev
                and is_local_device(ips, self.port, dev["replication_ip"], dev["replication_port"])
                and (override_devices is None or dev["device"] in override_devices)
            )
        ]:
            found_local = True
            dev_path = join(self.devices_dir, local_dev["device"])
            obj_path = join(dev_path, data_dir)
            tmp_path = join(dev_path, get_tmp_dir(policy))
            if self.mount_check and not ismount(dev_path):
                self._add_failure_stats(
                    [
                        (failure_dev["replication_ip"], failure_dev["device"])
                        for failure_dev in policy.object_ring.devs
                        if failure_dev
                    ]
                )
                self.logger.warning(_("%s is not mounted"), local_dev["device"])
                continue
            unlink_older_than(tmp_path, time.time() - self.reclaim_age)
            if not os.path.exists(obj_path):
                try:
                    mkdirs(obj_path)
                except Exception:
                    self.logger.exception("ERROR creating %s" % obj_path)
                continue
            for partition in os.listdir(obj_path):
                if override_partitions is not None and partition not in override_partitions:
                    continue

                part_nodes = None
                try:
                    job_path = join(obj_path, partition)
                    part_nodes = policy.object_ring.get_part_nodes(int(partition))
                    nodes = [node for node in part_nodes if node["id"] != local_dev["id"]]
                    jobs.append(
                        dict(
                            path=job_path,
                            device=local_dev["device"],
                            obj_path=obj_path,
                            nodes=nodes,
                            delete=len(nodes) > len(part_nodes) - 1,
                            policy=policy,
                            partition=partition,
                            region=local_dev["region"],
                        )
                    )
                except ValueError:
                    if part_nodes:
                        self._add_failure_stats(
                            [(failure_dev["replication_ip"], failure_dev["device"]) for failure_dev in nodes]
                        )
                    else:
                        self._add_failure_stats(
                            [
                                (failure_dev["replication_ip"], failure_dev["device"])
                                for failure_dev in policy.object_ring.devs
                                if failure_dev
                            ]
                        )
                    continue
        if not found_local:
            self.logger.error(
                "Can't find itself %s with port %s in ring " "file, not replicating", ", ".join(ips), self.port
            )
        return jobs
Esempio n. 42
0
    def replicate(self, override_devices=None, override_partitions=None,
                  override_policies=None):
        """Run a replication pass"""
        self.start = time.time()
        self.suffix_count = 0
        self.suffix_sync = 0
        self.suffix_hash = 0
        self.replication_count = 0
        self.last_replication_count = -1
        self.partition_times = []
        self.my_replication_ips = self._get_my_replication_ips()
        self.all_devs_info = set()

        stats = eventlet.spawn(self.heartbeat)
        lockup_detector = eventlet.spawn(self.detect_lockups)
        eventlet.sleep()  # Give spawns a cycle

        current_nodes = None
        try:
            self.run_pool = GreenPool(size=self.concurrency)
            jobs = self.collect_jobs(override_devices=override_devices,
                                     override_partitions=override_partitions,
                                     override_policies=override_policies)
            for job in jobs:
                current_nodes = job['nodes']
                if override_devices and job['device'] not in override_devices:
                    continue
                if override_partitions and \
                        job['partition'] not in override_partitions:
                    continue
                dev_path = join(self.devices_dir, job['device'])
                if self.mount_check and not ismount(dev_path):
                    self._add_failure_stats([(failure_dev['replication_ip'],
                                              failure_dev['device'])
                                             for failure_dev in job['nodes']])
                    self.logger.warn(_('%s is not mounted'), job['device'])
                    continue
                if not self.check_ring(job['policy'].object_ring):
                    self.logger.info(_("Ring change detected. Aborting "
                                       "current replication pass."))
                    return
                try:
                    if isfile(job['path']):
                        # Clean up any (probably zero-byte) files where a
                        # partition should be.
                        self.logger.warning(
                            'Removing partition directory '
                            'which was a file: %s', job['path'])
                        os.remove(job['path'])
                        continue
                except OSError:
                    continue
                if job['delete']:
                    self.run_pool.spawn(self.update_deleted, job)
                else:
                    self.run_pool.spawn(self.update, job)
            current_nodes = None
            with Timeout(self.lockup_timeout):
                self.run_pool.waitall()
        except (Exception, Timeout):
            if current_nodes:
                self._add_failure_stats([(failure_dev['replication_ip'],
                                          failure_dev['device'])
                                         for failure_dev in current_nodes])
            else:
                self._add_failure_stats(self.all_devs_info)
            self.logger.exception(_("Exception in top-level replication loop"))
            self.kill_coros()
        finally:
            stats.kill()
            lockup_detector.kill()
            self.stats_line()
            self.stats['attempted'] = self.replication_count
Esempio n. 43
0
    def build_replication_jobs(self, policy, ips, override_devices=None,
                               override_partitions=None):
        """
        Helper function for collect_jobs to build jobs for replication
        using replication style storage policy
        """
        jobs = []
        df_mgr = self._df_router[policy]
        self.all_devs_info.update(
            [(dev['replication_ip'], dev['device'])
             for dev in policy.object_ring.devs if dev])
        data_dir = get_data_dir(policy)
        found_local = False
        for local_dev in [dev for dev in policy.object_ring.devs
                          if (dev
                              and is_local_device(ips,
                                                  self.port,
                                                  dev['replication_ip'],
                                                  dev['replication_port'])
                              and (override_devices is None
                                   or dev['device'] in override_devices))]:
            found_local = True
            dev_path = join(self.devices_dir, local_dev['device'])
            obj_path = join(dev_path, data_dir)
            tmp_path = join(dev_path, get_tmp_dir(policy))
            if self.mount_check and not ismount(dev_path):
                self._add_failure_stats(
                    [(failure_dev['replication_ip'],
                      failure_dev['device'])
                     for failure_dev in policy.object_ring.devs
                     if failure_dev])
                self.logger.warning(
                    _('%s is not mounted'), local_dev['device'])
                continue
            unlink_older_than(tmp_path, time.time() -
                              df_mgr.reclaim_age)
            if not os.path.exists(obj_path):
                try:
                    mkdirs(obj_path)
                except Exception:
                    self.logger.exception('ERROR creating %s' % obj_path)
                continue
            for partition in os.listdir(obj_path):
                if (override_partitions is not None
                        and partition not in override_partitions):
                    continue

                if (partition.startswith('auditor_status_') and
                        partition.endswith('.json')):
                    # ignore auditor status files
                    continue

                part_nodes = None
                try:
                    job_path = join(obj_path, partition)
                    part_nodes = policy.object_ring.get_part_nodes(
                        int(partition))
                    nodes = [node for node in part_nodes
                             if node['id'] != local_dev['id']]
                    jobs.append(
                        dict(path=job_path,
                             device=local_dev['device'],
                             obj_path=obj_path,
                             nodes=nodes,
                             delete=len(nodes) > len(part_nodes) - 1,
                             policy=policy,
                             partition=partition,
                             region=local_dev['region']))
                except ValueError:
                    if part_nodes:
                        self._add_failure_stats(
                            [(failure_dev['replication_ip'],
                              failure_dev['device'])
                             for failure_dev in nodes])
                    else:
                        self._add_failure_stats(
                            [(failure_dev['replication_ip'],
                              failure_dev['device'])
                             for failure_dev in policy.object_ring.devs
                             if failure_dev])
                    continue
        if not found_local:
            self.logger.error("Can't find itself in policy with index %d with"
                              " ips %s and with port %s in ring file, not"
                              " replicating",
                              int(policy), ", ".join(ips), self.port)
        return jobs