Example #1
0
    def _commit_copies(self, cycle_number, inventory, copy_list, comment):
        """
        @param cycle_number  Cycle number.
        @param inventory     Dynamo inventory.
        @param copy_list     List of dataset replicas to be created (proposal - final state depends on copy scheduling success)
        @param comment       Comment to be passed to the copy interface.
        """

        signal_blocker = SignalBlocker(logger = LOG)

        by_site = collections.defaultdict(list)
        for replica in copy_list:
            by_site[replica.site].append(replica)

        for site in sorted(by_site.iterkeys(), key = lambda s: s.name):
            replicas = by_site[site]

            LOG.info('Scheduling copy of %d replicas to %s.', len(replicas), site.name)

            with signal_blocker:
                history_record = self.history.make_cycle_entry(cycle_number, site)

                scheduled_replicas = self.copy_op[site.name].schedule_copies(replicas, history_record.operation_id, comments = comment)

                for replica in scheduled_replicas:
                    history_record.replicas.append(CopiedReplica(replica.dataset.name, replica.size(physical = False), HistoryRecord.ST_ENROUTE))

                    inventory.update(replica)
                    for block_replica in replica.block_replicas:
                        inventory.update(block_replica)

                self.history.update_entry(history_record)

                total_size = sum(r.size for r in history_record.replicas)
                LOG.info('Scheduled copy %.1f TB to %s.', total_size * 1.e-12, site.name)
Example #2
0
    def _update_inventory(self, update_commands):
        # My updates
        self.manager.set_status(ServerHost.STAT_UPDATING)

        with SignalBlocker():
            self._exec_updates(update_commands)

        self.manager.set_status(ServerHost.STAT_ONLINE)

        # Others
        self.manager.send_updates(update_commands)
Example #3
0
    def _commit_copies(self, cycle_number, inventory, copy_list, comment):
        """
        @param cycle_number  Cycle number.
        @param inventory     Dynamo inventory.
        @param copy_list     Flat list of dataset or block replicas.
        @param comment       Comment to be passed to the copy interface.
        """

        signal_blocker = SignalBlocker(logger = LOG)

        group = inventory.groups[self.policy.group_name]

        by_site = collections.defaultdict(list)
        for replica in copy_list:
            by_site[replica.site].append(replica)

        for site in sorted(by_site.iterkeys(), key = lambda s: s.name):
            replicas = by_site[site]

            LOG.info('Scheduling copy of %d replicas to %s.', len(replicas), site.name)

            with signal_blocker:
                copy_mapping = self.copy_op.schedule_copies(replicas, comments = comment)
        
                # It would be better if mapping from replicas to items is somehow kept
                # Then we can get rid of creating yet another replica object below, which
                # means we can let each plugin to decide which group they want to make replicas in
                for copy_id, (approved, site, items) in copy_mapping.iteritems():
                    dataset_list = set()
                    size = 0
                    for item in items:
                        size += item.size

                        if type(item) is Dataset:
                            dataset_list.add(item)
                            if approved:
                                replica = new_replica_from_dataset(item, site, group)
                                inventory.update(replica)

                        else:
                            dataset_list.add(item.dataset)
                            if approved:
                                if site.find_dataset_replica(item.dataset) is None:
                                    replica = new_replica_from_dataset(item.dataset, site, group)
                                    inventory.update(replica)

                                replica = new_replica_from_block(item, site, group)
                                inventory.update(replica)
    
                    self.history.make_copy_entry(cycle_number, site, copy_id, approved, dataset_list, size)
Example #4
0
    def _commit_deletions(self, cycle_number, inventory, deleted, comment):
        """
        @param cycle_number  Cycle number.
        @param inventory     Global (original) inventory
        @param deleted       {dataset_replica: {condition_id: set(block_replicas)}}
        @param comment       Comment to be passed to the deletion interface.
        """

        signal_blocker = SignalBlocker(logger=LOG)

        # get the original replicas from the inventory and organize them into sites
        deletions_by_site = collections.defaultdict(
            list)  # {site: [(dataset_replica, block_replicas)]}

        for replica, matches in deleted.iteritems():
            site = inventory.sites[replica.site.name]

            original_replica = inventory.datasets[
                replica.dataset.name].find_replica(site)
            original_block_replicas = dict(
                (br.block.name, br) for br in original_replica.block_replicas)

            all_block_replicas = set()
            for block_replicas in matches.itervalues():
                for block_replica in block_replicas:
                    all_block_replicas.add(
                        original_block_replicas[block_replica.block.name])

            if not replica.growing and all_block_replicas == original_replica.block_replicas:
                # if we are deleting all block replicas and the replica is marked as not growing, delete the DatasetReplica
                deletions_by_site[site].append((original_replica, None))
            else:
                # otherwise delete only the BlockReplicas
                deletions_by_site[site].append(
                    (original_replica, list(all_block_replicas)))

        # now schedule deletions for each site
        for site in sorted(deletions_by_site.iterkeys(), key=lambda s: s.name):
            site_deletion_list = deletions_by_site[site]

            LOG.info('Deleting %d replicas from %s.', len(site_deletion_list),
                     site.name)

            null_group = inventory.groups[None]

            # Block interruptions until deletion is executed and recorded
            with signal_blocker:
                history_record = self.history.make_cycle_entry(
                    cycle_number, site)

                scheduled_replicas = self.deletion_op.schedule_deletions(
                    site_deletion_list,
                    history_record.operation_id,
                    comments=comment)

                for replica, block_replicas in scheduled_replicas:
                    deleted_size = 0

                    if block_replicas is None:
                        replica.growing = False
                        replica.group = null_group
                        # replica is a clone -> use inventory.update instead of inventory.register_update
                        inventory.update(replica)

                        original_replica = replica.site.find_dataset_replica(
                            replica.dataset)
                        for block_replica in original_replica.block_replicas:
                            block_replica.group = null_group
                            inventory.register_update(block_replica)

                        deleted_size += original_replica.size()

                    else:
                        for block_replica in block_replicas:
                            block_replica.group = null_group
                            inventory.update(block_replica)

                            deleted_size += block_replica.size

                    history_record.replicas.append(
                        DeletedReplica(replica.dataset.name, deleted_size))

                self.history.update_entry(history_record)

                total_size = sum(r.size for r in history_record.replicas)
                LOG.info('Done deleting %.1f TB from %s.', total_size * 1.e-12,
                         site.name)
Example #5
0
    def run(self):
        """
        Infinite-loop main body of the daemon.
        Step 1: Poll the registry for one uploaded script.
        Step 2: If a script is found, check the authorization of the script.
        Step 3: Spawn a child process for the script.
        Step 4: Collect updates from the write-enabled child process.
        Step 5: Collect completed child processes.
        Step 6: Sleep for N seconds.
        """

        LOG.info('Started dynamo daemon.')

        child_processes = []

        # There can only be one child process with write access at a time. We pass it a Queue to communicate back.
        # writing_process is a tuple (proc, queue) when some process is writing
        writing_process = (None, None)
        # We need to buffer updated and deleted objects from the child process to avoid filling up the pipe
        updated_objects = []
        deleted_objects = []

        signal_blocker = SignalBlocker(logger=LOG)

        try:
            LOG.info('Start polling for executables.')

            first_wait = True
            sleep_time = 0

            while True:
                self.registry.backend.query('UNLOCK TABLES')

                ## Step 4 (easier to do here because we use "continue"s)
                if writing_process[1] is not None:
                    terminated = self.collect_updates(writing_process[1],
                                                      updated_objects,
                                                      deleted_objects)
                    if terminated:
                        writing_process[1].close()
                        writing_process = (writing_process[0], None)

                ## Step 5 (easier to do here because we use "continue"s)
                completed_processes = self.collect_processes(child_processes)

                for proc, status in completed_processes:
                    if proc is not writing_process[0]:
                        continue

                    # drain the queue
                    if writing_process[1] is not None:
                        self.collect_updates(writing_process[1],
                                             updated_objects,
                                             deleted_objects,
                                             drain=True)
                        writing_process[1].close()

                    writing_process = (None, None)

                    if status != 'done':
                        continue

                    # The child process may send us the list of updated/deleted objects
                    # Block system signals and get update done
                    with signal_blocker:
                        for obj in updated_objects:
                            self.inventory.update(obj,
                                                  write=True,
                                                  changelog=CHANGELOG)
                        for obj in deleted_objects:
                            CHANGELOG.info('Deleting %s', str(obj))
                            self.inventory.delete(obj, write=True)

                    updated_objects = []
                    deleted_objects = []

                ## Step 6 (easier to do here because we use "continue"s)
                time.sleep(sleep_time)

                ## Step 1: Poll
                LOG.debug('Polling for executables.')

                # UNLOCK statement at the top of the while loop
                self.registry.backend.query('LOCK TABLES `action` WRITE')

                sql = 'SELECT s.`id`, s.`write_request`, s.`title`, s.`path`, s.`args`, s.`user_id`, u.`name`'
                sql += ' FROM `action` AS s INNER JOIN `users` AS u ON u.`id` = s.`user_id`'
                sql += ' WHERE s.`status` = \'new\''
                if writing_process[0] is not None:
                    # we don't allow write_requesting executables while there is one running
                    sql += ' AND s.`write_request` = 0'
                sql += ' ORDER BY s.`timestamp` LIMIT 1'
                result = self.registry.backend.query(sql)

                if len(result) == 0:
                    if len(child_processes) == 0 and first_wait:
                        LOG.info('Waiting for executables.')
                        first_wait = False

                    sleep_time = 0.5

                    LOG.debug('No executable found, sleeping for %d seconds.',
                              sleep_time)

                    continue

                ## Step 2: If a script is found, check the authorization of the script.
                exec_id, write_request, title, path, args, user_id, user_name = result[
                    0]

                first_wait = True
                sleep_time = 0

                if not os.path.exists(path + '/exec.py'):
                    LOG.info(
                        'Executable %s from user %s (write request: %s) not found.',
                        title, user_name, write_request)
                    self.registry.backend.query(
                        'UPDATE `action` SET `status` = %s WHERE `id` = %s',
                        'notfound', exec_id)
                    continue

                LOG.info(
                    'Found executable %s from user %s (write request: %s)',
                    title, user_name, write_request)

                proc_args = (path, args)

                if write_request:
                    if not self.check_write_auth(title, user_id, path):
                        LOG.warning(
                            'Executable %s from user %s is not authorized for write access.',
                            title, user_name)
                        # send a message

                        self.registry.backend.query(
                            'UPDATE `action` SET `status` = %s where `id` = %s',
                            'authfailed', exec_id)
                        continue

                    queue = multiprocessing.Queue()
                    proc_args += (queue, )

                ## Step 3: Spawn a child process for the script
                self.registry.backend.query(
                    'UPDATE `action` SET `status` = %s WHERE `id` = %s', 'run',
                    exec_id)

                proc = multiprocessing.Process(target=self._run_one,
                                               name=title,
                                               args=proc_args)
                child_processes.append((exec_id, proc, user_name, path))

                proc.daemon = True
                proc.start()

                if write_request:
                    writing_process = (proc, proc_args[-1])

                LOG.info('Started executable %s (%s) from user %s (PID %d).',
                         title, path, user_name, proc.pid)

        except KeyboardInterrupt:
            LOG.info('Server process was interrupted.')

        except:
            # log the exception
            LOG.warning(
                'Exception in server process. Terminating all child processes.'
            )
            raise

        finally:
            # If the main process was interrupted by Ctrl+C:
            # Ctrl+C will pass SIGINT to all child processes (if this process is the head of the
            # foreground process group). In this case calling terminate() will duplicate signals
            # in the child. Child processes have to always ignore SIGINT and be killed only from
            # SIGTERM sent by the line below.

            self.registry.backend.query('UNLOCK TABLES')

            for exec_id, proc, user_name, path in child_processes:
                LOG.warning('Terminating %s (%s) requested by %s (PID %d)',
                            proc.name, path, user_name, proc.pid)
                uid = os.geteuid()
                os.seteuid(0)
                proc.terminate()
                os.seteuid(uid)
                proc.join(5)
                if proc.is_alive():
                    LOG.warning(
                        'Child process %d did not return after 5 seconds.',
                        proc.pid)

                self.registry.backend.query(
                    'UPDATE `action` SET `status` = \'killed\' where `id` = %s',
                    exec_id)

            if writing_process[1] is not None:
                writing_process[1].close()
Example #6
0
    def _commit_deletions(self, cycle_number, inventory, deleted, comment):
        """
        @param cycle_number  Cycle number.
        @param inventory     Global (original) inventory
        @param deleted       {dataset_replica: {condition_id: set(block_replicas)}}
        @param comment       Comment to be passed to the deletion interface.
        """

        signal_blocker = SignalBlocker(logger=LOG)

        # organize the replicas into sites
        deletions_by_site = collections.defaultdict(
            list)  # {site: [(dataset_replica, block_replicas)]}
        for replica, matches in deleted.iteritems():
            all_block_replicas = set()
            for condition_id, block_replicas in matches.iteritems():
                all_block_replicas.update(block_replicas)

            deletions_by_site[replica.site].append(
                (replica, all_block_replicas))

        # now schedule deletions for each site
        for site in sorted(deletions_by_site.iterkeys(), key=lambda s: s.name):
            site_deletion_list = deletions_by_site[site]

            LOG.info('Deleting %d replicas from %s.', len(site_deletion_list),
                     site.name)

            flat_list = []
            for replica, block_replicas in site_deletion_list:
                if set(block_replicas) == replica.block_replicas:
                    flat_list.append(replica)
                else:
                    flat_list.extend(block_replicas)

            # Block interruptions until deletion is executed and recorded
            with signal_blocker:
                deletion_mapping = self.deletion_op.schedule_deletions(
                    flat_list, comments=comment)

                total_size = 0

                for deletion_id, (approved, site,
                                  items) in deletion_mapping.iteritems():
                    # Delete ownership of block replicas in the approved deletions.
                    # Because replicas in partition_repository are modified already during the iterative
                    # deletion, we find the original replicas from the global inventory.

                    size = 0
                    datasets = set()
                    for item in items:
                        if type(item) is Dataset:
                            dataset = inventory.datasets[item.name]
                            replica = dataset.find_replica(site.name)
                            for block_replica in replica.block_replicas:
                                size += block_replica.size
                                if approved:
                                    block_replica.group = inventory.groups[
                                        None]
                                    inventory.update(block_replica)
                        else:
                            dataset = inventory.datasets[item.dataset.name]
                            block = dataset.find_block(item.name)
                            replica = block.find_replica(site.name)
                            if replica is None:
                                LOG.info('Could not find %s:%s in inventory',
                                         site.name, block.full_name())
                                raise RuntimeError()

                            size += replica.size
                            if approved:
                                replica.group = inventory.groups[None]
                                inventory.update(replica)

                        datasets.add(dataset)

                    self.history.make_deletion_entry(cycle_number, site,
                                                     deletion_id, approved,
                                                     datasets, size)
                    total_size += size

            LOG.info('Done deleting %.1f TB from %s.', total_size * 1.e-12,
                     site.name)