Ejemplo n.º 1
0
 def _set_sentinel_monitoring_file(self, action_pid):
     """
     Create a sentinel monitoring file at the beginning of message processing, to know
     how long an agent is processing a message.
     """
     os.makedirs(self._uida_conf_vars['RABBIT_MONITORING_DIR'],
                 exist_ok=True)
     try:
         with open(self._sentinel_monitoring_file, 'w') as f:
             f.write(
                 '# a sentinel file for monitoring message process duration of an IDA postprocessing agent.\n'
                 '# this file is automatically destroyed by the agent when message processing has ended.\n'
                 'HOST="%s"\n'
                 'AGENT="%s"\n'
                 'PROCESS_ID=%d\n'
                 'ACTION_PID="%s"\n'
                 'PROCESSING_STARTED="%s"\n' %
                 (self._hostname, self.__class__.__name__,
                  self._process_pid, action_pid, current_time()))
     except PermissionError:
         st = os.stat(self._sentinel_monitoring_file)
         owner = pwd.getpwuid(st.st_uid).pw_name
         raise MonitoringFilePermissionError(
             'Unable to create sentinel file at %s: Operation not permitted. The file already existed, '
             'and is owned by: %s. The agent will refuse to process messages until it can create a '
             'monitoring file.' % (self._sentinel_monitoring_file, owner))
Ejemplo n.º 2
0
    def _copy_to_replication_location(self, node, timestamp=current_time()):
        """
        Copy a single node from frozen location to replication location.

        As an extra precaution, checksums are re-calculated for files after copy,
        and compared with the checksums of the initial checksum generation phase.

        Note that for efficiencies sake, during repair of a project, the file will not be re-copied
        if a replication already exists and the file size is the same for both the frozen file and
        already replicated file.
        """

        src_path = construct_file_path(self._uida_conf_vars, node)
        dest_path = construct_file_path(self._uida_conf_vars,
                                        node,
                                        replication=True)

        if os.path.exists(dest_path):
            if os.stat(src_path).st_size == os.stat(dest_path).st_size:
                self._logger.debug('Skipping already replicated file: %s' %
                                   dest_path)
                # If the file has no replicated timestamp defined, set it to the frozen timestamp
                if not node.get('replicated', None):
                    self._logger.debug(
                        'Fixing missing replicated timestamp: %s' %
                        node['frozen'])
                    node['replicated'] = node['frozen']
                    node['_updated'] = True
                return

        try:
            shutil.copy(src_path, dest_path)
        except IOError as e:
            # ENOENT(2): file does not exist, raised also on missing dest parent dir
            if e.errno != errno.ENOENT:
                raise
            os.makedirs(os.path.dirname(dest_path), exist_ok=True)
            shutil.copy(src_path, dest_path)

        try:
            replicated_checksum = self._get_file_checksum(dest_path)
        except Exception as e:
            raise Exception(
                'Error generating checksum for file: %s, pathname: %s, error: %s'
                % (node['pid'], node['pathname'], str(e)))

        # Remove any sha256: URI prefix
        node['checksum'] = self._get_checksum_value(node['checksum'])

        if node['checksum'] != replicated_checksum:
            raise Exception(
                'Checksum mismatch after replication for file: %s, pathname: %s, frozen_checksum: %s, replicated_checksum: %s'
                % (node['pid'], node['pathname'], node['checksum'],
                   replicated_checksum))

        node['replicated'] = timestamp
        node['_updated'] = True
        node['_copied'] = True
Ejemplo n.º 3
0
    def _process_replication(self, action):
        """
        Process replication for nodes in action.

        Replication basically means just a regular file copy from place a to b.
        """
        self._logger.info('Processing replication...')

        self._check_replication_root_is_mounted()
        nodes = self._get_nodes_associated_with_action(action)
        replication_start_time = current_time()
        files_copied = 0

        for node in nodes:

            if not node.get('checksum', None):
                raise Exception(
                    'Node %s did not have checksum generated before starting replication.'
                    % node['pid'])

            if node.get('replicated', None) and action['action'] != 'repair':
                self._logger.debug('Node %s already copied, skipping...' %
                                   node['pid'])
                continue

            try:
                self._copy_to_replication_location(node,
                                                   replication_start_time)
            except Exception:
                # on any error during file copy, check if the error is because the mount
                # point disappeared. if there is an error, the below method will raise
                # a ReplicationRootNotMounted exception, and the error will not count
                # towards retry limits.
                self._check_replication_root_is_mounted()

                # if code still executes here, the error was because of something else...
                # send the error down the usual error handling route.
                raise

            # save each individual successfully replicated node to IDA db immediately,
            # in case the replication process fails later. this way, already replicated
            # files will not be replicated again during a retry.
            self._save_nodes_to_db([node],
                                   fields=['replicated'],
                                   updated_only=True)
            if node.get('_copied', False) == True:
                files_copied += 1

        self.last_number_of_files_replicated = files_copied
        self._save_action_completion_timestamp(action, 'replication')
        self._save_action_completion_timestamp(action, 'completed')
        self._logger.info('Replication processing OK')
Ejemplo n.º 4
0
    def _save_action_failed_timestamp(self, action, exception):
        """
        The action has completely failed despite retries: Update the action
        being processed with a failed-timestamp and error description, and save to db.

        The action will no longer be automatically retried.
        """
        self._logger.debug('Saving failed-timestamp to ida db...')
        error_data = {'failed': current_time(), 'error': str(exception)}
        self._update_action_to_db(action, error_data)
        self._logger.info('Marked action with pid %s as failed.' %
                          action['pid'])
        self.last_failed_action = {
            'action_pid': action['pid'],
            'data': error_data
        }
        return True
Ejemplo n.º 5
0
    def _save_action_completion_timestamp(self, action, sub_action_name):
        """
        Update the action being processed with a sub-action's completion timestamp,
        and save to db.
        """
        self._logger.debug('Saving completion timestamp to ida db for: %s...' %
                           sub_action_name)

        # update existing action record in memory with the timestamp as well for convenience,
        # although not strictly necessary.
        action[sub_action_name] = current_time()

        self._update_action_to_db(action,
                                  {sub_action_name: action[sub_action_name]})
        self.last_completed_sub_action = {
            'action_pid': action['pid'],
            sub_action_name: action[sub_action_name],
        }
Ejemplo n.º 6
0
    def _process_metadata_repair(self, action, nodes):
        self._logger.info('Processing metadata repair...')

        if not nodes:
            nodes = self._get_nodes_associated_with_action(action)

        metadata_start_time = current_time()

        for node in nodes:
            node['metadata'] = metadata_start_time

        technical_metadata = self._aggregate_technical_metadata(action, nodes)

        self._repair_metadata(technical_metadata, action)

        self._save_nodes_to_db(nodes, fields=['metadata'])
        self._save_action_completion_timestamp(action, 'metadata')
        self._logger.info('Metadata repair OK')
Ejemplo n.º 7
0
    def _republish_action(self, sub_action_name, exception, queue):
        """
        An action has information about a sub-action's retries saved in the key
        'sub_action_name + _retry_info'. No key present means it has never been retried yet.

        Note that this retry-information is never saved into the db, only in the action that is
        currently being circulated in the rabbitmq exchanges.

        Returns:
        - True when action is successfully republished
        - False when republish fails. Results in the message not being ack'ed, and message
          will return to its original queue.
        """
        self._logger.debug('Republishing failed action...')
        sub_action_retry_info = '%s_retry_info' % sub_action_name

        action = self.rabbitmq_message

        if sub_action_retry_info not in action:
            action[sub_action_retry_info] = {}

        action[sub_action_retry_info]['previous_error'] = str(exception)
        action[sub_action_retry_info]['previous_attempt'] = current_time()
        retry_interval = self._settings['retry_policy'][sub_action_name][
            'retry_interval']

        if isinstance(exception, HttpApiNotResponding):
            # api-not-responding errors do not count towards retries, so that they may be
            # retried an infinite number of times.
            self._logger.debug(
                'Republishing to %s-failed-waiting due to failed HTTP request.'
                % sub_action_name)
        else:
            try:
                action[sub_action_retry_info]['retry'] += 1
            except KeyError:
                action[sub_action_retry_info]['retry'] = 1

            self._logger.debug(
                'Next retry #: %d, retry interval of %s-failed-waiting: %d seconds.'
                % (action[sub_action_retry_info]['retry'], sub_action_name,
                   retry_interval))

        try:
            # Publish action to i.e. checksums-failed-waiting or batch-checksums-failed-waiting,
            # from where it will be dead-lettered to a queue called metadata-failed or batch-metadata-failed
            # once its specified retry_interval has expired.
            if queue == 'replication' or queue == 'replication-failed' or queue == 'metadata' or queue == 'metadata-failed':
                self.publish_message(action,
                                     routing_key='%s-failed-waiting' %
                                     sub_action_name,
                                     exchange='actions-failed')
            elif queue == 'batch-replication' or queue == 'batch-replication-failed' or queue == 'batch-metadata' or queue == 'batch-metadata-failed':
                self.publish_message(action,
                                     routing_key='batch-%s-failed-waiting' %
                                     sub_action_name,
                                     exchange='batch-actions-failed')

        except:
            # could not publish? doesnt matter, the message will return to its queue and be retried
            # at some point.
            self._logger.warning(
                'Action republish failed. Message will return to original queue and be retried in the future.'
            )
            return False

        self._logger.info(
            'Successfully republished action with pid %s with a delay of %d seconds.'
            % (action['pid'], retry_interval))
        return True