def _set_sentinel_monitoring_file(self, action_pid): """ Create a sentinel monitoring file at the beginning of message processing, to know how long an agent is processing a message. """ os.makedirs(self._uida_conf_vars['RABBIT_MONITORING_DIR'], exist_ok=True) try: with open(self._sentinel_monitoring_file, 'w') as f: f.write( '# a sentinel file for monitoring message process duration of an IDA postprocessing agent.\n' '# this file is automatically destroyed by the agent when message processing has ended.\n' 'HOST="%s"\n' 'AGENT="%s"\n' 'PROCESS_ID=%d\n' 'ACTION_PID="%s"\n' 'PROCESSING_STARTED="%s"\n' % (self._hostname, self.__class__.__name__, self._process_pid, action_pid, current_time())) except PermissionError: st = os.stat(self._sentinel_monitoring_file) owner = pwd.getpwuid(st.st_uid).pw_name raise MonitoringFilePermissionError( 'Unable to create sentinel file at %s: Operation not permitted. The file already existed, ' 'and is owned by: %s. The agent will refuse to process messages until it can create a ' 'monitoring file.' % (self._sentinel_monitoring_file, owner))
def _copy_to_replication_location(self, node, timestamp=current_time()): """ Copy a single node from frozen location to replication location. As an extra precaution, checksums are re-calculated for files after copy, and compared with the checksums of the initial checksum generation phase. Note that for efficiencies sake, during repair of a project, the file will not be re-copied if a replication already exists and the file size is the same for both the frozen file and already replicated file. """ src_path = construct_file_path(self._uida_conf_vars, node) dest_path = construct_file_path(self._uida_conf_vars, node, replication=True) if os.path.exists(dest_path): if os.stat(src_path).st_size == os.stat(dest_path).st_size: self._logger.debug('Skipping already replicated file: %s' % dest_path) # If the file has no replicated timestamp defined, set it to the frozen timestamp if not node.get('replicated', None): self._logger.debug( 'Fixing missing replicated timestamp: %s' % node['frozen']) node['replicated'] = node['frozen'] node['_updated'] = True return try: shutil.copy(src_path, dest_path) except IOError as e: # ENOENT(2): file does not exist, raised also on missing dest parent dir if e.errno != errno.ENOENT: raise os.makedirs(os.path.dirname(dest_path), exist_ok=True) shutil.copy(src_path, dest_path) try: replicated_checksum = self._get_file_checksum(dest_path) except Exception as e: raise Exception( 'Error generating checksum for file: %s, pathname: %s, error: %s' % (node['pid'], node['pathname'], str(e))) # Remove any sha256: URI prefix node['checksum'] = self._get_checksum_value(node['checksum']) if node['checksum'] != replicated_checksum: raise Exception( 'Checksum mismatch after replication for file: %s, pathname: %s, frozen_checksum: %s, replicated_checksum: %s' % (node['pid'], node['pathname'], node['checksum'], replicated_checksum)) node['replicated'] = timestamp node['_updated'] = True node['_copied'] = True
def _process_replication(self, action): """ Process replication for nodes in action. Replication basically means just a regular file copy from place a to b. """ self._logger.info('Processing replication...') self._check_replication_root_is_mounted() nodes = self._get_nodes_associated_with_action(action) replication_start_time = current_time() files_copied = 0 for node in nodes: if not node.get('checksum', None): raise Exception( 'Node %s did not have checksum generated before starting replication.' % node['pid']) if node.get('replicated', None) and action['action'] != 'repair': self._logger.debug('Node %s already copied, skipping...' % node['pid']) continue try: self._copy_to_replication_location(node, replication_start_time) except Exception: # on any error during file copy, check if the error is because the mount # point disappeared. if there is an error, the below method will raise # a ReplicationRootNotMounted exception, and the error will not count # towards retry limits. self._check_replication_root_is_mounted() # if code still executes here, the error was because of something else... # send the error down the usual error handling route. raise # save each individual successfully replicated node to IDA db immediately, # in case the replication process fails later. this way, already replicated # files will not be replicated again during a retry. self._save_nodes_to_db([node], fields=['replicated'], updated_only=True) if node.get('_copied', False) == True: files_copied += 1 self.last_number_of_files_replicated = files_copied self._save_action_completion_timestamp(action, 'replication') self._save_action_completion_timestamp(action, 'completed') self._logger.info('Replication processing OK')
def _save_action_failed_timestamp(self, action, exception): """ The action has completely failed despite retries: Update the action being processed with a failed-timestamp and error description, and save to db. The action will no longer be automatically retried. """ self._logger.debug('Saving failed-timestamp to ida db...') error_data = {'failed': current_time(), 'error': str(exception)} self._update_action_to_db(action, error_data) self._logger.info('Marked action with pid %s as failed.' % action['pid']) self.last_failed_action = { 'action_pid': action['pid'], 'data': error_data } return True
def _save_action_completion_timestamp(self, action, sub_action_name): """ Update the action being processed with a sub-action's completion timestamp, and save to db. """ self._logger.debug('Saving completion timestamp to ida db for: %s...' % sub_action_name) # update existing action record in memory with the timestamp as well for convenience, # although not strictly necessary. action[sub_action_name] = current_time() self._update_action_to_db(action, {sub_action_name: action[sub_action_name]}) self.last_completed_sub_action = { 'action_pid': action['pid'], sub_action_name: action[sub_action_name], }
def _process_metadata_repair(self, action, nodes): self._logger.info('Processing metadata repair...') if not nodes: nodes = self._get_nodes_associated_with_action(action) metadata_start_time = current_time() for node in nodes: node['metadata'] = metadata_start_time technical_metadata = self._aggregate_technical_metadata(action, nodes) self._repair_metadata(technical_metadata, action) self._save_nodes_to_db(nodes, fields=['metadata']) self._save_action_completion_timestamp(action, 'metadata') self._logger.info('Metadata repair OK')
def _republish_action(self, sub_action_name, exception, queue): """ An action has information about a sub-action's retries saved in the key 'sub_action_name + _retry_info'. No key present means it has never been retried yet. Note that this retry-information is never saved into the db, only in the action that is currently being circulated in the rabbitmq exchanges. Returns: - True when action is successfully republished - False when republish fails. Results in the message not being ack'ed, and message will return to its original queue. """ self._logger.debug('Republishing failed action...') sub_action_retry_info = '%s_retry_info' % sub_action_name action = self.rabbitmq_message if sub_action_retry_info not in action: action[sub_action_retry_info] = {} action[sub_action_retry_info]['previous_error'] = str(exception) action[sub_action_retry_info]['previous_attempt'] = current_time() retry_interval = self._settings['retry_policy'][sub_action_name][ 'retry_interval'] if isinstance(exception, HttpApiNotResponding): # api-not-responding errors do not count towards retries, so that they may be # retried an infinite number of times. self._logger.debug( 'Republishing to %s-failed-waiting due to failed HTTP request.' % sub_action_name) else: try: action[sub_action_retry_info]['retry'] += 1 except KeyError: action[sub_action_retry_info]['retry'] = 1 self._logger.debug( 'Next retry #: %d, retry interval of %s-failed-waiting: %d seconds.' % (action[sub_action_retry_info]['retry'], sub_action_name, retry_interval)) try: # Publish action to i.e. checksums-failed-waiting or batch-checksums-failed-waiting, # from where it will be dead-lettered to a queue called metadata-failed or batch-metadata-failed # once its specified retry_interval has expired. if queue == 'replication' or queue == 'replication-failed' or queue == 'metadata' or queue == 'metadata-failed': self.publish_message(action, routing_key='%s-failed-waiting' % sub_action_name, exchange='actions-failed') elif queue == 'batch-replication' or queue == 'batch-replication-failed' or queue == 'batch-metadata' or queue == 'batch-metadata-failed': self.publish_message(action, routing_key='batch-%s-failed-waiting' % sub_action_name, exchange='batch-actions-failed') except: # could not publish? doesnt matter, the message will return to its queue and be retried # at some point. self._logger.warning( 'Action republish failed. Message will return to original queue and be retried in the future.' ) return False self._logger.info( 'Successfully republished action with pid %s with a delay of %d seconds.' % (action['pid'], retry_interval)) return True