def process_logs(self, logs_to_process, processed_files): """ :param logs_to_process: list of logs to process :param processed_files: set of processed files :returns: returns a list of rows of processed data. The first row is the column headers. The rest of the rows contain hourly aggregate data for the account specified in the row. Files processed are added to the processed_files set. When a large data structure is no longer needed, it is deleted in an effort to conserve memory. """ # map processor_args = (self.total_conf, self.logger) results = multiprocess_collate(LogProcessor, processor_args, 'process_one_file', logs_to_process, self.worker_count) # reduce aggr_data = self.get_aggregate_data(processed_files, results) del results # group # reduce a large number of keys in aggr_data[k] to a small # number of output keys final_info = self.get_final_info(aggr_data) del aggr_data # output return self.get_output(final_info)
def test_multiprocess_collate_errors(self): def get_object_data(*a, **kw): raise log_common.BadFileDownload() orig_get_object_data = log_processor.LogProcessor.get_object_data try: log_processor.LogProcessor.get_object_data = get_object_data proxy_config = self.proxy_config.copy() proxy_config.update({ 'log-processor-access': { 'source_filename_format': '%Y%m%d%H*', 'class_path': 'slogging.access_processor.AccessLogProcessor' } }) processor_args = (proxy_config, DumbLogger()) item = ('access', 'a', 'c', 'o') logs_to_process = [item] processor_klass = log_processor.LogProcessor results = log_common.multiprocess_collate(processor_klass, processor_args, 'process_one_file', logs_to_process, 1, DumbLogger()) results = list(results) expected = [] self.assertEquals(results, expected) finally: log_processor.LogProcessor._internal_proxy = None log_processor.LogProcessor.get_object_data = orig_get_object_data
def test_multiprocess_collate_errors(self): def get_object_data(*a, **kw): raise log_common.BadFileDownload() orig_get_object_data = log_processor.LogProcessor.get_object_data try: log_processor.LogProcessor.get_object_data = get_object_data proxy_config = self.proxy_config.copy() proxy_config.update({ 'log-processor-access': { 'source_filename_format': '%Y%m%d%H*', 'class_path': 'slogging.access_processor.AccessLogProcessor' }}) processor_args = (proxy_config, DumbLogger()) item = ('access', 'a', 'c', 'o') logs_to_process = [item] processor_klass = log_processor.LogProcessor results = log_common.multiprocess_collate(processor_klass, processor_args, 'process_one_file', logs_to_process, 1, DumbLogger()) results = list(results) expected = [] self.assertEquals(results, expected) finally: log_processor.LogProcessor._internal_proxy = None log_processor.LogProcessor.get_object_data = orig_get_object_data
def run_once(self, *a, **kw): self.logger.info(_("Beginning log processing")) start = time.time() already_processed_files = \ self.log_processor.load_already_processed_files() lookback_hours = kw.get('lookback_hours') if lookback_hours: self.log_processor.lookback_hours = lookback_hours lookback_window = kw.get('lookback_window') if lookback_window: self.log_processor.lookback_window = lookback_window logs_to_process = \ self.log_processor.get_logs_to_process(already_processed_files) if not logs_to_process: self.logger.info(_("Log processing done (%0.2f minutes)") % ((time.time() - start) / 60)) return # map processor_args = (self.conf, self.logger) results = multiprocess_collate(AccessLogDelivery, processor_args, 'process_one_file', logs_to_process, self.worker_count) #reduce processed_files = already_processed_files files_to_upload = set() for item, data in results: a, c, o = item processed_files.add(o) if data: files_to_upload.update(data) len_working_dir = len(self.working_dir) + 1 # +1 for the trailing '/' for filename in files_to_upload: target_name = filename[len_working_dir:] account, target_name = target_name.split('/', 1) some_id = uuid4().hex target_name = '%s/%s.log.gz' % (target_name, some_id) success = self.log_processor.internal_proxy.upload_file(filename, account, self.target_container, target_name) if success: os.unlink(filename) self.logger.debug('Uploaded %s to account %s' % (filename, account)) else: self.logger.error('Could not upload %s to account %s' % ( filename, account)) # cleanup success = self.log_processor.save_processed_files(processed_files) if not success: self.logger.error('Error uploading updated processed files log') self.logger.info(_("Log processing done (%0.2f minutes)") % ((time.time() - start) / 60))