コード例 #1
0
    def process_logs(self, logs_to_process, processed_files):
        """
        :param logs_to_process: list of logs to process
        :param processed_files: set of processed files

        :returns: returns a list of rows of processed data.

            The first row is the column headers. The rest of the rows contain
            hourly aggregate data for the account specified in the row.

            Files processed are added to the processed_files set.

            When a large data structure is no longer needed, it is deleted in
            an effort to conserve memory.
        """

        # map
        processor_args = (self.total_conf, self.logger)
        results = multiprocess_collate(LogProcessor, processor_args,
                                       'process_one_file', logs_to_process,
                                       self.worker_count)

        # reduce
        aggr_data = self.get_aggregate_data(processed_files, results)
        del results

        # group
        # reduce a large number of keys in aggr_data[k] to a small
        # number of output keys
        final_info = self.get_final_info(aggr_data)
        del aggr_data

        # output
        return self.get_output(final_info)
コード例 #2
0
    def process_logs(self, logs_to_process, processed_files):
        """
        :param logs_to_process: list of logs to process
        :param processed_files: set of processed files

        :returns: returns a list of rows of processed data.

            The first row is the column headers. The rest of the rows contain
            hourly aggregate data for the account specified in the row.

            Files processed are added to the processed_files set.

            When a large data structure is no longer needed, it is deleted in
            an effort to conserve memory.
        """

        # map
        processor_args = (self.total_conf, self.logger)
        results = multiprocess_collate(LogProcessor, processor_args,
                                       'process_one_file', logs_to_process,
                                       self.worker_count)

        # reduce
        aggr_data = self.get_aggregate_data(processed_files, results)
        del results

        # group
        # reduce a large number of keys in aggr_data[k] to a small
        # number of output keys
        final_info = self.get_final_info(aggr_data)
        del aggr_data

        # output
        return self.get_output(final_info)
コード例 #3
0
ファイル: test_log_common.py プロジェクト: leoh0/slogging
    def test_multiprocess_collate_errors(self):
        def get_object_data(*a, **kw):
            raise log_common.BadFileDownload()

        orig_get_object_data = log_processor.LogProcessor.get_object_data
        try:
            log_processor.LogProcessor.get_object_data = get_object_data
            proxy_config = self.proxy_config.copy()
            proxy_config.update({
                'log-processor-access': {
                    'source_filename_format': '%Y%m%d%H*',
                    'class_path':
                    'slogging.access_processor.AccessLogProcessor'
                }
            })
            processor_args = (proxy_config, DumbLogger())
            item = ('access', 'a', 'c', 'o')
            logs_to_process = [item]
            processor_klass = log_processor.LogProcessor
            results = log_common.multiprocess_collate(processor_klass,
                                                      processor_args,
                                                      'process_one_file',
                                                      logs_to_process, 1,
                                                      DumbLogger())
            results = list(results)
            expected = []
            self.assertEquals(results, expected)
        finally:
            log_processor.LogProcessor._internal_proxy = None
            log_processor.LogProcessor.get_object_data = orig_get_object_data
コード例 #4
0
 def test_multiprocess_collate_errors(self):
     def get_object_data(*a, **kw):
         raise log_common.BadFileDownload()
     orig_get_object_data = log_processor.LogProcessor.get_object_data
     try:
         log_processor.LogProcessor.get_object_data = get_object_data
         proxy_config = self.proxy_config.copy()
         proxy_config.update({
                 'log-processor-access': {
                     'source_filename_format': '%Y%m%d%H*',
                     'class_path':
                         'slogging.access_processor.AccessLogProcessor'
                 }})
         processor_args = (proxy_config, DumbLogger())
         item = ('access', 'a', 'c', 'o')
         logs_to_process = [item]
         processor_klass = log_processor.LogProcessor
         results = log_common.multiprocess_collate(processor_klass,
                                                   processor_args,
                                                   'process_one_file',
                                                   logs_to_process,
                                                   1,
                                                   DumbLogger())
         results = list(results)
         expected = []
         self.assertEquals(results, expected)
     finally:
         log_processor.LogProcessor._internal_proxy = None
         log_processor.LogProcessor.get_object_data = orig_get_object_data
コード例 #5
0
ファイル: access_log_delivery.py プロジェクト: leoh0/slogging
    def run_once(self, *a, **kw):
        self.logger.info(_("Beginning log processing"))
        start = time.time()
        already_processed_files = \
            self.log_processor.load_already_processed_files()
        lookback_hours = kw.get('lookback_hours')
        if lookback_hours:
            self.log_processor.lookback_hours = lookback_hours
        lookback_window = kw.get('lookback_window')
        if lookback_window:
            self.log_processor.lookback_window = lookback_window
        logs_to_process = \
            self.log_processor.get_logs_to_process(already_processed_files)
        if not logs_to_process:
            self.logger.info(_("Log processing done (%0.2f minutes)") %
                        ((time.time() - start) / 60))
            return

        # map
        processor_args = (self.conf, self.logger)
        results = multiprocess_collate(AccessLogDelivery, processor_args,
                                       'process_one_file', logs_to_process,
                                       self.worker_count)

        #reduce
        processed_files = already_processed_files
        files_to_upload = set()
        for item, data in results:
            a, c, o = item
            processed_files.add(o)
            if data:
                files_to_upload.update(data)
        len_working_dir = len(self.working_dir) + 1  # +1 for the trailing '/'
        for filename in files_to_upload:
            target_name = filename[len_working_dir:]
            account, target_name = target_name.split('/', 1)
            some_id = uuid4().hex
            target_name = '%s/%s.log.gz' % (target_name, some_id)
            success = self.log_processor.internal_proxy.upload_file(filename,
                            account,
                            self.target_container,
                            target_name)
            if success:
                os.unlink(filename)
                self.logger.debug('Uploaded %s to account %s' % (filename,
                                                                 account))
            else:
                self.logger.error('Could not upload %s to account %s' % (
                                    filename, account))

        # cleanup
        success = self.log_processor.save_processed_files(processed_files)
        if not success:
            self.logger.error('Error uploading updated processed files log')
        self.logger.info(_("Log processing done (%0.2f minutes)") %
                    ((time.time() - start) / 60))