Example #1
0
 def __init__(self, options, metadata):
     super(BaseFilter, self).__init__(options, metadata)
     self.check_options()
     self.logger = FilterLogger({
         'log_level': options.get('log_level'),
         'logger_name': options.get('logger_name')
     })
     self.set_metadata('filtered_out', 0)
     self.total = 0
Example #2
0
class BaseFilter(BasePipelineItem):
    """
    This module receives a batch, filter it according to some parameters, and returns it.
    """
    log_at_every = 1000

    def __init__(self, options, metadata):
        super(BaseFilter, self).__init__(options, metadata)
        self.check_options()
        self.logger = FilterLogger({
            'log_level': options.get('log_level'),
            'logger_name': options.get('logger_name')
        })
        self.set_metadata('filtered_out', 0)
        self.total = 0

    def _log_progress(self):
        if self.total % self.log_at_every == 0:
            self.logger.info('Filtered out %d records from %d total' %
                             (self.get_metadata('filtered_out'), self.total))

    def filter_batch(self, batch):
        """
        Receives the batch, filters it, and returns it.
        """
        for item in batch:
            if self.filter(item):
                yield item
            else:
                self.set_metadata('filtered_out',
                                  self.get_metadata('filtered_out') + 1)

            self.total += 1
            self._log_progress()

    def filter(self, item):
        """
        It receives an item and returns True if the filter must be included, or False if not
        """
        raise NotImplementedError

    def set_metadata(self, key, value, module='filter'):
        super(BaseFilter, self).set_metadata(key, value, module)

    def update_metadata(self, data, module='filter'):
        super(BaseFilter, self).update_metadata(data, module)

    def get_metadata(self, key, module='filter'):
        return super(BaseFilter, self).get_metadata(key, module)

    def get_all_metadata(self, module='filter'):
        return super(BaseFilter, self).get_all_metadata(module)
Example #3
0
class BaseFilter(BasePipelineItem):
    """
    This module receives a batch, filter it according to some parameters, and returns it.
    """
    log_at_every = 1000

    def __init__(self, options, metadata):
        super(BaseFilter, self).__init__(options, metadata)
        self.check_options()
        self.logger = FilterLogger(
            {'log_level': options.get('log_level'), 'logger_name': options.get('logger_name')})
        self.set_metadata('filtered_out', 0)
        self.total = 0

    def _log_progress(self):
        if self.total % self.log_at_every == 0:
            self.logger.info('Filtered out %d records from %d total' %
                             (self.get_metadata('filtered_out'), self.total))

    def filter_batch(self, batch):
        """
        Receives the batch, filters it, and returns it.
        """
        for item in batch:
            if self.filter(item):
                yield item
            else:
                self.set_metadata('filtered_out',
                                  self.get_metadata('filtered_out') + 1)

            self.total += 1
            self._log_progress()

    def filter(self, item):
        """
        It receives an item and returns True if the filter must be included, or False if not
        """
        raise NotImplementedError

    def set_metadata(self, key, value, module='filter'):
        super(BaseFilter, self).set_metadata(key, value, module)

    def update_metadata(self, data, module='filter'):
        super(BaseFilter, self).update_metadata(data, module)

    def get_metadata(self, key, module='filter'):
        return super(BaseFilter, self).get_metadata(key, module)

    def get_all_metadata(self, module='filter'):
        return super(BaseFilter, self).get_all_metadata(module)
Example #4
0
 def __init__(self, options, metadata):
     super(BaseFilter, self).__init__(options, metadata)
     self.check_options()
     self.logger = FilterLogger(
         {'log_level': options.get('log_level'), 'logger_name': options.get('logger_name')})
     self.set_metadata('filtered_out', 0)
     self.total = 0
Example #5
0
 def __init__(self, options, metadata=None):
     super(BaseGrouper, self).__init__(options, metadata)
     self.logger = FilterLogger({
         'log_level': self.options.get('log_level'),
         'logger_name': self.options.get('logger_name')
     })