Beispiel #1
0
 def execute(self):
     # We can't count items on streamed bypasses
     self.valid_total_count = False
     self.bypass_state = StreamBypassState(self.config, self.metadata)
     module_loader = ModuleLoader()
     reader = module_loader.load_reader(self.config.reader_options,
                                        self.metadata)
     writer = module_loader.load_writer(self.config.writer_options,
                                        self.metadata)
     with closing(reader), closing(writer):
         for stream in reader.get_read_streams():
             if stream not in self.bypass_state.skipped:
                 file_obj = cohere_stream(reader.open_stream(stream))
                 logging.log(
                     logging.INFO,
                     'Starting to copy file {}'.format(stream.filename))
                 try:
                     writer.write_stream(stream, file_obj)
                 finally:
                     file_obj.close()
                 logging.log(
                     logging.INFO,
                     'Finished copying file {}'.format(stream.filename))
                 self.bypass_state.commit_copied(stream)
             else:
                 logging.log(logging.INFO,
                             'Skip file {}'.format(stream.filename))
Beispiel #2
0
 def meets_conditions(cls, config):
     if not config.filter_before_options['name'].endswith('NoFilter'):
         cls._log_skip_reason('custom filter configured')
         return False
     if not config.filter_after_options['name'].endswith('NoFilter'):
         cls._log_skip_reason('custom filter configured')
         return False
     if not config.transform_options['name'].endswith('NoTransform'):
         cls._log_skip_reason('custom transform configured')
         return False
     if not config.grouper_options['name'].endswith('NoGrouper'):
         cls._log_skip_reason('custom grouper configured')
         return False
     if config.writer_options.get('options', {}).get('items_limit'):
         cls._log_skip_reason('items limit configuration (items_limit)')
         return False
     if config.writer_options.get('options',
                                  {}).get('items_per_buffer_write'):
         cls._log_skip_reason(
             'buffer limit configuration (items_per_buffer_write)')
         return False
     if config.writer_options.get('options',
                                  {}).get('size_per_buffer_write'):
         cls._log_skip_reason(
             'buffer limit configuration (size_per_buffer_write)')
         return False
     write_buffer = config.writer_options['options'].get('write_buffer')
     if write_buffer and not write_buffer.endswith('base.WriteBuffer'):
         cls._log_skip_reason('custom write buffer configuration')
         return False
     module_loader = ModuleLoader()
     try:
         with closing(
                 module_loader.load_class(
                     config.reader_options['name'])) as reader:
             pass
         with closing(
                 module_loader.load_class(
                     config.writer_options['name'])) as writer:
             pass
     except:
         cls._log_skip_reason("Can't load reader and/or writer")
         return False
     if not callable(getattr(reader, 'get_read_streams', None)) or\
        not callable(getattr(reader, 'open_stream', None)):
         cls._log_skip_reason(
             "Reader doesn't support get_read_streams()/open_stream()")
         return False
     if not hasattr(writer, 'write_stream'):
         cls._log_skip_reason("Writer doesn't support write_stream()")
         return False
     return True
Beispiel #3
0
 def __init__(self, config, metadata):
     module_loader = ModuleLoader()
     self.state = module_loader.load_persistence(config.persistence_options, metadata)
     self.state_position = self.state.get_last_position()
     if not self.state_position:
         self.done = []
         self.skipped = []
         self.stats = {'bytes_copied': 0}
         self.state.commit_position(self._get_state())
     else:
         self.done = []
         self.skipped = self.state_position['done']
         self.stats = self.state_position.get('stats', {'bytes_copied': 0})
Beispiel #4
0
 def __init__(self, config, metadata):
     module_loader = ModuleLoader()
     self.state = module_loader.load_persistence(config.persistence_options,
                                                 metadata)
     self.state_position = self.state.get_last_position()
     if not self.state_position:
         self.done = []
         self.skipped = []
         self.stats = {'bytes_copied': 0}
         self.state.commit_position(self._get_state())
     else:
         self.done = []
         self.skipped = self.state_position['done']
         self.stats = self.state_position.get('stats', {'bytes_copied': 0})
Beispiel #5
0
class NotifiersList(object):
    """
    This class is only used to support a list of notifications modules.
    """
    def __init__(self, options, metadata):
        self.options = options
        self.module_loader = ModuleLoader()
        self.notifiers = self._populate_notifiers(metadata)

    def _populate_notifiers(self, metadata):
        notifiers_list = []
        for notifier in self.options:
            notifier_object = self.module_loader.load_notifier(
                notifier, metadata)
            notifiers_list.append(notifier_object)
        return notifiers_list

    def notify_start_dump(self, receivers=None):
        if receivers is None:
            receivers = []
        for notifier in self.notifiers:
            notifier.notify_start_dump(receivers)

    def notify_complete_dump(self, receivers=None):
        if receivers is None:
            receivers = []
        for notifier in self.notifiers:
            notifier.notify_complete_dump(receivers)

    def notify_failed_job(self, msg, stack_strace, receivers=None):
        if receivers is None:
            receivers = []
        for notifier in self.notifiers:
            notifier.notify_failed_job(msg, stack_strace, receivers)
 def __init__(self, configuration):
     self.config = ExporterConfig(configuration)
     self.logger = ExportManagerLogger(self.config.log_options)
     self.module_loader = ModuleLoader()
     metadata = ExportMeta(configuration)
     self.metadata = metadata
     self.reader = self.module_loader.load_reader(
         self.config.reader_options, metadata)
     self.filter_before = self.module_loader.load_filter(
         self.config.filter_before_options, metadata)
     self.filter_after = self.module_loader.load_filter(
         self.config.filter_after_options, metadata)
     self.transform = self.module_loader.load_transform(
         self.config.transform_options, metadata)
     self.export_formatter = self.module_loader.load_formatter(
         self.config.formatter_options, metadata)
     self.writer = self.module_loader.load_writer(
         self.config.writer_options, metadata, export_formatter=self.export_formatter)
     self.persistence = self.module_loader.load_persistence(
         self.config.persistence_options, metadata)
     self.grouper = self.module_loader.load_grouper(
         self.config.grouper_options, metadata)
     self.notifiers = NotifiersList(self.config.notifiers, metadata)
     if self.config.disable_retries:
         disable_retries()
     self.logger.debug('{} has been initiated'.format(self.__class__.__name__))
     self.stats_manager = self.module_loader.load_stats_manager(
         self.config.stats_options, metadata)
     self.bypass_cases = []
class NotifiersList(object):
    """
    This class is only used to support a list of notifications modules.
    """

    def __init__(self, options, metadata):
        self.options = options
        self.module_loader = ModuleLoader()
        self.notifiers = self._populate_notifiers(metadata)

    def _populate_notifiers(self, metadata):
        notifiers_list = []
        for notifier in self.options:
            notifier_object = self.module_loader.load_notifier(notifier, metadata)
            notifiers_list.append(notifier_object)
        return notifiers_list

    def notify_start_dump(self, receivers=None):
        if receivers is None:
            receivers = []
        for notifier in self.notifiers:
            notifier.notify_start_dump(receivers)

    def notify_complete_dump(self, receivers=None):
        if receivers is None:
            receivers = []
        for notifier in self.notifiers:
            notifier.notify_complete_dump(receivers)

    def notify_failed_job(self, msg, stack_strace, receivers=None):
        if receivers is None:
            receivers = []
        for notifier in self.notifiers:
            notifier.notify_failed_job(msg, stack_strace, receivers)
 def __init__(self, config, metadata, aws_key, aws_secret):
     self.config = config
     module_loader = ModuleLoader()
     self.state = module_loader.load_persistence(config.persistence_options, metadata)
     self.state_position = self.state.get_last_position()
     if not self.state_position:
         self.pending = S3BucketKeysFetcher(
             self.config.reader_options['options'], aws_key, aws_secret).pending_keys()
         self.done = []
         self.skipped = []
         self.stats = {'total_count': 0}
         self.state.commit_position(self._get_state())
     else:
         self.pending = self.state_position['pending']
         self.done = []
         self.skipped = self.state_position['done']
         self.keys = self.pending
         self.stats = self.state_position.get('stats', {'total_count': 0})
Beispiel #9
0
 def __init__(self, config, metadata, aws_key, aws_secret):
     self.config = config
     module_loader = ModuleLoader()
     self.state = module_loader.load_persistence(config.persistence_options,
                                                 metadata)
     self.state_position = self.state.get_last_position()
     if not self.state_position:
         self.pending = S3BucketKeysFetcher(
             self.config.reader_options['options'], aws_key,
             aws_secret).pending_keys()
         self.done = []
         self.skipped = []
         self.stats = {'total_count': 0}
         self.state.commit_position(self._get_state())
     else:
         self.pending = self.state_position['pending']
         self.done = []
         self.skipped = self.state_position['done']
         self.keys = self.pending
         self.stats = self.state_position.get('stats', {'total_count': 0})
Beispiel #10
0
 def execute(self):
     # We can't count items on streamed bypasses
     self.valid_total_count = False
     self.bypass_state = StreamBypassState(self.config, self.metadata)
     module_loader = ModuleLoader()
     reader = module_loader.load_reader(self.config.reader_options, self.metadata)
     writer = module_loader.load_writer(self.config.writer_options, self.metadata)
     with closing(reader), closing(writer):
         for stream in reader.get_read_streams():
             if stream not in self.bypass_state.skipped:
                 file_obj = cohere_stream(reader.open_stream(stream))
                 logging.log(logging.INFO, 'Starting to copy file {}'.format(stream.filename))
                 try:
                     writer.write_stream(stream, file_obj)
                 finally:
                     file_obj.close()
                 logging.log(logging.INFO, 'Finished copying file {}'.format(stream.filename))
                 self.bypass_state.commit_copied(stream)
             else:
                 logging.log(logging.INFO, 'Skip file {}'.format(stream.filename))
Beispiel #11
0
    def _get_write_buffer(self):
        module_loader = ModuleLoader()

        write_buffer_module = self.read_option('write_buffer')
        write_buffer_class = module_loader.load_class(write_buffer_module)
        write_buffer_options = {
            'name': self.read_option('write_buffer'),
            'options': self.read_option('write_buffer_options'),
        }

        file_handler = self._items_group_files_handler(write_buffer_class,
                                                       **write_buffer_options['options'])
        kwargs = {
             'items_per_buffer_write': self.read_option('items_per_buffer_write'),
             'size_per_buffer_write': self.read_option('size_per_buffer_write'),
             'items_group_files_handler': file_handler,
             'compression_format': self.compression_format,
             'hash_algorithm': self.hash_algorithm,
        }
        return module_loader.load_write_buffer(write_buffer_options, self.metadata, **kwargs)
Beispiel #12
0
 def __init__(self, configuration):
     self.config = ExporterConfig(configuration)
     self.threaded = self.config.exporter_options.get('threaded', False)
     self.logger = ExportManagerLogger(self.config.log_options)
     self.module_loader = ModuleLoader()
     metadata = ExportMeta(configuration)
     self.metadata = metadata
     self.reader = self.module_loader.load_reader(
         self.config.reader_options, metadata)
     if is_stream_reader(self.reader):
         deserializer = self.module_loader.load_deserializer(
             self.config.deserializer_options, metadata)
         decompressor = self.module_loader.load_decompressor(
             self.config.decompressor_options, metadata)
         self.reader.deserializer = deserializer
         self.reader.decompressor = decompressor
     self.filter_before = self.module_loader.load_filter(
         self.config.filter_before_options, metadata)
     self.filter_after = self.module_loader.load_filter(
         self.config.filter_after_options, metadata)
     self.transform = self.module_loader.load_transform(
         self.config.transform_options, metadata)
     self.export_formatter = self.module_loader.load_formatter(
         self.config.formatter_options, metadata)
     self.writer = self.module_loader.load_writer(
         self.config.writer_options,
         metadata,
         export_formatter=self.export_formatter)
     self.persistence = self.module_loader.load_persistence(
         self.config.persistence_options, metadata)
     self.grouper = self.module_loader.load_grouper(
         self.config.grouper_options, metadata)
     self.notifiers = NotifiersList(self.config.notifiers, metadata)
     if self.config.disable_retries:
         disable_retries()
     self.logger.debug('{} has been initiated'.format(
         self.__class__.__name__))
     self.stats_manager = self.module_loader.load_stats_manager(
         self.config.stats_options, metadata)
     self.bypass_cases = []
Beispiel #13
0
 def meets_conditions(cls, config):
     if not config.filter_before_options['name'].endswith('NoFilter'):
         cls._log_skip_reason('custom filter configured')
         return False
     if not config.filter_after_options['name'].endswith('NoFilter'):
         cls._log_skip_reason('custom filter configured')
         return False
     if not config.transform_options['name'].endswith('NoTransform'):
         cls._log_skip_reason('custom transform configured')
         return False
     if not config.grouper_options['name'].endswith('NoGrouper'):
         cls._log_skip_reason('custom grouper configured')
         return False
     if config.writer_options.get('options', {}).get('items_limit'):
         cls._log_skip_reason('items limit configuration (items_limit)')
         return False
     if config.writer_options.get('options', {}).get('items_per_buffer_write'):
         cls._log_skip_reason('buffer limit configuration (items_per_buffer_write)')
         return False
     if config.writer_options.get('options', {}).get('size_per_buffer_write'):
         cls._log_skip_reason('buffer limit configuration (size_per_buffer_write)')
         return False
     module_loader = ModuleLoader()
     try:
         with closing(module_loader.load_class(config.reader_options['name'])) as reader:
             pass
         with closing(module_loader.load_class(config.writer_options['name'])) as writer:
             pass
     except:
         cls._log_skip_reason("Can't load reader and/or writer")
         return False
     if not callable(getattr(reader, 'get_read_streams', None)) or\
        not callable(getattr(reader, 'open_stream', None)):
         cls._log_skip_reason("Reader doesn't support get_read_streams()/open_stream()")
         return False
     if not hasattr(writer, 'write_stream'):
         cls._log_skip_reason("Writer doesn't support write_stream()")
         return False
     return True
Beispiel #14
0
 def setUp(self):
     self.module_loader = ModuleLoader()
Beispiel #15
0
 def __init__(self, options, metadata):
     self.options = options
     self.module_loader = ModuleLoader()
     self.notifiers = self._populate_notifiers(metadata)
 def __init__(self, options, metadata):
     self.options = options
     self.module_loader = ModuleLoader()
     self.notifiers = self._populate_notifiers(metadata)
 def test_transform_batch(self):
     reader = ModuleLoader().load_reader(self.options['reader'], meta())
     # FIXME inline batch, without a reader
     batch = reader.get_next_batch()
     self.assertEquals(self.transform.transform_batch(batch), batch)
Beispiel #18
0
class ModuleLoaderTest(unittest.TestCase):
    def setUp(self):
        self.module_loader = ModuleLoader()

    def test_reader_valid_class(self):
        options = valid_config_with_updates({
            'exporter_options': {
                'LOG_LEVEL': 'DEBUG',
                'LOGGER_NAME': 'export-pipeline'
            },
            'reader': {
                'name': 'exporters.transform.no_transform.NoTransform',
                'options': {}
            }
        })
        with self.assertRaises(TypeError):
            o = ExporterConfig(options)
            self.module_loader.load_reader(o.reader_options)

    def test_writer_valid_class(self):
        options = valid_config_with_updates({
            'exporter_options': {
                'LOG_LEVEL': 'DEBUG',
                'LOGGER_NAME': 'export-pipeline'
            },
            'writer': {
                'name': 'exporters.readers.random_reader.RandomReader',
                'options': {
                    'number_of_items': 1000,
                    'batch_size': 100
                }
            },
        })
        with self.assertRaises(TypeError):
            o = ExporterConfig(options)
            self.module_loader.load_writer(o.writer_options)

    def test_persistence_valid_class(self):
        options = valid_config_with_updates({
            'exporter_options': {
                'LOG_LEVEL': 'DEBUG',
                'LOGGER_NAME': 'export-pipeline'
            },
            'persistence': {
                'name': 'exporters.transform.no_transform.NoTransform',
                'options': {}
            }
        })
        o = ExporterConfig(options)
        with self.assertRaises(TypeError):
            self.module_loader.load_persistence(o.persistence_options)

    def test_formatter_valid_class(self):
        options = valid_config_with_updates({
            'exporter_options': {
                'LOG_LEVEL': 'DEBUG',
                'LOGGER_NAME': 'export-pipeline',
                "EXPORTER": 'exporters.writers.console_writer.ConsoleWriter',
            },
            'formatter': {
                'name': 'exporters.transform.no_transform.NoTransform',
                'options': {}
            },
        })
        with self.assertRaises(TypeError):
            o = ExporterConfig(options)
            self.module_loader.load_formatter(o.reader_options)

    def test_notifier_valid_class(self):
        options = valid_config_with_updates({
            'exporter_options': {
                'LOG_LEVEL': 'DEBUG',
                'LOGGER_NAME': 'export-pipeline'
            },
            'notifier': {
                'name': 'exporters.transform.no_transform.NoTransform',
                'options': {}
            }
        })
        with self.assertRaises(TypeError):
            o = ExporterConfig(options)
            self.module_loader.load_notifier(o.notifiers)

    def test_grouper_valid_class(self):
        options = valid_config_with_updates({
            'exporter_options': {
                'LOG_LEVEL': 'DEBUG',
                'LOGGER_NAME': 'export-pipeline'
            },
            'grouper': {
                'name': 'exporters.transform.no_transform.NoTransform',
                'options': {}
            }
        })
        with self.assertRaises(TypeError):
            o = ExporterConfig(options)
            self.module_loader.load_grouper(o.grouper_options)

    def test_filter_valid_class(self):
        options = valid_config_with_updates({
            'exporter_options': {
                'LOG_LEVEL': 'DEBUG',
                'LOGGER_NAME': 'export-pipeline'
            },
            'filter': {
                'name': 'exporters.transform.no_transform.NoTransform',
                'options': {}
            },
        })
        with self.assertRaises(TypeError):
            o = ExporterConfig(options)
            self.module_loader.load_filter(o.filter_before_options)

    def test_transform_valid_class(self):
        options = valid_config_with_updates({
            'exporter_options': {
                'LOG_LEVEL': 'DEBUG',
                'LOGGER_NAME': 'export-pipeline'
            },
            'transform': {
                'name': 'exporters.filters.no_filter.NoFilter',
                'options': {}
            }
        })
        with self.assertRaises(TypeError):
            o = ExporterConfig(options)
            self.module_loader.load_transform(o.transform_options)

    def test_load_grouper(self):
        grouper = {
            'name': 'exporters.groupers.file_key_grouper.FileKeyGrouper',
            'options': {
                'keys': ['country_code', 'state', 'city']
            }
        }
        self.assertIsInstance(self.module_loader.load_grouper(grouper, None),
                              BaseGrouper)
Beispiel #19
0
 def setUp(self):
     self.module_loader = ModuleLoader()
Beispiel #20
0
class ModuleLoaderTest(unittest.TestCase):
    def setUp(self):
        self.module_loader = ModuleLoader()

    def test_reader_valid_class(self):
        options = valid_config_with_updates({
            'exporter_options': {
                'LOG_LEVEL': 'DEBUG',
                'LOGGER_NAME': 'export-pipeline'
            },
            'reader': {
                'name': 'exporters.transform.no_transform.NoTransform',
                'options': {
                }
            }
        })
        with self.assertRaises(TypeError):
            o = ExporterConfig(options)
            self.module_loader.load_reader(o.reader_options)

    def test_writer_valid_class(self):
        options = valid_config_with_updates({
            'exporter_options': {
                'LOG_LEVEL': 'DEBUG',
                'LOGGER_NAME': 'export-pipeline'
            },
            'writer': {
                'name': 'exporters.readers.random_reader.RandomReader',
                'options': {
                    'number_of_items': 1000,
                    'batch_size': 100
                }
            },
        })
        with self.assertRaises(TypeError):
            o = ExporterConfig(options)
            self.module_loader.load_writer(o.writer_options)

    def test_persistence_valid_class(self):
        options = valid_config_with_updates({
            'exporter_options': {
                'LOG_LEVEL': 'DEBUG',
                'LOGGER_NAME': 'export-pipeline'
            },
            'persistence': {
                'name': 'exporters.transform.no_transform.NoTransform',
                'options': {
                }
            }
        })
        o = ExporterConfig(options)
        with self.assertRaises(TypeError):
            self.module_loader.load_persistence(o.persistence_options)

    def test_formatter_valid_class(self):
        options = valid_config_with_updates({
            'exporter_options': {
                'LOG_LEVEL': 'DEBUG',
                'LOGGER_NAME': 'export-pipeline',
                "EXPORTER": 'exporters.writers.console_writer.ConsoleWriter',
            },
            'formatter': {
                'name': 'exporters.transform.no_transform.NoTransform',
                'options': {
                }
            },
        })
        with self.assertRaises(TypeError):
            o = ExporterConfig(options)
            self.module_loader.load_formatter(o.reader_options)

    def test_notifier_valid_class(self):
        options = valid_config_with_updates({
            'exporter_options': {
                'LOG_LEVEL': 'DEBUG',
                'LOGGER_NAME': 'export-pipeline'
            },
            'notifier': {
                'name': 'exporters.transform.no_transform.NoTransform',
                'options': {
                }
            }
        })
        with self.assertRaises(TypeError):
            o = ExporterConfig(options)
            self.module_loader.load_notifier(o.notifiers)

    def test_grouper_valid_class(self):
        options = valid_config_with_updates({
            'exporter_options': {
                'LOG_LEVEL': 'DEBUG',
                'LOGGER_NAME': 'export-pipeline'
            },
            'grouper': {
                'name': 'exporters.transform.no_transform.NoTransform',
                'options': {
                }
            }
        })
        with self.assertRaises(TypeError):
            o = ExporterConfig(options)
            self.module_loader.load_grouper(o.grouper_options)

    def test_filter_valid_class(self):
        options = valid_config_with_updates({
            'exporter_options': {
                'LOG_LEVEL': 'DEBUG',
                'LOGGER_NAME': 'export-pipeline'
            },
            'filter': {
                'name': 'exporters.transform.no_transform.NoTransform',
                'options': {
                }
            },
        })
        with self.assertRaises(TypeError):
            o = ExporterConfig(options)
            self.module_loader.load_filter(o.filter_before_options)

    def test_transform_valid_class(self):
        options = valid_config_with_updates({
            'exporter_options': {
                'LOG_LEVEL': 'DEBUG',
                'LOGGER_NAME': 'export-pipeline'
            },
            'transform': {
                'name': 'exporters.filters.no_filter.NoFilter',
                'options': {
                }
            }
        })
        with self.assertRaises(TypeError):
            o = ExporterConfig(options)
            self.module_loader.load_transform(o.transform_options)

    def test_load_grouper(self):
        grouper = {
            'name': 'exporters.groupers.file_key_grouper.FileKeyGrouper',
            'options': {
                    'keys': ['country_code', 'state', 'city']
            }
        }
        self.assertIsInstance(self.module_loader.load_grouper(grouper, None),
                              BaseGrouper)
Beispiel #21
0
class BaseExporter(object):
    def __init__(self, configuration):
        self.config = ExporterConfig(configuration)
        self.logger = ExportManagerLogger(self.config.log_options)
        self.module_loader = ModuleLoader()
        metadata = ExportMeta(configuration)
        self.metadata = metadata
        self.reader = self.module_loader.load_reader(
            self.config.reader_options, metadata)
        if is_stream_reader(self.reader):
            deserializer = self.module_loader.load_deserializer(
                self.config.deserializer_options, metadata)
            decompressor = self.module_loader.load_decompressor(
                self.config.decompressor_options, metadata)
            self.reader.deserializer = deserializer
            self.reader.decompressor = decompressor
        self.filter_before = self.module_loader.load_filter(
            self.config.filter_before_options, metadata)
        self.filter_after = self.module_loader.load_filter(
            self.config.filter_after_options, metadata)
        self.transform = self.module_loader.load_transform(
            self.config.transform_options, metadata)
        self.export_formatter = self.module_loader.load_formatter(
            self.config.formatter_options, metadata)
        self.writer = self.module_loader.load_writer(
            self.config.writer_options, metadata, export_formatter=self.export_formatter)
        self.persistence = self.module_loader.load_persistence(
            self.config.persistence_options, metadata)
        self.grouper = self.module_loader.load_grouper(
            self.config.grouper_options, metadata)
        self.notifiers = NotifiersList(self.config.notifiers, metadata)
        if self.config.disable_retries:
            disable_retries()
        self.logger.debug('{} has been initiated'.format(self.__class__.__name__))
        self.stats_manager = self.module_loader.load_stats_manager(
            self.config.stats_options, metadata)
        self.bypass_cases = []

    def _run_pipeline_iteration(self):
        times = OrderedDict([('started', datetime.datetime.now())])
        self.logger.debug('Getting new batch')
        if self.config.exporter_options.get('forced_reads'):
            next_batch = list(self.reader.get_next_batch())
        else:
            next_batch = self.reader.get_next_batch()
        times.update(read=datetime.datetime.now())
        next_batch = self.filter_before.filter_batch(next_batch)
        times.update(filtered=datetime.datetime.now())
        next_batch = self.transform.transform_batch(next_batch)
        times.update(transformed=datetime.datetime.now())
        next_batch = self.filter_after.filter_batch(next_batch)
        times.update(filtered_after=datetime.datetime.now())
        next_batch = self.grouper.group_batch(next_batch)
        times.update(grouped=datetime.datetime.now())
        try:
            self.writer.write_batch(batch=next_batch)
            times.update(written=datetime.datetime.now())
            last_position = self._get_last_position()
            self.persistence.commit_position(last_position)
            times.update(persisted=datetime.datetime.now())
        except ItemsLimitReached:
            # we have written some amount of records up to the limit
            times.update(written=datetime.datetime.now())
            self._iteration_stats_report(times)
            raise
        else:
            self._iteration_stats_report(times)

    def _get_last_position(self):
        last_position = self.reader.get_last_position()
        last_position['writer_metadata'] = self.writer.get_all_metadata()
        return last_position

    def _init_export_job(self):
        self.notifiers.notify_start_dump(receivers=[CLIENTS, TEAM])
        last_position = self.persistence.get_last_position()
        if last_position is not None:
            self.writer.update_metadata(last_position.get('writer_metadata'))
            self.metadata.accurate_items_count = last_position.get('accurate_items_count', False)
        self.reader.set_last_position(last_position)

    def _clean_export_job(self):
        try:
            self.reader.close()
        except:
            raise
        finally:
            self.writer.close()

    def _finish_export_job(self):
        self.writer.finish_writing()
        self.metadata.end_time = datetime.datetime.now()

    def bypass_exporter(self, bypass_class):
        self.logger.info('Executing bypass {}.'.format(bypass_class.__name__))
        self.notifiers.notify_start_dump(receivers=[CLIENTS, TEAM])
        if not self.config.exporter_options.get('resume'):
            self.persistence.close()
            self.persistence.delete()
        with closing(bypass_class(self.config, self.metadata)) as bypass:
            bypass.execute()
        if not bypass.valid_total_count:
            self.metadata.accurate_items_count = False
            self.logger.warning('No accurate items count info can be retrieved')
        self.writer.set_metadata(
            'items_count', self.writer.get_metadata('items_count') + bypass.total_items)
        self.logger.info(
            'Finished executing bypass {}.'.format(bypass_class.__name__))
        self.notifiers.notify_complete_dump(receivers=[CLIENTS, TEAM])

    def bypass(self):
        if self.config.prevent_bypass:
            return False
        for bypass_class in self.bypass_cases:
            if bypass_class.meets_conditions(self.config):
                try:
                    self.bypass_exporter(bypass_class)
                    return True
                finally:
                    self._clean_export_job()
        return False

    def _handle_export_exception(self, exception):
        self.logger.error(traceback.format_exc(exception))
        self.logger.error(str(exception))
        self.notifiers.notify_failed_job(
            str(exception), str(traceback.format_exc(exception)), receivers=[TEAM])

    def _iteration_stats_report(self, times):
        try:
            self.stats_manager.iteration_report(times)
        except Exception as e:
            import traceback
            traceback.print_exc()
            self.logger.error('Error making stats report: {}'.format(str(e)))

    def _final_stats_report(self):
        try:
            self.stats_manager.final_report()
        except Exception as e:
            self.logger.error('Error making final stats report: {}'.format(str(e)))

    def _run_pipeline(self):
        while not self.reader.is_finished():
            try:
                self._run_pipeline_iteration()
            except ItemsLimitReached as e:
                self.logger.info('{!r}'.format(e))
                break
        self.writer.flush()

    def export(self):
        if not self.bypass():
            try:
                self._init_export_job()
                self._run_pipeline()
                self._finish_export_job()
                self._final_stats_report()
                self.persistence.close()
                self.notifiers.notify_complete_dump(receivers=[CLIENTS, TEAM])
            except Exception as e:
                self._handle_export_exception(e)
                raise e
            finally:
                self._clean_export_job()
        else:
            self.metadata.bypassed_pipeline = True
Beispiel #22
0
class BaseExporter(object):
    def __init__(self, configuration):
        self.config = ExporterConfig(configuration)
        self.threaded = self.config.exporter_options.get('threaded', False)
        self.queue_size = self.config.exporter_options.get('thread_queue_size', 100)
        self.logger = ExportManagerLogger(self.config.log_options)
        self.module_loader = ModuleLoader()
        metadata = ExportMeta(configuration)
        self.metadata = metadata
        self.reader = self.module_loader.load_reader(
            self.config.reader_options, metadata)
        if is_stream_reader(self.reader):
            deserializer = self.module_loader.load_deserializer(
                self.config.deserializer_options, metadata)
            decompressor = self.module_loader.load_decompressor(
                self.config.decompressor_options, metadata)
            self.reader.deserializer = deserializer
            self.reader.decompressor = decompressor
        self.filter_before = self.module_loader.load_filter(
            self.config.filter_before_options, metadata)
        self.filter_after = self.module_loader.load_filter(
            self.config.filter_after_options, metadata)
        self.transform = self.module_loader.load_transform(
            self.config.transform_options, metadata)
        self.export_formatter = self.module_loader.load_formatter(
            self.config.formatter_options, metadata)
        self.writer = self.module_loader.load_writer(
            self.config.writer_options, metadata, export_formatter=self.export_formatter)
        self.persistence = self.module_loader.load_persistence(
            self.config.persistence_options, metadata)
        self.grouper = self.module_loader.load_grouper(
            self.config.grouper_options, metadata)
        self.notifiers = NotifiersList(self.config.notifiers, metadata)
        if self.config.disable_retries:
            disable_retries()
        self.logger.debug('{} has been initiated'.format(self.__class__.__name__))
        self.stats_manager = self.module_loader.load_stats_manager(
            self.config.stats_options, metadata)
        self.bypass_cases = []

    def _run_pipeline_iteration(self):
        times = OrderedDict([('started', datetime.datetime.now())])
        self.logger.debug('Getting new batch')
        if self.config.exporter_options.get('forced_reads'):
            next_batch = list(self.reader.get_next_batch())
        else:
            next_batch = self.reader.get_next_batch()
        times.update(read=datetime.datetime.now())
        next_batch = self.filter_before.filter_batch(next_batch)
        times.update(filtered=datetime.datetime.now())
        next_batch = self.transform.transform_batch(next_batch)
        times.update(transformed=datetime.datetime.now())
        next_batch = self.filter_after.filter_batch(next_batch)
        times.update(filtered_after=datetime.datetime.now())
        next_batch = self.grouper.group_batch(next_batch)
        times.update(grouped=datetime.datetime.now())
        try:
            self.writer.write_batch(batch=next_batch)
            times.update(written=datetime.datetime.now())
            last_position = self._get_last_position()
            self.persistence.commit_position(last_position)
            times.update(persisted=datetime.datetime.now())
        except ItemsLimitReached:
            # we have written some amount of records up to the limit
            times.update(written=datetime.datetime.now())
            self._iteration_stats_report(times)
            raise
        else:
            self._iteration_stats_report(times)

    def _get_last_position(self):
        last_position = self.reader.get_last_position()
        last_position['writer_metadata'] = self.writer.get_all_metadata()
        return last_position

    def _init_export_job(self):
        self.notifiers.notify_start_dump(receivers=[CLIENTS, TEAM])
        last_position = self.persistence.get_last_position()
        if last_position is not None:
            self.writer.update_metadata(last_position.get('writer_metadata'))
            self.metadata.accurate_items_count = last_position.get('accurate_items_count', False)
        self.reader.set_last_position(last_position)

    def _clean_export_job(self):
        try:
            self.reader.close()
        except:
            raise
        finally:
            self.writer.close()

    def _finish_export_job(self):
        self.writer.finish_writing()
        self.metadata.end_time = datetime.datetime.now()

    def bypass_exporter(self, bypass_class):
        self.logger.info('Executing bypass {}.'.format(bypass_class.__name__))
        self.notifiers.notify_start_dump(receivers=[CLIENTS, TEAM])
        if not self.config.exporter_options.get('resume'):
            self.persistence.close()
            self.persistence.delete()
        with closing(bypass_class(self.config, self.metadata)) as bypass:
            bypass.execute()
        if not bypass.valid_total_count:
            self.metadata.accurate_items_count = False
            self.logger.warning('No accurate items count info can be retrieved')
        self.writer.set_metadata(
            'items_count', self.writer.get_metadata('items_count') + bypass.total_items)
        self.logger.info(
            'Finished executing bypass {}.'.format(bypass_class.__name__))
        self._final_stats_report()
        self.notifiers.notify_complete_dump(receivers=[CLIENTS, TEAM])

    def bypass(self):
        if self.config.prevent_bypass:
            return False
        for bypass_class in self.bypass_cases:
            if bypass_class.meets_conditions(self.config):
                try:
                    self.bypass_exporter(bypass_class)
                    return True
                finally:
                    self._clean_export_job()
        return False

    def _handle_export_exception(self, exception):
        self.logger.error(traceback.format_exc(exception))
        self.logger.error(str(exception))
        self.notifiers.notify_failed_job(
            str(exception), str(traceback.format_exc(exception)), receivers=[TEAM])

    def _iteration_stats_report(self, times):
        try:
            self.stats_manager.iteration_report(times)
        except Exception as e:
            import traceback
            traceback.print_exc()
            self.logger.error('Error making stats report: {}'.format(str(e)))

    def _final_stats_report(self):
        try:
            self.stats_manager.final_report()
        except Exception as e:
            self.logger.error('Error making final stats report: {}'.format(str(e)))

    def _run_pipeline(self):
        while not self.reader.is_finished():
            try:
                self._run_pipeline_iteration()
            except ItemsLimitReached as e:
                self.logger.info('{!r}'.format(e))
                break
        self.writer.flush()

    def _reader_thread(self):
        self.logger.info('Starting reader thread')
        while not self.reader.is_finished():
            self.process_queue.put(list(self.reader.get_next_batch()))
            qsize = self.process_queue.qsize()
            if qsize > 0.5*self.queue_size:
                # Queues are getting full, throttle the reader so the processor/writer can keep up
                time.sleep((qsize*10.0 / self.queue_size) - 5)
        self.reader_finished = True

    def _process_thread(self):
        self.logger.info('Starting processing thread')
        while not self.reader_finished or not self.process_queue.empty():
            next_batch = self.process_queue.get()
            next_batch = self.filter_before.filter_batch(next_batch)
            next_batch = self.transform.transform_batch(next_batch)
            next_batch = self.filter_after.filter_batch(next_batch)
            next_batch = self.grouper.group_batch(next_batch)
            self.writer_queue.put(next_batch)
        self.process_finished = True

    def _writer_thread(self):
        self.logger.info('Starting writer thread')
        while not self.process_finished or not self.writer_queue.empty():
            batch = self.writer_queue.get()
            self.writer.write_batch(batch=batch)
        self.writer.finish_writing()
        self.writer.flush()

    def _run_threads(self):
        self.reader_finished = False
        self.process_finished = False
        self.process_queue = Queue(self.queue_size)
        self.writer_queue = Queue(self.queue_size)
        reader_thread = Thread(target=self._reader_thread)
        process_thread = Thread(target=self._process_thread)
        writer_thread = Thread(target=self._writer_thread)
        reader_thread.start()
        process_thread.start()
        writer_thread.start()
        reader_thread.join()
        process_thread.join()
        writer_thread.join()

    def export(self):
        if not self.bypass():
            try:
                self._init_export_job()
                if self.threaded:
                    self._run_threads()
                else:
                    self._run_pipeline()
                    self._finish_export_job()
                self._final_stats_report()
                self.persistence.close()
                self.notifiers.notify_complete_dump(receivers=[CLIENTS, TEAM])
            except Exception as e:
                self._handle_export_exception(e)
                raise
            finally:
                self._clean_export_job()
        else:
            self.metadata.bypassed_pipeline = True