def execute(self): # We can't count items on streamed bypasses self.valid_total_count = False self.bypass_state = StreamBypassState(self.config, self.metadata) module_loader = ModuleLoader() reader = module_loader.load_reader(self.config.reader_options, self.metadata) writer = module_loader.load_writer(self.config.writer_options, self.metadata) with closing(reader), closing(writer): for stream in reader.get_read_streams(): if stream not in self.bypass_state.skipped: file_obj = cohere_stream(reader.open_stream(stream)) logging.log( logging.INFO, 'Starting to copy file {}'.format(stream.filename)) try: writer.write_stream(stream, file_obj) finally: file_obj.close() logging.log( logging.INFO, 'Finished copying file {}'.format(stream.filename)) self.bypass_state.commit_copied(stream) else: logging.log(logging.INFO, 'Skip file {}'.format(stream.filename))
def meets_conditions(cls, config): if not config.filter_before_options['name'].endswith('NoFilter'): cls._log_skip_reason('custom filter configured') return False if not config.filter_after_options['name'].endswith('NoFilter'): cls._log_skip_reason('custom filter configured') return False if not config.transform_options['name'].endswith('NoTransform'): cls._log_skip_reason('custom transform configured') return False if not config.grouper_options['name'].endswith('NoGrouper'): cls._log_skip_reason('custom grouper configured') return False if config.writer_options.get('options', {}).get('items_limit'): cls._log_skip_reason('items limit configuration (items_limit)') return False if config.writer_options.get('options', {}).get('items_per_buffer_write'): cls._log_skip_reason( 'buffer limit configuration (items_per_buffer_write)') return False if config.writer_options.get('options', {}).get('size_per_buffer_write'): cls._log_skip_reason( 'buffer limit configuration (size_per_buffer_write)') return False write_buffer = config.writer_options['options'].get('write_buffer') if write_buffer and not write_buffer.endswith('base.WriteBuffer'): cls._log_skip_reason('custom write buffer configuration') return False module_loader = ModuleLoader() try: with closing( module_loader.load_class( config.reader_options['name'])) as reader: pass with closing( module_loader.load_class( config.writer_options['name'])) as writer: pass except: cls._log_skip_reason("Can't load reader and/or writer") return False if not callable(getattr(reader, 'get_read_streams', None)) or\ not callable(getattr(reader, 'open_stream', None)): cls._log_skip_reason( "Reader doesn't support get_read_streams()/open_stream()") return False if not hasattr(writer, 'write_stream'): cls._log_skip_reason("Writer doesn't support write_stream()") return False return True
def __init__(self, config, metadata): module_loader = ModuleLoader() self.state = module_loader.load_persistence(config.persistence_options, metadata) self.state_position = self.state.get_last_position() if not self.state_position: self.done = [] self.skipped = [] self.stats = {'bytes_copied': 0} self.state.commit_position(self._get_state()) else: self.done = [] self.skipped = self.state_position['done'] self.stats = self.state_position.get('stats', {'bytes_copied': 0})
class NotifiersList(object): """ This class is only used to support a list of notifications modules. """ def __init__(self, options, metadata): self.options = options self.module_loader = ModuleLoader() self.notifiers = self._populate_notifiers(metadata) def _populate_notifiers(self, metadata): notifiers_list = [] for notifier in self.options: notifier_object = self.module_loader.load_notifier( notifier, metadata) notifiers_list.append(notifier_object) return notifiers_list def notify_start_dump(self, receivers=None): if receivers is None: receivers = [] for notifier in self.notifiers: notifier.notify_start_dump(receivers) def notify_complete_dump(self, receivers=None): if receivers is None: receivers = [] for notifier in self.notifiers: notifier.notify_complete_dump(receivers) def notify_failed_job(self, msg, stack_strace, receivers=None): if receivers is None: receivers = [] for notifier in self.notifiers: notifier.notify_failed_job(msg, stack_strace, receivers)
def __init__(self, configuration): self.config = ExporterConfig(configuration) self.logger = ExportManagerLogger(self.config.log_options) self.module_loader = ModuleLoader() metadata = ExportMeta(configuration) self.metadata = metadata self.reader = self.module_loader.load_reader( self.config.reader_options, metadata) self.filter_before = self.module_loader.load_filter( self.config.filter_before_options, metadata) self.filter_after = self.module_loader.load_filter( self.config.filter_after_options, metadata) self.transform = self.module_loader.load_transform( self.config.transform_options, metadata) self.export_formatter = self.module_loader.load_formatter( self.config.formatter_options, metadata) self.writer = self.module_loader.load_writer( self.config.writer_options, metadata, export_formatter=self.export_formatter) self.persistence = self.module_loader.load_persistence( self.config.persistence_options, metadata) self.grouper = self.module_loader.load_grouper( self.config.grouper_options, metadata) self.notifiers = NotifiersList(self.config.notifiers, metadata) if self.config.disable_retries: disable_retries() self.logger.debug('{} has been initiated'.format(self.__class__.__name__)) self.stats_manager = self.module_loader.load_stats_manager( self.config.stats_options, metadata) self.bypass_cases = []
class NotifiersList(object): """ This class is only used to support a list of notifications modules. """ def __init__(self, options, metadata): self.options = options self.module_loader = ModuleLoader() self.notifiers = self._populate_notifiers(metadata) def _populate_notifiers(self, metadata): notifiers_list = [] for notifier in self.options: notifier_object = self.module_loader.load_notifier(notifier, metadata) notifiers_list.append(notifier_object) return notifiers_list def notify_start_dump(self, receivers=None): if receivers is None: receivers = [] for notifier in self.notifiers: notifier.notify_start_dump(receivers) def notify_complete_dump(self, receivers=None): if receivers is None: receivers = [] for notifier in self.notifiers: notifier.notify_complete_dump(receivers) def notify_failed_job(self, msg, stack_strace, receivers=None): if receivers is None: receivers = [] for notifier in self.notifiers: notifier.notify_failed_job(msg, stack_strace, receivers)
def __init__(self, config, metadata, aws_key, aws_secret): self.config = config module_loader = ModuleLoader() self.state = module_loader.load_persistence(config.persistence_options, metadata) self.state_position = self.state.get_last_position() if not self.state_position: self.pending = S3BucketKeysFetcher( self.config.reader_options['options'], aws_key, aws_secret).pending_keys() self.done = [] self.skipped = [] self.stats = {'total_count': 0} self.state.commit_position(self._get_state()) else: self.pending = self.state_position['pending'] self.done = [] self.skipped = self.state_position['done'] self.keys = self.pending self.stats = self.state_position.get('stats', {'total_count': 0})
def execute(self): # We can't count items on streamed bypasses self.valid_total_count = False self.bypass_state = StreamBypassState(self.config, self.metadata) module_loader = ModuleLoader() reader = module_loader.load_reader(self.config.reader_options, self.metadata) writer = module_loader.load_writer(self.config.writer_options, self.metadata) with closing(reader), closing(writer): for stream in reader.get_read_streams(): if stream not in self.bypass_state.skipped: file_obj = cohere_stream(reader.open_stream(stream)) logging.log(logging.INFO, 'Starting to copy file {}'.format(stream.filename)) try: writer.write_stream(stream, file_obj) finally: file_obj.close() logging.log(logging.INFO, 'Finished copying file {}'.format(stream.filename)) self.bypass_state.commit_copied(stream) else: logging.log(logging.INFO, 'Skip file {}'.format(stream.filename))
def _get_write_buffer(self): module_loader = ModuleLoader() write_buffer_module = self.read_option('write_buffer') write_buffer_class = module_loader.load_class(write_buffer_module) write_buffer_options = { 'name': self.read_option('write_buffer'), 'options': self.read_option('write_buffer_options'), } file_handler = self._items_group_files_handler(write_buffer_class, **write_buffer_options['options']) kwargs = { 'items_per_buffer_write': self.read_option('items_per_buffer_write'), 'size_per_buffer_write': self.read_option('size_per_buffer_write'), 'items_group_files_handler': file_handler, 'compression_format': self.compression_format, 'hash_algorithm': self.hash_algorithm, } return module_loader.load_write_buffer(write_buffer_options, self.metadata, **kwargs)
def __init__(self, configuration): self.config = ExporterConfig(configuration) self.threaded = self.config.exporter_options.get('threaded', False) self.logger = ExportManagerLogger(self.config.log_options) self.module_loader = ModuleLoader() metadata = ExportMeta(configuration) self.metadata = metadata self.reader = self.module_loader.load_reader( self.config.reader_options, metadata) if is_stream_reader(self.reader): deserializer = self.module_loader.load_deserializer( self.config.deserializer_options, metadata) decompressor = self.module_loader.load_decompressor( self.config.decompressor_options, metadata) self.reader.deserializer = deserializer self.reader.decompressor = decompressor self.filter_before = self.module_loader.load_filter( self.config.filter_before_options, metadata) self.filter_after = self.module_loader.load_filter( self.config.filter_after_options, metadata) self.transform = self.module_loader.load_transform( self.config.transform_options, metadata) self.export_formatter = self.module_loader.load_formatter( self.config.formatter_options, metadata) self.writer = self.module_loader.load_writer( self.config.writer_options, metadata, export_formatter=self.export_formatter) self.persistence = self.module_loader.load_persistence( self.config.persistence_options, metadata) self.grouper = self.module_loader.load_grouper( self.config.grouper_options, metadata) self.notifiers = NotifiersList(self.config.notifiers, metadata) if self.config.disable_retries: disable_retries() self.logger.debug('{} has been initiated'.format( self.__class__.__name__)) self.stats_manager = self.module_loader.load_stats_manager( self.config.stats_options, metadata) self.bypass_cases = []
def meets_conditions(cls, config): if not config.filter_before_options['name'].endswith('NoFilter'): cls._log_skip_reason('custom filter configured') return False if not config.filter_after_options['name'].endswith('NoFilter'): cls._log_skip_reason('custom filter configured') return False if not config.transform_options['name'].endswith('NoTransform'): cls._log_skip_reason('custom transform configured') return False if not config.grouper_options['name'].endswith('NoGrouper'): cls._log_skip_reason('custom grouper configured') return False if config.writer_options.get('options', {}).get('items_limit'): cls._log_skip_reason('items limit configuration (items_limit)') return False if config.writer_options.get('options', {}).get('items_per_buffer_write'): cls._log_skip_reason('buffer limit configuration (items_per_buffer_write)') return False if config.writer_options.get('options', {}).get('size_per_buffer_write'): cls._log_skip_reason('buffer limit configuration (size_per_buffer_write)') return False module_loader = ModuleLoader() try: with closing(module_loader.load_class(config.reader_options['name'])) as reader: pass with closing(module_loader.load_class(config.writer_options['name'])) as writer: pass except: cls._log_skip_reason("Can't load reader and/or writer") return False if not callable(getattr(reader, 'get_read_streams', None)) or\ not callable(getattr(reader, 'open_stream', None)): cls._log_skip_reason("Reader doesn't support get_read_streams()/open_stream()") return False if not hasattr(writer, 'write_stream'): cls._log_skip_reason("Writer doesn't support write_stream()") return False return True
def setUp(self): self.module_loader = ModuleLoader()
def __init__(self, options, metadata): self.options = options self.module_loader = ModuleLoader() self.notifiers = self._populate_notifiers(metadata)
def test_transform_batch(self): reader = ModuleLoader().load_reader(self.options['reader'], meta()) # FIXME inline batch, without a reader batch = reader.get_next_batch() self.assertEquals(self.transform.transform_batch(batch), batch)
class ModuleLoaderTest(unittest.TestCase): def setUp(self): self.module_loader = ModuleLoader() def test_reader_valid_class(self): options = valid_config_with_updates({ 'exporter_options': { 'LOG_LEVEL': 'DEBUG', 'LOGGER_NAME': 'export-pipeline' }, 'reader': { 'name': 'exporters.transform.no_transform.NoTransform', 'options': {} } }) with self.assertRaises(TypeError): o = ExporterConfig(options) self.module_loader.load_reader(o.reader_options) def test_writer_valid_class(self): options = valid_config_with_updates({ 'exporter_options': { 'LOG_LEVEL': 'DEBUG', 'LOGGER_NAME': 'export-pipeline' }, 'writer': { 'name': 'exporters.readers.random_reader.RandomReader', 'options': { 'number_of_items': 1000, 'batch_size': 100 } }, }) with self.assertRaises(TypeError): o = ExporterConfig(options) self.module_loader.load_writer(o.writer_options) def test_persistence_valid_class(self): options = valid_config_with_updates({ 'exporter_options': { 'LOG_LEVEL': 'DEBUG', 'LOGGER_NAME': 'export-pipeline' }, 'persistence': { 'name': 'exporters.transform.no_transform.NoTransform', 'options': {} } }) o = ExporterConfig(options) with self.assertRaises(TypeError): self.module_loader.load_persistence(o.persistence_options) def test_formatter_valid_class(self): options = valid_config_with_updates({ 'exporter_options': { 'LOG_LEVEL': 'DEBUG', 'LOGGER_NAME': 'export-pipeline', "EXPORTER": 'exporters.writers.console_writer.ConsoleWriter', }, 'formatter': { 'name': 'exporters.transform.no_transform.NoTransform', 'options': {} }, }) with self.assertRaises(TypeError): o = ExporterConfig(options) self.module_loader.load_formatter(o.reader_options) def test_notifier_valid_class(self): options = valid_config_with_updates({ 'exporter_options': { 'LOG_LEVEL': 'DEBUG', 'LOGGER_NAME': 'export-pipeline' }, 'notifier': { 'name': 'exporters.transform.no_transform.NoTransform', 'options': {} } }) with self.assertRaises(TypeError): o = ExporterConfig(options) self.module_loader.load_notifier(o.notifiers) def test_grouper_valid_class(self): options = valid_config_with_updates({ 'exporter_options': { 'LOG_LEVEL': 'DEBUG', 'LOGGER_NAME': 'export-pipeline' }, 'grouper': { 'name': 'exporters.transform.no_transform.NoTransform', 'options': {} } }) with self.assertRaises(TypeError): o = ExporterConfig(options) self.module_loader.load_grouper(o.grouper_options) def test_filter_valid_class(self): options = valid_config_with_updates({ 'exporter_options': { 'LOG_LEVEL': 'DEBUG', 'LOGGER_NAME': 'export-pipeline' }, 'filter': { 'name': 'exporters.transform.no_transform.NoTransform', 'options': {} }, }) with self.assertRaises(TypeError): o = ExporterConfig(options) self.module_loader.load_filter(o.filter_before_options) def test_transform_valid_class(self): options = valid_config_with_updates({ 'exporter_options': { 'LOG_LEVEL': 'DEBUG', 'LOGGER_NAME': 'export-pipeline' }, 'transform': { 'name': 'exporters.filters.no_filter.NoFilter', 'options': {} } }) with self.assertRaises(TypeError): o = ExporterConfig(options) self.module_loader.load_transform(o.transform_options) def test_load_grouper(self): grouper = { 'name': 'exporters.groupers.file_key_grouper.FileKeyGrouper', 'options': { 'keys': ['country_code', 'state', 'city'] } } self.assertIsInstance(self.module_loader.load_grouper(grouper, None), BaseGrouper)
class ModuleLoaderTest(unittest.TestCase): def setUp(self): self.module_loader = ModuleLoader() def test_reader_valid_class(self): options = valid_config_with_updates({ 'exporter_options': { 'LOG_LEVEL': 'DEBUG', 'LOGGER_NAME': 'export-pipeline' }, 'reader': { 'name': 'exporters.transform.no_transform.NoTransform', 'options': { } } }) with self.assertRaises(TypeError): o = ExporterConfig(options) self.module_loader.load_reader(o.reader_options) def test_writer_valid_class(self): options = valid_config_with_updates({ 'exporter_options': { 'LOG_LEVEL': 'DEBUG', 'LOGGER_NAME': 'export-pipeline' }, 'writer': { 'name': 'exporters.readers.random_reader.RandomReader', 'options': { 'number_of_items': 1000, 'batch_size': 100 } }, }) with self.assertRaises(TypeError): o = ExporterConfig(options) self.module_loader.load_writer(o.writer_options) def test_persistence_valid_class(self): options = valid_config_with_updates({ 'exporter_options': { 'LOG_LEVEL': 'DEBUG', 'LOGGER_NAME': 'export-pipeline' }, 'persistence': { 'name': 'exporters.transform.no_transform.NoTransform', 'options': { } } }) o = ExporterConfig(options) with self.assertRaises(TypeError): self.module_loader.load_persistence(o.persistence_options) def test_formatter_valid_class(self): options = valid_config_with_updates({ 'exporter_options': { 'LOG_LEVEL': 'DEBUG', 'LOGGER_NAME': 'export-pipeline', "EXPORTER": 'exporters.writers.console_writer.ConsoleWriter', }, 'formatter': { 'name': 'exporters.transform.no_transform.NoTransform', 'options': { } }, }) with self.assertRaises(TypeError): o = ExporterConfig(options) self.module_loader.load_formatter(o.reader_options) def test_notifier_valid_class(self): options = valid_config_with_updates({ 'exporter_options': { 'LOG_LEVEL': 'DEBUG', 'LOGGER_NAME': 'export-pipeline' }, 'notifier': { 'name': 'exporters.transform.no_transform.NoTransform', 'options': { } } }) with self.assertRaises(TypeError): o = ExporterConfig(options) self.module_loader.load_notifier(o.notifiers) def test_grouper_valid_class(self): options = valid_config_with_updates({ 'exporter_options': { 'LOG_LEVEL': 'DEBUG', 'LOGGER_NAME': 'export-pipeline' }, 'grouper': { 'name': 'exporters.transform.no_transform.NoTransform', 'options': { } } }) with self.assertRaises(TypeError): o = ExporterConfig(options) self.module_loader.load_grouper(o.grouper_options) def test_filter_valid_class(self): options = valid_config_with_updates({ 'exporter_options': { 'LOG_LEVEL': 'DEBUG', 'LOGGER_NAME': 'export-pipeline' }, 'filter': { 'name': 'exporters.transform.no_transform.NoTransform', 'options': { } }, }) with self.assertRaises(TypeError): o = ExporterConfig(options) self.module_loader.load_filter(o.filter_before_options) def test_transform_valid_class(self): options = valid_config_with_updates({ 'exporter_options': { 'LOG_LEVEL': 'DEBUG', 'LOGGER_NAME': 'export-pipeline' }, 'transform': { 'name': 'exporters.filters.no_filter.NoFilter', 'options': { } } }) with self.assertRaises(TypeError): o = ExporterConfig(options) self.module_loader.load_transform(o.transform_options) def test_load_grouper(self): grouper = { 'name': 'exporters.groupers.file_key_grouper.FileKeyGrouper', 'options': { 'keys': ['country_code', 'state', 'city'] } } self.assertIsInstance(self.module_loader.load_grouper(grouper, None), BaseGrouper)
class BaseExporter(object): def __init__(self, configuration): self.config = ExporterConfig(configuration) self.logger = ExportManagerLogger(self.config.log_options) self.module_loader = ModuleLoader() metadata = ExportMeta(configuration) self.metadata = metadata self.reader = self.module_loader.load_reader( self.config.reader_options, metadata) if is_stream_reader(self.reader): deserializer = self.module_loader.load_deserializer( self.config.deserializer_options, metadata) decompressor = self.module_loader.load_decompressor( self.config.decompressor_options, metadata) self.reader.deserializer = deserializer self.reader.decompressor = decompressor self.filter_before = self.module_loader.load_filter( self.config.filter_before_options, metadata) self.filter_after = self.module_loader.load_filter( self.config.filter_after_options, metadata) self.transform = self.module_loader.load_transform( self.config.transform_options, metadata) self.export_formatter = self.module_loader.load_formatter( self.config.formatter_options, metadata) self.writer = self.module_loader.load_writer( self.config.writer_options, metadata, export_formatter=self.export_formatter) self.persistence = self.module_loader.load_persistence( self.config.persistence_options, metadata) self.grouper = self.module_loader.load_grouper( self.config.grouper_options, metadata) self.notifiers = NotifiersList(self.config.notifiers, metadata) if self.config.disable_retries: disable_retries() self.logger.debug('{} has been initiated'.format(self.__class__.__name__)) self.stats_manager = self.module_loader.load_stats_manager( self.config.stats_options, metadata) self.bypass_cases = [] def _run_pipeline_iteration(self): times = OrderedDict([('started', datetime.datetime.now())]) self.logger.debug('Getting new batch') if self.config.exporter_options.get('forced_reads'): next_batch = list(self.reader.get_next_batch()) else: next_batch = self.reader.get_next_batch() times.update(read=datetime.datetime.now()) next_batch = self.filter_before.filter_batch(next_batch) times.update(filtered=datetime.datetime.now()) next_batch = self.transform.transform_batch(next_batch) times.update(transformed=datetime.datetime.now()) next_batch = self.filter_after.filter_batch(next_batch) times.update(filtered_after=datetime.datetime.now()) next_batch = self.grouper.group_batch(next_batch) times.update(grouped=datetime.datetime.now()) try: self.writer.write_batch(batch=next_batch) times.update(written=datetime.datetime.now()) last_position = self._get_last_position() self.persistence.commit_position(last_position) times.update(persisted=datetime.datetime.now()) except ItemsLimitReached: # we have written some amount of records up to the limit times.update(written=datetime.datetime.now()) self._iteration_stats_report(times) raise else: self._iteration_stats_report(times) def _get_last_position(self): last_position = self.reader.get_last_position() last_position['writer_metadata'] = self.writer.get_all_metadata() return last_position def _init_export_job(self): self.notifiers.notify_start_dump(receivers=[CLIENTS, TEAM]) last_position = self.persistence.get_last_position() if last_position is not None: self.writer.update_metadata(last_position.get('writer_metadata')) self.metadata.accurate_items_count = last_position.get('accurate_items_count', False) self.reader.set_last_position(last_position) def _clean_export_job(self): try: self.reader.close() except: raise finally: self.writer.close() def _finish_export_job(self): self.writer.finish_writing() self.metadata.end_time = datetime.datetime.now() def bypass_exporter(self, bypass_class): self.logger.info('Executing bypass {}.'.format(bypass_class.__name__)) self.notifiers.notify_start_dump(receivers=[CLIENTS, TEAM]) if not self.config.exporter_options.get('resume'): self.persistence.close() self.persistence.delete() with closing(bypass_class(self.config, self.metadata)) as bypass: bypass.execute() if not bypass.valid_total_count: self.metadata.accurate_items_count = False self.logger.warning('No accurate items count info can be retrieved') self.writer.set_metadata( 'items_count', self.writer.get_metadata('items_count') + bypass.total_items) self.logger.info( 'Finished executing bypass {}.'.format(bypass_class.__name__)) self.notifiers.notify_complete_dump(receivers=[CLIENTS, TEAM]) def bypass(self): if self.config.prevent_bypass: return False for bypass_class in self.bypass_cases: if bypass_class.meets_conditions(self.config): try: self.bypass_exporter(bypass_class) return True finally: self._clean_export_job() return False def _handle_export_exception(self, exception): self.logger.error(traceback.format_exc(exception)) self.logger.error(str(exception)) self.notifiers.notify_failed_job( str(exception), str(traceback.format_exc(exception)), receivers=[TEAM]) def _iteration_stats_report(self, times): try: self.stats_manager.iteration_report(times) except Exception as e: import traceback traceback.print_exc() self.logger.error('Error making stats report: {}'.format(str(e))) def _final_stats_report(self): try: self.stats_manager.final_report() except Exception as e: self.logger.error('Error making final stats report: {}'.format(str(e))) def _run_pipeline(self): while not self.reader.is_finished(): try: self._run_pipeline_iteration() except ItemsLimitReached as e: self.logger.info('{!r}'.format(e)) break self.writer.flush() def export(self): if not self.bypass(): try: self._init_export_job() self._run_pipeline() self._finish_export_job() self._final_stats_report() self.persistence.close() self.notifiers.notify_complete_dump(receivers=[CLIENTS, TEAM]) except Exception as e: self._handle_export_exception(e) raise e finally: self._clean_export_job() else: self.metadata.bypassed_pipeline = True
class BaseExporter(object): def __init__(self, configuration): self.config = ExporterConfig(configuration) self.threaded = self.config.exporter_options.get('threaded', False) self.queue_size = self.config.exporter_options.get('thread_queue_size', 100) self.logger = ExportManagerLogger(self.config.log_options) self.module_loader = ModuleLoader() metadata = ExportMeta(configuration) self.metadata = metadata self.reader = self.module_loader.load_reader( self.config.reader_options, metadata) if is_stream_reader(self.reader): deserializer = self.module_loader.load_deserializer( self.config.deserializer_options, metadata) decompressor = self.module_loader.load_decompressor( self.config.decompressor_options, metadata) self.reader.deserializer = deserializer self.reader.decompressor = decompressor self.filter_before = self.module_loader.load_filter( self.config.filter_before_options, metadata) self.filter_after = self.module_loader.load_filter( self.config.filter_after_options, metadata) self.transform = self.module_loader.load_transform( self.config.transform_options, metadata) self.export_formatter = self.module_loader.load_formatter( self.config.formatter_options, metadata) self.writer = self.module_loader.load_writer( self.config.writer_options, metadata, export_formatter=self.export_formatter) self.persistence = self.module_loader.load_persistence( self.config.persistence_options, metadata) self.grouper = self.module_loader.load_grouper( self.config.grouper_options, metadata) self.notifiers = NotifiersList(self.config.notifiers, metadata) if self.config.disable_retries: disable_retries() self.logger.debug('{} has been initiated'.format(self.__class__.__name__)) self.stats_manager = self.module_loader.load_stats_manager( self.config.stats_options, metadata) self.bypass_cases = [] def _run_pipeline_iteration(self): times = OrderedDict([('started', datetime.datetime.now())]) self.logger.debug('Getting new batch') if self.config.exporter_options.get('forced_reads'): next_batch = list(self.reader.get_next_batch()) else: next_batch = self.reader.get_next_batch() times.update(read=datetime.datetime.now()) next_batch = self.filter_before.filter_batch(next_batch) times.update(filtered=datetime.datetime.now()) next_batch = self.transform.transform_batch(next_batch) times.update(transformed=datetime.datetime.now()) next_batch = self.filter_after.filter_batch(next_batch) times.update(filtered_after=datetime.datetime.now()) next_batch = self.grouper.group_batch(next_batch) times.update(grouped=datetime.datetime.now()) try: self.writer.write_batch(batch=next_batch) times.update(written=datetime.datetime.now()) last_position = self._get_last_position() self.persistence.commit_position(last_position) times.update(persisted=datetime.datetime.now()) except ItemsLimitReached: # we have written some amount of records up to the limit times.update(written=datetime.datetime.now()) self._iteration_stats_report(times) raise else: self._iteration_stats_report(times) def _get_last_position(self): last_position = self.reader.get_last_position() last_position['writer_metadata'] = self.writer.get_all_metadata() return last_position def _init_export_job(self): self.notifiers.notify_start_dump(receivers=[CLIENTS, TEAM]) last_position = self.persistence.get_last_position() if last_position is not None: self.writer.update_metadata(last_position.get('writer_metadata')) self.metadata.accurate_items_count = last_position.get('accurate_items_count', False) self.reader.set_last_position(last_position) def _clean_export_job(self): try: self.reader.close() except: raise finally: self.writer.close() def _finish_export_job(self): self.writer.finish_writing() self.metadata.end_time = datetime.datetime.now() def bypass_exporter(self, bypass_class): self.logger.info('Executing bypass {}.'.format(bypass_class.__name__)) self.notifiers.notify_start_dump(receivers=[CLIENTS, TEAM]) if not self.config.exporter_options.get('resume'): self.persistence.close() self.persistence.delete() with closing(bypass_class(self.config, self.metadata)) as bypass: bypass.execute() if not bypass.valid_total_count: self.metadata.accurate_items_count = False self.logger.warning('No accurate items count info can be retrieved') self.writer.set_metadata( 'items_count', self.writer.get_metadata('items_count') + bypass.total_items) self.logger.info( 'Finished executing bypass {}.'.format(bypass_class.__name__)) self._final_stats_report() self.notifiers.notify_complete_dump(receivers=[CLIENTS, TEAM]) def bypass(self): if self.config.prevent_bypass: return False for bypass_class in self.bypass_cases: if bypass_class.meets_conditions(self.config): try: self.bypass_exporter(bypass_class) return True finally: self._clean_export_job() return False def _handle_export_exception(self, exception): self.logger.error(traceback.format_exc(exception)) self.logger.error(str(exception)) self.notifiers.notify_failed_job( str(exception), str(traceback.format_exc(exception)), receivers=[TEAM]) def _iteration_stats_report(self, times): try: self.stats_manager.iteration_report(times) except Exception as e: import traceback traceback.print_exc() self.logger.error('Error making stats report: {}'.format(str(e))) def _final_stats_report(self): try: self.stats_manager.final_report() except Exception as e: self.logger.error('Error making final stats report: {}'.format(str(e))) def _run_pipeline(self): while not self.reader.is_finished(): try: self._run_pipeline_iteration() except ItemsLimitReached as e: self.logger.info('{!r}'.format(e)) break self.writer.flush() def _reader_thread(self): self.logger.info('Starting reader thread') while not self.reader.is_finished(): self.process_queue.put(list(self.reader.get_next_batch())) qsize = self.process_queue.qsize() if qsize > 0.5*self.queue_size: # Queues are getting full, throttle the reader so the processor/writer can keep up time.sleep((qsize*10.0 / self.queue_size) - 5) self.reader_finished = True def _process_thread(self): self.logger.info('Starting processing thread') while not self.reader_finished or not self.process_queue.empty(): next_batch = self.process_queue.get() next_batch = self.filter_before.filter_batch(next_batch) next_batch = self.transform.transform_batch(next_batch) next_batch = self.filter_after.filter_batch(next_batch) next_batch = self.grouper.group_batch(next_batch) self.writer_queue.put(next_batch) self.process_finished = True def _writer_thread(self): self.logger.info('Starting writer thread') while not self.process_finished or not self.writer_queue.empty(): batch = self.writer_queue.get() self.writer.write_batch(batch=batch) self.writer.finish_writing() self.writer.flush() def _run_threads(self): self.reader_finished = False self.process_finished = False self.process_queue = Queue(self.queue_size) self.writer_queue = Queue(self.queue_size) reader_thread = Thread(target=self._reader_thread) process_thread = Thread(target=self._process_thread) writer_thread = Thread(target=self._writer_thread) reader_thread.start() process_thread.start() writer_thread.start() reader_thread.join() process_thread.join() writer_thread.join() def export(self): if not self.bypass(): try: self._init_export_job() if self.threaded: self._run_threads() else: self._run_pipeline() self._finish_export_job() self._final_stats_report() self.persistence.close() self.notifiers.notify_complete_dump(receivers=[CLIENTS, TEAM]) except Exception as e: self._handle_export_exception(e) raise finally: self._clean_export_job() else: self.metadata.bypassed_pipeline = True