def update_to(self, seq): kafka_seq = None if isinstance(seq, dict): assert self.sequence_format == 'json' kafka_seq = seq seq = kafka_seq_to_str(seq) elif isinstance(seq, int): seq = str(seq) pillow_logging.info( "(%s) setting checkpoint: %s" % (self.checkpoint_id, seq) ) with transaction.atomic(): if kafka_seq: for topic_partition, offset in kafka_seq.items(): KafkaCheckpoint.objects.update_or_create( checkpoint_id=self.checkpoint_id, topic=topic_partition[0], partition=topic_partition[1], defaults={'offset': offset} ) checkpoint = self.get_or_create_wrapped(verify_unchanged=True) checkpoint.sequence = seq checkpoint.timestamp = datetime.utcnow() checkpoint.save() self._last_checkpoint = checkpoint
def process_bulk_docs(self, docs, progress_logger): if len(docs) == 0: return True pillow_logging.info("Processing batch of %s docs", len(docs)) changes = [ self._doc_to_change(doc) for doc in docs if self.process_deletes or not is_deletion(doc.get('doc_type')) ] error_collector = ErrorCollector() bulk_changes = build_bulk_payload(self.index_info, changes, self.doc_transform, error_collector) for change, exception in error_collector.errors: pillow_logging.error("Error procesing doc %s: %s (%s)", change.id, type(exception), exception) es_interface = ElasticsearchInterface(self.es) try: es_interface.bulk_ops(bulk_changes) except (ESBulkIndexError, ES2BulkIndexError, ES7BulkIndexError) as e: pillow_logging.error("Bulk index errors\n%s", e.errors) except Exception: pillow_logging.exception("\tException sending payload to ES") return False return True
def process_bulk_docs(self, docs, progress_logger): if not docs: return True pillow_logging.info("Processing batch of %s docs", len(docs)) changes = [] for doc in docs: change = self._doc_to_change(doc) # de-dupe the is_deletion check if self.process_deletes or not change.deleted: changes.append(change) error_collector = ErrorCollector() bulk_changes = build_bulk_payload(changes, self.doc_transform, error_collector) for change, exception in error_collector.errors: pillow_logging.error("Error processing doc %s: %s (%s)", change.id, type(exception), exception) es_interface = ElasticsearchInterface(self.es) try: es_interface.bulk_ops(self.index_info.alias, self.index_info.type, bulk_changes) except BulkIndexError as e: pillow_logging.error("Bulk index errors\n%s", e.errors) except Exception as exc: pillow_logging.exception( "Error sending bulk payload to Elasticsearch: %s", exc) return False return True
def process_bulk_docs(self, docs): if len(docs) == 0: return True pillow_logging.info("Processing batch of %s docs", len((docs))) changes = [self._doc_to_change(doc) for doc in docs] error_collector = ErrorCollector() bulk_changes = build_bulk_payload(self.index_info, changes, self.doc_transform, error_collector) for change, exception in error_collector.errors: pillow_logging.error("Error procesing doc %s: %s", change.id, exception) payloads = prepare_bulk_payloads(bulk_changes, MAX_PAYLOAD_SIZE) if len(payloads) > 1: pillow_logging.info("Payload split into %s parts" % len(payloads)) for payload in payloads: success = self._send_payload_with_retries(payload) if not success: # stop the reindexer if we're unable to send a payload to ES return False return True
def update_to(self, seq, change=None): kafka_seq = None if isinstance(seq, dict): assert self.sequence_format == 'json' kafka_seq = seq seq = kafka_seq_to_str(seq) elif isinstance(seq, int): seq = str(seq) pillow_logging.info( "(%s) setting checkpoint: %s" % (self.checkpoint_id, seq) ) doc_modification_time = change.metadata.publish_timestamp if change else None with transaction.atomic(): if kafka_seq: for topic_partition, offset in kafka_seq.items(): KafkaCheckpoint.objects.update_or_create( checkpoint_id=self.checkpoint_id, topic=topic_partition[0], partition=topic_partition[1], defaults={'offset': offset, 'doc_modification_time': doc_modification_time} ) checkpoint = self.get_or_create_wrapped(verify_unchanged=True) checkpoint.sequence = seq checkpoint.timestamp = datetime.utcnow() checkpoint.save() self._last_checkpoint = checkpoint
def run(self): """ Main entry point for running pillows forever. """ pillow_logging.info("Starting pillow %s" % self.__class__) self.process_changes(since=self.get_last_checkpoint_sequence(), forever=True)
def _send_payload_with_retries(self, payload): pillow_logging.info("Sending payload to ES") retries = 0 bulk_start = datetime.utcnow() success = False while retries < MAX_TRIES: if retries: retry_time = (datetime.utcnow() - bulk_start ).seconds + retries * RETRY_TIME_DELAY_FACTOR pillow_logging.warning("\tRetrying in %s seconds" % retry_time) time.sleep(retry_time) pillow_logging.warning("\tRetrying now ...") # reset timestamp when looping again bulk_start = datetime.utcnow() try: self.es.bulk(payload) success = True break except Exception: retries += 1 pillow_logging.exception("\tException sending payload to ES") return success
def process_bulk_docs(self, docs): if len(docs) == 0: return True pillow_logging.info("Processing batch of %s docs", len((docs))) changes = [self._doc_to_change(doc) for doc in docs] error_collector = ErrorCollector() bulk_changes = build_bulk_payload(self.index_info, changes, self.doc_transform, error_collector) for change, exception in error_collector.errors: pillow_logging.error("Error procesing doc %s: %s", change.id, exception) max_payload_size = pow(10, 8) # ~ 100Mb payloads = prepare_bulk_payloads(bulk_changes, max_payload_size) if len(payloads) > 1: pillow_logging.info("Payload split into %s parts" % len(payloads)) for payload in payloads: success = self._send_payload_with_retries(payload) if not success: # stop the reindexer if we're unable to send a payload to ES return False return True
def _update_modified_since(self, timestamp): """ Find any data sources that have been modified since the last time this was bootstrapped and update the in-memory references. """ for data_source in self.data_source_provider.get_data_sources_modified_since(timestamp): pillow_logging.info(f'updating modified registry data source: {data_source.domain}: {data_source._id}') self._add_or_update_data_source(data_source)
def update_to(self, seq): pillow_logging.info("(%s) setting checkpoint: %s" % (self.checkpoint_id, seq)) checkpoint = self.get_or_create_wrapped(verify_unchanged=True) checkpoint.sequence = seq checkpoint.timestamp = datetime.utcnow() checkpoint.save() self._last_checkpoint = checkpoint
def run(self): """ Main entry point for running pillows forever. """ pillow_logging.info("Starting pillow %s" % self.__class__) with configure_scope() as scope: scope.set_tag("pillow_name", self.get_name()) self.process_changes(since=self.get_last_checkpoint_sequence(), forever=True)
def update_to(self, seq): pillow_logging.info( "(%s) setting checkpoint: %s" % (self.checkpoint_id, seq) ) checkpoint = self.get_or_create_wrapped(verify_unchanged=True).document checkpoint.sequence = seq checkpoint.timestamp = datetime.utcnow() checkpoint.save() self._last_checkpoint = checkpoint
def reindex(self): for i, change in enumerate(self.change_provider.iter_all_changes(start_from=self.start_from)): try: self.pillow.process_change(change) except Exception: pillow_logging.exception("Unable to process change: %s", change.id) if i % 1000: pillow_logging.info("Processed %s docs", i)
def completely_initialize_pillow_index(pillow): """ This utility can be used to initialize the elastic index and mapping for a pillow """ index_exists = pillow_index_exists(pillow) if not index_exists: create_index_for_pillow(pillow) pillow_logging.info("Pillowtop [%s] Initializing mapping in ES" % pillow.get_name()) initialize_mapping_if_necessary(pillow)
def update_to(self, seq): pillow_logging.info( "(%s) setting checkpoint: %s" % (self.checkpoint_id, seq) ) checkpoint = self.get_or_create(verify_unchanged=True).document checkpoint['seq'] = seq checkpoint['timestamp'] = get_formatted_current_timestamp() self._manager.update_checkpoint(self.checkpoint_id, checkpoint) self._last_checkpoint = checkpoint
def reindex(self): for i, change in enumerate(self.change_provider.iter_all_changes()): try: # below works because signature is same for pillow and processor self.pillow_or_processor.process_change(change) except Exception: pillow_logging.exception("Unable to process change: %s", change.id) if i % 1000: pillow_logging.info("Processed %s docs", i)
def initialize_index(es, index_info): index = index_info.index mapping = index_info.mapping mapping['_meta']['created'] = datetime.isoformat(datetime.utcnow()) meta = copy(index_info.meta) meta.update({'mappings': {index_info.type: mapping}}) pillow_logging.info("Initializing elasticsearch index for [%s]" % index_info.type) es.indices.create(index=index, body=meta) set_index_normal_settings(es, index)
def _add_data_sources_to_table_adapters(self, new_data_sources): for new_data_source in new_data_sources: pillow_logging.info(f'updating modified data source: {new_data_source.domain}: {new_data_source._id}') domain_adapters = self.table_adapters_by_domain[new_data_source.domain] # remove any previous adapters if they existed domain_adapters = [ adapter for adapter in domain_adapters if adapter.config._id != new_data_source._id ] # add a new one domain_adapters.append(self._get_indicator_adapter(new_data_source)) # update dictionary self.table_adapters_by_domain[new_data_source.domain] = domain_adapters
def process_bulk(self, changes): if not changes: return self.allow_updates = False self.bulk = True bstart = datetime.utcnow() bulk_payload = '\n'.join(map(simplejson.dumps, self.bulk_builder(changes))) + "\n" pillow_logging.info( "%s,prepare_bulk,%s" % (self.get_name(), str(ms_from_timedelta(datetime.utcnow() - bstart) / 1000.0))) send_start = datetime.utcnow() self.send_bulk(bulk_payload) pillow_logging.info( "%s,send_bulk,%s" % (self.get_name(), str(ms_from_timedelta(datetime.utcnow() - send_start) / 1000.0)))
def update_checkpoint(self, change, context): if self.should_update_checkpoint(context): context.reset() self.checkpoint.update_to(self.get_new_seq(change)) self.last_update = datetime.utcnow() if self.checkpoint_callback: self.checkpoint_callback.checkpoint_updated() return True elif (datetime.utcnow() - self.last_log).total_seconds() > 10: self.last_log = datetime.utcnow() pillow_logging.info("Heartbeat: %s", self.get_new_seq(change)) return False
def update_to(self, seq): if isinstance(seq, dict): seq = json.dumps(seq) elif isinstance(seq, int): seq = str(seq) pillow_logging.info( "(%s) setting checkpoint: %s" % (self.checkpoint_id, seq) ) checkpoint = self.get_or_create_wrapped(verify_unchanged=True) checkpoint.sequence = seq checkpoint.timestamp = datetime.utcnow() checkpoint.save() self._last_checkpoint = checkpoint
def run(self): """ Main entry point for running pillows forever. """ pillow_logging.info("Starting pillow %s" % self.__class__) with configure_scope() as scope: scope.set_tag("pillow_name", self.get_name()) if self.is_dedicated_migration_process: for processor in self.processors: processor.bootstrap_if_needed() time.sleep(10) else: self.process_changes(since=self.get_last_checkpoint_sequence(), forever=True)
def initialize_index(es, index_info): index = index_info.index mapping = index_info.mapping mapping['_meta']['created'] = datetime.isoformat(datetime.utcnow()) meta = copy(index_info.meta) if settings.ELASTICSEARCH_MAJOR_VERSION == 7: mapping = transform_for_es7(mapping) meta.update({'mappings': mapping}) else: meta.update({'mappings': {index_info.type: mapping}}) pillow_logging.info("Initializing elasticsearch index for [%s]" % index_info.type) es.indices.create(index=index, body=meta) set_index_normal_settings(es, index)
def update_to(self, seq): kafka_seq = seq seq = kafka_seq_to_str(seq) pillow_logging.info( "(%s) setting checkpoint: %s" % (self.checkpoint_id, seq) ) with transaction.atomic(): if kafka_seq: for topic_partition, offset in kafka_seq.items(): KafkaCheckpoint.objects.update_or_create( checkpoint_id=self.checkpoint_id, topic=topic_partition[0], partition=topic_partition[1], defaults={'offset': offset} )
def update_to(self, seq, change=None): if isinstance(seq, six.string_types): kafka_seq = str_to_kafka_seq(seq) else: kafka_seq = seq seq = kafka_seq_to_str(seq) pillow_logging.info( "(%s) setting checkpoint: %s" % (self.checkpoint_id, seq) ) doc_modification_time = change.metadata.publish_timestamp if change else None with transaction.atomic(): if kafka_seq: for topic_partition, offset in kafka_seq.items(): KafkaCheckpoint.objects.update_or_create( checkpoint_id=self.checkpoint_id, topic=topic_partition[0], partition=topic_partition[1], defaults={'offset': offset, 'doc_modification_time': doc_modification_time} )
def update_to(self, seq, change=None): if isinstance(seq, str): kafka_seq = str_to_kafka_seq(seq) else: kafka_seq = seq seq = kafka_seq_to_str(seq) pillow_logging.info( "(%s) setting checkpoint: %s" % (self.checkpoint_id, seq) ) doc_modification_time = change.metadata.publish_timestamp if change else None with transaction.atomic(): if kafka_seq: for topic_partition, offset in kafka_seq.items(): KafkaCheckpoint.objects.update_or_create( checkpoint_id=self.checkpoint_id, topic=topic_partition[0], partition=topic_partition[1], defaults={'offset': offset, 'doc_modification_time': doc_modification_time} )
def _rebuild_sql_tables(self, adapters): tables_by_engine = defaultdict(dict) all_adapters = [] for adapter in adapters: if getattr(adapter, 'all_adapters', None): all_adapters.extend(adapter.all_adapters) else: all_adapters.append(adapter) for adapter in all_adapters: tables_by_engine[adapter.engine_id][ adapter.get_table().name] = adapter _assert = soft_assert(notify_admins=True) _notify_rebuild = lambda msg, obj: _assert(False, msg, obj) for engine_id, table_map in tables_by_engine.items(): table_names = list(table_map) engine = connection_manager.get_engine(engine_id) diffs = get_table_diffs(engine, table_names, get_metadata(engine_id)) tables_to_act_on = get_tables_rebuild_migrate(diffs) for table_name in tables_to_act_on.rebuild: pillow_logging.info("[rebuild] Rebuilding table: %s", table_name) sql_adapter = table_map[table_name] table_diffs = [ diff for diff in diffs if diff.table_name == table_name ] if not sql_adapter.config.is_static: try: self.rebuild_table(sql_adapter, table_diffs) except TableRebuildError as e: _notify_rebuild(str(e), sql_adapter.config.to_json()) else: self.rebuild_table(sql_adapter, table_diffs) self.migrate_tables(engine, diffs, tables_to_act_on.migrate, table_map)
def migrate_tables(self, engine, diffs, table_names, adapters_by_table): migration_diffs = [diff for diff in diffs if diff.table_name in table_names] for table in table_names: adapter = adapters_by_table[table] pillow_logging.info("[rebuild] Using config: %r", adapter.config) pillow_logging.info("[rebuild] sqlalchemy metadata: %r", get_metadata(adapter.engine_id).tables[table]) pillow_logging.info("[rebuild] sqlalchemy table: %r", adapter.get_table()) changes = migrate_tables(engine, migration_diffs) for table, diffs in changes.items(): adapter = adapters_by_table[table] pillow_logging.info( "[rebuild] Migrating table: %s, from config %s at rev %s", table, adapter.config._id, adapter.config._rev ) adapter.log_table_migrate(source='pillowtop', diffs=diffs)
def process_bulk_docs(self, docs): if len(docs) == 0: return True pillow_logging.info("Processing batch of %s docs", len((docs))) changes = [self._doc_to_change(doc) for doc in docs] error_collector = ErrorCollector() bulk_changes = build_bulk_payload(self.index_info, changes, self.doc_transform, error_collector) for change, exception in error_collector.errors: pillow_logging.error("Error procesing doc %s: %s (%s)", change.id, type(exception), exception) es_interface = ElasticsearchInterface(self.es) try: es_interface.bulk_ops(bulk_changes) except Exception: pillow_logging.exception("\tException sending payload to ES") return False return True
def _send_payload_with_retries(self, payload): pillow_logging.info("Sending payload to ES") retries = 0 bulk_start = datetime.utcnow() success = False while retries < MAX_TRIES: if retries: retry_time = (datetime.utcnow() - bulk_start).seconds + retries * RETRY_TIME_DELAY_FACTOR pillow_logging.warning("\tRetrying in %s seconds" % retry_time) time.sleep(retry_time) pillow_logging.warning("\tRetrying now ...") # reset timestamp when looping again bulk_start = datetime.utcnow() try: self.es.bulk(payload) success = True break except Exception: retries += 1 pillow_logging.exception("\tException sending payload to ES") return success
def run(self): """ Main entry point for running pillows forever. """ pillow_logging.info("Starting pillow %s" % self.__class__) with configure_scope() as scope: scope.set_tag("pillow_name", self.get_name()) since = self.get_last_checkpoint_sequence() while True: pillow_logging.info( f"Processing from change feed starting at {since}") self.process_changes(since=since) since = self.get_last_checkpoint_sequence() pillow_logging.info( f"Change feed ended at {since}. Pausing until next message.") self.wait_for_change(since) pillow_logging.info("Next message arrived.")
def initialize_mapping_if_necessary(es, index_info): """ Initializes the elasticsearch mapping for this pillow if it is not found. """ if not mapping_exists(es, index_info): pillow_logging.info("Initializing elasticsearch mapping for [%s]" % index_info.type) mapping = copy(index_info.mapping) mapping['_meta']['created'] = datetime.isoformat(datetime.utcnow()) mapping_res = es.indices.put_mapping(index_info.type, {index_info.type: mapping}, index=index_info.index) if mapping_res.get('ok', False) and mapping_res.get('acknowledged', False): # API confirms OK, trust it. pillow_logging.info("Mapping set: [%s] %s" % (index_info.type, mapping_res)) else: pillow_logging.info("Elasticsearch mapping for [%s] was already present." % index_info.type)
def initialize_mapping_if_necessary(pillow): """ Initializes the elasticsearch mapping for this pillow if it is not found. """ es = pillow.get_es_new() if not pillow_mapping_exists(pillow): pillow_logging.info("Initializing elasticsearch mapping for [%s]" % pillow.es_type) mapping = copy(pillow.default_mapping) mapping['_meta']['created'] = datetime.isoformat(datetime.utcnow()) mapping_res = es.indices.put_mapping(pillow.es_index, pillow.es_type, {pillow.es_type: mapping}) if mapping_res.get('ok', False) and mapping_res.get('acknowledged', False): # API confirms OK, trust it. pillow_logging.info("Mapping set: [%s] %s" % (pillow.es_type, mapping_res)) else: pillow_logging.info("Elasticsearch mapping for [%s] was already present." % pillow.es_type)
def process_bulk(self, changes): if not changes: return self.allow_updates = False self.bulk = True bstart = datetime.utcnow() bulk_changes = self.bulk_builder(changes) max_payload_size = pow(10, 8) # ~ 100Mb payloads = prepare_bulk_payloads(bulk_changes, max_payload_size) if len(payloads) > 1: pillow_logging.info("%s,payload split into %s parts" % (self.get_name(), len(payloads))) pillow_logging.info( "%s,prepare_bulk,%s" % (self.get_name(), str(ms_from_timedelta(datetime.utcnow() - bstart) / 1000.0))) send_start = datetime.utcnow() for payload in payloads: self.send_bulk(payload) pillow_logging.info( "%s,send_bulk,%s" % (self.get_name(), str(ms_from_timedelta(datetime.utcnow() - send_start) / 1000.0)))
def _set_checkpoint(pillow): checkpoint_value = pillow.get_change_feed().get_checkpoint_value() pillow_logging.info('setting checkpoint to {}'.format(checkpoint_value)) pillow.checkpoint.update_to(checkpoint_value)
def process_change(self, change, is_retry_attempt=False): # do nothing if self._changes_processed % KAFKA_CHECKPOINT_FREQUENCY == 0: # only log a small amount to avoid clogging up supervisor pillow_logging.info('Processed change {}: {}'.format(self._changes_processed, change)) self._changes_processed += 1