def run(self, mongo_collection_name, page_size): bucket_size = int(page_size) if page_size else self.default_page_size print('Indexing data from mongo/{} to elastic/{}'.format( mongo_collection_name, mongo_collection_name)) service = superdesk.get_resource_service(mongo_collection_name) cursor = service.get_from_mongo(None, {}) count = cursor.count() no_of_buckets = len(range(0, count, bucket_size)) print('Number of items to index: {}, pages={}'.format( count, no_of_buckets)) for x in range(0, no_of_buckets): skip = x * bucket_size print('Page : {}, skip: {}'.format(x + 1, skip)) cursor = service.get_from_mongo(None, {}) cursor.skip(skip) cursor.limit(bucket_size) items = list(cursor) print('Inserting {} items'.format(len(items))) success, failed = superdesk.app.data._search_backend( mongo_collection_name).bulk_insert(mongo_collection_name, items) print('Inserted {} items'.format(success)) if failed: print( 'Failed to do bulk insert of items {}. Errors: {}'.format( len(failed), failed)) raise BulkIndexError(resource=mongo_collection_name, errors=failed) return 'Finished indexing collection {}'.format(mongo_collection_name)
def copy_resource(cls, resource, page_size): for items in cls.get_mongo_items(resource, page_size): print('{} Inserting {} items'.format(time.strftime('%X %x %Z'), len(items))) s = time.time() success, failed = 0, 0 for i in range(1, 4): try: success, failed = superdesk.app.data._search_backend( resource).bulk_insert(resource, items) except Exception as ex: print('Exception thrown on insert to elastic {}', ex) time.sleep(10) continue else: break print('{} Inserted {} items in {:.3f} seconds'.format( time.strftime('%X %x %Z'), success, time.time() - s)) if failed: print( 'Failed to do bulk insert of items {}. Errors: {}'.format( len(failed), failed)) raise BulkIndexError(resource=resource, errors=failed) return 'Finished indexing collection {}'.format(resource)
def index_elastic_from_mongo(hours=None, collection=None): print('Starting indexing from mongodb for "{}" collection hours={}'.format(collection, hours)) resources = app.data.get_elastic_resources() if collection: if collection not in resources: raise SystemExit('Cannot find collection: {}'.format(collection)) resources = [collection] for resource in resources: print('Starting indexing collection {}'.format(resource)) for items in _get_mongo_items(resource, hours): print('{} Inserting {} items'.format(time.strftime('%X %x %Z'), len(items))) s = time.time() for i in range(1, 4): try: success, failed = superdesk.app.data._search_backend(resource).bulk_insert(resource, items) except Exception as ex: print('Exception thrown on insert to elastic {}', ex) time.sleep(10) continue else: break print('{} Inserted {} items in {:.3f} seconds'.format(time.strftime('%X %x %Z'), success, time.time() - s)) if failed: print('Failed to do bulk insert of items {}. Errors: {}'.format(len(failed), failed)) raise BulkIndexError(resource=resource, errors=failed) print('Finished indexing collection {}'.format(resource))
def index_elastic_from_mongo_from_timestamp(collection, timestamp_str, direction): if not collection: raise SystemExit('Collection not provided') elif not timestamp_str: raise SystemExit('Timestamp not provided') elif direction not in ['older', 'newer']: raise SystemExit('Direction can only be "older" or "newer", not {}'.format(direction)) try: timestamp = datetime.strptime(timestamp_str, '%Y-%m-%dT%H:%M') except ValueError as e: raise SystemExit('Timestamp in incorrect format (e.g. 2019-05-20T05:00). {}'.format(e)) print('Starting indexing from mongodb for "{}" collection, timestamp={}, direction={}'.format( collection, timestamp, direction )) resources = app.data.get_elastic_resources() if collection not in resources: raise SystemExit('Cannot find collection: {}'.format(collection)) print('Starting indexing collection {}'.format(collection)) for items in _get_mongo_items_from_timestamp(collection, timestamp, direction): print('{} Inserting {} items'.format(time.strftime('%X %x %Z'), len(items))) s = time.time() for i in range(1, 4): try: success, failed = superdesk.app.data._search_backend(collection).bulk_insert(collection, items) except Exception as ex: print('Exception thrown on insert to elastic {}', ex) time.sleep(10) continue else: break print('{} Inserted {} items in {:.3f} seconds'.format(time.strftime('%X %x %Z'), success, time.time() - s)) if failed: print('Failed to do bulk insert of items {}. Errors: {}'.format(len(failed), failed)) raise BulkIndexError(resource=collection, errors=failed) print('Finished indexing collection {}'.format(collection))