def reindex(session, es, request): """Reindex all annotations into a new index, and update the alias.""" if get_aliased_index(es) is None: raise RuntimeError('cannot reindex if current index is not aliased') settings = request.find_service(name='settings') new_index = configure_index(es) try: settings.put(SETTING_NEW_INDEX, new_index) request.tm.commit() indexer = BatchIndexer(session, es, request, target_index=new_index, op_type='create') errored = indexer.index() if errored: log.debug('failed to index {} annotations, retrying...'.format( len(errored))) errored = indexer.index(errored) if errored: log.warn('failed to index {} annotations: {!r}'.format( len(errored), errored)) update_aliased_index(es, new_index) finally: settings.delete(SETTING_NEW_INDEX) request.tm.commit()
def reindex_user_annotations(userid): ids = [a.id for a in celery.request.db.query(models.Annotation.id).filter_by(userid=userid)] indexer = BatchIndexer(celery.request.db, celery.request.es, celery.request) errored = indexer.index(ids) if errored: log.warning('Failed to re-index annotations %s', errored)
def reindex_annotations_in_date_range(start_date, end_date, max_annotations=250000): """Re-index annotations from Postgres to Elasticsearch in a date range. :param start_date: Begin at this time (greater or equal) :param end_date: End at this time (less than or equal) :param max_annotations: Maximum number of items to process overall """ log.info(f"Re-indexing from {start_date} to {end_date}...") indexer = BatchIndexer(celery.request.db, celery.request.es, celery.request) errored = indexer.index( annotation.id for annotation in celery.request.db.query(Annotation.id) .filter(Annotation.updated >= start_date) .filter(Annotation.updated <= end_date) .limit(max_annotations) ) if errored: log.warning("Failed to re-index annotations into ES6 %s", errored) log.info( "Re-index from %s to %s complete.", start_date, end_date, )
def move_uri(ctx, old, new): """ Move annotations and document equivalence data from one URL to another. This will **replace** the annotation's ``target_uri`` and all the document uri's ``claimant``, plus the matching ``uri`` for self-claim and canonical uris. """ request = ctx.obj["bootstrap"]() annotations = _fetch_annotations(request.db, old) docuris_claimant = _fetch_document_uri_claimants(request.db, old) docuris_uri = _fetch_document_uri_canonical_self_claim(request.db, old) prompt = ( "Changing all annotations and document data matching:\n" '"{old}"\nto:\n"{new}"\n' "This will affect {ann_count} annotations, {doc_claimant} " "document uri claimants, and {doc_uri} document uri self-claims " "or canonical uris.\n" "Are you sure? [y/N]" ).format( old=old, new=new, ann_count=len(annotations), doc_claimant=len(docuris_claimant), doc_uri=len(docuris_uri), ) c = click.prompt(prompt, default="n", show_default=False) if c != "y": print("Aborted") return for annotation in annotations: annotation.target_uri = new for docuri in docuris_claimant: docuri.claimant = new for docuri in docuris_uri: docuri.uri = new if annotations: indexer = BatchIndexer(request.db, request.es, request) ids = [a.id for a in annotations] indexer.index(ids) request.db.flush() documents = models.Document.find_by_uris(request.db, [new]) if documents.count() > 1: merge_documents(request.db, documents) request.tm.commit()
def move_uri(ctx, old, new): """ Move annotations and document equivalence data from one URL to another. This will **replace** the annotation's ``target_uri`` and all the document uri's ``claimant``, plus the matching ``uri`` for self-claim and canonical uris. """ request = ctx.obj["bootstrap"]() annotations = _fetch_annotations(request.db, old) docuris_claimant = _fetch_document_uri_claimants(request.db, old) docuris_uri = _fetch_document_uri_canonical_self_claim(request.db, old) prompt = ("Changing all annotations and document data matching:\n" '"{old}"\nto:\n"{new}"\n' "This will affect {ann_count} annotations, {doc_claimant} " "document uri claimants, and {doc_uri} document uri self-claims " "or canonical uris.\n" "Are you sure? [y/N]").format( old=old, new=new, ann_count=len(annotations), doc_claimant=len(docuris_claimant), doc_uri=len(docuris_uri), ) c = click.prompt(prompt, default="n", show_default=False) if c != "y": print("Aborted") return for annotation in annotations: annotation.target_uri = new for docuri in docuris_claimant: docuri.claimant = new for docuri in docuris_uri: docuri.uri = new if annotations: indexer = BatchIndexer(request.db, request.es, request) ids = [a.id for a in annotations] indexer.index(ids) request.db.flush() documents = models.Document.find_by_uris(request.db, [new]) if documents.count() > 1: merge_documents(request.db, documents) request.tm.commit()
def reindex(session, es, request): """Reindex all annotations into a new index, and update the alias.""" current_index = get_aliased_index(es) if current_index is None: raise RuntimeError('cannot reindex if current index is not aliased') settings = request.find_service(name='settings') # Preload userids of shadowbanned users. nipsa_svc = request.find_service(name='nipsa') nipsa_svc.fetch_all_flagged_userids() new_index = configure_index(es) log.info('configured new index {}'.format(new_index)) setting_name = 'reindex.new_index' if es.version < (2, ): setting_name = 'reindex.new_index' try: settings.put(setting_name, new_index) request.tm.commit() log.info('reindexing annotations into new index {}'.format(new_index)) indexer = BatchIndexer(session, es, request, target_index=new_index, op_type='create') errored = indexer.index() if errored: log.debug('failed to index {} annotations, retrying...'.format( len(errored))) errored = indexer.index(errored) if errored: log.warn('failed to index {} annotations: {!r}'.format( len(errored), errored)) log.info('making new index {} current'.format(new_index)) update_aliased_index(es, new_index) log.info('removing previous index {}'.format(current_index)) delete_index(es, current_index) finally: settings.delete(setting_name) request.tm.commit()
def test_it_does_not_error_if_annotations_already_indexed( self, db_session, es_client, factories, pyramid_request): annotations = factories.Annotation.create_batch(3) expected_errored_ids = {annotations[1].id} elasticsearch.helpers.streaming_bulk = mock.Mock() elasticsearch.helpers.streaming_bulk.return_value = [ (True, {}), (False, { "create": { "error": "some error", "_id": annotations[1].id } }), ( False, { "create": { "error": "document already exists", "_id": annotations[2].id, } }, ), ] errored = BatchIndexer(db_session, es_client, pyramid_request, es_client.index, "create").index() assert errored == expected_errored_ids
def reindex(session, es, request): """Reindex all annotations into a new index, and update the alias.""" current_index = get_aliased_index(es) if current_index is None: raise RuntimeError("cannot reindex if current index is not aliased") settings = request.find_service(name="settings") # Preload userids of shadowbanned users. nipsa_svc = request.find_service(name="nipsa") nipsa_svc.fetch_all_flagged_userids() new_index = configure_index(es) log.info("configured new index {}".format(new_index)) setting_name = "reindex.new_index" try: settings.put(setting_name, new_index) request.tm.commit() log.info("reindexing annotations into new index {}".format(new_index)) indexer = BatchIndexer( session, es, request, target_index=new_index, op_type="create" ) errored = indexer.index() if errored: log.debug( "failed to index {} annotations, retrying...".format(len(errored)) ) errored = indexer.index(errored) if errored: log.warning( "failed to index {} annotations: {!r}".format(len(errored), errored) ) log.info("making new index {} current".format(new_index)) update_aliased_index(es, new_index) log.info("removing previous index {}".format(current_index)) delete_index(es, current_index) finally: settings.delete(setting_name) request.tm.commit()
def test_it_accepts_different_indexes(self, target_index, es_client): indexer = BatchIndexer( session=sentinel.db, es_client=es_client, request=sentinel.request, target_index=target_index, ) assert (indexer._target_index == target_index if target_index else es_client.index)
def reindex(session, es, request): """Reindex all annotations into a new index, and update the alias.""" current_index = get_aliased_index(es) if current_index is None: raise RuntimeError("cannot reindex if current index is not aliased") settings = request.find_service(name="settings") # Preload userids of shadowbanned users. nipsa_svc = request.find_service(name="nipsa") nipsa_svc.fetch_all_flagged_userids() new_index = configure_index(es) log.info("configured new index %s", new_index) setting_name = "reindex.new_index" try: settings.put(setting_name, new_index) request.tm.commit() log.info("reindexing annotations into new index %s", new_index) indexer = BatchIndexer( session, es, request, target_index=new_index, op_type="create" ) errored = indexer.index() if errored: log.debug("failed to index %d annotations, retrying...", len(errored)) errored = indexer.index(errored) if errored: log.warning("failed to index %d annotations: %r", len(errored), errored) log.info("making new index %s current", new_index) update_aliased_index(es, new_index) log.info("removing previous index %s", current_index) delete_index(es, current_index) finally: settings.delete(setting_name) request.tm.commit()
def factory(_context, request): """Create a SearchIndexService.""" return SearchIndexService( request=request, es_client=request.es, session=request.db, settings=request.find_service(name="settings"), queue=Queue( db=request.db, es=request.es, batch_indexer=BatchIndexer(request.db, request.es, request), ), )
def batch_indexer(db_session, es_client, pyramid_request, moderation_service): return BatchIndexer(db_session, es_client, pyramid_request)
def batch_indexer( # pylint:disable=unused-argument db_session, es_client, pyramid_request, moderation_service ): return BatchIndexer(db_session, es_client, pyramid_request)