def migrate_chunk(chunk, broken_output=None, dry_run=False): from flask_sqlalchemy import models_committed from invenio_records.receivers import record_modification from invenio_records.tasks.index import get_record_index from invenio.base.globals import cfg from elasticsearch.helpers import bulk as es_bulk from inspirehep.modules.citations.receivers import ( catch_citations_insert, add_citation_count_on_insert_or_update, catch_citations_update ) from invenio_records.signals import before_record_index, after_record_insert models_committed.disconnect(record_modification) after_record_insert.disconnect(catch_citations_insert) before_record_index.disconnect(add_citation_count_on_insert_or_update) before_record_index.disconnect(catch_citations_update) records_to_index = [] try: for record in chunk: recid = json = None try: recid, json = create_record(record, force=True, dry_run=dry_run) index = get_record_index(json) or \ cfg['SEARCH_ELASTIC_DEFAULT_INDEX'] before_record_index.send(recid, json=json, index=index) json.update({'_index': index, '_type': 'record', '_id': recid, 'citation_count': 0}) records_to_index.append(json) except Exception as err: logger.error("ERROR with record {} and json {}".format(recid, json)) logger.exception(err) if broken_output: broken_output_fd = open(broken_output, "a") print(record, file=broken_output_fd) logger.info("Committing chunk") db.session.commit() logger.info("Sending chunk to elasticsearch") es_bulk(es, records_to_index, request_timeout=60) finally: models_committed.connect(record_modification) after_record_insert.connect(catch_citations_insert) before_record_index.connect(add_citation_count_on_insert_or_update) before_record_index.connect(catch_citations_update) db.session.close()
def index_holdingpen_record(sender, **kwargs): """Index a Holding Pen record.""" from invenio_ext.es import es from invenio_records.api import Record from invenio_records.signals import before_record_index from invenio_records.recordext.functions.get_record_collections import ( get_record_collections, ) from invenio_records.tasks.index import get_record_index from invenio_workflows.registry import workflows if not sender.workflow: # No workflow registered to object yet. Skip indexing return if sender.version == ObjectVersion.INITIAL: # Ignore initial versions return workflow = workflows.get(sender.workflow.name) if not workflow: current_app.logger.info( "Workflow {0} not found for sender: {1}".format( sender.workflow.name, sender.id ) ) return if not hasattr(sender, 'data'): sender.data = sender.get_data() if not hasattr(sender, 'extra_data'): sender.extra_data = sender.get_extra_data() record = Record({}) record["version"] = ObjectVersion.name_from_version(sender.version) record["type"] = sender.data_type record["status"] = sender.status record["created"] = sender.created.isoformat() record["modified"] = sender.modified.isoformat() record["uri"] = sender.uri record["id_workflow"] = sender.id_workflow record["id_user"] = sender.id_user record["id_parent"] = sender.id_parent record["workflow"] = sender.workflow.name try: record.update(workflow.get_record(sender)) except Exception as err: current_app.logger.exception(err) try: record.update(workflow.get_sort_data(sender)) except Exception as err: current_app.logger.exception(err) # Add collection to get correct mapping record["_collections"] = get_record_collections(record) # Depends on "_collections" being filled correctly for record record_index = get_record_index(record) or current_app.config["SEARCH_ELASTIC_DEFAULT_INDEX"] # Trigger any before_record_index receivers before_record_index.send(sender.id, json=record, index=record_index) if record_index: index = current_app.config['WORKFLOWS_HOLDING_PEN_ES_PREFIX'] + record_index es.index( index=index, doc_type=current_app.config["WORKFLOWS_HOLDING_PEN_DOC_TYPE"], body=dict(record), id=sender.id )
def decorated(recid, *args, **kwargs): from invenio_collections.models import Collection from .api import get_record from .access import check_user_can_view_record from invenio_records.tasks.index import get_record_index from invenio_records.api import Record from invenio.base.globals import cfg from invenio_ext.es import es from elasticsearch import TransportError # ensure recid to be integer recid = int(recid) # get record from db and the one from es db_record = get_record(recid) if db_record is None: abort(404) index = get_record_index(db_record) or \ cfg['SEARCH_ELASTIC_DEFAULT_INDEX'] try: es_record = es.get(index=index, doc_type='record', id=recid) except TransportError: abort(404) g.record = record = Record(data=es_record['_source']) g.collection = collection = Collection.query.filter( Collection.name.in_(record['_collections'])).first() (auth_code, auth_msg) = check_user_can_view_record( current_user, record) # only superadmins can use verbose parameter for obtaining debug # information if not current_user.is_super_admin and 'verbose' in kwargs: kwargs['verbose'] = 0 if auth_code: flash(auth_msg, 'error') abort(apache.HTTP_UNAUTHORIZED) if Query(cfg['RECORDS_DELETED_FIELD_QUERY']).match(record): # Record is deleted. Check for referred master recid if merged or 404 if record.get(cfg['RECORDS_MERGED_MASTER_RECID_KEY']): return redirect(url_for('.' + f.func_name, recid=record.get(cfg['RECORDS_MERGED_MASTER_RECID_KEY']))) abort(404) title = record.get(cfg.get('RECORDS_BREADCRUMB_TITLE_KEY'), '') tabs = [] def _format_record(record, of='hd', user_info=current_user, *args, **kwargs): from invenio_formatter import format_record return format_record(record, of, user_info=user_info, *args, **kwargs) @register_template_context_processor def record_context(): # from invenio.modules.comments.api import get_mini_reviews return dict(recid=recid, record=record, tabs=tabs, title=title, get_mini_reviews=lambda *args, **kwargs: '', # FIXME get_mini_reviews, collection=collection, format_record=_format_record ) pre_template_render.send( "%s.%s" % (blueprint.name, f.__name__), recid=recid, ) return f(recid, *args, **kwargs)
def decorated(recid, *args, **kwargs): from invenio_collections.models import Collection from .api import get_record from .access import check_user_can_view_record from invenio_records.tasks.index import get_record_index from invenio_records.api import Record from invenio.base.globals import cfg from invenio_ext.es import es from elasticsearch import TransportError # ensure recid to be integer recid = int(recid) # get record from db and the one from es db_record = get_record(recid) if db_record is None: abort(404) index = get_record_index(db_record) or \ cfg['SEARCH_ELASTIC_DEFAULT_INDEX'] try: es_record = es.get(index=index, doc_type='record', id=recid) except TransportError: abort(404) g.record = record = Record(data=es_record['_source']) g.collection = collection = Collection.query.filter( Collection.name.in_(record['_collections'])).first() (auth_code, auth_msg) = check_user_can_view_record(current_user, record) # only superadmins can use verbose parameter for obtaining debug # information if not current_user.is_super_admin and 'verbose' in kwargs: kwargs['verbose'] = 0 if auth_code: flash(auth_msg, 'error') abort(apache.HTTP_UNAUTHORIZED) # TODO check record status (exists, merged, deleted) title = record.get(cfg.get('RECORDS_BREADCRUMB_TITLE_KEY'), '') tabs = [] def _format_record(record, of='hd', user_info=current_user, *args, **kwargs): from invenio_formatter import format_record return format_record(record, of, user_info=user_info, *args, **kwargs) @register_template_context_processor def record_context(): # from invenio.modules.comments.api import get_mini_reviews return dict( recid=recid, record=record, tabs=tabs, title=title, get_mini_reviews=lambda *args, **kwargs: '', # FIXME get_mini_reviews, collection=collection, format_record=_format_record) pre_template_render.send( "%s.%s" % (blueprint.name, f.__name__), recid=recid, ) return f(recid, *args, **kwargs)
def migrate_chunk(chunk, broken_output=None, dry_run=False): from flask_sqlalchemy import models_committed from invenio_records.receivers import record_modification from invenio_records.tasks.index import get_record_index from invenio.base.globals import cfg from elasticsearch.helpers import bulk as es_bulk from inspirehep.modules.citations.receivers import ( catch_citations_insert, add_citation_count_on_insert_or_update, catch_citations_update ) from invenio_records.signals import before_record_index, after_record_insert models_committed.disconnect(record_modification) after_record_insert.disconnect(catch_citations_insert) before_record_index.disconnect(add_citation_count_on_insert_or_update) before_record_index.disconnect(catch_citations_update) records_to_index = [] try: for raw_record in chunk: json = None record = marc_create_record(raw_record, keep_singletons=False) recid = int(record['001']) if not dry_run: prod_record = InspireProdRecords(recid=recid) prod_record.marcxml = raw_record try: with db.session.begin_nested(): errors, recid, json = create_record( recid, record, force=True, dry_run=dry_run, validation=True ) if dry_run: continue prod_record.valid = not errors prod_record.errors = errors index = get_record_index(json) or \ cfg['SEARCH_ELASTIC_DEFAULT_INDEX'] before_record_index.send(recid, json=json, index=index) json.update({'_index': index, '_type': 'record', '_id': recid, 'citation_count': 0}) records_to_index.append(json) prod_record.successful = True db.session.merge(prod_record) except Exception as err: logger.error("ERROR with record {} and json {}".format(recid, json)) logger.exception(err) if not dry_run: prod_record.successful = False db.session.merge(prod_record) logger.info("Committing chunk") db.session.commit() logger.info("Sending chunk to elasticsearch") if not dry_run: es_bulk(es, records_to_index, request_timeout=60) finally: models_committed.connect(record_modification) after_record_insert.connect(catch_citations_insert) before_record_index.connect(add_citation_count_on_insert_or_update) before_record_index.connect(catch_citations_update) db.session.close()