def migrate_chunk(chunk, broken_output=None, dry_run=False): from flask_sqlalchemy import models_committed from invenio_records.receivers import record_modification from invenio_records.tasks.index import get_record_index from invenio.base.globals import cfg from elasticsearch.helpers import bulk as es_bulk from inspirehep.modules.citations.receivers import ( catch_citations_insert, add_citation_count_on_insert_or_update, catch_citations_update ) from invenio_records.signals import before_record_index, after_record_insert models_committed.disconnect(record_modification) after_record_insert.disconnect(catch_citations_insert) before_record_index.disconnect(add_citation_count_on_insert_or_update) before_record_index.disconnect(catch_citations_update) records_to_index = [] try: for record in chunk: recid = json = None try: recid, json = create_record(record, force=True, dry_run=dry_run) index = get_record_index(json) or \ cfg['SEARCH_ELASTIC_DEFAULT_INDEX'] before_record_index.send(recid, json=json, index=index) json.update({'_index': index, '_type': 'record', '_id': recid, 'citation_count': 0}) records_to_index.append(json) except Exception as err: logger.error("ERROR with record {} and json {}".format(recid, json)) logger.exception(err) if broken_output: broken_output_fd = open(broken_output, "a") print(record, file=broken_output_fd) logger.info("Committing chunk") db.session.commit() logger.info("Sending chunk to elasticsearch") es_bulk(es, records_to_index, request_timeout=60) finally: models_committed.connect(record_modification) after_record_insert.connect(catch_citations_insert) before_record_index.connect(add_citation_count_on_insert_or_update) before_record_index.connect(catch_citations_update) db.session.close()
def signals(): """Fixtures to connect signals.""" called = {} def _listener(signal_name, sender, *args, **kwargs): if signal_name not in called: called[signal_name] = 0 called[signal_name] += 1 after_record_delete_listener = partial(_listener, 'after_record_delete') after_record_insert_listener = partial(_listener, 'after_record_insert') after_record_revert_listener = partial(_listener, 'after_record_revert') after_record_update_listener = partial(_listener, 'after_record_update') before_record_delete_listener = partial(_listener, 'before_record_delete') before_record_insert_listener = partial(_listener, 'before_record_insert') before_record_revert_listener = partial(_listener, 'before_record_revert') before_record_update_listener = partial(_listener, 'before_record_update') before_record_insert_listener = partial(_listener, 'before_record_insert') after_record_delete.connect(after_record_delete_listener) after_record_insert.connect(after_record_insert_listener) after_record_revert.connect(after_record_revert_listener) after_record_update.connect(after_record_update_listener) before_record_delete.connect(before_record_delete_listener) before_record_insert.connect(before_record_insert_listener) before_record_revert.connect(before_record_revert_listener) before_record_update.connect(before_record_update_listener) before_record_insert.connect(before_record_insert_listener) yield called after_record_delete.disconnect(after_record_delete_listener) after_record_insert.disconnect(after_record_insert_listener) after_record_revert.disconnect(after_record_revert_listener) after_record_update.disconnect(after_record_update_listener) before_record_delete.disconnect(before_record_delete_listener) before_record_insert.disconnect(before_record_insert_listener) before_record_revert.disconnect(before_record_revert_listener) before_record_update.disconnect(before_record_update_listener) before_record_insert.disconnect(before_record_insert_listener)
def create_author(profile): """Create a new author profile based on a given signature. The method receives a dictionary representing an author. Based on the values, it creates a dictionary in the invenio_records format. After all the fields are processed, the method calls create_record from invenio_records.api to put the new record. :param profile: A signature representing an author's to be created as a profile. Example: profile = {u'affiliations': [{u'value': u'Yerevan Phys. Inst.'}], u'alternative_name': None, u'curated_relation': False, u'email': None, u'full_name': u'Chatrchyan, Serguei', u'inspire_id': None, u'orcid': None, u'profile': u'', u'recid': None, u'role': None, u'uuid': u'd63537a8-1df4-4436-b5ed-224da5b5028c'} :return: A recid, where the new profile can be accessed. Example: "1234" """ name = profile.get('full_name') # Template of an initial record. record = {'collections': [{'primary': 'HEPNAMES'}], 'name': {'value': name}, '$schema': _get_author_schema()} # The author's email address. # Unfortunately the method will not correlate a given e-mail address # with an affiliation. if 'email' in profile: email = profile.get('email') record['positions'] = [] record['positions'].append({'email': email}) # The author can be a member of more than one affiliation. if 'affiliations' in profile: affiliations = profile.get('affiliations') if 'positions' not in record: record['positions'] = [] for affiliation in affiliations: name = affiliation.get('value') recid = affiliation.get('recid', None) if recid: record['positions'].append( {'institution': {'name': name, 'recid': recid}}) else: record['positions'].append( {'institution': {'name': name}}) # FIXME: The method should also collect the useful data # from the publication, like category field, subject, # etc. # Disconnect the signal on insert of a new record. after_record_insert.disconnect(append_new_record_to_queue) # Create a new author profile. record = InspireRecord.create(record, id_=None) # Create Inspire recid. record_pid = inspire_recid_minter(record.id, record) # Extend the new record with Inspire recid and self key. record['control_number'] = record_pid.pid_value record['self'] = inspire_dojson_utils.get_record_ref( record_pid.pid_value, 'authors') # Apply the changes. record.commit() db.session.commit() # Reconnect the disconnected signal. after_record_insert.connect(append_new_record_to_queue) # Report. logger.info("Created profile: %s", record_pid.pid_value) # Return the recid of new profile to which signatures will point to. return record_pid.pid_value
def create_author(profile): """Create a new author profile based on a given signature. The method receives a dictionary representing an author. Based on the values, it creates a dictionary in the invenio_records format. After all the fields are processed, the method calls create_record from invenio_records.api to put the new record. :param profile: A signature representing an author's to be created as a profile. Example: profile = {u'affiliations': [{u'value': u'Yerevan Phys. Inst.'}], u'alternative_name': None, u'curated_relation': False, u'email': None, u'full_name': u'Chatrchyan, Serguei', u'inspire_id': None, u'orcid': None, u'profile': u'', u'recid': None, u'role': None, u'uuid': u'd63537a8-1df4-4436-b5ed-224da5b5028c'} :return: A recid, where the new profile can be accessed. Example: "1234" """ name = profile.get('full_name') # Template of an initial record. record = {'collections': [{'primary': 'HEPNAMES'}], 'name': {'value': name}, '$schema': _get_author_schema()} # The author's email address. # Unfortunately the method will not correlate a given e-mail address # with an affiliation. if 'email' in profile: email = profile.get('email') record['positions'] = [] record['positions'].append({'email': email}) # The author can be a member of more than one affiliation. if 'affiliations' in profile: affiliations = profile.get('affiliations') if 'positions' not in record: record['positions'] = [] for affiliation in affiliations: name = affiliation.get('value') recid = affiliation.get('recid', None) if recid: record['positions'].append( {'institution': {'name': name, 'recid': recid}}) else: record['positions'].append( {'institution': {'name': name}}) # FIXME: The method should also collect the useful data # from the publication, like category field, subject, # etc. # Disconnect the signal on insert of a new record. after_record_insert.disconnect(append_new_record_to_queue) # Create a new author profile. record = Record.create(record, id_=None) # Create Inspire recid. record_pid = inspire_recid_minter(record.id, record) # Extend the new record with Inspire recid and self key. record['control_number'] = record_pid.pid_value record['self'] = inspire_dojson_utils.get_record_ref( record_pid.pid_value, 'authors') # Apply the changes. record.commit() db.session.commit() # Add the record to Elasticsearch. indexer = RecordIndexer() indexer.index_by_id(record_pid.object_uuid) # Reconnect the disconnected signal. after_record_insert.connect(append_new_record_to_queue) # Report. logger.info("Created profile: %s", record_pid.pid_value) # Return the recid of new profile to which signatures will point to. return record_pid.pid_value
def migrate_chunk(chunk, broken_output=None, dry_run=False): from flask_sqlalchemy import models_committed from invenio_records.receivers import record_modification from invenio_records.tasks.index import get_record_index from invenio.base.globals import cfg from elasticsearch.helpers import bulk as es_bulk from inspirehep.modules.citations.receivers import ( catch_citations_insert, add_citation_count_on_insert_or_update, catch_citations_update ) from invenio_records.signals import before_record_index, after_record_insert models_committed.disconnect(record_modification) after_record_insert.disconnect(catch_citations_insert) before_record_index.disconnect(add_citation_count_on_insert_or_update) before_record_index.disconnect(catch_citations_update) records_to_index = [] try: for raw_record in chunk: json = None record = marc_create_record(raw_record, keep_singletons=False) recid = int(record['001']) if not dry_run: prod_record = InspireProdRecords(recid=recid) prod_record.marcxml = raw_record try: with db.session.begin_nested(): errors, recid, json = create_record( recid, record, force=True, dry_run=dry_run, validation=True ) if dry_run: continue prod_record.valid = not errors prod_record.errors = errors index = get_record_index(json) or \ cfg['SEARCH_ELASTIC_DEFAULT_INDEX'] before_record_index.send(recid, json=json, index=index) json.update({'_index': index, '_type': 'record', '_id': recid, 'citation_count': 0}) records_to_index.append(json) prod_record.successful = True db.session.merge(prod_record) except Exception as err: logger.error("ERROR with record {} and json {}".format(recid, json)) logger.exception(err) if not dry_run: prod_record.successful = False db.session.merge(prod_record) logger.info("Committing chunk") db.session.commit() logger.info("Sending chunk to elasticsearch") if not dry_run: es_bulk(es, records_to_index, request_timeout=60) finally: models_committed.connect(record_modification) after_record_insert.connect(catch_citations_insert) before_record_index.connect(add_citation_count_on_insert_or_update) before_record_index.connect(catch_citations_update) db.session.close()