def match_references(references): """Match references to their respective records in INSPIRE. Args: references (list): the list of references. Returns: dict: the match result """ matched_references, previous_matched_recid = [], None any_link_modified = False added_recids = [] removed_recids = [] for reference in references: current_record_ref = get_value(reference, "record.$ref") reference = match_reference(reference, previous_matched_recid) new_record_ref = get_value(reference, "record.$ref") if current_record_ref != new_record_ref: any_link_modified = True if current_record_ref: removed_recids.append( get_recid_from_ref({"$ref": current_record_ref})) if new_record_ref: added_recids.append( get_recid_from_ref({"$ref": new_record_ref})) matched_references.append(reference) if "record" in reference: previous_matched_recid = get_recid_from_ref(reference["record"]) return { "matched_references": matched_references, "any_link_modified": any_link_modified, "added_recids": added_recids, "removed_recids": removed_recids, }
def get_conference_record(record, default=None): """Return the first Conference record associated with a record. Queries the database to fetch the first Conference record referenced in the ``publication_info`` of the record. Args: record(InspireRecord): a record. default: value to be returned if no conference record present/found Returns: InspireRecord: the first Conference record associated with the record. Examples: >>> record = { ... 'publication_info': [ ... { ... 'conference_record': { ... '$ref': '/api/conferences/972464', ... }, ... }, ... ], ... } >>> conference_record = get_conference_record(record) >>> conference_record['control_number'] 972464 """ pub_info = get_value(record, 'publication_info.conference_record[0]') if not pub_info: return default conferences = get_db_records([('con', get_recid_from_ref(pub_info))]) return list(conferences)[0]
def record_insert_or_replace(json, skip_files=False): """Insert or replace a record.""" pid_type = get_pid_type_from_schema(json['$schema']) control_number = json['control_number'] try: pid = PersistentIdentifier.get(pid_type, control_number) record = InspireRecord.get_record(pid.object_uuid) record.clear() record.update(json, skip_files=skip_files) if json.get('legacy_creation_date'): record.model.created = datetime.strptime( json['legacy_creation_date'], '%Y-%m-%d') record.commit() except PIDDoesNotExistError: record = InspireRecord.create(json, id_=None, skip_files=skip_files) if json.get('legacy_creation_date'): record.model.created = datetime.strptime( json['legacy_creation_date'], '%Y-%m-%d') inspire_recid_minter(str(record.id), json) if json.get('deleted'): new_recid = get_recid_from_ref(json.get('new_record')) if not new_recid: record.delete() return record
def merge_merged_records(): """Merge all records that were marked as merged.""" records = get_merged_records() for record in records: record_pid = PersistentIdentifier.query.filter_by(object_uuid=record.id).one() deleted_ids = [get_recid_from_ref(ref) for ref in record['deleted_records']] for deleted_id in deleted_ids: deleted_pid = PersistentIdentifier.query.filter_by( pid_value=str(deleted_id) ).one_or_none() if not deleted_pid: deleted_pid = PersistentIdentifier.create( pid_type=get_pid_type_from_schema(record['$schema']), pid_value=deleted_id, object_type='rec' ) deleted_pid.register() db.session.add(deleted_pid) deleted_pid.redirect(record_pid) db.session.commit()
def create_or_update(cls, data, **kwargs): """Create or update a record. It will check if there is any record registered with the same ``control_number`` and ``pid_type``. If it's ``True``, it will update the current record, otherwise it will create a new one. Keyword Args: files_src_records(List[InspireRecord]): if passed, it will try to get the files for the documents and figures from the first record in the list that has it in it's files iterator before downloading them, for example to merge existing records. skip_files(bool): if ``True`` it will skip the files retrieval described above. Note also that, if not passed, it will fall back to the value of the ``RECORDS_SKIP_FILES`` configuration variable. Examples: >>> record = { ... '$schema': 'hep.json', ... } >>> record = InspireRecord.create_or_update(record) >>> record.commit() """ pid_type = get_pid_type_from_schema(data['$schema']) control_number = data.get('control_number') files_src_records = kwargs.pop('files_src_records', []) skip_files = kwargs.pop('skip_files', current_app.config.get('RECORDS_SKIP_FILES')) try: pid = PersistentIdentifier.get(pid_type, control_number) record = super(InspireRecord, cls).get_record(pid.object_uuid) record.clear() record.update(data, skip_files=skip_files, **kwargs) if data.get('legacy_creation_date'): record.model.created = datetime.strptime( data['legacy_creation_date'], '%Y-%m-%d') except PIDDoesNotExistError: record = cls.create(data, skip_files=skip_files, **kwargs) if data.get('legacy_creation_date'): record.model.created = datetime.strptime( data['legacy_creation_date'], '%Y-%m-%d') if data.get('deleted'): new_recid = get_recid_from_ref(data.get('new_record')) if not new_recid: record.delete() if not skip_files: record.download_documents_and_figures( src_records=files_src_records, ) return record
def get_authors(record): """TODO. Args: record: a record. Returns: TODO Examples: TODO """ hal_id_map = _get_hal_id_map(record) result = [] for author in record.get('authors', []): affiliations = [] first_name, last_name = _split_full_name(author['full_name']) for affiliation in author.get('affiliations', []): recid = get_recid_from_ref(affiliation.get('record')) if recid in hal_id_map and hal_id_map[recid]: affiliations.append({'hal_id': hal_id_map[recid]}) result.append({ 'affiliations': affiliations, 'first_name': first_name, 'last_name': last_name, }) return result
def merge_merged_records(): """Merge all records that were marked as merged.""" records = get_merged_records() for record in records: record_pid = PersistentIdentifier.query.filter_by( object_uuid=record.id).one() deleted_ids = [ get_recid_from_ref(ref) for ref in record['deleted_records'] ] for deleted_id in deleted_ids: deleted_pid = PersistentIdentifier.query.filter_by( pid_value=str(deleted_id)).one_or_none() if not deleted_pid: deleted_pid = PersistentIdentifier.create( pid_type=get_pid_type_from_schema(record['$schema']), pid_value=deleted_id, object_type='rec') deleted_pid.register() db.session.add(deleted_pid) deleted_pid.redirect(record_pid) db.session.commit()
def get_orcids_for_push(record): """Obtain the ORCIDs associated to the list of authors in the Literature record. The ORCIDs are looked up both in the ``ids`` of the ``authors`` and in the Author records that have claimed the paper. Args: record(dict): metadata from a Literature record Returns: Iterator[str]: all ORCIDs associated to these authors """ orcids_on_record = [] author_recids_with_claims = [] for author in record.get('authors', []): orcids_in_author = get_values_for_schema(author.get('ids', []), 'ORCID') if orcids_in_author: orcids_on_record.extend(orcids_in_author) elif author.get('curated_relation') is True and 'record' in author: author_recids_with_claims.append(get_recid_from_ref(author['record'])) author_records = get_db_records(('aut', recid) for recid in author_recids_with_claims) all_ids = (author.get('ids', []) for author in author_records) orcids_in_authors = chain.from_iterable(get_values_for_schema(ids, 'ORCID') for ids in all_ids) return chain(orcids_on_record, orcids_in_authors)
def match_reference_control_numbers_with_relaxed_journal_titles(reference): """Match reference and return the `control_number`. Args: reference (dict): the metadata of a reference. Returns: list: list of matched recids or None. """ if reference.get("curated_relation"): try: return [get_recid_from_ref(reference["record"])] except KeyError: return None configs = match_reference_config(reference, use_relaxed_titles_matching=True) matches = set() for config in configs: matched_recids = [ matched_record["_source"]["control_number"] for matched_record in match(reference, config) ] matches.update(matched_recids) matches = list(matches)[0:5] return matches
def get_orcids_for_push(record): """Obtain the ORCIDs associated to the list of authors in the Literature record. The ORCIDs are looked up both in the ``ids`` of the ``authors`` and in the Author records that have claimed the paper. Args: record(dict): metadata from a Literature record Returns: Iterator[str]: all ORCIDs associated to these authors """ orcids_on_record = [] author_recids_with_claims = [] for author in record.get("authors", []): orcids_in_author = get_values_for_schema(author.get("ids", []), "ORCID") if orcids_in_author: orcids_on_record.extend(orcids_in_author) elif author.get("curated_relation") is True and "record" in author: author_recids_with_claims.append( get_recid_from_ref(author["record"])) author_records = AuthorsRecord.get_records_by_pids( ("aut", str(recid)) for recid in author_recids_with_claims) all_ids = (author.get("ids", []) for author in author_records) orcids_in_authors = chain.from_iterable( get_values_for_schema(ids, "ORCID") for ids in all_ids) return chain(orcids_on_record, orcids_in_authors)
def record_insert_or_replace(json, skip_files=False): """Insert or replace a record.""" pid_type = get_pid_type_from_schema(json['$schema']) control_number = json['control_number'] try: pid = PersistentIdentifier.get(pid_type, control_number) record = InspireRecord.get_record(pid.object_uuid) record.clear() record.update(json, skip_files=skip_files) if json.get('legacy_creation_date'): record.model.created = datetime.strptime(json['legacy_creation_date'], '%Y-%m-%d') record.commit() except PIDDoesNotExistError: record = InspireRecord.create(json, id_=None, skip_files=skip_files) if json.get('legacy_creation_date'): record.model.created = datetime.strptime(json['legacy_creation_date'], '%Y-%m-%d') inspire_recid_minter(str(record.id), json) if json.get('deleted'): new_recid = get_recid_from_ref(json.get('new_record')) if not new_recid: record.delete() return record
def generate_entries_for_table(self): table_entries_buffer = [] student_record_uuid = self.id for advisor in self.get_value("advisors", []): if "record" not in advisor: LOGGER.info( f"Skipping creating entries in " f"{StudentsAdvisors.__tablename__} table. Advisor record is missing", recid=self.get("control_number"), uuid=str(self.id), ) continue advisor_recid = get_recid_from_ref(advisor["record"]) advisor_record_uuid = ( PersistentIdentifier.query.with_entities( PersistentIdentifier.object_uuid ) .filter_by(pid_value=str(advisor_recid), pid_type="aut") .scalar() ) degree_type = advisor.get("degree_type") table_entries_buffer.append( StudentsAdvisors( advisor_id=advisor_record_uuid, student_id=student_record_uuid, degree_type=degree_type, ) ) return table_entries_buffer
def create_or_update(cls, data, **kwargs): """Create or update a record. It will check if there is any record registered with the same ``control_number`` and ``pid_type``. If it's ``True``, it will update the current record, otherwise it will create a new one. Keyword Args: files_src_records(List[InspireRecord]): if passed, it will try to get the files for the documents and figures from the first record in the list that has it in it's files iterator before downloading them, for example to merge existing records. skip_files(bool): if ``True`` it will skip the files retrieval described above. Note also that, if not passed, it will fall back to the value of the ``RECORDS_SKIP_FILES`` configuration variable. Examples: >>> record = { ... '$schema': 'hep.json', ... } >>> record = InspireRecord.create_or_update(record) >>> record.commit() """ pid_type = get_pid_type_from_schema(data['$schema']) control_number = data.get('control_number') files_src_records = kwargs.pop('files_src_records', []) skip_files = kwargs.pop( 'skip_files', current_app.config.get('RECORDS_SKIP_FILES')) try: pid = PersistentIdentifier.get(pid_type, control_number) record = super(InspireRecord, cls).get_record(pid.object_uuid) record.clear() record.update(data, skip_files=skip_files, **kwargs) if data.get('legacy_creation_date'): record.model.created = datetime.strptime(data['legacy_creation_date'], '%Y-%m-%d') except PIDDoesNotExistError: record = cls.create(data, skip_files=skip_files, **kwargs) if data.get('legacy_creation_date'): record.model.created = datetime.strptime(data['legacy_creation_date'], '%Y-%m-%d') if data.get('deleted'): new_recid = get_recid_from_ref(data.get('new_record')) if not new_recid: record.delete() if not skip_files: record.download_documents_and_figures( src_records=files_src_records, ) return record
def get_resolved_record_or_experiment(self, experiment_records_map, experiment): experiment_record_id = get_recid_from_ref(experiment.get("record")) experiment_record = experiment_records_map.get(experiment_record_id) if experiment_record and "legacy_name" not in experiment_record: experiment_record["legacy_name"] = experiment.get("legacy_name") return experiment_record or experiment
def _get_hal_id_map(record): affiliations = get_value(record, 'authors.affiliations.record', default=[]) affiliation_list = chain.from_iterable(affiliations) affiliation_recids = [get_recid_from_ref(el) for el in affiliation_list] pids = [('ins', pid) for pid in affiliation_recids] institutions = get_db_records(pids) return {el['control_number']: _get_hal_id(el) for el in institutions}
def _get_hal_id_map(record): affiliations = record.get_value("authors.affiliations.record", default=[]) affiliation_list = chain.from_iterable(affiliations) affiliation_recids = [get_recid_from_ref(el) for el in affiliation_list] pids = [("ins", str(pid)) for pid in affiliation_recids] institutions = InspireRecord.get_records_by_pids(pids) return {el["control_number"]: _get_hal_id(el) for el in institutions}
def _get_hal_id_map(record): affiliation_records = chain.from_iterable(get_value( record, 'authors.affiliations.record', default=[])) affiliation_recids = [get_recid_from_ref(el) for el in affiliation_records] try: institutions = get_es_records('ins', affiliation_recids) except RequestError: institutions = [] return {el['control_number']: _get_hal_id(el) for el in institutions}
def get_resolved_record_or_experiment(self, experiment_records_map, experiment): record_ref = experiment.get("record") experiment_record_id = get_recid_from_ref(record_ref) experiment_record = experiment_records_map.get(experiment_record_id) if experiment_record: if "legacy_name" not in experiment_record: experiment_record["legacy_name"] = experiment.get("legacy_name") if "self" not in experiment_record: experiment_record["self"] = record_ref return experiment_record or experiment
def _get_hal_id_map(record): affiliation_records = chain.from_iterable( get_value(record, 'authors.affiliations.record', default=[])) affiliation_recids = [get_recid_from_ref(el) for el in affiliation_records] try: institutions = get_es_records('ins', affiliation_recids) except RequestError: institutions = [] return {el['control_number']: _get_hal_id(el) for el in institutions}
def get_authors(record): """Return the authors of a record. Queries the Institution records linked from the authors affiliations to add, whenever it exists, the HAL identifier of the institution to the affiliation. Args: record(InspireRecord): a record. Returns: list(dict): the authors of the record. Examples: >>> record = { ... 'authors': [ ... 'affiliations': [ ... { ... 'record': { ... '$ref': 'http://localhost:5000/api/institutions/902725', ... } ... }, ... ], ... ], ... } >>> authors = get_authors(record) >>> authors[0]['hal_id'] '300037' """ hal_id_map = _get_hal_id_map(record) result = [] for author in record.get('authors', []): affiliations = [] parsed_name = ParsedName.loads(author['full_name']) first_name, last_name = parsed_name.first, parsed_name.last for affiliation in author.get('affiliations', []): recid = get_recid_from_ref(affiliation.get('record')) if recid in hal_id_map and hal_id_map[recid]: affiliations.append({'hal_id': hal_id_map[recid]}) result.append({ 'affiliations': affiliations, 'first_name': first_name, 'last_name': last_name, }) return result
def _get_lit_authors_names_recids_dict(authors, last_names_only=False): authors_recids_names = {} for author in authors: if not author.get("record"): continue author_recid = get_recid_from_ref(author["record"]) if last_names_only: authors_recids_names[author_recid] = author["full_name"].split( ",")[0] else: authors_recids_names[author_recid] = author["full_name"] return authors_recids_names
def _recursive_find_refs(json_root): if isinstance(json_root, list): items = enumerate(json_root) elif isinstance(json_root, dict): # Note that items have to be generated before altering the dict. # In this case, iteritems might break during iteration. items = json_root.items() else: items = [] for key, value in items: if (isinstance(json_root, dict) and isinstance(value, dict) and '$ref' in value): # Append '_recid' and remove 'record' from the key name. key_basename = key.replace('record', '').rstrip('_') new_key = '{}_recid'.format(key_basename).lstrip('_') json_root[new_key] = get_recid_from_ref(value) elif (isinstance(json_root, dict) and isinstance(value, list) and key in list_ref_fields_translations): new_list = [get_recid_from_ref(v) for v in value] new_key = list_ref_fields_translations[key] json_root[new_key] = new_list else: _recursive_find_refs(value)
def match_references(references): """Match references to their respective records in INSPIRE. Args: references (list): the list of references. Returns: list: the matched references. """ matched_references, previous_matched_recid = [], None for ref in references: ref = match_reference(ref, previous_matched_recid) matched_references.append(ref) if 'record' in ref: previous_matched_recid = get_recid_from_ref(ref['record']) return matched_references
def _find_matching_author_in_lit_record(author_parsed_name, lit_recid): author_name_query = author_parsed_name.generate_es_query() author_name_query["nested"]["inner_hits"] = {} query = { "bool": { "must": [author_name_query, { "match": { "control_number": lit_recid } }] } } hits = LiteratureSearch().query(query).execute() authors_matched = hits[0].meta["inner_hits"].to_dict().get("authors") if len(hits) == 1 and len(authors_matched) == 1: author_record = authors_matched[0]["record"].to_dict() return get_recid_from_ref(author_record)
def merge_merged_records(): """Merge all records that were marked as merged.""" def _get_record(recid): pid = PersistentIdentifier.query.filter_by(pid_value=str(recid)).one() return get_db_record(pid.pid_type, recid) records = get_records_to_merge() with db.session.begin_nested(): for record in records: recid = record['control_number'] other_recid = get_recid_from_ref(record['new_record']) other_record = _get_record(other_recid) logger.info('Merged records: %d and %d', recid, other_recid) record.merge(other_record) record.commit() db.session.commit()
def test_merge_record_with_non_existing_pid(api_client, merged_records): def get_pid_entry(recid): return PersistentIdentifier.query.filter_by(pid_value=str(recid)).one_or_none() merged_record = get_db_record('lit', 111) assert get_recid_from_ref(merged_record['deleted_records'][0]) == 222 # remove it so it doesn't exist anymore pid_for_222 = get_pid_entry(222) db.session.delete(pid_for_222) pid_for_222 = get_pid_entry(222) assert pid_for_222 is None merge_merged_records() # new pid is created for the non-existing deleted record pid_for_222 = get_pid_entry(222) assert pid_for_222 is not None
def record_insert_or_replace(json): """Insert or replace a record.""" control_number = json.get('control_number', json.get('recid')) if control_number: pid_type = get_pid_type_from_schema(json['$schema']) try: pid = PersistentIdentifier.get(pid_type, control_number) record = InspireRecord.get_record(pid.object_uuid) record.clear() record.update(json) record.commit() except PIDDoesNotExistError: record = InspireRecord.create(json, id_=None) # Create persistent identifier. inspire_recid_minter(str(record.id), json) if json.get('deleted'): new_recid = get_recid_from_ref(json.get('new_record')) if not new_recid: record.delete() return record
def test_get_recid_from_ref_returns_none_on_empty_object(): assert get_recid_from_ref({}) is None
def get_resolved_record_or_experiment(self, experiment_records_map, experiment): experiment_record_id = get_recid_from_ref(experiment.get('record')) experiment_record = experiment_records_map.get(experiment_record_id) return experiment_record or experiment
def test_assign_without_to_author(inspire_app): cataloger = create_user(role="cataloger") from_author = create_record("aut") literature1 = create_record( "lit", data={ "authors": [{ "curated_relation": False, "full_name": "Urhan, Harun", "record": { "$ref": f"http://localhost:5000/api/authors/{from_author['control_number']}" }, }] }, ) literature2 = create_record( "lit", data={ "authors": [{ "curated_relation": False, "full_name": "Urhan, H", "record": { "$ref": f"http://localhost:5000/api/authors/{from_author['control_number']}" }, }] }, ) with inspire_app.test_client() as client: login_user_via_session(client, email=cataloger.email) response = client.post( "/assign/author", data=orjson.dumps({ "literature_recids": [ literature1["control_number"], literature2["control_number"], ], "from_author_recid": from_author["control_number"], }), content_type="application/json", ) response_status_code = response.status_code assert response_status_code == 200 stub_author_id = response.json["stub_author_id"] literature1_after = LiteratureRecord.get_record_by_pid_value( literature1["control_number"]) literature1_author = literature1_after["authors"][0] literature1_author_recid = get_recid_from_ref(literature1_author["record"]) assert literature1_author_recid != from_author["control_number"] assert literature1_author_recid == stub_author_id assert literature1_author["curated_relation"] is True literature2_after = LiteratureRecord.get_record_by_pid_value( literature1["control_number"]) literature2_author = literature2_after["authors"][0] literature2_author_recid = get_recid_from_ref(literature2_author["record"]) assert literature2_author_recid != from_author["control_number"] assert literature2_author_recid == stub_author_id assert literature2_author["curated_relation"] is True author = AuthorsRecord.get_record_by_pid_value(stub_author_id) assert author["stub"] is True assert author["name"] == { "value": "Urhan, Harun", "name_variants": ["Urhan, H"] }
def get_author_by_recid(literature_record, author_recid): return next(author for author in literature_record.get("authors") if get_recid_from_ref(author.get("record")) == author_recid)
def get_recid(self, data): # FIXME: missing from everwhere if "record" in data: return get_recid_from_ref(data["record"]) return missing
def test_get_recid_from_ref_returns_none_on_ref_malformed(): assert get_recid_from_ref({'$ref': 'http://bad_url'}) is None
def test_get_recid_from_ref_returns_none_on_ref_a_simple_string(): assert get_recid_from_ref({'$ref': 'a_string'}) is None
def test_get_recid_from_ref_returns_none_on_simple_strings(): assert get_recid_from_ref('a_string') is None
def test_get_recid_from_ref_returns_none_on_object_with_wrong_key(): assert get_recid_from_ref({'bad_key': 'some_val'}) is None
def get_reference_record_id(self, data): return get_recid_from_ref(data.get('record'))