def curation_ticket_context(user, obj): recid = obj.extra_data.get('recid') record_url = obj.extra_data.get('url') arxiv_ids = get_value(obj.data, 'arxiv_eprints.value') or [] for index, arxiv_id in enumerate(arxiv_ids): if arxiv_id and is_arxiv_post_2007(arxiv_id): arxiv_ids[index] = 'arXiv:{0}'.format(arxiv_id) report_numbers = get_value(obj.data, 'report_numbers.value') or [] dois = [ "doi:{0}".format(doi) for doi in get_value(obj.data, 'dois.value') or [] ] link_to_pdf = obj.extra_data.get('formdata', {}).get('url') subject = ' '.join(filter( lambda x: x is not None, arxiv_ids + dois + report_numbers + ['(#{0})'.format(recid)] )) references = obj.extra_data.get('formdata', {}).get('references') user_comment = obj.extra_data.get('formdata', {}).get('extra_comments', '') return dict( recid=recid, record_url=record_url, link_to_pdf=link_to_pdf, email=user.email if user else '', references=references, user_comment=user_comment, subject=subject )
def get_name(self, item): institution = get_value(item, 'institutions[0].value') accelerator = get_value(item, 'accelerator.value') experiment = get_value(item, 'experiment.value') if institution and accelerator and experiment: return u'{}-{}-{}'.format(institution, accelerator, experiment) return item.get('legacy_name')
def populate_affiliation_suggest(record): """Populate the ``affiliation_suggest`` field of Institution records.""" ICN = record.get('ICN', []) institution_acronyms = get_value(record, 'institution_hierarchy.acronym', default=[]) institution_names = get_value(record, 'institution_hierarchy.name', default=[]) legacy_ICN = record.get('legacy_ICN', '') name_variants = force_list(get_value(record, 'name_variants.value', default=[])) postal_codes = force_list(get_value(record, 'addresses.postal_code', default=[])) # XXX: this is need by the curators to search only with numbers extract_numbers_from_umr = [] for name in name_variants: match = re.match(r'UMR\s', name, re.IGNORECASE) if match: umr_number = name.replace(match.group(0), '') extract_numbers_from_umr.append(umr_number) input_values = [] input_values.extend(ICN) input_values.extend(institution_acronyms) input_values.extend(institution_names) input_values.append(legacy_ICN) input_values.extend(name_variants) input_values.extend(postal_codes) input_values.extend(extract_numbers_from_umr) input_values = [el for el in input_values if el] record['affiliation_suggest'] = { 'input': input_values, }
def populate_bookautocomplete(sender, json, *args, **kwargs): """Populate the ```bookautocomplete`` field of Literature records.""" if not is_hep(json): return if 'book' not in json.get('document_type', []): return paths = [ 'imprints.date', 'imprints.publisher', 'isbns.value', ] authors = force_list(get_value(json, 'authors.full_name', default=[])) titles = force_list(get_value(json, 'titles.title', default=[])) input_values = list(chain.from_iterable( force_list(get_value(json, path, default=[])) for path in paths)) input_values.extend(authors) input_values.extend(titles) input_values = [el for el in input_values if el] ref = get_value(json, 'self.$ref') json.update({ 'bookautocomplete': { 'input': input_values, 'payload': { 'authors': authors, 'id': ref, 'title': titles, }, }, })
def populate_affiliation_suggest(sender, json, *args, **kwargs): """Populate the ``affiliation_suggest`` field of Institution records.""" if 'institutions.json' not in json.get('$schema'): return ICN = json.get('ICN', []) institution_acronyms = get_value(json, 'institution_hierarchy.acronym', default=[]) institution_names = get_value(json, 'institution_hierarchy.name', default=[]) legacy_ICN = json.get('legacy_ICN', '') name_variants = force_list(get_value(json, 'name_variants.value', default=[])) postal_codes = force_list(get_value(json, 'addresses.postal_code', default=[])) input_values = [] input_values.extend(ICN) input_values.extend(institution_acronyms) input_values.extend(institution_names) input_values.append(legacy_ICN) input_values.extend(name_variants) input_values.extend(postal_codes) input_values = [el for el in input_values if el] json.update({ 'affiliation_suggest': { 'input': input_values, 'output': legacy_ICN, 'payload': { '$ref': get_value(json, 'self.$ref'), 'ICN': ICN, 'institution_acronyms': institution_acronyms, 'institution_names': institution_names, 'legacy_ICN': legacy_ICN, }, }, })
def check_unlinked_references(): """Return two lists with the unlinked references that have a doi or an arxiv id. If the reference read has a doi or an arxiv id, it is stored in the data structure. Once all the data is read, it is ordered by most relevant to less relevant.""" result_doi, result_arxiv = defaultdict(lambda: (0, 0)), defaultdict(lambda: (0, 0)) linked_ids = defaultdict(lambda: (0, 0)) data = get_all_unlinked_references() for reference in data: dois = get_value(reference, 'reference.reference.dois', []) arxiv_id = get_value(reference, 'reference.reference.arxiv_eprint') if arxiv_id and len(dois) > 0: for doi in dois: increase_cited_count(linked_ids, (doi, arxiv_id), reference["core"]) for doi in dois: increase_cited_count(result_doi, doi, reference["core"]) if arxiv_id: increase_cited_count(result_arxiv, arxiv_id, reference["core"]) add_linked_ids(result_doi, result_arxiv, linked_ids) result_doi = order_dictionary_into_list(result_doi) result_arxiv = order_dictionary_into_list(result_arxiv) return result_doi, result_arxiv
def prepare_magpie_payload(record, corpus): """Prepare payload to send to Magpie API.""" payload = dict(text="", corpus=corpus) titles = filter(None, get_value(record, "titles.title", [])) abstracts = filter(None, get_value(record, "abstracts.value", [])) payload["text"] = ". ".join( [part.encode('utf-8') for part in titles + abstracts]) return payload
def get_address(data, doc_type): conference = get_conference_record(data, default={}) pubinfo_city = get_value(conference, 'address[0].cities[0]') pubinfo_country_code = get_value(conference, 'address[0].country_code') if pubinfo_city and pubinfo_country_code: return pubinfo_city + ', ' + get_country_name_by_code(pubinfo_country_code, default=pubinfo_country_code) return get_value(data, 'imprints[0].place')
def title_translation(self): """Translated title. Returns: Tuple[string, string]: translated title and the language code of the translation, if available """ title = get_value(self.record, 'title_translations[0].title') language_code = get_value(self.record, 'title_translations[0].language') if title and language_code: return title, language_code
def is_arxiv_paper(obj, *args, **kwargs): """Check if the record is from arXiv.""" method = get_value(obj.data, 'acquisition_source.method') source = get_value(obj.data, 'acquisition_source.source', default='') is_submission_with_arxiv = method == 'submitter' and 'arxiv_eprints' in obj.data is_harvested_from_arxiv = method == 'hepcrawl' and source.lower() == 'arxiv' return is_submission_with_arxiv or is_harvested_from_arxiv
def year_validator(form, field): """Validate that the field contains an year in an acceptable range.""" hep = load_schema('hep') min_year = get_value(hep, 'properties.publication_info.items.properties.year.minimum') max_year = get_value(hep, 'properties.publication_info.items.properties.year.maximum') message = 'Please, provide an year between {} and {}.'.format(min_year, max_year) if field.data and not min_year <= int(field.data) <= max_year: raise StopValidation(message)
def get_dois(self, data): dois = data.get('dois', None) control_number = data.get('control_number') if dois and not control_number: data['dois'] = force_list( {'value': get_value(data, 'dois[0]', default=missing)}) elif dois: data['dois'] = force_list( {'value': get_value(data, 'dois[0].value', default=missing)}) return data.get('dois', missing)
def _parse_json_on_failure(self, failure): """Parse a JSON article entry.""" original_response = failure.request.meta['original_response'] record = HEPLoader(item=HEPRecord(), response=original_response) article = failure.request.meta['json_article'] doi = get_value(article, 'identifiers.doi', default='') record.add_dois(dois_values=[doi]) record.add_value('page_nr', str(article.get('numPages', ''))) record.add_value('abstract', get_value(article, 'abstract.value', default='')) record.add_value('title', get_value(article, 'title.value', default='')) # record.add_value('subtitle', '') authors, collaborations = self._get_authors_and_collab(article) record.add_value('authors', authors) record.add_value('collaborations', collaborations) # record.add_value('free_keywords', free_keywords) # record.add_value('classification_numbers', classification_numbers) record.add_value('journal_title', get_value(article, 'journal.abbreviatedName', default='')) record.add_value('journal_issue', get_value(article, 'issue.number', default='')) record.add_value('journal_volume', get_value(article, 'volume.number', default='')) # record.add_value('journal_artid', ) published_date = article.get('date', '') record.add_value('journal_year', int(published_date[:4])) record.add_value('date_published', published_date) record.add_value('copyright_holder', get_value(article, 'rights.copyrightHolders.name[0]', default='')) record.add_value('copyright_year', str(get_value(article, 'rights.copyrightYear', default=''))) record.add_value('copyright_statement', get_value(article, 'rights.rightsStatement', default='')) record.add_value('copyright_material', 'publication') license = get_licenses( license_url=get_value(article, 'rights.licenses.url[0]', default='') ) record.add_value('license', license) record.add_value('collections', ['HEP', 'Citeable', 'Published']) return ParsedItem( record=record.load_item(), record_format='hepcrawl', )
def get_arxiv_id(record): """Return the first arXiv identifier of a record. Args: record (InspireRecord): a record. Returns: str: the first arXiv identifier of the record. Examples: >>> record = { ... 'arxiv_eprints': [ ... { ... 'categories': [ ... 'hep-th', ... 'hep-ph', ... ], ... 'value': '1612.08928', ... }, ... ], ... } >>> get_arxiv_id(record) '1612.08928' """ return get_value(record, 'arxiv_eprints.value[0]', default='')
def get_arxiv_categories(record): """Return all the arXiv categories of a record. Args: record (InspireRecord): a record. Returns: list(str): all the arXiv categories of the record. Examples: >>> record = { ... 'arxiv_eprints': [ ... { ... 'categories': [ ... 'hep-th', ... 'hep-ph', ... ], ... 'value': '1612.08928', ... }, ... ], ... } >>> get_arxiv_categories(record) ['hep-th', 'hep-ph'] """ return list(chain.from_iterable( get_value(record, 'arxiv_eprints.categories', default=[])))
def populate_authors_name_variations(record): """Generate name variations for an Author record.""" author_name = get_value(record, 'name.value') if author_name: name_variations = generate_name_variations(author_name) record['name_variations'] = name_variations
def bibtex_document_type(doc_type, obj): """Return the BibTeX entry type. Maps the INSPIRE ``document_type`` to a BibTeX entry type. Also checks ``thesis_info.degree_type`` in case it's a thesis, as it stores the information on which kind of thesis we're dealing with. Args: doc_type (text_type): INSPIRE document type. obj (dict): literature record. Returns: text_type: bibtex document type for the given INSPIRE entry. """ DOCUMENT_TYPE_MAP = { 'article': 'article', 'book': 'book', 'book chapter': 'inbook', 'conference paper': 'inproceedings', 'proceedings': 'proceedings', 'report': 'techreport', 'note': 'article', # theses handled separately due to masters/phd distinction } if doc_type in DOCUMENT_TYPE_MAP: return DOCUMENT_TYPE_MAP[doc_type] # Theses need special treatment, because bibtex differentiates between their types: elif doc_type == 'thesis' and get_value(obj, 'thesis_info.degree_type') in ('phd', 'habilitation'): return 'phdthesis' # Other types of theses (other, bachelor, laurea) don't have separate types in bibtex: # We will use the type field (see `get_type`) to indicate the type of diploma. elif doc_type == 'thesis': return 'mastersthesis' return 'misc'
def newreview(): """View for INSPIRE author new form review by a cataloger.""" objectid = request.values.get('objectid', 0, type=int) if not objectid: abort(400) workflow_metadata = WorkflowUIRecord.get_record(objectid)['metadata'] # Converting json to populate form workflow_metadata['extra_comments'] = get_value( workflow_metadata, '_private_notes[0].value' ) convert_for_form(workflow_metadata) form = AuthorUpdateForm( data=workflow_metadata, is_review=True) ctx = { "action": url_for('.reviewhandler', objectid=objectid), "name": "authorUpdateForm", "id": "authorUpdateForm", "objectid": objectid } return render_template('authors/forms/review_form.html', form=form, **ctx)
def get_conference_record(record, default=None): """Return the first Conference record associated with a record. Queries the database to fetch the first Conference record referenced in the ``publication_info`` of the record. Args: record(InspireRecord): a record. default: value to be returned if no conference record present/found Returns: InspireRecord: the first Conference record associated with the record. Examples: >>> record = { ... 'publication_info': [ ... { ... 'conference_record': { ... '$ref': '/api/conferences/972464', ... }, ... }, ... ], ... } >>> conference_record = get_conference_record(record) >>> conference_record['control_number'] 972464 """ replaced = replace_refs(get_value(record, 'publication_info.conference_record[0]'), 'db') if replaced: return replaced else: return default
def get_note(data, doc_type): """Write and addendum/errata information to the BibTeX note field. Traverse publication_info looking for erratum and addendum in `publication_info.material` field and build a string of references to those publication entries. Returns: string: formatted list of the errata and addenda available for a given record """ notices = ('erratum', 'addendum') entries = [entry for entry in get_value(data, 'publication_info', []) if entry.get('material') in notices] if not entries: return None note_strings = [ text_type('{field}: {journal} {volume}, {pages} {year}').format( field=entry['material'].title(), journal=entry.get('journal_title'), volume=entry.get('journal_volume'), pages=get_page_artid_for_publication_info(entry, '--'), year='({})'.format(entry['year']) if 'year' in entry else '' ).strip() for entry in entries ] note_string = '[' + ', '.join(note_strings) + ']' note_string = re.sub(' +', ' ', note_string) # Remove possible multiple spaces return re.sub(',,', ',', note_string) # ... and commas
def match_reference(reference, previous_matched_recid=None): """Match a reference using inspire-matcher. Args: reference (dict): the metadata of a reference. previous_matched_recid (int): the record id of the last matched reference from the list of references. Returns: dict: the matched reference. """ if reference.get('curated_relation'): return reference config_unique_identifiers = config.REFERENCE_MATCHER_UNIQUE_IDENTIFIERS_CONFIG config_default_publication_info = config.REFERENCE_MATCHER_DEFAULT_PUBLICATION_INFO_CONFIG config_jcap_and_jhep_publication_info = config.REFERENCE_MATCHER_JHEP_AND_JCAP_PUBLICATION_INFO_CONFIG config_data = config.REFERENCE_MATCHER_DATA_CONFIG journal_title = get_value(reference, 'reference.publication_info.journal_title') config_publication_info = config_jcap_and_jhep_publication_info if \ journal_title in ['JCAP', 'JHEP'] else config_default_publication_info configs = [config_unique_identifiers, config_publication_info, config_data] matches = (match_reference_with_config(reference, config, previous_matched_recid) for config in configs) matches = (matched_record for matched_record in matches if 'record' in matched_record) reference = next(matches, reference) return reference
def import_legacy_orcid_tokens(self): """ Celery task to import OAUTH ORCID tokens from legacy. Note: bind=True for compatibility with @time_execution. """ if get_value(current_app.config, 'ORCID_APP_CREDENTIALS.consumer_key') is None: return for user_data in legacy_orcid_arrays(): try: orcid, token, email, name = user_data if push_access_tokens.is_access_token_invalid(token): continue orcid_to_push = _register_user(name, email, orcid, token) if orcid_to_push: LOGGER.info( 'allow_push now enabled on %s, will push all works now', orcid_to_push ) recids = get_literature_recids_for_orcid(orcid_to_push) for recid in recids: orcid_push.apply_async( queue='orcid_push_legacy_tokens', kwargs={ 'orcid': orcid_to_push, 'rec_id': recid, 'oauth_token': token, }, ) except SQLAlchemyError as ex: LOGGER.exception(ex) db.session.commit()
def map_refextract_to_schema(extracted_references, source=None): """Convert refextract output to the schema using the builder.""" result = [] for reference in extracted_references: rb = ReferenceBuilder() mapping = [ ('author', rb.add_refextract_authors_str), ('collaboration', rb.add_collaboration), ('doi', rb.add_uid), ('hdl', rb.add_uid), ('isbn', rb.add_uid), ('journal_reference', rb.set_pubnote), ('linemarker', rb.set_label), ('misc', rb.add_misc), ('publisher', rb.set_publisher), ('raw_ref', lambda raw_ref: rb.add_raw_reference(raw_ref, source=source)), ('reportnumber', rb.add_report_number), ('texkey', rb.set_texkey), ('title', rb.add_title), ('url', rb.add_url), ('year', rb.set_year), ] for field, method in mapping: for el in force_list(reference.get(field)): if el: method(el) if get_value(rb.obj, 'reference.urls'): rb.obj['reference']['urls'] = dedupe_list_of_dicts(rb.obj['reference']['urls']) result.append(rb.obj) return result
def get_coauthors_neighborhood(signature, radius=10): authors = get_value(signature, 'publication.authors', default=[]) try: center = authors.index(signature['author_name']) return ' '.join(authors[max(0, center - radius):min(len(authors), center + radius)]) except ValueError: return ' '.join(authors)
def get_conference_record(record, default=None): """Return the first Conference record associated with a record. Queries the database to fetch the first Conference record referenced in the ``publication_info`` of the record. Args: record(InspireRecord): a record. default: value to be returned if no conference record present/found Returns: InspireRecord: the first Conference record associated with the record. Examples: >>> record = { ... 'publication_info': [ ... { ... 'conference_record': { ... '$ref': '/api/conferences/972464', ... }, ... }, ... ], ... } >>> conference_record = get_conference_record(record) >>> conference_record['control_number'] 972464 """ pub_info = get_value(record, 'publication_info.conference_record[0]') if not pub_info: return default conferences = get_db_records([('con', get_recid_from_ref(pub_info))]) return list(conferences)[0]
def test_fuzzy_match_returns_true_if_something_matched_with_earliest_date(mock_match, enable_fuzzy_matcher): schema = load_schema('hep') titles_schema = schema['properties']['titles'] matched_record = { 'control_number': 1472986, 'titles': [ { 'title': 'title', }, ], 'earliest_date': '2016-06-29', } assert validate(matched_record['titles'], titles_schema) is None mock_match.return_value = iter([{'_source': matched_record}]) data = {} extra_data = {} obj = MockObj(data, extra_data) eng = MockEng() assert fuzzy_match(obj, eng) assert 'matches' in obj.extra_data expected = [{ 'control_number': 1472986, 'title': 'title', 'earliest_date': '2016-06-29', }] result = get_value(obj.extra_data, 'matches.fuzzy') assert expected == result
def test_refextract_url(log_in_as_cataloger, api_client): schema = load_schema('hep') subschema = schema['properties']['references'] with requests_mock.Mocker() as requests_mocker: requests_mocker.register_uri( 'GET', 'https://arxiv.org/pdf/1612.06414.pdf', content=pkg_resources.resource_string( __name__, os.path.join('fixtures', '1612.06414.pdf')), ) requests_mocker.register_uri( 'GET', 'http://test-indexer:9200/records-hep/hep/_search?_source=control_number', content=pkg_resources.resource_string( __name__, os.path.join('fixtures', 'es_response.json')), ) response = api_client.post( '/editor/refextract/url', content_type='application/json', data=json.dumps({ 'url': 'https://arxiv.org/pdf/1612.06414.pdf', }), ) references = json.loads(response.data) assert response.status_code == 200 assert validate(references, subschema) is None assert get_value({'references': references}, 'references.reference.publication_info.journal_title')
def parse(self, response): """Parse a APS record into a HEP record. Attempts to parse an XML JATS full text first, if available, and falls back to parsing JSON if such is not available. """ aps_response = json.loads(response.body_as_unicode()) for article in aps_response['data']: doi = get_value(article, 'identifiers.doi', default='') if doi: request = Request(url='{}/{}'.format(self.aps_base_url, doi), headers={'Accept': 'text/xml'}, callback=self._parse_jats, errback=self._parse_json_on_failure) request.meta['json_article'] = article request.meta['original_response'] = response yield request # Pagination support. Will yield until no more "next" pages are found if 'Link' in response.headers: links = link_header.parse(response.headers['Link']) next = links.links_by_attr_pairs([('rel', 'next')]) if next: next_url = next[0].href yield Request(next_url)
def force_each_collaboration_to_be_object(self, data): if not data.get('record'): collaborations = get_value(data, 'reference.collaborations') if collaborations: data['reference']['collaborations'] = [{'value': collaboration} for collaboration in collaborations] return data
def get_linked_records_in_field(record, field_path): """Get all linked records in a given field. Args: record (dict): the record containing the links field_path (string): a dotted field path specification understandable by ``get_value``, containing a json reference to another record. Returns: Iterator[dict]: an iterator on the linked record. Warning: Currently, the order in which the linked records are yielded is different from the order in which they appear in the record. Example: >>> record = {'references': [ ... {'record': {'$ref': 'https://labs.inspirehep.net/api/literature/1234'}}, ... {'record': {'$ref': 'https://labs.inspirehep.net/api/data/421'}}, ... ]} >>> get_linked_record_in_field(record, 'references.record') [...] """ full_path = '.'.join([field_path, '$ref']) pids = force_list([get_pid_from_record_uri(rec) for rec in get_value(record, full_path, [])]) return get_db_records(pids)
def build_experiment(self, data): if get_value(data, "legacy_name") and get_value(data, "project_type"): return { "_collections": ["Experiments"], "$schema": url_for( "invenio_jsonschemas.get_schema", schema_path="records/experiments.json", _external=True, ), "legacy_name": get_value(data, "legacy_name"), "project_type": get_value(data, "project_type"), } raise InvalidDataError("Experiment is missing a value or values.")
def test_set_exact_match_as_approved_in_extradata(): data = {} extra_data = {'matches': {'exact': [1, 2]}} obj = MockObj(data, extra_data) eng = MockEng() set_exact_match_as_approved_in_extradata(obj, eng) assert get_value(obj.extra_data, 'matches.approved') == 1
def match_by_doi(record): """Match by DOIs.""" dois = get_value(record, 'dois.value', []) result = set() for doi in dois: query = '0247:"{0}"'.format(doi) result.update(search(query)) return list(result)
def get_isbn(data, doc_type): def hyphenate_if_possible(no_hyphens): try: return normalize_isbn(no_hyphens) except ISBNError: return no_hyphens isbns = get_value(data, 'isbns.value', []) if isbns: return ', '.join(hyphenate_if_possible(isbn) for isbn in isbns)
def get_isbn(self, data): def hyphenate_if_possible(no_hyphens): try: return normalize_isbn(no_hyphens) except ISBNError: return no_hyphens isbns = get_value(data, "isbns.value", []) if isbns: return ", ".join(hyphenate_if_possible(isbn) for isbn in isbns)
def build_conference(self, data) -> dict: """Process data from form and build a conference record. Args: data (ConferenceRecord): record to serialize Return: dict: a conference record """ builder = ConferenceBuilder() builder.add_title(title=data.get("name"), subtitle=data.get("subtitle")) builder.set_short_description(value=data.get("description", "")) builder.set_opening_date(get_value(data, "dates[0]")) builder.set_closing_date(get_value(data, "dates[1]")) builder.add_inspire_categories(data.get("field_of_interest", [])) builder.add_public_note(value=data.get("additional_info", "")) builder.add_series( name=data.get("series_name"), number=data.get("series_number") ) for address in data.get("addresses"): builder.add_address( cities=[address.get("city")], state=address.get("state"), place_name=address.get("venue"), country_code=country_name_to_code(address.get("country")), ) for contact in data.get("contacts", []): builder.add_contact(**contact) for acr in data.get("acronyms", []): builder.add_acronym(acr) for website in data.get("websites", []): builder.add_url(website) for keyword in data.get("keywords", []): builder.add_keyword(value=keyword) builder.record["$schema"] = url_for( "invenio_jsonschemas.get_schema", schema_path="records/conferences.json", _external=True, ) return builder.record
def has_cern_collaboration(record): """Check if the record should be part of `CERN:arXiv` set.""" collaborations = get_value(record, "collaborations.value", default=[]) collaboration_regex = re.compile(r"NA\W+\d", re.IGNORECASE) return any( collaboration for collaboration in collaborations if collaboration.lower() in COLLABORATIONS or collaboration_regex.match(collaboration) )
def get_best_publication_info(data): publication_info = get_value(data, "publication_info", []) only_publications = [ entry for entry in publication_info if entry.get("material", "publication") == "publication" ] if not only_publications: return {} return sorted(only_publications, key=len, reverse=True)[0]
def normalize_journal_title(self, reference): try: journal_title = get_value( reference, "reference.publication_info.journal_title") reference["reference"]["publication_info"][ "journal_title"] = JournalsSearch().normalize_title( journal_title) except KeyError: pass return reference
def test_fuzzy_match_returns_true_if_something_matched_with_publication_info(mock_match, enable_fuzzy_matcher): schema = load_schema('hep') publication_info_schema = schema['properties']['publication_info'] titles_schema = schema['properties']['titles'] matched_record = { 'control_number': 1472986, 'titles': [ { 'title': 'title', }, ], 'publication_info': [ { 'artid': '054021', 'journal_issue': '5', 'journal_title': 'Phys.Rev.D', 'journal_volume': '94', 'pubinfo_freetext': 'Phys. Rev. D94 (2016) 054021', 'year': 2016 }, ], } assert validate(matched_record['titles'], titles_schema) is None assert validate(matched_record['publication_info'], publication_info_schema) is None mock_match.return_value = iter([{'_source': matched_record}]) data = {} extra_data = {} obj = MockObj(data, extra_data) eng = MockEng() assert fuzzy_match(obj, eng) assert 'matches' in obj.extra_data expected = [{ 'control_number': 1472986, 'title': 'title', 'publication_info': [ { 'artid': '054021', 'journal_issue': '5', 'journal_title': 'Phys.Rev.D', 'journal_volume': '94', 'pubinfo_freetext': 'Phys. Rev. D94 (2016) 054021', 'year': 2016 }, ], }] result = get_value(obj.extra_data, 'matches.fuzzy') assert expected == result
def match_literature_author(author, record): configs = [ current_app.config["AUTHOR_MATCHER_NAME_CONFIG"], current_app.config["AUTHOR_MATCHER_NAME_INITIALS_CONFIG"], ] validators = [(collaboration_validator, affiliations_validator), None] parsed_name = ParsedName.loads(author.get("full_name")) author_matcher_data = { "first_name": parsed_name.first, "last_name": parsed_name.last, "full_name": author.get("full_name"), "collaborations": get_value(record, "collaborations.value", []), "affiliations": get_value(author, "affiliations.value", []), } for config, validator in zip(configs, validators): matched_records = match_literature_author_with_config( author_matcher_data, config ) matched_author_data = ( get_reference_and_bai_if_unambiguous_literature_author_match( matched_records ) ) if not matched_author_data and validator: for validator_function in validator: valid_matches = ( match for match in matched_records if validator_function(author_matcher_data, match) ) matched_author_data = ( get_reference_and_bai_if_unambiguous_literature_author_match( valid_matches ) ) if matched_author_data: break if matched_author_data: return matched_author_data
def _get_hep_record_brief(hep_record): brief = { 'control_number': hep_record['control_number'], 'title': get_value(hep_record, 'titles[0].title'), } abstract = get_value(hep_record, 'abstracts[0].value') if abstract is not None: brief['abstract'] = abstract arxiv_eprint = get_value(hep_record, 'arxiv_eprints[0].value') if arxiv_eprint is not None: brief['arxiv_eprint'] = arxiv_eprint number_of_pages = get_value(hep_record, 'number_of_pages') if number_of_pages is not None: brief['number_of_pages'] = number_of_pages earliest_date = get_value(hep_record, 'earliest_date') if earliest_date is not None: brief['earliest_date'] = earliest_date authors = hep_record.get('authors') if authors is not None: brief['authors_count'] = len(authors) author_briefs = [] for author in authors[:3]: author_briefs.append({'full_name': author['full_name']}) brief['authors'] = author_briefs public_notes = hep_record.get('public_notes') if public_notes is not None: public_notes_value = [] for public_note in public_notes: public_notes_value.append({'value': public_note['value']}) brief['public_notes'] = public_notes_value publication_info = hep_record.get('publication_info') if publication_info is not None: brief['publication_info'] = publication_info return brief
def keep_only_update_source_in_field(field, root, head, update): """Remove elements from root and head where ``source`` matches the update. This is useful if the update needs to overwrite all elements with the same source. .. note:: If the update doesn't contain exactly one source in ``field``, the records are returned with no modifications. Args: field (str): the field to filter out. root (pmap): the root record, whose ``field`` will be cleaned. head (pmap): the head record, whose ``field`` will be cleaned. update (pmap): the update record, from which the ``source`` is read. Returns: tuple: ``(root, head, update)`` with some elements filtered out from ``root`` and ``head``. """ update_thawed = thaw(update) update_sources = { source.lower() for source in get_value(update_thawed, '.'.join([field, 'source']), []) } if not update_sources: # If there is no field or source then fallback for source to `aquisition_source.source` source = get_value(update_thawed, "acquisition_source.source") if source: update_sources = {source.lower()} if len(update_sources) != 1: return root, head, update source = update_sources.pop() if field in root: root = root.set(field, remove_elements_with_source(source, root[field])) if field in head: head = head.set(field, remove_elements_with_source(source, head[field])) return root, head, update
def _classify_paper(obj, eng): from flask import current_app params = dict(taxonomy_name=taxonomy or current_app.config['HEP_ONTOLOGY_FILE'], output_mode='dict', output_limit=output_limit, spires=spires, match_mode=match_mode, no_cache=no_cache, with_author_keywords=with_author_keywords, rebuild_cache=rebuild_cache, only_core_tags=only_core_tags, extract_acronyms=extract_acronyms) fulltext_used = True with get_document_in_workflow(obj) as tmp_document: try: if tmp_document: result = get_keywords_from_local_file( tmp_document, **params) else: data = get_value(obj.data, 'titles.title', []) data.extend(get_value(obj.data, 'titles.subtitle', [])) data.extend(get_value(obj.data, 'abstracts.value', [])) data.extend(get_value(obj.data, 'keywords.value', [])) if not data: obj.log.error( "No classification done due to missing data.") return result = get_keywords_from_text(data, **params) fulltext_used = False except ClassifierException as e: obj.log.exception(e) return result['complete_output'] = clean_instances_from_data( result.get("complete_output", {})) result["fulltext_used"] = fulltext_used # Check if it is not empty output before adding if any(result.get("complete_output", {}).values()): obj.extra_data['classifier_results'] = result
def _match_with_invenio_matcher(obj, eng): from invenio_matcher.api import match as _match if queries is None: queries_ = [{ 'type': 'exact', 'match': 'dois.value' }, { 'type': 'exact', 'match': 'arxiv_eprints.value' }] else: queries_ = queries record_matches = { "recids": [], "records": [], "base_url": os.path.join(current_app.config["SERVER_NAME"], 'record') } record = {} record['dois.value'] = get_value(obj.data, 'dois.value') record['arxiv_eprints.value'] = get_value(obj.data, 'arxiv_eprints.value') for matched_record in _match(record, queries=queries_, index=index, doc_type=doc_type): matched_recid = matched_record.record.get('id') record_matches['recids'].append(matched_recid) record_matches['records'].append({ "source": matched_record.record.dumps(), "score": matched_record.score }) if len(record_matches['recids']) > 0: obj.extra_data["record_matches"] = record_matches return True return False
def _link_user_and_token(user, name, orcid, token): """Create a link between a user and token, if possible. Args: user (invenio_oauthclient.models.User): an existing user object to connect the token to orcid (string): user's ORCID identifier token (string): OAUTH token for the user Returns: str: the ORCID associated with the new token if we created one, or the ORCID associated with the token whose ``allow_push`` flag changed state. """ result = None try: # Link user and ORCID oauth_link_external_id(user, { 'id': orcid, 'method': 'orcid' }) except AlreadyLinkedError: # User already has their ORCID linked pass # Check whether there are already tokens associated with this # ORCID identifier. tokens = RemoteToken.query.join(RemoteAccount).join(User)\ .join(UserIdentity).filter(UserIdentity.id == orcid).all() if tokens: # Force the allow_push. with db.session.begin_nested(): for token in tokens: if not token.remote_account.extra_data['allow_push']: result = orcid token.remote_account.extra_data['allow_push'] = True else: # If not, create and put the token entry with db.session.begin_nested(): result = orcid RemoteToken.create( user_id=user.id, client_id=get_value(current_app.config, 'ORCID_APP_CREDENTIALS.consumer_key'), token=token, secret=None, extra_data={ 'orcid': orcid, 'full_name': name, 'allow_push': True, } ) return result
def has_cern_accelerator_experiment(record): """Check if the record should be part of `CERN:arXiv` set.""" accelerator_experiments = get_value( record, "accelerator_experiments.legacy_name", default=[] ) return any( experiment for experiment in accelerator_experiments if experiment.lower() in ACCELERATOR_EXPERIMENTS_NAMES or experiment.lower().startswith("cern") )
def generate_bai(cls, data): name = get_value(data, "name.value") bai = ".".join(format_name(name, initials_only=True).split()) bai = unidecode(bai) bai = "".join( filter(lambda x: x in set(string.ascii_letters + "."), bai)) bai = re.sub(r"\.+", ".", bai).lstrip(".") if not bai.endswith("."): bai = f"{bai}." next_bai_number = cls.next_bai_number(bai) return f"{bai}{next_bai_number}"
def _get_hal_id_map(record): affiliation_records = chain.from_iterable( get_value(record, 'authors.affiliations.record', default=[])) affiliation_recids = [get_recid_from_ref(el) for el in affiliation_records] try: institutions = get_es_records('ins', affiliation_recids) except RequestError: institutions = [] return {el['control_number']: _get_hal_id(el) for el in institutions}
def add_arxiv_categories(record, blob): if not record.get('arxiv_eprints') or not blob.get('65017'): return record for category in force_list(get_value(blob, '65017')): if category.get('2') == 'arXiv' and category.get('a'): record['arxiv_eprints'][0]['categories'].append( normalize_arxiv_category(category['a']) ) return record
def populate_bookautocomplete(record): """Populate the ```bookautocomplete`` field of Literature records.""" paths = [ 'imprints.date', 'imprints.publisher', 'isbns.value', ] authors = force_list(get_value(record, 'authors.full_name', default=[])) titles = force_list(get_value(record, 'titles.title', default=[])) input_values = list(chain.from_iterable( force_list(get_value(record, path, default=[])) for path in paths)) input_values.extend(authors) input_values.extend(titles) input_values = [el for el in input_values if el] record['bookautocomplete'] = { 'input': input_values, }
def get_arxiv_eprints(self, data): arxiv_eprint = data.pop("arxiv_eprint", None) arxiv_eprints = data.get("arxiv_eprints") if arxiv_eprint: data["arxiv_eprint"] = force_list({"value": arxiv_eprint}) elif arxiv_eprints: data["arxiv_eprint"] = force_list( {"value": get_value(data, "arxiv_eprints[0].value", default=missing)} ) data.pop("arxiv_eprints", None) return data.get("arxiv_eprint", missing)
def get_journal_coverage(obj, eng): """Return the journal coverage that this article belongs to.""" journals = replace_refs(get_value(obj.data, 'publication_info.journal_record'), 'db') if not journals: return if any(journal['_harvesting_info'].get('coverage') == 'full' for journal in journals): obj.extra_data['journal_coverage'] = 'full' else: obj.extra_data['journal_coverage'] = 'partial'
def test_get_value_works_on_lists(): record = [ { 'foo': 'bar' }, ] expected = ['bar'] result = get_value(record, 'foo') assert expected == result
def before_dump(self, data): family_name, given_name = self.get_name_splitted(data) return { 'advisors': get_value(data, 'advisors', default=missing), 'acquisition_source': get_value(data, 'acquisition_source', default=missing), 'arxiv_categories': get_value(data, 'arxiv_categories', default=missing), 'blog': self.get_first_or_missing( self.get_value_by_description_key(data.get('urls', []), 'blog')), 'display_name': get_value(data, 'name.preferred_name', default=missing), 'family_name': self.get_value_or_missing(family_name), 'given_name': self.get_value_or_missing(given_name), 'linkedin': self.get_first_or_missing( get_values_for_schema(data.get('ids', []), 'LINKEDIN')), 'native_name': get_value(data, 'name.native_names[0]', default=missing), 'orcid': self.get_first_or_missing( get_values_for_schema(data.get('ids', []), 'ORCID')), 'positions': get_value(data, 'positions', default=missing), 'project_membership': get_value(data, 'project_membership', default=missing), 'public_emails': get_value(data, 'email_addresses.value', default=missing), 'status': get_value(data, 'status', default=missing), 'twitter': self.get_first_or_missing( get_values_for_schema(data.get('ids', []), 'TWITTER')), 'websites': get_value(data, 'urls.value', default=missing), }
def publication_date(self): """(Partial) date of publication. Returns: partial_date (inspire_utils.date.PartialDate): publication date """ try: return PartialDate.loads( get_value(self.record, 'imprints.date[0]') or get_publication_date(self.record)) except ValueError: return None
def orcid_role_for_inspire_author(self, author): """ORCID role for an INSPIRE author field. Args: author (dict): an author field from INSPIRE literature record Returns: string: ORCID role of a person """ inspire_roles = sorted(get_value(author, 'inspire_roles', ['author'])) if inspire_roles: return self.INSPIRE_TO_ORCID_ROLES_MAP[inspire_roles[0]]
def update_moved_orcid(old_orcid, new_orcid): author_record = AuthorsRecord.get_record_by_pid_value(old_orcid, "orcid") if new_orcid not in get_value(author_record, "ids.value", []): new_author_ids = [ {"schema": "ORCID", "value": new_orcid}, *author_record["ids"], ] author_record["ids"] = new_author_ids author_record.update(dict(author_record)) remove_access_token_for_orcid_account(old_orcid, new_orcid) db.session.commit() LOGGER.info("ORCID updated", new_orcid=new_orcid, old_orcid=old_orcid)
def get_linked_book(self, data): parent = get_parent_record(data) if parent and "titles" in parent and "control_number" in parent: endpoint = PidStoreBase.get_endpoint_from_pid_type( PidStoreBase.get_pid_type_from_schema(data["$schema"])) endpoint_item = f"invenio_records_rest.{endpoint}_item" ref = get_value(parent, "self.$ref") or url_for( endpoint_item, pid_value=parent["control_number"], _external=True) return {**parent["titles"][0], "record": {"$ref": ref}} return None
def publication_date(self): """(Partial) date of publication. Returns: partial_date (inspire_utils.date.PartialDate): publication date """ try: return PartialDate.loads( get_value(self.record, "imprints.date[0]") or LiteratureReader(self.record).publication_date) except ValueError: return None