Esempio n. 1
0
def curation_ticket_context(user, obj):
    recid = obj.extra_data.get('recid')
    record_url = obj.extra_data.get('url')

    arxiv_ids = get_value(obj.data, 'arxiv_eprints.value') or []
    for index, arxiv_id in enumerate(arxiv_ids):
        if arxiv_id and is_arxiv_post_2007(arxiv_id):
            arxiv_ids[index] = 'arXiv:{0}'.format(arxiv_id)

    report_numbers = get_value(obj.data, 'report_numbers.value') or []
    dois = [
        "doi:{0}".format(doi)
        for doi in get_value(obj.data, 'dois.value') or []
    ]
    link_to_pdf = obj.extra_data.get('formdata', {}).get('url')

    subject = ' '.join(filter(
        lambda x: x is not None,
        arxiv_ids + dois + report_numbers + ['(#{0})'.format(recid)]
    ))

    references = obj.extra_data.get('formdata', {}).get('references')
    user_comment = obj.extra_data.get('formdata', {}).get('extra_comments', '')

    return dict(
        recid=recid,
        record_url=record_url,
        link_to_pdf=link_to_pdf,
        email=user.email if user else '',
        references=references,
        user_comment=user_comment,
        subject=subject
    )
 def get_name(self, item):
     institution = get_value(item, 'institutions[0].value')
     accelerator = get_value(item, 'accelerator.value')
     experiment = get_value(item, 'experiment.value')
     if institution and accelerator and experiment:
         return u'{}-{}-{}'.format(institution, accelerator, experiment)
     return item.get('legacy_name')
Esempio n. 3
0
def populate_affiliation_suggest(record):
    """Populate the ``affiliation_suggest`` field of Institution records."""
    ICN = record.get('ICN', [])
    institution_acronyms = get_value(record, 'institution_hierarchy.acronym', default=[])
    institution_names = get_value(record, 'institution_hierarchy.name', default=[])
    legacy_ICN = record.get('legacy_ICN', '')
    name_variants = force_list(get_value(record, 'name_variants.value', default=[]))
    postal_codes = force_list(get_value(record, 'addresses.postal_code', default=[]))

    # XXX: this is need by the curators to search only with numbers
    extract_numbers_from_umr = []
    for name in name_variants:
        match = re.match(r'UMR\s', name, re.IGNORECASE)
        if match:
            umr_number = name.replace(match.group(0), '')
            extract_numbers_from_umr.append(umr_number)

    input_values = []
    input_values.extend(ICN)
    input_values.extend(institution_acronyms)
    input_values.extend(institution_names)
    input_values.append(legacy_ICN)
    input_values.extend(name_variants)
    input_values.extend(postal_codes)
    input_values.extend(extract_numbers_from_umr)
    input_values = [el for el in input_values if el]

    record['affiliation_suggest'] = {
        'input': input_values,
    }
Esempio n. 4
0
def populate_bookautocomplete(sender, json, *args, **kwargs):
    """Populate the ```bookautocomplete`` field of Literature records."""
    if not is_hep(json):
        return

    if 'book' not in json.get('document_type', []):
        return

    paths = [
        'imprints.date',
        'imprints.publisher',
        'isbns.value',
    ]

    authors = force_list(get_value(json, 'authors.full_name', default=[]))
    titles = force_list(get_value(json, 'titles.title', default=[]))

    input_values = list(chain.from_iterable(
        force_list(get_value(json, path, default=[])) for path in paths))
    input_values.extend(authors)
    input_values.extend(titles)
    input_values = [el for el in input_values if el]

    ref = get_value(json, 'self.$ref')

    json.update({
        'bookautocomplete': {
            'input': input_values,
            'payload': {
                'authors': authors,
                'id': ref,
                'title': titles,
            },
        },
    })
Esempio n. 5
0
def populate_affiliation_suggest(sender, json, *args, **kwargs):
    """Populate the ``affiliation_suggest`` field of Institution records."""
    if 'institutions.json' not in json.get('$schema'):
        return

    ICN = json.get('ICN', [])
    institution_acronyms = get_value(json, 'institution_hierarchy.acronym', default=[])
    institution_names = get_value(json, 'institution_hierarchy.name', default=[])
    legacy_ICN = json.get('legacy_ICN', '')
    name_variants = force_list(get_value(json, 'name_variants.value', default=[]))
    postal_codes = force_list(get_value(json, 'addresses.postal_code', default=[]))

    input_values = []
    input_values.extend(ICN)
    input_values.extend(institution_acronyms)
    input_values.extend(institution_names)
    input_values.append(legacy_ICN)
    input_values.extend(name_variants)
    input_values.extend(postal_codes)
    input_values = [el for el in input_values if el]

    json.update({
        'affiliation_suggest': {
            'input': input_values,
            'output': legacy_ICN,
            'payload': {
                '$ref': get_value(json, 'self.$ref'),
                'ICN': ICN,
                'institution_acronyms': institution_acronyms,
                'institution_names': institution_names,
                'legacy_ICN': legacy_ICN,
            },
        },
    })
Esempio n. 6
0
def check_unlinked_references():
    """Return two lists with the unlinked references that have a doi or an arxiv id.

    If the reference read has a doi or an arxiv id, it is stored in the data structure.
    Once all the data is read, it is ordered by most relevant to less relevant."""

    result_doi, result_arxiv = defaultdict(lambda: (0, 0)), defaultdict(lambda: (0, 0))
    linked_ids = defaultdict(lambda: (0, 0))

    data = get_all_unlinked_references()

    for reference in data:
        dois = get_value(reference, 'reference.reference.dois', [])
        arxiv_id = get_value(reference, 'reference.reference.arxiv_eprint')

        if arxiv_id and len(dois) > 0:
            for doi in dois:
                increase_cited_count(linked_ids, (doi, arxiv_id), reference["core"])

        for doi in dois:
            increase_cited_count(result_doi, doi, reference["core"])

        if arxiv_id:
            increase_cited_count(result_arxiv, arxiv_id, reference["core"])

    add_linked_ids(result_doi, result_arxiv, linked_ids)

    result_doi = order_dictionary_into_list(result_doi)
    result_arxiv = order_dictionary_into_list(result_arxiv)

    return result_doi, result_arxiv
Esempio n. 7
0
def prepare_magpie_payload(record, corpus):
    """Prepare payload to send to Magpie API."""
    payload = dict(text="", corpus=corpus)
    titles = filter(None, get_value(record, "titles.title", []))
    abstracts = filter(None, get_value(record, "abstracts.value", []))
    payload["text"] = ". ".join(
        [part.encode('utf-8') for part in titles + abstracts])
    return payload
Esempio n. 8
0
def get_address(data, doc_type):
    conference = get_conference_record(data, default={})
    pubinfo_city = get_value(conference, 'address[0].cities[0]')
    pubinfo_country_code = get_value(conference, 'address[0].country_code')

    if pubinfo_city and pubinfo_country_code:
        return pubinfo_city + ', ' + get_country_name_by_code(pubinfo_country_code, default=pubinfo_country_code)
    return get_value(data, 'imprints[0].place')
Esempio n. 9
0
    def title_translation(self):
        """Translated title.

        Returns:
            Tuple[string, string]: translated title and the language code of the translation, if available
        """
        title = get_value(self.record, 'title_translations[0].title')
        language_code = get_value(self.record, 'title_translations[0].language')
        if title and language_code:
            return title, language_code
Esempio n. 10
0
def is_arxiv_paper(obj, *args, **kwargs):
    """Check if the record is from arXiv."""

    method = get_value(obj.data, 'acquisition_source.method')
    source = get_value(obj.data, 'acquisition_source.source', default='')

    is_submission_with_arxiv = method == 'submitter' and 'arxiv_eprints' in obj.data
    is_harvested_from_arxiv = method == 'hepcrawl' and source.lower() == 'arxiv'

    return is_submission_with_arxiv or is_harvested_from_arxiv
Esempio n. 11
0
def year_validator(form, field):
    """Validate that the field contains an year in an acceptable range."""
    hep = load_schema('hep')
    min_year = get_value(hep, 'properties.publication_info.items.properties.year.minimum')
    max_year = get_value(hep, 'properties.publication_info.items.properties.year.maximum')

    message = 'Please, provide an year between {} and {}.'.format(min_year, max_year)

    if field.data and not min_year <= int(field.data) <= max_year:
        raise StopValidation(message)
Esempio n. 12
0
 def get_dois(self, data):
     dois = data.get('dois', None)
     control_number = data.get('control_number')
     if dois and not control_number:
         data['dois'] = force_list(
             {'value': get_value(data, 'dois[0]', default=missing)})
     elif dois:
         data['dois'] = force_list(
             {'value': get_value(data, 'dois[0].value', default=missing)})
     return data.get('dois', missing)
Esempio n. 13
0
    def _parse_json_on_failure(self, failure):
        """Parse a JSON article entry."""
        original_response = failure.request.meta['original_response']
        record = HEPLoader(item=HEPRecord(), response=original_response)
        article = failure.request.meta['json_article']

        doi = get_value(article, 'identifiers.doi', default='')
        record.add_dois(dois_values=[doi])
        record.add_value('page_nr', str(article.get('numPages', '')))

        record.add_value('abstract', get_value(article, 'abstract.value', default=''))
        record.add_value('title', get_value(article, 'title.value', default=''))
        # record.add_value('subtitle', '')

        authors, collaborations = self._get_authors_and_collab(article)
        record.add_value('authors', authors)
        record.add_value('collaborations', collaborations)

        # record.add_value('free_keywords', free_keywords)
        # record.add_value('classification_numbers', classification_numbers)

        record.add_value('journal_title',
                         get_value(article, 'journal.abbreviatedName', default=''))
        record.add_value('journal_issue',
                         get_value(article, 'issue.number', default=''))
        record.add_value('journal_volume',
                         get_value(article, 'volume.number', default=''))
        # record.add_value('journal_artid', )

        published_date = article.get('date', '')
        record.add_value('journal_year', int(published_date[:4]))
        record.add_value('date_published', published_date)
        record.add_value('copyright_holder',
                         get_value(article, 'rights.copyrightHolders.name[0]', default=''))
        record.add_value('copyright_year',
                         str(get_value(article, 'rights.copyrightYear', default='')))
        record.add_value('copyright_statement',
                         get_value(article, 'rights.rightsStatement', default=''))
        record.add_value('copyright_material', 'publication')

        license = get_licenses(
            license_url=get_value(article, 'rights.licenses.url[0]', default='')
        )
        record.add_value('license', license)

        record.add_value('collections', ['HEP', 'Citeable', 'Published'])

        return ParsedItem(
            record=record.load_item(),
            record_format='hepcrawl',
        )
Esempio n. 14
0
def get_arxiv_id(record):
    """Return the first arXiv identifier of a record.

    Args:
        record (InspireRecord): a record.

    Returns:
        str: the first arXiv identifier of the record.

    Examples:
        >>> record = {
        ...     'arxiv_eprints': [
        ...         {
        ...             'categories': [
        ...                 'hep-th',
        ...                 'hep-ph',
        ...             ],
        ...             'value': '1612.08928',
        ...         },
        ...     ],
        ... }
        >>> get_arxiv_id(record)
        '1612.08928'

    """
    return get_value(record, 'arxiv_eprints.value[0]', default='')
Esempio n. 15
0
def get_arxiv_categories(record):
    """Return all the arXiv categories of a record.

    Args:
        record (InspireRecord): a record.

    Returns:
        list(str): all the arXiv categories of the record.

    Examples:
        >>> record = {
        ...     'arxiv_eprints': [
        ...         {
        ...             'categories': [
        ...                 'hep-th',
        ...                 'hep-ph',
        ...             ],
        ...             'value': '1612.08928',
        ...         },
        ...     ],
        ... }
        >>> get_arxiv_categories(record)
        ['hep-th', 'hep-ph']

    """
    return list(chain.from_iterable(
        get_value(record, 'arxiv_eprints.categories', default=[])))
Esempio n. 16
0
def populate_authors_name_variations(record):
    """Generate name variations for an Author record."""
    author_name = get_value(record, 'name.value')

    if author_name:
        name_variations = generate_name_variations(author_name)
        record['name_variations'] = name_variations
Esempio n. 17
0
def bibtex_document_type(doc_type, obj):
    """Return the BibTeX entry type.

    Maps the INSPIRE ``document_type`` to a BibTeX entry type. Also checks
    ``thesis_info.degree_type`` in case it's a thesis, as it stores the
    information on which kind of thesis we're dealing with.

    Args:
        doc_type (text_type): INSPIRE document type.
        obj (dict): literature record.

    Returns:
        text_type: bibtex document type for the given INSPIRE entry.
    """
    DOCUMENT_TYPE_MAP = {
        'article': 'article',
        'book': 'book',
        'book chapter': 'inbook',
        'conference paper': 'inproceedings',
        'proceedings': 'proceedings',
        'report': 'techreport',
        'note': 'article',
        # theses handled separately due to masters/phd distinction
    }
    if doc_type in DOCUMENT_TYPE_MAP:
        return DOCUMENT_TYPE_MAP[doc_type]
    # Theses need special treatment, because bibtex differentiates between their types:
    elif doc_type == 'thesis' and get_value(obj, 'thesis_info.degree_type') in ('phd', 'habilitation'):
        return 'phdthesis'
    # Other types of theses (other, bachelor, laurea) don't have separate types in bibtex:
    # We will use the type field (see `get_type`) to indicate the type of diploma.
    elif doc_type == 'thesis':
        return 'mastersthesis'
    return 'misc'
Esempio n. 18
0
def newreview():
    """View for INSPIRE author new form review by a cataloger."""
    objectid = request.values.get('objectid', 0, type=int)
    if not objectid:
        abort(400)

    workflow_metadata = WorkflowUIRecord.get_record(objectid)['metadata']

    # Converting json to populate form
    workflow_metadata['extra_comments'] = get_value(
        workflow_metadata,
        '_private_notes[0].value'
    )
    convert_for_form(workflow_metadata)

    form = AuthorUpdateForm(
        data=workflow_metadata, is_review=True)
    ctx = {
        "action": url_for('.reviewhandler', objectid=objectid),
        "name": "authorUpdateForm",
        "id": "authorUpdateForm",
        "objectid": objectid
    }

    return render_template('authors/forms/review_form.html', form=form, **ctx)
Esempio n. 19
0
def get_conference_record(record, default=None):
    """Return the first Conference record associated with a record.

    Queries the database to fetch the first Conference record referenced
    in the ``publication_info`` of the record.

    Args:
        record(InspireRecord): a record.
        default: value to be returned if no conference record present/found

    Returns:
        InspireRecord: the first Conference record associated with the record.

    Examples:
        >>> record = {
        ...     'publication_info': [
        ...         {
        ...             'conference_record': {
        ...                 '$ref': '/api/conferences/972464',
        ...             },
        ...         },
        ...     ],
        ... }
        >>> conference_record = get_conference_record(record)
        >>> conference_record['control_number']
        972464

    """
    replaced = replace_refs(get_value(record, 'publication_info.conference_record[0]'), 'db')
    if replaced:
        return replaced
    else:
        return default
Esempio n. 20
0
def get_note(data, doc_type):
    """Write and addendum/errata information to the BibTeX note field.

    Traverse publication_info looking for erratum and addendum in `publication_info.material`
    field and build a string of references to those publication entries.

    Returns:
        string: formatted list of the errata and addenda available for a given record

    """
    notices = ('erratum', 'addendum')
    entries = [entry for entry in get_value(data, 'publication_info', []) if entry.get('material') in notices]

    if not entries:
        return None

    note_strings = [
        text_type('{field}: {journal} {volume}, {pages} {year}').format(
            field=entry['material'].title(),
            journal=entry.get('journal_title'),
            volume=entry.get('journal_volume'),
            pages=get_page_artid_for_publication_info(entry, '--'),
            year='({})'.format(entry['year']) if 'year' in entry else ''
        ).strip()
        for entry in entries
    ]

    note_string = '[' + ', '.join(note_strings) + ']'
    note_string = re.sub(' +', ' ', note_string)  # Remove possible multiple spaces
    return re.sub(',,', ',', note_string)         # ... and commas
Esempio n. 21
0
def match_reference(reference, previous_matched_recid=None):
    """Match a reference using inspire-matcher.

    Args:
        reference (dict): the metadata of a reference.
        previous_matched_recid (int): the record id of the last matched
            reference from the list of references.

    Returns:
        dict: the matched reference.
    """
    if reference.get('curated_relation'):
        return reference

    config_unique_identifiers = config.REFERENCE_MATCHER_UNIQUE_IDENTIFIERS_CONFIG
    config_default_publication_info = config.REFERENCE_MATCHER_DEFAULT_PUBLICATION_INFO_CONFIG
    config_jcap_and_jhep_publication_info = config.REFERENCE_MATCHER_JHEP_AND_JCAP_PUBLICATION_INFO_CONFIG
    config_data = config.REFERENCE_MATCHER_DATA_CONFIG

    journal_title = get_value(reference, 'reference.publication_info.journal_title')
    config_publication_info = config_jcap_and_jhep_publication_info if \
        journal_title in ['JCAP', 'JHEP'] else config_default_publication_info

    configs = [config_unique_identifiers, config_publication_info, config_data]

    matches = (match_reference_with_config(reference, config, previous_matched_recid) for config in configs)
    matches = (matched_record for matched_record in matches if 'record' in matched_record)
    reference = next(matches, reference)

    return reference
Esempio n. 22
0
def import_legacy_orcid_tokens(self):
    """
    Celery task to import OAUTH ORCID tokens from legacy.
    Note: bind=True for compatibility with @time_execution.
    """
    if get_value(current_app.config, 'ORCID_APP_CREDENTIALS.consumer_key') is None:
        return

    for user_data in legacy_orcid_arrays():
        try:
            orcid, token, email, name = user_data
            if push_access_tokens.is_access_token_invalid(token):
                continue
            orcid_to_push = _register_user(name, email, orcid, token)
            if orcid_to_push:
                LOGGER.info(
                    'allow_push now enabled on %s, will push all works now',
                    orcid_to_push
                )
                recids = get_literature_recids_for_orcid(orcid_to_push)
                for recid in recids:
                    orcid_push.apply_async(
                        queue='orcid_push_legacy_tokens',
                        kwargs={
                            'orcid': orcid_to_push,
                            'rec_id': recid,
                            'oauth_token': token,
                        },
                    )
        except SQLAlchemyError as ex:
            LOGGER.exception(ex)

    db.session.commit()
Esempio n. 23
0
def map_refextract_to_schema(extracted_references, source=None):
    """Convert refextract output to the schema using the builder."""
    result = []

    for reference in extracted_references:
        rb = ReferenceBuilder()
        mapping = [
            ('author', rb.add_refextract_authors_str),
            ('collaboration', rb.add_collaboration),
            ('doi', rb.add_uid),
            ('hdl', rb.add_uid),
            ('isbn', rb.add_uid),
            ('journal_reference', rb.set_pubnote),
            ('linemarker', rb.set_label),
            ('misc', rb.add_misc),
            ('publisher', rb.set_publisher),
            ('raw_ref', lambda raw_ref: rb.add_raw_reference(raw_ref, source=source)),
            ('reportnumber', rb.add_report_number),
            ('texkey', rb.set_texkey),
            ('title', rb.add_title),
            ('url', rb.add_url),
            ('year', rb.set_year),
        ]

        for field, method in mapping:
            for el in force_list(reference.get(field)):
                if el:
                    method(el)

        if get_value(rb.obj, 'reference.urls'):
            rb.obj['reference']['urls'] = dedupe_list_of_dicts(rb.obj['reference']['urls'])

        result.append(rb.obj)

    return result
Esempio n. 24
0
def get_coauthors_neighborhood(signature, radius=10):
    authors = get_value(signature, 'publication.authors', default=[])
    try:
        center = authors.index(signature['author_name'])
        return ' '.join(authors[max(0, center - radius):min(len(authors), center + radius)])
    except ValueError:
        return ' '.join(authors)
Esempio n. 25
0
def get_conference_record(record, default=None):
    """Return the first Conference record associated with a record.

    Queries the database to fetch the first Conference record referenced
    in the ``publication_info`` of the record.

    Args:
        record(InspireRecord): a record.
        default: value to be returned if no conference record present/found

    Returns:
        InspireRecord: the first Conference record associated with the record.

    Examples:
    >>> record = {
    ...     'publication_info': [
    ...         {
    ...             'conference_record': {
    ...                 '$ref': '/api/conferences/972464',
    ...             },
    ...         },
    ...     ],
    ... }
    >>> conference_record = get_conference_record(record)
    >>> conference_record['control_number']
    972464

    """
    pub_info = get_value(record, 'publication_info.conference_record[0]')
    if not pub_info:
        return default

    conferences = get_db_records([('con', get_recid_from_ref(pub_info))])
    return list(conferences)[0]
def test_fuzzy_match_returns_true_if_something_matched_with_earliest_date(mock_match, enable_fuzzy_matcher):
    schema = load_schema('hep')
    titles_schema = schema['properties']['titles']

    matched_record = {
        'control_number': 1472986,
        'titles': [
            {
                'title': 'title',
            },
        ],
        'earliest_date': '2016-06-29',
    }

    assert validate(matched_record['titles'], titles_schema) is None

    mock_match.return_value = iter([{'_source': matched_record}])

    data = {}
    extra_data = {}

    obj = MockObj(data, extra_data)
    eng = MockEng()

    assert fuzzy_match(obj, eng)
    assert 'matches' in obj.extra_data

    expected = [{
        'control_number': 1472986,
        'title': 'title',
        'earliest_date': '2016-06-29',
    }]
    result = get_value(obj.extra_data, 'matches.fuzzy')

    assert expected == result
Esempio n. 27
0
def test_refextract_url(log_in_as_cataloger, api_client):
    schema = load_schema('hep')
    subschema = schema['properties']['references']

    with requests_mock.Mocker() as requests_mocker:
        requests_mocker.register_uri(
            'GET', 'https://arxiv.org/pdf/1612.06414.pdf',
            content=pkg_resources.resource_string(
                __name__, os.path.join('fixtures', '1612.06414.pdf')),
        )
        requests_mocker.register_uri(
            'GET', 'http://test-indexer:9200/records-hep/hep/_search?_source=control_number',
            content=pkg_resources.resource_string(
                __name__, os.path.join('fixtures', 'es_response.json')),
        )

        response = api_client.post(
            '/editor/refextract/url',
            content_type='application/json',
            data=json.dumps({
                'url': 'https://arxiv.org/pdf/1612.06414.pdf',
            }),
        )
        references = json.loads(response.data)

    assert response.status_code == 200
    assert validate(references, subschema) is None
    assert get_value({'references': references}, 'references.reference.publication_info.journal_title')
Esempio n. 28
0
    def parse(self, response):
        """Parse a APS record into a HEP record.

        Attempts to parse an XML JATS full text first, if available, and falls
        back to parsing JSON if such is not available.
        """
        aps_response = json.loads(response.body_as_unicode())

        for article in aps_response['data']:
            doi = get_value(article, 'identifiers.doi', default='')

            if doi:
                request = Request(url='{}/{}'.format(self.aps_base_url, doi),
                              headers={'Accept': 'text/xml'},
                              callback=self._parse_jats,
                              errback=self._parse_json_on_failure)
                request.meta['json_article'] = article
                request.meta['original_response'] = response
                yield request

        # Pagination support. Will yield until no more "next" pages are found
        if 'Link' in response.headers:
            links = link_header.parse(response.headers['Link'])
            next = links.links_by_attr_pairs([('rel', 'next')])
            if next:
                next_url = next[0].href
                yield Request(next_url)
Esempio n. 29
0
 def force_each_collaboration_to_be_object(self, data):
     if not data.get('record'):
         collaborations = get_value(data, 'reference.collaborations')
         if collaborations:
             data['reference']['collaborations'] = [{'value': collaboration}
                                                    for collaboration in collaborations]
     return data
Esempio n. 30
0
def get_linked_records_in_field(record, field_path):
    """Get all linked records in a given field.

    Args:
        record (dict): the record containing the links
        field_path (string): a dotted field path specification understandable
            by ``get_value``, containing a json reference to another record.

    Returns:
        Iterator[dict]: an iterator on the linked record.

    Warning:
        Currently, the order in which the linked records are yielded is
        different from the order in which they appear in the record.

    Example:
        >>> record = {'references': [
        ...     {'record': {'$ref': 'https://labs.inspirehep.net/api/literature/1234'}},
        ...     {'record': {'$ref': 'https://labs.inspirehep.net/api/data/421'}},
        ... ]}
        >>> get_linked_record_in_field(record, 'references.record')
        [...]
    """
    full_path = '.'.join([field_path, '$ref'])
    pids = force_list([get_pid_from_record_uri(rec) for rec in get_value(record, full_path, [])])
    return get_db_records(pids)
Esempio n. 31
0
    def build_experiment(self, data):

        if get_value(data, "legacy_name") and get_value(data, "project_type"):

            return {
                "_collections": ["Experiments"],
                "$schema":
                url_for(
                    "invenio_jsonschemas.get_schema",
                    schema_path="records/experiments.json",
                    _external=True,
                ),
                "legacy_name":
                get_value(data, "legacy_name"),
                "project_type":
                get_value(data, "project_type"),
            }

        raise InvalidDataError("Experiment is missing a value or values.")
Esempio n. 32
0
def test_set_exact_match_as_approved_in_extradata():
    data = {}
    extra_data = {'matches': {'exact': [1, 2]}}

    obj = MockObj(data, extra_data)
    eng = MockEng()

    set_exact_match_as_approved_in_extradata(obj, eng)

    assert get_value(obj.extra_data, 'matches.approved') == 1
Esempio n. 33
0
def match_by_doi(record):
    """Match by DOIs."""
    dois = get_value(record, 'dois.value', [])

    result = set()
    for doi in dois:
        query = '0247:"{0}"'.format(doi)
        result.update(search(query))

    return list(result)
Esempio n. 34
0
def get_isbn(data, doc_type):
    def hyphenate_if_possible(no_hyphens):
        try:
            return normalize_isbn(no_hyphens)
        except ISBNError:
            return no_hyphens

    isbns = get_value(data, 'isbns.value', [])
    if isbns:
        return ', '.join(hyphenate_if_possible(isbn) for isbn in isbns)
Esempio n. 35
0
    def get_isbn(self, data):
        def hyphenate_if_possible(no_hyphens):
            try:
                return normalize_isbn(no_hyphens)
            except ISBNError:
                return no_hyphens

        isbns = get_value(data, "isbns.value", [])
        if isbns:
            return ", ".join(hyphenate_if_possible(isbn) for isbn in isbns)
Esempio n. 36
0
    def build_conference(self, data) -> dict:
        """Process data from form and build a conference record.

        Args:
            data (ConferenceRecord): record to serialize

        Return:
            dict: a conference record

        """
        builder = ConferenceBuilder()
        builder.add_title(title=data.get("name"), subtitle=data.get("subtitle"))
        builder.set_short_description(value=data.get("description", ""))
        builder.set_opening_date(get_value(data, "dates[0]"))
        builder.set_closing_date(get_value(data, "dates[1]"))
        builder.add_inspire_categories(data.get("field_of_interest", []))
        builder.add_public_note(value=data.get("additional_info", ""))
        builder.add_series(
            name=data.get("series_name"), number=data.get("series_number")
        )
        for address in data.get("addresses"):
            builder.add_address(
                cities=[address.get("city")],
                state=address.get("state"),
                place_name=address.get("venue"),
                country_code=country_name_to_code(address.get("country")),
            )
        for contact in data.get("contacts", []):
            builder.add_contact(**contact)
        for acr in data.get("acronyms", []):
            builder.add_acronym(acr)
        for website in data.get("websites", []):
            builder.add_url(website)
        for keyword in data.get("keywords", []):
            builder.add_keyword(value=keyword)

        builder.record["$schema"] = url_for(
            "invenio_jsonschemas.get_schema",
            schema_path="records/conferences.json",
            _external=True,
        )

        return builder.record
Esempio n. 37
0
def has_cern_collaboration(record):
    """Check if the record should be part of `CERN:arXiv` set."""
    collaborations = get_value(record, "collaborations.value", default=[])
    collaboration_regex = re.compile(r"NA\W+\d", re.IGNORECASE)
    return any(
        collaboration
        for collaboration in collaborations
        if collaboration.lower() in COLLABORATIONS
        or collaboration_regex.match(collaboration)
    )
Esempio n. 38
0
    def get_best_publication_info(data):
        publication_info = get_value(data, "publication_info", [])
        only_publications = [
            entry for entry in publication_info
            if entry.get("material", "publication") == "publication"
        ]
        if not only_publications:
            return {}

        return sorted(only_publications, key=len, reverse=True)[0]
Esempio n. 39
0
 def normalize_journal_title(self, reference):
     try:
         journal_title = get_value(
             reference, "reference.publication_info.journal_title")
         reference["reference"]["publication_info"][
             "journal_title"] = JournalsSearch().normalize_title(
                 journal_title)
     except KeyError:
         pass
     return reference
Esempio n. 40
0
def test_fuzzy_match_returns_true_if_something_matched_with_publication_info(mock_match, enable_fuzzy_matcher):
    schema = load_schema('hep')
    publication_info_schema = schema['properties']['publication_info']
    titles_schema = schema['properties']['titles']

    matched_record = {
        'control_number': 1472986,
        'titles': [
            {
                'title': 'title',
            },
        ],
        'publication_info': [
            {
                'artid': '054021',
                'journal_issue': '5',
                'journal_title': 'Phys.Rev.D',
                'journal_volume': '94',
                'pubinfo_freetext': 'Phys. Rev. D94 (2016) 054021',
                'year': 2016
            },
        ],
    }

    assert validate(matched_record['titles'], titles_schema) is None
    assert validate(matched_record['publication_info'], publication_info_schema) is None

    mock_match.return_value = iter([{'_source': matched_record}])

    data = {}
    extra_data = {}

    obj = MockObj(data, extra_data)
    eng = MockEng()

    assert fuzzy_match(obj, eng)
    assert 'matches' in obj.extra_data

    expected = [{
        'control_number': 1472986,
        'title': 'title',
        'publication_info': [
            {
                'artid': '054021',
                'journal_issue': '5',
                'journal_title': 'Phys.Rev.D',
                'journal_volume': '94',
                'pubinfo_freetext': 'Phys. Rev. D94 (2016) 054021',
                'year': 2016
            },
        ],
    }]
    result = get_value(obj.extra_data, 'matches.fuzzy')

    assert expected == result
Esempio n. 41
0
def match_literature_author(author, record):
    configs = [
        current_app.config["AUTHOR_MATCHER_NAME_CONFIG"],
        current_app.config["AUTHOR_MATCHER_NAME_INITIALS_CONFIG"],
    ]

    validators = [(collaboration_validator, affiliations_validator), None]

    parsed_name = ParsedName.loads(author.get("full_name"))
    author_matcher_data = {
        "first_name": parsed_name.first,
        "last_name": parsed_name.last,
        "full_name": author.get("full_name"),
        "collaborations": get_value(record, "collaborations.value", []),
        "affiliations": get_value(author, "affiliations.value", []),
    }

    for config, validator in zip(configs, validators):
        matched_records = match_literature_author_with_config(
            author_matcher_data, config
        )
        matched_author_data = (
            get_reference_and_bai_if_unambiguous_literature_author_match(
                matched_records
            )
        )
        if not matched_author_data and validator:
            for validator_function in validator:
                valid_matches = (
                    match
                    for match in matched_records
                    if validator_function(author_matcher_data, match)
                )
                matched_author_data = (
                    get_reference_and_bai_if_unambiguous_literature_author_match(
                        valid_matches
                    )
                )
                if matched_author_data:
                    break
        if matched_author_data:
            return matched_author_data
Esempio n. 42
0
def _get_hep_record_brief(hep_record):
    brief = {
        'control_number': hep_record['control_number'],
        'title': get_value(hep_record, 'titles[0].title'),
    }

    abstract = get_value(hep_record, 'abstracts[0].value')
    if abstract is not None:
        brief['abstract'] = abstract

    arxiv_eprint = get_value(hep_record, 'arxiv_eprints[0].value')
    if arxiv_eprint is not None:
        brief['arxiv_eprint'] = arxiv_eprint

    number_of_pages = get_value(hep_record, 'number_of_pages')
    if number_of_pages is not None:
        brief['number_of_pages'] = number_of_pages

    earliest_date = get_value(hep_record, 'earliest_date')
    if earliest_date is not None:
        brief['earliest_date'] = earliest_date

    authors = hep_record.get('authors')
    if authors is not None:
        brief['authors_count'] = len(authors)
        author_briefs = []
        for author in authors[:3]:
            author_briefs.append({'full_name': author['full_name']})
        brief['authors'] = author_briefs

    public_notes = hep_record.get('public_notes')
    if public_notes is not None:
        public_notes_value = []
        for public_note in public_notes:
            public_notes_value.append({'value': public_note['value']})
        brief['public_notes'] = public_notes_value

    publication_info = hep_record.get('publication_info')
    if publication_info is not None:
        brief['publication_info'] = publication_info

    return brief
Esempio n. 43
0
def keep_only_update_source_in_field(field, root, head, update):
    """Remove elements from root and head where ``source`` matches the update.

    This is useful if the update needs to overwrite all elements with the same
    source.

    .. note::
        If the update doesn't contain exactly one source in ``field``, the
        records are returned with no modifications.

    Args:
        field (str): the field to filter out.
        root (pmap): the root record, whose ``field`` will be cleaned.
        head (pmap): the head record, whose ``field`` will be cleaned.
        update (pmap): the update record, from which the ``source`` is read.

    Returns:
        tuple: ``(root, head, update)`` with some elements filtered out from
            ``root`` and ``head``.
    """
    update_thawed = thaw(update)
    update_sources = {
        source.lower()
        for source in get_value(update_thawed, '.'.join([field, 'source']), [])
    }
    if not update_sources:
        # If there is no field or source then fallback for source to `aquisition_source.source`
        source = get_value(update_thawed, "acquisition_source.source")
        if source:
            update_sources = {source.lower()}
    if len(update_sources) != 1:
        return root, head, update
    source = update_sources.pop()

    if field in root:
        root = root.set(field,
                        remove_elements_with_source(source, root[field]))
    if field in head:
        head = head.set(field,
                        remove_elements_with_source(source, head[field]))

    return root, head, update
Esempio n. 44
0
    def _classify_paper(obj, eng):
        from flask import current_app
        params = dict(taxonomy_name=taxonomy
                      or current_app.config['HEP_ONTOLOGY_FILE'],
                      output_mode='dict',
                      output_limit=output_limit,
                      spires=spires,
                      match_mode=match_mode,
                      no_cache=no_cache,
                      with_author_keywords=with_author_keywords,
                      rebuild_cache=rebuild_cache,
                      only_core_tags=only_core_tags,
                      extract_acronyms=extract_acronyms)

        fulltext_used = True
        with get_document_in_workflow(obj) as tmp_document:
            try:
                if tmp_document:
                    result = get_keywords_from_local_file(
                        tmp_document, **params)
                else:
                    data = get_value(obj.data, 'titles.title', [])
                    data.extend(get_value(obj.data, 'titles.subtitle', []))
                    data.extend(get_value(obj.data, 'abstracts.value', []))
                    data.extend(get_value(obj.data, 'keywords.value', []))
                    if not data:
                        obj.log.error(
                            "No classification done due to missing data.")
                        return
                    result = get_keywords_from_text(data, **params)
                    fulltext_used = False
            except ClassifierException as e:
                obj.log.exception(e)
                return

        result['complete_output'] = clean_instances_from_data(
            result.get("complete_output", {}))
        result["fulltext_used"] = fulltext_used

        # Check if it is not empty output before adding
        if any(result.get("complete_output", {}).values()):
            obj.extra_data['classifier_results'] = result
Esempio n. 45
0
    def _match_with_invenio_matcher(obj, eng):
        from invenio_matcher.api import match as _match

        if queries is None:
            queries_ = [{
                'type': 'exact',
                'match': 'dois.value'
            }, {
                'type': 'exact',
                'match': 'arxiv_eprints.value'
            }]
        else:
            queries_ = queries

        record_matches = {
            "recids": [],
            "records": [],
            "base_url": os.path.join(current_app.config["SERVER_NAME"],
                                     'record')
        }

        record = {}
        record['dois.value'] = get_value(obj.data, 'dois.value')
        record['arxiv_eprints.value'] = get_value(obj.data,
                                                  'arxiv_eprints.value')
        for matched_record in _match(record,
                                     queries=queries_,
                                     index=index,
                                     doc_type=doc_type):
            matched_recid = matched_record.record.get('id')
            record_matches['recids'].append(matched_recid)
            record_matches['records'].append({
                "source":
                matched_record.record.dumps(),
                "score":
                matched_record.score
            })

        if len(record_matches['recids']) > 0:
            obj.extra_data["record_matches"] = record_matches
            return True
        return False
Esempio n. 46
0
def _link_user_and_token(user, name, orcid, token):
    """Create a link between a user and token, if possible.

    Args:
        user (invenio_oauthclient.models.User): an existing user object to connect the token to
        orcid (string): user's ORCID identifier
        token (string): OAUTH token for the user

    Returns:
        str: the ORCID associated with the new token if we created one, or the
        ORCID associated with the token whose ``allow_push`` flag changed state.

    """
    result = None

    try:
        # Link user and ORCID
        oauth_link_external_id(user, {
            'id': orcid,
            'method': 'orcid'
        })
    except AlreadyLinkedError:
        # User already has their ORCID linked
        pass

    # Check whether there are already tokens associated with this
    # ORCID identifier.
    tokens = RemoteToken.query.join(RemoteAccount).join(User)\
        .join(UserIdentity).filter(UserIdentity.id == orcid).all()

    if tokens:
        # Force the allow_push.
        with db.session.begin_nested():
            for token in tokens:
                if not token.remote_account.extra_data['allow_push']:
                    result = orcid
                token.remote_account.extra_data['allow_push'] = True
    else:
        # If not, create and put the token entry
        with db.session.begin_nested():
            result = orcid
            RemoteToken.create(
                user_id=user.id,
                client_id=get_value(current_app.config, 'ORCID_APP_CREDENTIALS.consumer_key'),
                token=token,
                secret=None,
                extra_data={
                    'orcid': orcid,
                    'full_name': name,
                    'allow_push': True,
                }
            )

    return result
Esempio n. 47
0
def has_cern_accelerator_experiment(record):
    """Check if the record should be part of `CERN:arXiv` set."""
    accelerator_experiments = get_value(
        record, "accelerator_experiments.legacy_name", default=[]
    )
    return any(
        experiment
        for experiment in accelerator_experiments
        if experiment.lower() in ACCELERATOR_EXPERIMENTS_NAMES
        or experiment.lower().startswith("cern")
    )
Esempio n. 48
0
 def generate_bai(cls, data):
     name = get_value(data, "name.value")
     bai = ".".join(format_name(name, initials_only=True).split())
     bai = unidecode(bai)
     bai = "".join(
         filter(lambda x: x in set(string.ascii_letters + "."), bai))
     bai = re.sub(r"\.+", ".", bai).lstrip(".")
     if not bai.endswith("."):
         bai = f"{bai}."
     next_bai_number = cls.next_bai_number(bai)
     return f"{bai}{next_bai_number}"
Esempio n. 49
0
def _get_hal_id_map(record):
    affiliation_records = chain.from_iterable(
        get_value(record, 'authors.affiliations.record', default=[]))
    affiliation_recids = [get_recid_from_ref(el) for el in affiliation_records]

    try:
        institutions = get_es_records('ins', affiliation_recids)
    except RequestError:
        institutions = []

    return {el['control_number']: _get_hal_id(el) for el in institutions}
Esempio n. 50
0
def add_arxiv_categories(record, blob):
    if not record.get('arxiv_eprints') or not blob.get('65017'):
        return record

    for category in force_list(get_value(blob, '65017')):
        if category.get('2') == 'arXiv' and category.get('a'):
            record['arxiv_eprints'][0]['categories'].append(
                normalize_arxiv_category(category['a'])
            )

    return record
Esempio n. 51
0
def populate_bookautocomplete(record):
    """Populate the ```bookautocomplete`` field of Literature records."""
    paths = [
        'imprints.date',
        'imprints.publisher',
        'isbns.value',
    ]

    authors = force_list(get_value(record, 'authors.full_name', default=[]))
    titles = force_list(get_value(record, 'titles.title', default=[]))

    input_values = list(chain.from_iterable(
        force_list(get_value(record, path, default=[])) for path in paths))
    input_values.extend(authors)
    input_values.extend(titles)
    input_values = [el for el in input_values if el]

    record['bookautocomplete'] = {
        'input': input_values,
    }
Esempio n. 52
0
 def get_arxiv_eprints(self, data):
     arxiv_eprint = data.pop("arxiv_eprint", None)
     arxiv_eprints = data.get("arxiv_eprints")
     if arxiv_eprint:
         data["arxiv_eprint"] = force_list({"value": arxiv_eprint})
     elif arxiv_eprints:
         data["arxiv_eprint"] = force_list(
             {"value": get_value(data, "arxiv_eprints[0].value", default=missing)}
         )
     data.pop("arxiv_eprints", None)
     return data.get("arxiv_eprint", missing)
Esempio n. 53
0
def get_journal_coverage(obj, eng):
    """Return the journal coverage that this article belongs to."""
    journals = replace_refs(get_value(obj.data, 'publication_info.journal_record'), 'db')

    if not journals:
        return

    if any(journal['_harvesting_info'].get('coverage') == 'full' for journal in journals):
        obj.extra_data['journal_coverage'] = 'full'
    else:
        obj.extra_data['journal_coverage'] = 'partial'
Esempio n. 54
0
def test_get_value_works_on_lists():
    record = [
        {
            'foo': 'bar'
        },
    ]

    expected = ['bar']
    result = get_value(record, 'foo')

    assert expected == result
Esempio n. 55
0
 def before_dump(self, data):
     family_name, given_name = self.get_name_splitted(data)
     return {
         'advisors':
         get_value(data, 'advisors', default=missing),
         'acquisition_source':
         get_value(data, 'acquisition_source', default=missing),
         'arxiv_categories':
         get_value(data, 'arxiv_categories', default=missing),
         'blog':
         self.get_first_or_missing(
             self.get_value_by_description_key(data.get('urls', []),
                                               'blog')),
         'display_name':
         get_value(data, 'name.preferred_name', default=missing),
         'family_name':
         self.get_value_or_missing(family_name),
         'given_name':
         self.get_value_or_missing(given_name),
         'linkedin':
         self.get_first_or_missing(
             get_values_for_schema(data.get('ids', []), 'LINKEDIN')),
         'native_name':
         get_value(data, 'name.native_names[0]', default=missing),
         'orcid':
         self.get_first_or_missing(
             get_values_for_schema(data.get('ids', []), 'ORCID')),
         'positions':
         get_value(data, 'positions', default=missing),
         'project_membership':
         get_value(data, 'project_membership', default=missing),
         'public_emails':
         get_value(data, 'email_addresses.value', default=missing),
         'status':
         get_value(data, 'status', default=missing),
         'twitter':
         self.get_first_or_missing(
             get_values_for_schema(data.get('ids', []), 'TWITTER')),
         'websites':
         get_value(data, 'urls.value', default=missing),
     }
Esempio n. 56
0
    def publication_date(self):
        """(Partial) date of publication.

        Returns:
            partial_date (inspire_utils.date.PartialDate): publication date
        """
        try:
            return PartialDate.loads(
                get_value(self.record, 'imprints.date[0]')
                or get_publication_date(self.record))
        except ValueError:
            return None
Esempio n. 57
0
    def orcid_role_for_inspire_author(self, author):
        """ORCID role for an INSPIRE author field.

        Args:
            author (dict): an author field from INSPIRE literature record

        Returns:
            string: ORCID role of a person
        """
        inspire_roles = sorted(get_value(author, 'inspire_roles', ['author']))
        if inspire_roles:
            return self.INSPIRE_TO_ORCID_ROLES_MAP[inspire_roles[0]]
Esempio n. 58
0
def update_moved_orcid(old_orcid, new_orcid):
    author_record = AuthorsRecord.get_record_by_pid_value(old_orcid, "orcid")
    if new_orcid not in get_value(author_record, "ids.value", []):
        new_author_ids = [
            {"schema": "ORCID", "value": new_orcid},
            *author_record["ids"],
        ]
        author_record["ids"] = new_author_ids
        author_record.update(dict(author_record))
    remove_access_token_for_orcid_account(old_orcid, new_orcid)
    db.session.commit()
    LOGGER.info("ORCID updated", new_orcid=new_orcid, old_orcid=old_orcid)
Esempio n. 59
0
 def get_linked_book(self, data):
     parent = get_parent_record(data)
     if parent and "titles" in parent and "control_number" in parent:
         endpoint = PidStoreBase.get_endpoint_from_pid_type(
             PidStoreBase.get_pid_type_from_schema(data["$schema"]))
         endpoint_item = f"invenio_records_rest.{endpoint}_item"
         ref = get_value(parent, "self.$ref") or url_for(
             endpoint_item,
             pid_value=parent["control_number"],
             _external=True)
         return {**parent["titles"][0], "record": {"$ref": ref}}
     return None
Esempio n. 60
0
    def publication_date(self):
        """(Partial) date of publication.

        Returns:
            partial_date (inspire_utils.date.PartialDate): publication date
        """
        try:
            return PartialDate.loads(
                get_value(self.record, "imprints.date[0]")
                or LiteratureReader(self.record).publication_date)
        except ValueError:
            return None