Ejemplo n.º 1
0
def get_orcids_for_push(record):
    """Obtain the ORCIDs associated to the list of authors in the Literature record.

    The ORCIDs are looked up both in the ``ids`` of the ``authors`` and in the
    Author records that have claimed the paper.

    Args:
        record(dict): metadata from a Literature record

    Returns:
        Iterator[str]: all ORCIDs associated to these authors
    """
    orcids_on_record = []
    author_recids_with_claims = []

    for author in record.get('authors', []):
        orcids_in_author = get_values_for_schema(author.get('ids', []),
                                                 'ORCID')
        if orcids_in_author:
            orcids_on_record.extend(orcids_in_author)
        elif author.get('curated_relation') is True and 'record' in author:
            author_recids_with_claims.append(
                get_recid_from_ref(author['record']))

    author_records = get_db_records('aut', author_recids_with_claims)
    all_ids = (author.get('ids', []) for author in author_records)
    orcids_in_authors = chain.from_iterable(
        get_values_for_schema(ids, 'ORCID') for ids in all_ids)

    return chain(orcids_on_record, orcids_in_authors)
Ejemplo n.º 2
0
def get_linked_records_in_field(record, field_path):
    """Get all linked records in a given field.

    Args:
        record (dict): the record containing the links
        field_path (string): a dotted field path specification understandable
            by ``get_value``, containing a json reference to another record.

    Returns:
        Iterator[dict]: an iterator on the linked record.

    Warning:
        Currently, the order in which the linked records are yielded is
        different from the order in which they appear in the record.

    Example:
        >>> record = {'references': [
        ...     {'record': {'$ref': 'https://labs.inspirehep.net/api/literature/1234'}},
        ...     {'record': {'$ref': 'https://labs.inspirehep.net/api/data/421'}},
        ... ]}
        >>> get_linked_record_in_field(record, 'references.record')
        [...]
    """
    full_path = '.'.join([field_path, '$ref'])
    pids = force_list([
        get_pid_from_record_uri(rec)
        for rec in get_value(record, full_path, [])
    ])
    return get_db_records(pids)
Ejemplo n.º 3
0
def get_orcids_for_push(record):
    """Obtain the ORCIDs associated to the list of authors in the Literature record.

    The ORCIDs are looked up both in the ``ids`` of the ``authors`` and in the
    Author records that have claimed the paper.

    Args:
        record(dict): metadata from a Literature record

    Returns:
        Iterator[str]: all ORCIDs associated to these authors
    """
    orcids_on_record = []
    author_recids_with_claims = []

    for author in record.get('authors', []):
        orcids_in_author = get_values_for_schema(author.get('ids', []), 'ORCID')
        if orcids_in_author:
            orcids_on_record.extend(orcids_in_author)
        elif author.get('curated_relation') is True and 'record' in author:
            author_recids_with_claims.append(get_recid_from_ref(author['record']))

    author_records = get_db_records(('aut', recid) for recid in author_recids_with_claims)
    all_ids = (author.get('ids', []) for author in author_records)
    orcids_in_authors = chain.from_iterable(get_values_for_schema(ids, 'ORCID') for ids in all_ids)

    return chain(orcids_on_record, orcids_in_authors)
Ejemplo n.º 4
0
def get_linked_records_in_field(record, field_path):
    """Get all linked records in a given field.

    Args:
        record (dict): the record containing the links
        field_path (string): a dotted field path specification understandable
            by ``get_value``, containing a json reference to another record.

    Returns:
        Iterator[dict]: an iterator on the linked record.

    Warning:
        Currently, the order in which the linked records are yielded is
        different from the order in which they appear in the record.

    Example:
        >>> record = {'references': [
        ...     {'record': {'$ref': 'https://labs.inspirehep.net/api/literature/1234'}},
        ...     {'record': {'$ref': 'https://labs.inspirehep.net/api/data/421'}},
        ... ]}
        >>> get_linked_record_in_field(record, 'references.record')
        [...]
    """
    full_path = '.'.join([field_path, '$ref'])
    pids = force_list([get_pid_from_record_uri(rec) for rec in get_value(record, full_path, [])])
    return get_db_records(pids)
Ejemplo n.º 5
0
def test_get_db_records_finds_right_results(app):
    literature = [1498175, 1090628]
    authors = [983059]

    results = list(get_db_records('lit', literature + authors))
    recids = {result['control_number'] for result in results}

    assert len(results) == len(literature)
    assert recids == set(literature)
Ejemplo n.º 6
0
def test_get_db_records_accept_multiple_pid_types(app):
    records = [('lit', 1498175), ('lit', 1090628), ('aut', 983059)]

    results = list(get_db_records(records))

    assert len(results) == 3
Ejemplo n.º 7
0
def test_get_db_records_accepts_string_pid_values(app):
    records = list(get_db_records([('lit', '4328')]))

    assert len(records) == 1
Ejemplo n.º 8
0
def test_get_db_records_accepts_integer_pid_values(app):
    records = list(get_db_records([('lit', 4328)]))

    assert len(records) == 1
Ejemplo n.º 9
0
def test_get_db_records_handles_empty_lists(app):
    assert list(get_db_records([])) == []
Ejemplo n.º 10
0
def find_arxiv_duplicates(starting_date=None):
    """Finds arXiv paper created more than once.

    After duplicates are detected, merged/deleted records are filtered out.

    Example:
        >> from datetime import datetime
        >> date = datetime(2019, 01, 10)
        >> dups = find_arxiv_duplicates(date)
        >> dups
        {'1812.10562': {'recids': [1711898, 1711919], 'wf_ids': [1387687, 1388915]}}

    Args:
        starting_date (datetime): date from which starting to search workflows

    Returns:
        (dict): a dictionary having as key the arXiv id and as value a
        dictionary having two keys: `recids`, which is a list of control
        numbers of the paper refering to the arXiv id key, and `wf_ids`,
        which contains the list od ids of the workflows that created the
        related records.
    """
    query = WorkflowObjectModel.query.\
        with_entities(WorkflowObjectModel.id).\
        filter(WorkflowObjectModel.status == ObjectStatus.COMPLETED).\
        filter(cast(WorkflowObjectModel.data, JSONB).has_key('arxiv_eprints')).\
        filter(cast(WorkflowObjectModel.data, JSONB).has_key('control_number'))   # noqa: W601

    if starting_date:
        query = query.filter(WorkflowObjectModel.created > starting_date)

    workflows_ids = [wf_id for (wf_id,) in query.all()]
    duplicates = dict()

    with click.progressbar(workflows_ids, label='Processing workflows') as wf_iterator:
        for wf_id in wf_iterator:
            wf = workflow_object_class.get(wf_id)
            recid = wf.data.get('control_number')
            arxiv_id = wf.data['arxiv_eprints'][0]['value']

            if arxiv_id not in duplicates:
                duplicates[arxiv_id] = {'recids': [], 'wf_ids': []}

            if recid not in duplicates[arxiv_id]['recids']:
                duplicates[arxiv_id]['recids'].append(recid)
                duplicates[arxiv_id]['wf_ids'].append(wf_id)

    duplicates = {k: v for (k, v) in duplicates.items() if len(v['recids']) > 1}

    with click.progressbar(duplicates, label='Removing merged records') as rec_iterator:
        for arxiv_id in rec_iterator:
            pids = [('lit', _id) for _id in duplicates[arxiv_id]['recids']]
            records = get_db_records(pids)

            for rec in records:
                if rec.get('deleted'):
                    idx = duplicates[arxiv_id]['recids'].index(rec['control_number'])
                    del duplicates[arxiv_id]['recids'][idx]
                    del duplicates[arxiv_id]['wf_ids'][idx]

    click.secho('Found %d arXiv duplicates' % len(duplicates))
    click.secho(json.dumps(duplicates))
Ejemplo n.º 11
0
def find_arxiv_duplicates(starting_date=None):
    """Finds arXiv paper created more than once.

    After duplicates are detected, merged/deleted records are filtered out.

    Example:
        >> from datetime import datetime
        >> date = datetime(2019, 01, 10)
        >> dups = find_arxiv_duplicates(date)
        >> dups
        {'1812.10562': {'recids': [1711898, 1711919], 'wf_ids': [1387687, 1388915]}}

    Args:
        starting_date (datetime): date from which starting to search workflows

    Returns:
        (dict): a dictionary having as key the arXiv id and as value a
        dictionary having two keys: `recids`, which is a list of control
        numbers of the paper refering to the arXiv id key, and `wf_ids`,
        which contains the list od ids of the workflows that created the
        related records.
    """
    query = WorkflowObjectModel.query.\
        with_entities(WorkflowObjectModel.id).\
        filter(WorkflowObjectModel.status == ObjectStatus.COMPLETED).\
        filter(cast(WorkflowObjectModel.data, JSONB).has_key('arxiv_eprints')).\
        filter(cast(WorkflowObjectModel.data, JSONB).has_key('control_number'))   # noqa: W601

    if starting_date:
        query = query.filter(WorkflowObjectModel.created > starting_date)

    workflows_ids = [wf_id for (wf_id, ) in query.all()]
    duplicates = dict()

    with click.progressbar(workflows_ids,
                           label='Processing workflows') as wf_iterator:
        for wf_id in wf_iterator:
            wf = workflow_object_class.get(wf_id)
            recid = wf.data.get('control_number')
            arxiv_id = wf.data['arxiv_eprints'][0]['value']

            if arxiv_id not in duplicates:
                duplicates[arxiv_id] = {'recids': [], 'wf_ids': []}

            if recid not in duplicates[arxiv_id]['recids']:
                duplicates[arxiv_id]['recids'].append(recid)
                duplicates[arxiv_id]['wf_ids'].append(wf_id)

    duplicates = {
        k: v
        for (k, v) in duplicates.items() if len(v['recids']) > 1
    }

    with click.progressbar(duplicates,
                           label='Removing merged records') as rec_iterator:
        for arxiv_id in rec_iterator:
            pids = [('lit', _id) for _id in duplicates[arxiv_id]['recids']]
            records = get_db_records(pids)

            for rec in records:
                if rec.get('deleted'):
                    idx = duplicates[arxiv_id]['recids'].index(
                        rec['control_number'])
                    del duplicates[arxiv_id]['recids'][idx]
                    del duplicates[arxiv_id]['wf_ids'][idx]

    click.secho('Found %d arXiv duplicates' % len(duplicates))
    click.secho(json.dumps(duplicates))
Ejemplo n.º 12
0
def test_get_db_records_accepts_lists_of_strings(app):
    records = list(get_db_records('lit', ['4328']))

    assert len(records) == 1
Ejemplo n.º 13
0
def test_get_db_records_accepts_lists_of_integers(app):
    records = list(get_db_records('lit', [4328]))

    assert len(records) == 1