def test_merging_full_name_field_keeps_longest_name():
    root = {
        'authors': [{
            'full_name': 'Pitts Kevin',
        }]
    }
    head = {
        'authors': [{
            'full_name': 'Pitts, Kevin John',
        }]
    }
    update = {
        'authors': [{
            'full_name': 'Pitts, Kevin',
        }]
    }

    expected_merged = head

    expected_conflict = []

    merged, conflict = merge(root, head, update, head_source='arxiv')
    assert merged == expected_merged
    assert_ordered_conflicts(conflict, expected_conflict)
    validate_subschema(merged)
def test_comparing_publication_info():
    root = {}
    head = {
        'publication_info': [
            {
                'journal_title': 'J. Testing',
                'journal_volume': '42',
            }
        ]
    }
    update = {
        'publication_info': [
            {
                'journal_title': 'J. Testing',
                'journal_volume': '42',
                'artid': 'foo',
            }
        ]
    }

    expected_conflict = []
    expected_merged = update

    root, head, update, expected_merged = add_arxiv_source(root, head, update, expected_merged)
    merged, conflict = merge(root, head, update, head_source='arxiv')

    merged = add_arxiv_source(merged)
    assert merged == expected_merged
    assert_ordered_conflicts(conflict, expected_conflict)
    validate_subschema(merged)
def test_merging_inspire_categories_field():
    root = {
        'inspire_categories': [{
            'source': 'INSPIRE',
            'term': 'Theory-HEP'
        }]
    }
    head = {
        'inspire_categories': [{
            'source': 'curator',
            'term': 'Theory-HEP'
        }, {
            'source': 'curator',
            'term': 'Theory-Nucl'
        }]
    }
    update = {
        'inspire_categories': [{
            'source': 'arxiv',
            'term': 'Computing'
        }, {
            'source': 'arxiv',
            'term': 'Other'
        }]
    }

    expected_merged = head
    expected_conflict = []

    merged, conflict = merge(root, head, update, head_source='arxiv')
    assert merged == expected_merged
    assert_ordered_conflicts(conflict, expected_conflict)
    validate_subschema(merged)
Example #4
0
def test_comparing_authors_unicode_name():
    root = {}
    head = {
        'authors': [
            {
                'full_name': 'Ortín, Tomás'
            },
        ],
    }
    update = {
        'authors': [
            {
                'full_name': 'Ortin, Tomas'
            },
        ],
    }

    expected_conflict = []
    expected_merged = head

    root, head, update, expected_merged = add_arxiv_source(
        root, head, update, expected_merged)
    merged, conflict = merge(root, head, update, head_source='arxiv')

    merged = add_arxiv_source(merged)
    assert merged == expected_merged
    assert_ordered_conflicts(conflict, expected_conflict)
    validate_subschema(merged)
Example #5
0
def test_comparing_publication_info_with_cnum():
    root = {}
    head = {
        'publication_info': [{
            "artid": "WEPAB127",
            "cnum": "C21-05-24.3",
            "conf_acronym": "IPAC2021",
            "year": 2021
        }]
    }
    update = {
        'publication_info': [{
            "artid": "WEPAB127",
            "cnum": "C21-05-24.3",
            "conf_acronym": "IPAC2021",
            "conference_record": {
                "$ref": "https://inspirehep.net/api/conferences/1853162"
            },
            "year": 2021
        }]
    }

    expected_conflict = []
    expected_merged = update

    root, head, update, expected_merged = add_arxiv_source(
        root, head, update, expected_merged)
    merged, conflict = merge(root, head, update, head_source='arxiv')

    merged = add_arxiv_source(merged)
    assert merged == expected_merged
    assert_ordered_conflicts(conflict, expected_conflict)
    validate_subschema(merged)
Example #6
0
def test_merging_same_documents_arxiv_on_arxiv(fake_get_config):
    root = {
        "documents": [
            {
                "key": "pdf1.pdf",
                "description": "paper",
                "source": "arXiv",
                "fulltext": True,
                "url": "http://example.com/files/1234-1234-1234-1234/pdf1.pdf",
            },
            {
                "key": "pdf.tex",
                "description": "latex version",
                "source": "arXiv",
                "url": "http://example.com/files/1234-1234-1234-1234/pdf.tex",
            },
        ]
    }
    head = root
    update = root
    expected_merged = head
    expected_conflict = []
    merged, conflict = merge(root, head, update)
    assert merged == expected_merged
    assert_ordered_conflicts(conflict, expected_conflict)
    validate_subschema(merged)
Example #7
0
def test_merging_acquisition_source_publisher_on_arxiv(fake_get_config):
    root = {
        "acquisition_source": {
            "datetime": "2021-05-11T02:35:43.387350",
            "method": "hepcrawl",
            "source": "arXiv",
            "submission_number": "c8a0e3e0b20011eb8d930a580a6402c0"
        }
    }
    head = {
        "acquisition_source": {
            "datetime": "2021-05-11T02:35:43.387350",
            "method": "hepcrawl",
            "source": "arXiv",
            "submission_number": "c8a0e3e0b20011eb8d930a580a6402c0"
        }
    }
    update = {
        "acquisition_source": {
            "datetime": "2021-05-12T02:35:43.387350",
            "method": "beard",
            "source": "other source",
            "submission_number": "c8a0e3e0b20011eb8d930a580a6402c1"
        }
    }
    expected_merged = update
    expected_conflict = []
    merged, conflict = merge(root, head, update)
    assert merged == expected_merged
    assert_ordered_conflicts(conflict, expected_conflict)
    validate_subschema(merged)
Example #8
0
def test_merging_dois_field_handles_repeated_values():
    root = {
        'dois': [{
            'material': 'preprint',
            'value': '10.1023/A:1026654312961'
        }]
    }
    head = {
        'dois': [
            {
                'material': 'publication',
                'value': '10.1023/A:1026654312961'
            },
            {
                'source': 'nowhere',
                'value': '10.1023/B:1026654312961'
            },
        ]
    }
    update = {
        'dois': [
            {
                'material': 'erratum',
                'value': '10.1023/A:1026654312961'
            },
            {
                'material': 'erratum',
                'source': 'nowhere',
                'value': '10.1023/B:1026654312961'
            },
        ]
    }

    expected_merged = {
        'dois': [
            {
                'material': 'publication',
                'value': '10.1023/A:1026654312961'
            },
            {
                'source': 'nowhere',
                'value': '10.1023/B:1026654312961'
            },
            {
                'material': 'erratum',
                'value': '10.1023/A:1026654312961'
            },
            {
                'material': 'erratum',
                'source': 'nowhere',
                'value': '10.1023/B:1026654312961'
            },
        ]
    }
    expected_conflict = []

    merged, conflict = merge(root, head, update, head_source='arxiv')
    assert merged == expected_merged
    assert_ordered_conflicts(conflict, expected_conflict)
    validate_subschema(merged)
def test_merging_titles_field():
    root = {
        'titles': [{
            'source':
            'arXiv',
            'title':
            'ANTARES: An observatory at the seabed '
            'to the confines of the Universe'
        }  # record: 1519935
                   ]
    }
    head = {
        'titles': [{
            'source':
            'arXiv',
            'subtitle':
            'this subtitle has been added by a curator',
            'title':
            'ANTARES: An observatory at the seabed '
            'to the confines of the Universe'
        }]
    }
    update = {
        'titles': [
            {
                'source': 'arXiv',
                'title': 'ANTARES: Un osservatorio foo bar'
            },
        ]
    }

    expected_merged = {
        'titles': [
            {
                'source':
                'arXiv',
                'subtitle':
                'this subtitle has been added by a curator',
                'title':
                'ANTARES: An observatory at the seabed '
                'to the confines of the Universe'
            },
        ]
    }
    expected_conflict = [{
        'path': '/titles/0',
        'op': 'add',
        'value': {
            'source': 'arXiv',
            'title': 'ANTARES: Un osservatorio foo bar'
        },
        '$type': 'INSERT'
    }]

    merged, conflict = merge(root, head, update, head_source='arxiv')
    assert merged == expected_merged
    assert_ordered_conflicts(conflict, expected_conflict)
    validate_subschema(merged)
def test_comparing_references_field_different_dois():
    root = {}
    head = {
        'references': [
            {
                'reference': {
                    'dois': [
                        '10.1099/bar',
                    ],
                }
            }
        ]
    }
    update = {
        'references': [
            {
                'reference': {
                    'dois': [
                        '10.1099/foo',
                    ],
                    'document_type': 'article',
                }
            }
        ]
    }

    expected_conflict = []

    expected_merged = {
        'references': [
            {
                'reference': {
                    'dois': [
                        '10.1099/bar',
                    ],
                }
            },
            {
                'reference': {
                    'dois': [
                        '10.1099/foo',
                    ],
                    'document_type': 'article',
                }
            }
        ]
    }

    root, head, update, expected_merged = add_arxiv_source(root, head, update, expected_merged)
    merged, conflict = merge(root, head, update, head_source='arxiv')

    merged = add_arxiv_source(merged)
    assert merged == expected_merged
    assert_ordered_conflicts(conflict, expected_conflict)
    validate_subschema(merged)
def test_figures():
    root = {}
    head = {
        'figures': [{
            'key':
            'figure1.png',
            'caption':
            'Figure 1',
            'source':
            'arXiv',
            'url':
            'http://example.comfiles/1234-1234-1234-1234/figure1.png',
        }, {
            'key':
            'figure2.png',
            'caption':
            'Figure 2',
            'source':
            'arXiv',
            'url':
            'http://example.com/files/1234-1234-1234-1234/figure2.png',
        }]
    }
    update = {
        'figures': [{
            'key':
            'new_figure1.png',
            'caption':
            'Figure 1',
            'source':
            'arXiv',
            'url':
            'http://example.com/files/5678-5678-5678-5678/figure1.png',
        }, {
            'key':
            'new_figure2.png',
            'caption':
            'Figure 2',
            'source':
            'arXiv',
            'url':
            'http://example.com/files/5678-5678-5678-5678/figure2.png',
        }]
    }

    expected_merged = update
    expected_conflict = []

    merged, conflict = merge(root, head, update, head_source='arxiv')
    assert merged == expected_merged
    assert_ordered_conflicts(conflict, expected_conflict)
    validate_subschema(merged)
def test_ordering_conflicts():
    # This test is actually for broken input.
    # Where authors are duplicated.

    root = load_test_data("test_data/root.json")
    head = load_test_data("test_data/head.json")
    update = load_test_data("test_data/update.json")

    expected_conflicts = load_test_data("test_data/conflicts.json")
    expected_merged = load_test_data("test_data/merged.json")
    merged, conflicts = merge(root, head, update)

    assert sorted(merged['authors'], key=itemgetter('uuid')) == sorted(expected_merged['authors'], key=itemgetter('uuid'))
    assert_ordered_conflicts(conflicts, expected_conflicts)
def test_documents():
    root = {}
    head = {
        'documents': [
            {
                'key': 'pdf1.pdf',
                'description': 'paper',
                'source': 'arXiv',
                'fulltext': True,
                'url': 'http://example.com/files/1234-1234-1234-1234/pdf1.pdf',
            },
            {
                'key': 'pdf.tex',
                'description': 'latex version',
                'source': 'arXiv',
                'url': 'http://example.com/files/1234-1234-1234-1234/pdf.tex',
            },
        ]
    }
    update = {
        'documents': [{
            'key':
            'pdf.pdf',
            'description':
            'paper',
            'source':
            'arXiv',
            'url':
            'http://example.com/files/5678-5678-5678-5678/pdf.pdf',
        }, {
            'key':
            'foo.xml',
            'description':
            'some xml files',
            'source':
            'arXiv',
            'url':
            'http://example.com/files/5678-5678-5678-5678/foo.xml',
        }]
    }

    expected_merged = update
    expected_conflict = []

    merged, conflict = merge(root, head, update, head_source='arxiv')
    assert merged == expected_merged
    assert_ordered_conflicts(conflict, expected_conflict)
    validate_subschema(merged)
def test_merging_acquisition_source_field():
    root = {}
    # record_id: 1517095
    head = {'acquisition_source': {'method': 'submitter', 'source': 'arxiv'}}
    update = {
        'acquisition_source': {
            'method': 'batchuploader',
            'source': 'arxiv'
        }
    }

    expected_merged = update
    expected_conflict = []

    merged, conflict = merge(root, head, update, head_source='arxiv')
    assert merged == expected_merged
    assert_ordered_conflicts(conflict, expected_conflict)
    validate_subschema(merged)
Example #15
0
def test_comparing_keywords():
    root = {}
    head = {
        'keywords': [{
            'value': 'shielding',
            'schema': 'JACOW',
        }, {
            'value': 'test',
            'schema': 'JACOW',
        }]
    }
    update = {
        'keywords': [{
            'value': 'shielding',
            'schema': 'INSPIRE',
        }, {
            'value': 'shielding',
            'schema': 'JACOW',
        }]
    }

    expected_conflict = []
    expected_merged = {
        'keywords': [
            {
                'value': 'shielding',
                'schema': 'INSPIRE',
            },
            {
                'value': 'shielding',
                'schema': 'JACOW',
            },
            {
                'value': 'test',
                'schema': 'JACOW',
            },
        ]
    }

    merged, conflict = merge(root, head, update, head_source='arxiv')

    assert merged == expected_merged
    assert_ordered_conflicts(conflict, expected_conflict)
    validate_subschema(merged)
def test_merging_license_field():
    root = {
        'license': [{
            'imposing': 'Elsevier',
            'url': 'http://creativecommons.org/licenses/by/4.0/',
            'license': 'elsevier foo bar'
        }]
    }
    head = {
        'license': [{
            'imposing': 'Elsevier',
            'url': 'http://creativecommons.org/licenses/by/4.0/',
            'license': 'elsevier foo bar'
        }, {
            'imposing': 'arXiv',
            'url': 'http://creativecommons.org/licenses/by/4.0/',
            'license': 'arxiv foo bar'
        }]
    }
    update = {
        'license': [{
            'imposing': 'Elsevier',
            'url': 'http://creativecommons.org/licenses/by/4.0/',
            'license': 'elsevier foo bar updated!'
        }]
    }

    expected_merged = {
        'license': [{
            'imposing': 'Elsevier',
            'url': 'http://creativecommons.org/licenses/by/4.0/',
            'license': 'elsevier foo bar updated!'
        }, {
            'imposing': 'arXiv',
            'url': 'http://creativecommons.org/licenses/by/4.0/',
            'license': 'arxiv foo bar'
        }]
    }
    expected_conflict = []

    merged, conflict = merge(root, head, update, head_source='arxiv')
    assert merged == expected_merged
    assert_ordered_conflicts(conflict, expected_conflict)
    validate_subschema(merged)
def test_merging_report_numbers_field_repeated_values():
    root = {
        'report_numbers': [
            {
                'source': 'arXiv',
                'value': 'CERN-CMS-2018-001',
            },
        ]
    }  # record: 1598022
    head = {
        'report_numbers': [
            {
                'hidden': True,
                'source': 'arXiv',
                'value': 'CERN-CMS-2018-001',
            },
            {
                'value': 'CERN-CMS-2018-001',
            },
        ]
    }
    update = {
        'report_numbers': [
            {
                'source': 'arXiv',
                'value': 'CERN-CMS-2018-001',
            },
        ]
    }

    expected_merged = head
    expected_conflict = []

    merged, conflict = merge(root, head, update, head_source='arxiv')
    assert merged == expected_merged
    assert_ordered_conflicts(conflict, expected_conflict)
    validate_subschema(merged)
def test_merging_raw_affiliations_field():
    root = {}
    head = {
        'authors': [{
            'full_name':
            'Pitts, Kevin T',
            'raw_affiliations': [{
                'source':
                'arxiv',
                'value':
                'Department of Physics, Indiana University, Bloomington, IN 47405, USA'
            }]
        }]
    }
    update = {
        'authors': [{
            'full_name':
            'Pitts, Kevin T',
            'raw_affiliations': [{
                'source':
                'arxiv',
                'value':
                'Department of Physics, Indiana University, Bloomington, IN 47405, US'
            }, {
                'source': 'arxiv',
                'value': 'Padua U',
            }]
        }]
    }

    expected_merged = update
    expected_conflict = []

    merged, conflict = merge(root, head, update, head_source='arxiv')
    assert merged == expected_merged
    assert_ordered_conflicts(conflict, expected_conflict)
    validate_subschema(merged)
def test_merging_publication_info_field():
    root = {
        'publication_info': [{
            "hidden": True,
            "journal_title": "Adv.Theor.Math.Phys.",
            "journal_volume": "12",
            "page_end": "979",
            "page_start": "948",
            "year": 2008
        }]
    }  # record 697133
    head = {
        'publication_info': [{
            "hidden": True,
            "journal_title": "Adv.Theor.Math.Phys.",
            "journal_record": {
                "$ref": "http://labs.inspirehep.net/api/journals/1212914"
            },
            "journal_volume": "12",
            "page_end": "979",
            "page_start": "948",
            "year": 2008
        }]
    }
    update = {
        'publication_info': [
            {
                'artid': '948-979',
                'curated_relation': True,
                'journal_issue': '1',
                'journal_title': 'Adv.Theor.Math.Phys.',
                'journal_volume': '12',
                'year': 2008,
                'cnum': 'C12-03-10',
                'material': 'erratum',
                'page_end': '042',
                'page_start': '032',
                'parent_isbn': '9780521467025',
                'parent_report_number': 'CERN-PH-TH-2012-115',
            },
        ]
    }

    expected_merged = {
        'publication_info': [{
            'artid': '948-979',
            'cnum': 'C12-03-10',
            'curated_relation': True,
            'journal_title': 'Adv.Theor.Math.Phys.',
            "journal_volume": "12",
            'journal_issue': '1',
            "journal_record": {
                "$ref": "http://labs.inspirehep.net/api/journals/1212914"
            },
            'material': 'erratum',
            'page_end': '042',
            'page_start': '032',
            'parent_isbn': '9780521467025',
            'parent_report_number': 'CERN-PH-TH-2012-115',
            "year": 2008,
        }]
    }
    expected_conflict = []

    merged, conflict = merge(root, head, update, head_source='arxiv')
    assert merged == expected_merged
    assert_ordered_conflicts(conflict, expected_conflict)
    validate_subschema(merged)