Exemple #1
0
def test_merging_same_documents_arxiv_on_arxiv(fake_get_config):
    root = {
        "documents": [
            {
                "key": "pdf1.pdf",
                "description": "paper",
                "source": "arXiv",
                "fulltext": True,
                "url": "http://example.com/files/1234-1234-1234-1234/pdf1.pdf",
            },
            {
                "key": "pdf.tex",
                "description": "latex version",
                "source": "arXiv",
                "url": "http://example.com/files/1234-1234-1234-1234/pdf.tex",
            },
        ]
    }
    head = root
    update = root
    expected_merged = head
    expected_conflict = []
    merged, conflict = merge(root, head, update)
    assert merged == expected_merged
    assert_ordered_conflicts(conflict, expected_conflict)
    validate_subschema(merged)
Exemple #2
0
def test_merging_dois_field_handles_repeated_values():
    root = {
        'dois': [{
            'material': 'preprint',
            'value': '10.1023/A:1026654312961'
        }]
    }
    head = {
        'dois': [
            {
                'material': 'publication',
                'value': '10.1023/A:1026654312961'
            },
            {
                'source': 'nowhere',
                'value': '10.1023/B:1026654312961'
            },
        ]
    }
    update = {
        'dois': [
            {
                'material': 'erratum',
                'value': '10.1023/A:1026654312961'
            },
            {
                'material': 'erratum',
                'source': 'nowhere',
                'value': '10.1023/B:1026654312961'
            },
        ]
    }

    expected_merged = {
        'dois': [
            {
                'material': 'publication',
                'value': '10.1023/A:1026654312961'
            },
            {
                'source': 'nowhere',
                'value': '10.1023/B:1026654312961'
            },
            {
                'material': 'erratum',
                'value': '10.1023/A:1026654312961'
            },
            {
                'material': 'erratum',
                'source': 'nowhere',
                'value': '10.1023/B:1026654312961'
            },
        ]
    }
    expected_conflict = []

    merged, conflict = merge(root, head, update, head_source='arxiv')
    assert merged == expected_merged
    assert_ordered_conflicts(conflict, expected_conflict)
    validate_subschema(merged)
def test_merging_inspire_categories_field():
    root = {
        'inspire_categories': [{
            'source': 'INSPIRE',
            'term': 'Theory-HEP'
        }]
    }
    head = {
        'inspire_categories': [{
            'source': 'curator',
            'term': 'Theory-HEP'
        }, {
            'source': 'curator',
            'term': 'Theory-Nucl'
        }]
    }
    update = {
        'inspire_categories': [{
            'source': 'arxiv',
            'term': 'Computing'
        }, {
            'source': 'arxiv',
            'term': 'Other'
        }]
    }

    expected_merged = head
    expected_conflict = []

    merged, conflict = merge(root, head, update, head_source='arxiv')
    assert merged == expected_merged
    assert_ordered_conflicts(conflict, expected_conflict)
    validate_subschema(merged)
Exemple #4
0
def test_merging_acquisition_source_publisher_on_arxiv(fake_get_config):
    root = {
        "acquisition_source": {
            "datetime": "2021-05-11T02:35:43.387350",
            "method": "hepcrawl",
            "source": "arXiv",
            "submission_number": "c8a0e3e0b20011eb8d930a580a6402c0"
        }
    }
    head = {
        "acquisition_source": {
            "datetime": "2021-05-11T02:35:43.387350",
            "method": "hepcrawl",
            "source": "arXiv",
            "submission_number": "c8a0e3e0b20011eb8d930a580a6402c0"
        }
    }
    update = {
        "acquisition_source": {
            "datetime": "2021-05-12T02:35:43.387350",
            "method": "beard",
            "source": "other source",
            "submission_number": "c8a0e3e0b20011eb8d930a580a6402c1"
        }
    }
    expected_merged = update
    expected_conflict = []
    merged, conflict = merge(root, head, update)
    assert merged == expected_merged
    assert_ordered_conflicts(conflict, expected_conflict)
    validate_subschema(merged)
def test_merging_full_name_field_keeps_longest_name():
    root = {
        'authors': [{
            'full_name': 'Pitts Kevin',
        }]
    }
    head = {
        'authors': [{
            'full_name': 'Pitts, Kevin John',
        }]
    }
    update = {
        'authors': [{
            'full_name': 'Pitts, Kevin',
        }]
    }

    expected_merged = head

    expected_conflict = []

    merged, conflict = merge(root, head, update, head_source='arxiv')
    assert merged == expected_merged
    assert_ordered_conflicts(conflict, expected_conflict)
    validate_subschema(merged)
def test_get_head_source_arxiv_dois_and_freetext(rec_dois,
                                                 rec_publication_info):
    rec = rec_dois
    rec.get('dois')[0]['source'] = 'arxiv'
    rec['publication_info'] = rec_publication_info['publication_info']
    validate_subschema(rec_dois)
    assert get_head_source(rec_dois) == 'arxiv'
def test_get_acquisition_source_non_arxiv():
    rec = {
        'acquisition_source': {
            'source': 'foo'
        }
    }
    assert get_acquisition_source(rec) == 'foo'
    validate_subschema(rec)
def test_merging_titles_field():
    root = {
        'titles': [{
            'source':
            'arXiv',
            'title':
            'ANTARES: An observatory at the seabed '
            'to the confines of the Universe'
        }  # record: 1519935
                   ]
    }
    head = {
        'titles': [{
            'source':
            'arXiv',
            'subtitle':
            'this subtitle has been added by a curator',
            'title':
            'ANTARES: An observatory at the seabed '
            'to the confines of the Universe'
        }]
    }
    update = {
        'titles': [
            {
                'source': 'arXiv',
                'title': 'ANTARES: Un osservatorio foo bar'
            },
        ]
    }

    expected_merged = {
        'titles': [
            {
                'source':
                'arXiv',
                'subtitle':
                'this subtitle has been added by a curator',
                'title':
                'ANTARES: An observatory at the seabed '
                'to the confines of the Universe'
            },
        ]
    }
    expected_conflict = [{
        'path': '/titles/0',
        'op': 'add',
        'value': {
            'source': 'arXiv',
            'title': 'ANTARES: Un osservatorio foo bar'
        },
        '$type': 'INSERT'
    }]

    merged, conflict = merge(root, head, update, head_source='arxiv')
    assert merged == expected_merged
    assert_ordered_conflicts(conflict, expected_conflict)
    validate_subschema(merged)
def test_figures():
    root = {}
    head = {
        'figures': [{
            'key':
            'figure1.png',
            'caption':
            'Figure 1',
            'source':
            'arXiv',
            'url':
            'http://example.comfiles/1234-1234-1234-1234/figure1.png',
        }, {
            'key':
            'figure2.png',
            'caption':
            'Figure 2',
            'source':
            'arXiv',
            'url':
            'http://example.com/files/1234-1234-1234-1234/figure2.png',
        }]
    }
    update = {
        'figures': [{
            'key':
            'new_figure1.png',
            'caption':
            'Figure 1',
            'source':
            'arXiv',
            'url':
            'http://example.com/files/5678-5678-5678-5678/figure1.png',
        }, {
            'key':
            'new_figure2.png',
            'caption':
            'Figure 2',
            'source':
            'arXiv',
            'url':
            'http://example.com/files/5678-5678-5678-5678/figure2.png',
        }]
    }

    expected_merged = update
    expected_conflict = []

    merged, conflict = merge(root, head, update, head_source='arxiv')
    assert merged == expected_merged
    assert_ordered_conflicts(conflict, expected_conflict)
    validate_subschema(merged)
def test_documents():
    root = {}
    head = {
        'documents': [
            {
                'key': 'pdf1.pdf',
                'description': 'paper',
                'source': 'arXiv',
                'fulltext': True,
                'url': 'http://example.com/files/1234-1234-1234-1234/pdf1.pdf',
            },
            {
                'key': 'pdf.tex',
                'description': 'latex version',
                'source': 'arXiv',
                'url': 'http://example.com/files/1234-1234-1234-1234/pdf.tex',
            },
        ]
    }
    update = {
        'documents': [{
            'key':
            'pdf.pdf',
            'description':
            'paper',
            'source':
            'arXiv',
            'url':
            'http://example.com/files/5678-5678-5678-5678/pdf.pdf',
        }, {
            'key':
            'foo.xml',
            'description':
            'some xml files',
            'source':
            'arXiv',
            'url':
            'http://example.com/files/5678-5678-5678-5678/foo.xml',
        }]
    }

    expected_merged = update
    expected_conflict = []

    merged, conflict = merge(root, head, update, head_source='arxiv')
    assert merged == expected_merged
    assert_ordered_conflicts(conflict, expected_conflict)
    validate_subschema(merged)
def test_head_curates_author_no_duplicate():
    # https://labs.inspirehep.net/api/holdingpen/1268973
    root = {
        'authors': [
            {
                "full_name": "Li, Zhengxiang"
            },
        ]
    }
    head = {
        "authors": [{
            "affiliations": [{
                "value": "Beijing Normal U."
            }],
            "full_name": "Li, Zheng-Xiang",
        }]
    }
    update = {
        'authors': [
            {
                "full_name": "Li, Zhengxiang"
            },
        ]
    }

    expected_merged = {
        'authors': [{
            'full_name': 'Li, Zhengxiang'
        }, {
            'full_name': 'Li, Zheng-Xiang',
            'affiliations': [{
                'value': 'Beijing Normal U.'
            }]
        }]
    }

    expected_conflict = [{
        'path': '/authors/1',
        'op': 'remove',
        'value': None,
        '$type': 'REMOVE_FIELD'
    }]

    merged, conflict = merge(root, head, update, head_source='arxiv')
    assert merged == expected_merged
    assert conflict == expected_conflict
    validate_subschema(merged)
def test_merging_acquisition_source_field():
    root = {}
    # record_id: 1517095
    head = {'acquisition_source': {'method': 'submitter', 'source': 'arxiv'}}
    update = {
        'acquisition_source': {
            'method': 'batchuploader',
            'source': 'arxiv'
        }
    }

    expected_merged = update
    expected_conflict = []

    merged, conflict = merge(root, head, update, head_source='arxiv')
    assert merged == expected_merged
    assert_ordered_conflicts(conflict, expected_conflict)
    validate_subschema(merged)
def test_merging_license_field():
    root = {
        'license': [{
            'imposing': 'Elsevier',
            'url': 'http://creativecommons.org/licenses/by/4.0/',
            'license': 'elsevier foo bar'
        }]
    }
    head = {
        'license': [{
            'imposing': 'Elsevier',
            'url': 'http://creativecommons.org/licenses/by/4.0/',
            'license': 'elsevier foo bar'
        }, {
            'imposing': 'arXiv',
            'url': 'http://creativecommons.org/licenses/by/4.0/',
            'license': 'arxiv foo bar'
        }]
    }
    update = {
        'license': [{
            'imposing': 'Elsevier',
            'url': 'http://creativecommons.org/licenses/by/4.0/',
            'license': 'elsevier foo bar updated!'
        }]
    }

    expected_merged = {
        'license': [{
            'imposing': 'Elsevier',
            'url': 'http://creativecommons.org/licenses/by/4.0/',
            'license': 'elsevier foo bar updated!'
        }, {
            'imposing': 'arXiv',
            'url': 'http://creativecommons.org/licenses/by/4.0/',
            'license': 'arxiv foo bar'
        }]
    }
    expected_conflict = []

    merged, conflict = merge(root, head, update, head_source='arxiv')
    assert merged == expected_merged
    assert_ordered_conflicts(conflict, expected_conflict)
    validate_subschema(merged)
def test_merging_raw_affiliations_field():
    root = {}
    head = {
        'authors': [{
            'full_name':
            'Pitts, Kevin T',
            'raw_affiliations': [{
                'source':
                'arxiv',
                'value':
                'Department of Physics, Indiana University, Bloomington, IN 47405, USA'
            }]
        }]
    }
    update = {
        'authors': [{
            'full_name':
            'Pitts, Kevin T',
            'raw_affiliations': [{
                'source':
                'arxiv',
                'value':
                'Department of Physics, Indiana University, Bloomington, IN 47405, US'
            }, {
                'source': 'arxiv',
                'value': 'Padua U',
            }]
        }]
    }

    expected_merged = update
    expected_conflict = []

    merged, conflict = merge(root, head, update, head_source='arxiv')
    assert merged == expected_merged
    assert_ordered_conflicts(conflict, expected_conflict)
    validate_subschema(merged)
def test_merging_report_numbers_field_repeated_values():
    root = {
        'report_numbers': [
            {
                'source': 'arXiv',
                'value': 'CERN-CMS-2018-001',
            },
        ]
    }  # record: 1598022
    head = {
        'report_numbers': [
            {
                'hidden': True,
                'source': 'arXiv',
                'value': 'CERN-CMS-2018-001',
            },
            {
                'value': 'CERN-CMS-2018-001',
            },
        ]
    }
    update = {
        'report_numbers': [
            {
                'source': 'arXiv',
                'value': 'CERN-CMS-2018-001',
            },
        ]
    }

    expected_merged = head
    expected_conflict = []

    merged, conflict = merge(root, head, update, head_source='arxiv')
    assert merged == expected_merged
    assert_ordered_conflicts(conflict, expected_conflict)
    validate_subschema(merged)
def test_get_head_source_no_arxiv_dois_and_no_freetext(rec_dois,
                                                       rec_publication_info):
    rec = rec_dois
    rec['publication_info'] = rec_publication_info['publication_info']
    validate_subschema(rec_dois)
    assert get_head_source(rec_dois) == 'publisher'
def test_get_head_source_arxiv_dois_no_eprint(rec_dois):
    # record has dois without arxiv source but no arxiv_eprint, no publication_info
    del rec_dois['arxiv_eprints']
    validate_subschema(rec_dois)
    assert get_head_source(rec_dois) == 'publisher'
def test_merging_publication_info_field():
    root = {
        'publication_info': [{
            "hidden": True,
            "journal_title": "Adv.Theor.Math.Phys.",
            "journal_volume": "12",
            "page_end": "979",
            "page_start": "948",
            "year": 2008
        }]
    }  # record 697133
    head = {
        'publication_info': [{
            "hidden": True,
            "journal_title": "Adv.Theor.Math.Phys.",
            "journal_record": {
                "$ref": "http://labs.inspirehep.net/api/journals/1212914"
            },
            "journal_volume": "12",
            "page_end": "979",
            "page_start": "948",
            "year": 2008
        }]
    }
    update = {
        'publication_info': [
            {
                'artid': '948-979',
                'curated_relation': True,
                'journal_issue': '1',
                'journal_title': 'Adv.Theor.Math.Phys.',
                'journal_volume': '12',
                'year': 2008,
                'cnum': 'C12-03-10',
                'material': 'erratum',
                'page_end': '042',
                'page_start': '032',
                'parent_isbn': '9780521467025',
                'parent_report_number': 'CERN-PH-TH-2012-115',
            },
        ]
    }

    expected_merged = {
        'publication_info': [{
            'artid': '948-979',
            'cnum': 'C12-03-10',
            'curated_relation': True,
            'journal_title': 'Adv.Theor.Math.Phys.',
            "journal_volume": "12",
            'journal_issue': '1',
            "journal_record": {
                "$ref": "http://labs.inspirehep.net/api/journals/1212914"
            },
            'material': 'erratum',
            'page_end': '042',
            'page_start': '032',
            'parent_isbn': '9780521467025',
            'parent_report_number': 'CERN-PH-TH-2012-115',
            "year": 2008,
        }]
    }
    expected_conflict = []

    merged, conflict = merge(root, head, update, head_source='arxiv')
    assert merged == expected_merged
    assert_ordered_conflicts(conflict, expected_conflict)
    validate_subschema(merged)
def test_get_head_source_arxiv_dois(rec_dois):
    # record has dois with arxiv source and arxiv_eprint, no publication_info
    rec_dois.get('dois')[0]['source'] = 'arxiv'
    validate_subschema(rec_dois)
    assert get_head_source(rec_dois) == 'arxiv'
def test_get_head_source_no_arxiv_dois(rec_dois):
    # record has dois without arxiv source, no publication_info
    validate_subschema(rec_dois)
    assert get_head_source(rec_dois) is 'publisher'
def test_get_head_source_no_freetext_pub_info(rec_publication_info):
    # record has no pubinfo_freetext, no dois
    del rec_publication_info['publication_info'][0]['pubinfo_freetext']
    validate_subschema(rec_publication_info)
    assert get_head_source(rec_publication_info) == 'publisher'
def test_get_head_source_freetext_pub_info_with_no_eprint(
        rec_publication_info):
    # record has pubinfo_freetext but not arxiv_eprints, no dois
    del rec_publication_info['arxiv_eprints']
    validate_subschema(rec_publication_info)
    assert get_head_source(rec_publication_info) == 'publisher'
def test_get_head_source_freetext_pub_info_with_eprint(rec_publication_info):
    # record has pubinfo_freetext and arxiv_eprints, no dois
    validate_subschema(rec_publication_info)
    assert get_head_source(rec_publication_info) is 'arxiv'