def test_filter_documents_same_source_multiple_sources_in_update(): root = {} head = { 'documents': [ { 'source': 'arXiv', 'key': 'old_file.pdf', 'url': '/files/5678-5678-5678-5678/old_file.pdf', }, ], } update = { 'documents': [ { 'source': 'arXiv', 'key': 'file1.pdf', 'url': '/files/1234-1234-1234-1234/file1.pdf', }, { 'source': 'arXiv', 'key': 'file2.pdf', 'url': '/files/1234-1234-1234-1234/file2.pdf', }, { 'source': 'publisher', 'key': 'file3.pdf', 'url': '/files/1234-1234-1234-1234/file3.pdf', }, ], } result = filter_records(root, head, update, filters=[filter_documents_same_source]) expected = root, head, update assert result == expected
def test_filter_publisher_references_keeps_update_if_no_refs_in_head(): root = {} head = {} update = { 'references': [ { 'reference': { 'arxiv_eprint': '1810.56789', }, }, ], } expected_update = { 'references': [ { 'reference': { 'arxiv_eprint': '1810.56789', }, }, ], } result = filter_records(root, head, update, filters=[filter_publisher_references]) expected = root, head, expected_update assert result == expected
def test_filter_curated_references_takes_update_if_not_curated(): root = {} head = { 'references': [ { 'reference': { 'arxiv_eprint': '1810.12345', }, }, ], } update = { 'references': [ { 'reference': { 'arxiv_eprint': '1810.56789', }, }, ], } expected_head = {} expected_update = { 'references': [ { 'reference': { 'arxiv_eprint': '1810.56789', }, }, ], } result = filter_records(root, head, update, filters=[filter_curated_references]) expected = root, expected_head, expected_update assert result == expected
def test_filter_curated_references_keeps_update_if_head_almost_equal_to_root(): root = { 'references': [ { 'reference': { 'arxiv_eprint': '1810.12345', }, 'curated_relation': False, }, ], } head = { 'references': [ { 'reference': { 'arxiv_eprint': '1810.12345', 'misc': ['foo'], 'authors': ['Smith, J.'], }, 'raw_refs': [ { 'source': 'arXiv', 'schema': 'text', 'value': 'foo 1810.12345', }, ] }, ], } update = { 'references': [ { 'reference': { 'arxiv_eprint': '1810.56789', }, }, ], } expected_root = {} expected_head = {} expected_update = { 'references': [ { 'reference': { 'arxiv_eprint': '1810.56789', }, }, ], } result = filter_records(root, head, update, filters=[filter_curated_references]) expected = expected_root, expected_head, expected_update assert result == expected
def merge(root, head, update, head_source=None): """ This function instantiate a ``Merger`` object using a configuration in according to the ``source`` value of head and update params. Then it run the merger on the three files provided in input. Params root(dict): the last common parent json of head and update head(dict): the last version of a record in INSPIRE update(dict): the update coming from outside INSPIRE to merge head_source(string): the source of the head record. If ``None``, heuristics are used to derive it from the metadata. This is useful if the HEAD came from legacy and the acquisition_source does not reflect the state of the record. Return A tuple containing the resulted merged record in json format and a an object containing all generated conflicts. """ configuration = get_configuration(head, update, head_source) conflicts = [] root, head, update = filter_records(root, head, update, filters=configuration.pre_filters) merger = Merger( root=root, head=head, update=update, default_dict_merge_op=configuration.default_dict_merge_op, default_list_merge_op=configuration.default_list_merge_op, list_dict_ops=configuration.list_dict_ops, list_merge_ops=configuration.list_merge_ops, comparators=configuration.comparators, ) try: merger.merge() except MergeError as e: conflicts = e.content conflicts = filter_conflicts(conflicts, configuration.conflict_filters) conflicts_as_json = [json.loads(c.to_json()) for c in conflicts] flat_conflicts_as_json = list( itertools.chain.from_iterable(conflicts_as_json)) merged = merger.merged_root return merged, flat_conflicts_as_json
def test_filter_missing_figures_on_update_are_properly_handled(): fig_1 = { 'caption': 'CC', 'key': 'w0_bflow.png', 'label': 'fig:bflow', 'material': 'preprint', 'source': 'arxiv', 'url': '/api/files/8e2b4d59-6870-4517-8580-35822bf12edb/w0_bflow.png' } fig_2 = { 'caption': 'CC2', 'key': 'w1_bflow.png', 'label': 'fig2:bflow', 'material': 'preprint', 'source': 'other', 'url': '/api/files/8e2b4d59-6870-4517-8888-35822bf12edb/w1_bflow.png' } fig_3 = { 'caption': 'CC', 'key': '627d2caea8059d8875281ebed455a714', 'label': 'fig:bflow', 'material': 'preprint', 'source': 'arxiv', 'url': '/api/files/8e2b4d59-6870-4517-8580-35822bf12edb/w0_bflow.png' } root = { "figures": [ fig_1, fig_2 ] } head = { "figures": [ fig_2, fig_3 ] } update = {'acquisition_source': {'source': 'arXiv'}} expected_root = {"figures": [fig_2]} expected_head = {"figures": [fig_2]} expected_update = update new_root, new_head, new_update = filter_records(root, head, update, filters=[filter_figures_same_source]) assert new_root == expected_root assert new_head == expected_head assert new_update == expected_update
def test_filter_curated_references_keeps_head_if_differs_from_root(): root = { 'references': [ { 'reference': { 'arxiv_eprint': '1810.12345', }, }, ], } head = { 'references': [ { 'reference': { 'arxiv_eprint': '1810.12345', 'dois': ['10.1234/5678'], }, }, ], } update = { 'references': [ { 'reference': { 'arxiv_eprint': '1810.56789', }, }, ], } expected_root = {} expected_head = { 'references': [ { 'reference': { 'arxiv_eprint': '1810.12345', 'dois': ['10.1234/5678'], }, }, ], } expected_update = {} result = filter_records(root, head, update, filters=[filter_curated_references]) expected = expected_root, expected_head, expected_update assert result == expected
def test_filter_documents_same_source_is_case_insensitive_on_source(): root = {} head = { 'documents': [ { 'source': 'arXiv', 'key': 'file1.pdf', 'url': '/files/1234-1234-1234-1234/file1.pdf', }, { 'source': 'arXiv', 'key': 'file2.pdf', 'url': '/files/1234-1234-1234-1234/file2.pdf', }, { 'key': 'file3.pdf', 'url': '/files/1234-1234-1234-1234/file3.pdf', }, ], } update = { 'documents': [ { 'source': 'arxiv', 'key': 'new_file.pdf', 'url': '/files/5678-5678-5678-5678/new_file.pdf', }, ], } expected_head = { 'documents': [ { 'key': 'file3.pdf', 'url': '/files/1234-1234-1234-1234/file3.pdf', }, ], } result = filter_records(root, head, update, filters=[filter_documents_same_source]) expected = root, expected_head, update assert result == expected
def test_filter_curated_references_keeps_head_if_legacy_curated(): root = {} head = { 'references': [ { 'legacy_curated': True, 'reference': { 'arxiv_eprint': '1810.12345', }, }, ], } update = { 'references': [ { 'reference': { 'arxiv_eprint': '1810.56789', }, }, ], } expected_head = { 'references': [ { 'legacy_curated': True, 'reference': { 'arxiv_eprint': '1810.12345', }, }, ], } expected_update = {} result = filter_records(root, head, update, filters=[filter_curated_references]) expected = root, expected_head, expected_update assert result == expected