def test_legacy_export_marcxml(self): """Record - legacy export marxml.""" # FIXME: use a better way to compare from invenio_record.models import Record from invenio.legacy.bibrecord import create_record, records_identical blob = ''' <record> <controlfield tag="001">8</controlfield> <datafield tag="100" ind1=" " ind2=" "> <subfield code="a">Efstathiou, G P</subfield> <subfield code="u">Cambridge University</subfield> </datafield> <datafield tag="245" ind1=" " ind2=" "> <subfield code="a">Title</subfield> <subfield code="b">SubTitle</subfield> </datafield> <datafield tag="700" ind1=" " ind2=" "> <subfield code="a">Lasenby, A N</subfield> </datafield> <datafield tag="980" ind1=" " ind2=" "> <subfield code="a">Articles</subfield> </datafield> </record> ''' rec = Record.create(blob, master_format='marc', namespace='testsuite') recstruct, _, _ = create_record(blob) json_recstruct, _, _ = create_record(rec.legacy_export_as_marc()) self.assertTrue(records_identical(json_recstruct, recstruct, ignore_subfield_order=True))
def setUp(self): self.marcxml = pkg_resources.resource_string('tests', os.path.join( 'fixtures', 'test_hep_formats.xml') ) self.marcxml_publi_info = pkg_resources.resource_string('tests', os.path.join( 'fixtures', 'test_hep_publi_info.xml') ) record = create_record(self.marcxml) record_publi_info = create_record(self.marcxml_publi_info) self.hep_record = hep.do(record) self.hep_record_publi_info = hep.do(record_publi_info) self.sample_cv_latex = { 'author': 'G.~Aad', 'title': "{\\bf ``\nSearch for supersymmetry in events containing a same-flavour opposite-sign dilepton pair, jets, and large missing transverse momentum in $\sqrt{s}=8$ TeV $pp$ collisions with the ATLAS detector\n''}", 'publi_info': ['Eur.\ Phys.\ J.\ C {\\bf 75}, no. 7, 318 (2015)', '[Eur.\ Phys.\ J.\ C {\\bf 75}, no. 10, 463 (2015)]'], 'url': cfg['CFG_SITE_URL'] + '/record/1351762', 'date': 'Mar 11, 2015' } self.sample_cv_latex_publi_info = { 'publi_info': ['Class.\\ Quant.\\ Grav.\\ {\\bf 15}, 2153 (1998)'] }
def test_image(self): """Test image model from XML into JSONi.""" from dojson.contrib.marc21.utils import create_record from cds_dojson.marc21.models.image import ( model as marc21 ) match = query_matcher(create_record(CDS_IMAGE)) assert isinstance(match, marc21.__class__) blob = create_record(CDS_IMAGE) data = marc21.do(blob) # Check the control number (doJSON) assert data.get('control_number') == '1782445' # Check the parent album (CDSImage) assert data['album_parent'][0]['album_id'] == '2054964' # Check the imprint (CDSMarc21) assert data['imprint'][0]['place_of_publication'] == 'Geneva' # Check that no fields are missing their model assert marc21.missing(blob) == []
def test_identity_check(): """Test image model from XML into JSON.""" blob = create_record(CDS_VIDEO_PROJECT) data = marc21.do(blob) back_blob = to_marc21.do(data) assert blob == back_blob blob = create_record(CDS_VIDEO_CLIP) data = marc21.do(blob) back_blob = to_marc21.do(data) assert blob == back_blob
def setUp(self): self.marcxml = pkg_resources.resource_string('tests', os.path.join( 'fixtures', 'test_hep_formats.xml') ) self.marcxml_publi_info = pkg_resources.resource_string('tests', os.path.join( 'fixtures', 'test_hep_publi_info.xml') ) record = create_record(self.marcxml) record_publi_info = create_record(self.marcxml_publi_info) self.hep_record = hep.do(record) self.hep_record_publi_info = hep.do(record_publi_info) self.latex_eu = Latex(self.hep_record, 'latex_eu') self.latex_us = Latex(self.hep_record, 'latex_us') self.latex_eu_publi_info = Latex( self.hep_record_publi_info, 'latex_eu') self.sample_latex_eu = { 'citation_key': 'Aad:2015wqa', 'author': 'G.~Aad', 'title': '\nSearch for supersymmetry in events containing a same-flavour opposite-sign dilepton pair, jets, and large missing transverse momentum in $\sqrt{s}=8$ TeV pp collisions with the ATLAS detector\n', 'publi_info': ['Eur.\ Phys.\ J.\ C {\\bf 75} (2015) 7, 318', '[Eur.\ Phys.\ J.\ C {\\bf 75} (2015) 10, 463]'], 'arxiv': 'arXiv:1503.03290 [hep-ex]', 'report_number': '', 'SLACcitation': '%%CITATION = ARXIV:1503.03290;%%', } self.sample_latex_publi_info = { 'publi_info': ['Class.\\ Quant.\\ Grav.\\ {\\bf 15} (1998) 2153'] } self.sample_latex_us = { 'citation_key': 'Aad:2015wqa', 'author': 'G.~Aad', 'title': '\nSearch for supersymmetry in events containing a same-flavour opposite-sign dilepton pair, jets, and large missing transverse momentum in $\sqrt{s}=8$ TeV pp collisions with the ATLAS detector\n', 'publi_info': ['Eur.\ Phys.\ J.\ C {\\bf 75}, no. 7, 318 (2015)', '[Eur.\ Phys.\ J.\ C {\\bf 75}, no. 10, 463 (2015)]'], 'arxiv': 'arXiv:1503.03290 [hep-ex]', 'report_number': '', 'SLACcitation': '%%CITATION = ARXIV:1503.03290;%%', }
def test_marc21_856_indicators(): """Test MARC21 856 field special indicator values.""" from dojson.contrib.marc21 import marc21 from dojson.contrib.marc21.utils import create_record from dojson.contrib.to_marc21 import to_marc21 RECORD_8564 = ''' <datafield tag="856" ind1="4" ind2=" "> <subfield code="s">272681</subfield> <subfield code="u">https://zenodo.org/record/17575/files/...</subfield> <subfield code="z">0</subfield> </datafield> ''' RECORD_8567 = ''' <datafield tag="856" ind1="7" ind2=" "> <subfield code="s">272681</subfield> <subfield code="u">https://zenodo.org/record/17575/files/...</subfield> <subfield code="z">0</subfield> <subfield code="2">Awesome access method</subfield> </datafield> ''' expected_8564 = { 'electronic_location_and_access': [ {'public_note': ('0',), 'access_method': 'HTTP', 'uniform_resource_identifier': ( 'https://zenodo.org/record/17575/files/...',), 'file_size': ('272681',)} ] } expected_8567 = { 'electronic_location_and_access': [ {'public_note': ('0',), 'access_method': 'Awesome access method', 'uniform_resource_identifier': ( 'https://zenodo.org/record/17575/files/...',), 'file_size': ('272681',)} ] } blob = create_record(RECORD_8564) data = marc21.do(blob) assert expected_8564 == data back_blob = to_marc21.do(data) assert blob == back_blob blob = create_record(RECORD_8567) data = marc21.do(blob) assert expected_8567 == data back_blob = to_marc21.do(data) assert blob == back_blob
def test_urls_from_marcxml_multiple_8564(): snippet = ( '<record>' ' <datafield tag="856" ind1="4" ind2="">' ' <subfield code="u">http://www.physics.unlv.edu/labastro/</subfield>' ' <subfield code="y">Conference web page</subfield>' ' </datafield>' ' <datafield tag="856" ind1="4" ind2="">' ' <subfield code="u">http://www.cern.ch/</subfield>' ' <subfield code="y">CERN web page</subfield>' ' </datafield>' '</record>' ) expected = [ { 'description': 'Conference web page', 'value': 'http://www.physics.unlv.edu/labastro/', }, { 'description': 'CERN web page', 'value': 'http://www.cern.ch/', }, ] result = clean_record(hep.do(create_record(snippet))) assert expected == result['urls']
def test_field_from_marcxml_650_with_two_2(): """Two '2' subfields in one datafield. The first will be taken (this time it's correct). """ snippet = ( '<record>' ' <datafield tag="650" ind1="1" ind2="7">' ' <subfield code="2">arXiv</subfield>' ' <subfield code="2">INSPIRE</subfield>' ' <subfield code="a">hep-ex</subfield>' ' </datafield>' '</record>' ) expected = [ { '_scheme': 'arXiv', 'scheme': 'INSPIRE', '_term': 'hep-ex', 'term': 'Experiment-HEP', }, ] result = clean_record(hepnames.do(create_record(snippet))) assert expected == result['field_categories']
def records(): """Load records.""" import pkg_resources import uuid from dojson.contrib.marc21 import marc21 from dojson.contrib.marc21.utils import create_record, split_blob from invenio_pidstore import current_pidstore from invenio_records.api import Record # pkg resources the demodata data_path = pkg_resources.resource_filename( 'invenio_records', 'data/marc21/bibliographic.xml' ) with open(data_path) as source: indexer = RecordIndexer() with db.session.begin_nested(): for index, data in enumerate(split_blob(source.read()), start=1): # create uuid rec_uuid = uuid.uuid4() # do translate record = marc21.do(create_record(data)) # create PID current_pidstore.minters['recid_minter']( rec_uuid, record ) # create record indexer.index(Record.create(record, id_=rec_uuid)) db.session.commit()
def test_field_from_multiple_marcxml_650(): """Two datafields. Both are arXiv field codes, but the other is incorrectly labeled as INSPIRE. """ snippet = ( '<record>' ' <datafield tag="650" ind1="1" ind2="7">' ' <subfield code="2">arXiv</subfield>' ' <subfield code="a">HEP-PH</subfield>' ' </datafield>' ' <datafield tag="650" ind1="1" ind2="7">' ' <subfield code="2">INSPIRE</subfield>' ' <subfield code="a">astro-ph.IM</subfield>' ' </datafield>' '</record>' ) expected = [ { '_scheme': 'arXiv', 'scheme': 'INSPIRE', '_term': 'HEP-PH', 'term': 'Phenomenology-HEP', }, { '_scheme': 'INSPIRE', 'scheme': 'INSPIRE', '_term': 'astro-ph.IM', 'term': 'Instrumentation', }, ] result = clean_record(hepnames.do(create_record(snippet))) assert expected == result['field_categories']
def test_address_from_111__a_c_e_g_x_y_and_270__b(): snippet = ( '<record>' ' <datafield tag="111" ind1=" " ind2=" ">' ' <subfield code="a">2017 International Workshop on Baryon and Lepton Number Violation: From the Cosmos to the LHC</subfield>' ' <subfield code="c">Cleveland, Ohio, USA</subfield>' ' <subfield code="e">BLV 2017</subfield>' ' <subfield code="g">C17-05-15</subfield>' ' <subfield code="x">2017-05-15</subfield>' ' <subfield code="y">2017-05-18</subfield>' ' </datafield>' ' <datafield tag="270" ind1=" " ind2=" ">' ' <subfield code="b">Case Western Reserve University</subfield>' ' </datafield>' '</record>' ) # record/1353313 expected = [ { 'original_address': 'Cleveland, Ohio, USA', 'country_code': 'US', 'state': 'US-OH', }, { 'original_address': 'Case Western Reserve University', }, ] result = clean_record(conferences.do(create_record(snippet))) assert expected == result['address']
def test_field_from_marcxml_650_with_single_a_and_9(): """Simple case. One arXiv fieldcode that will be mapped to an INSPIRE category. Source will also be mapped to a standard term. """ snippet = ( '<record>' ' <datafield tag="650" ind1="1" ind2="7">' ' <subfield code="2">INSPIRE</subfield>' ' <subfield code="a">HEP-PH</subfield>' ' <subfield code="9">automatically added based on DCC, PPF, DK </subfield>' ' </datafield>' '</record>' ) expected = [ { 'source': 'INSPIRE', '_scheme': 'INSPIRE', 'scheme': 'INSPIRE', '_term': 'HEP-PH', 'term': 'Phenomenology-HEP', }, ] result = clean_record(hepnames.do(create_record(snippet))) assert expected == result['field_categories']
def test_address_from_multiple_marcxml__111_c(): snippet = ( '<record>' ' <datafield tag="111" ind1=" " ind2=" ">' ' <subfield code="c">Austin, Tex.</subfield>' ' </datafield>' ' <datafield tag="111" ind1=" " ind2=" ">' ' <subfield code="c">Den Haag, Nederlands</subfield>' ' </datafield>' '</record>' ) expected = [ { 'country_code': 'US', 'state': 'US-TX', 'original_address': 'Austin, Tex.' }, { 'country_code': 'NL', 'original_address': 'Den Haag, Nederlands' }, ] result = clean_record(conferences.do(create_record(snippet))) assert expected == result['address']
def record_not_yet_deleted(app): snippet = ( '<record>' ' <controlfield tag="001">333</controlfield>' ' <controlfield tag="005">20160913214552.0</controlfield>' ' <datafield tag="980" ind1=" " ind2=" ">' ' <subfield code="a">HEP</subfield>' ' </datafield>' '</record>' ) with app.app_context(): json_record = hep.do(create_record(snippet)) json_record['$schema'] = 'http://localhost:5000/schemas/records/hep.json' with db.session.begin_nested(): record = record_upsert(json_record) if record: ri = RecordIndexer() ri.index(record) db.session.commit() yield with app.app_context(): _delete_record_from_everywhere('literature', 333)
def test_contact_details_from_multiple_marcxml_270(): snippet = ( '<record> ' ' <datafield tag="270" ind1=" " ind2=" ">' ' <subfield code="m">[email protected]</subfield>' ' <subfield code="p">Manfred Lindner</subfield>' ' </datafield>' ' <datafield tag="270" ind1=" " ind2=" ">' ' <subfield code="p">Wynton Marsalis</subfield>' ' </datafield>' '</record>' ) expected = [ { 'name': 'Manfred Lindner', 'email': '*****@*****.**', }, { 'name': 'Wynton Marsalis', }, ] result = clean_record(conferences.do(create_record(snippet))) assert expected == result['contact_details']
def test_languages_from_041__a_handles_multiple_languages_in_one_a(): schema = load_schema('hep') subschema = schema['properties']['languages'] snippet = ( '<datafield tag="041" ind1=" " ind2=" ">' ' <subfield code="a">Russian / English</subfield>' '</datafield>' ) # record/116959 expected = [ 'ru', 'en', ] result = hep.do(create_record(snippet)) assert validate(result['languages'], subschema) is None assert expected == result['languages'] expected = [ {'a': 'russian'}, {'a': 'english'}, ] result = hep2marc.do(result) assert expected == result['041']
def test_languages_from_double_041__a(): schema = load_schema('hep') subschema = schema['properties']['languages'] snippet = ( '<record>' ' <datafield tag="041" ind1=" " ind2=" ">' ' <subfield code="a">French</subfield>' ' </datafield>' ' <datafield tag="041" ind1=" " ind2=" ">' ' <subfield code="a">German</subfield>' ' </datafield>' '</record>' ) # record/1231408 expected = [ 'fr', 'de', ] result = hep.do(create_record(snippet)) assert validate(result['languages'], subschema) is None assert expected == result['languages'] expected = [ {'a': 'french'}, {'a': 'german'}, ] result = hep2marc.do(result) assert expected == result['041']
def test_report_numbers_from_037__z_9(): schema = load_schema('hep') subschema = schema['properties']['report_numbers'] snippet = ( '<datafield tag="037" ind1=" " ind2=" ">' ' <subfield code="9">SLAC</subfield>' ' <subfield code="a">SLAC-PUB-16140</subfield>' '</datafield>' ) # record/1326454 expected = [ { 'source': 'SLAC', 'value': 'SLAC-PUB-16140', }, ] result = hep.do(create_record(snippet)) assert validate(result['report_numbers'], subschema) is None assert expected == result['report_numbers'] expected = [ { '9': 'SLAC', 'a': 'SLAC-PUB-16140', }, ] result = hep2marc.do(result) assert expected == result['037']
def test_isbns_from_020__a_b_normalizes_online(): schema = load_schema('hep') subschema = schema['properties']['isbns'] snippet = ( '<datafield tag="020" ind1=" " ind2=" ">' ' <subfield code="a">978-94-024-0999-4</subfield>' ' <subfield code="b">Online</subfield>' '</datafield>' ) # record/1504286 expected = [ { 'value': '9789402409994', 'medium': 'online', }, ] result = hep.do(create_record(snippet)) assert validate(result['isbns'], subschema) is None assert expected == result['isbns'] expected = [ { 'a': '9789402409994', 'b': 'online', }, ] result = hep2marc.do(result) assert expected == result['020']
def test_report_numbers_from_two_037__a(): schema = load_schema('hep') subschema = schema['properties']['report_numbers'] snippet = ( '<record>' ' <datafield tag="037" ind1=" " ind2=" ">' ' <subfield code="a">UTPT-89-27</subfield>' ' </datafield>' ' <datafield tag="037" ind1=" " ind2=" ">' ' <subfield code="a">CALT-68-1585</subfield>' ' </datafield>' '</record>' ) # record/26564 expected = [ { 'value': 'UTPT-89-27', }, { 'value': 'CALT-68-1585', }, ] result = hep.do(create_record(snippet)) assert validate(result['report_numbers'], subschema) is None assert expected == result['report_numbers'] expected = [ {'a': 'UTPT-89-27'}, {'a': 'CALT-68-1585'}, ] result = hep2marc.do(result) assert expected == result['037']
def test_report_numbers_hidden_from_037__z(): schema = load_schema('hep') subschema = schema['properties']['report_numbers'] snippet = ( '<datafield tag="037" ind1=" " ind2=" ">' ' <subfield code="z">FERMILAB-PUB-17-011-CMS</subfield>' '</datafield>' ) # record/1508174 expected = [ { 'hidden': True, 'value': 'FERMILAB-PUB-17-011-CMS', }, ] result = hep.do(create_record(snippet)) assert validate(result['report_numbers'], subschema) is None assert expected == result['report_numbers'] expected = [ {'z': 'FERMILAB-PUB-17-011-CMS'} ] result = hep2marc.do(result) assert expected == result['037']
def test_external_system_numbers_from_035__a_d_h_m_9(): schema = load_schema('hep') subschema = schema['properties']['external_system_identifiers'] snippet = ( '<datafield tag="035" ind1=" " ind2=" ">' ' <subfield code="9">http://cds.cern.ch/oai2d</subfield>' ' <subfield code="a">oai:cds.cern.ch:325030</subfield>' ' <subfield code="d">2015-06-05T13:24:42Z</subfield>' ' <subfield code="h">2015-11-09T16:22:48Z</subfield>' ' <subfield code="m">marcxml</subfield>' '</datafield>' ) # record/1403324 expected = [ { 'value': 'oai:cds.cern.ch:325030', 'schema': 'http://cds.cern.ch/oai2d', } ] result = hep.do(create_record(snippet)) assert validate(result['external_system_identifiers'], subschema) is None assert expected == result['external_system_identifiers'] expected = [ { '9': 'http://cds.cern.ch/oai2d', 'a': 'oai:cds.cern.ch:325030', }, ] result = hep2marc.do(result) assert expected == result['035']
def test_texkeys_from_035__a_9(): schema = load_schema('hep') subschema = schema['properties']['texkeys'] snippet = ( '<datafield tag="035" ind1=" " ind2=" ">' ' <subfield code="9">INSPIRETeX</subfield>' ' <subfield code="a">Hagedorn:1963hdh</subfield>' '</datafield>' ) # record/1403324 expected = [ 'Hagedorn:1963hdh', ] result = hep.do(create_record(snippet)) assert validate(result['texkeys'], subschema) is None assert expected == result['texkeys'] expected = [ { '9': 'INSPIRETeX', 'a': 'Hagedorn:1963hdh', } ] result = hep2marc.do(result) assert expected == result['035']
def test_dois_from_0247_a_2_double_9_ignores_curator_source(): schema = load_schema('hep') subschema = schema['properties']['dois'] snippet = ( '<datafield tag="024" ind1="7" ind2=" ">' ' <subfield code="2">DOI</subfield>' ' <subfield code="9">bibcheck</subfield>' ' <subfield code="9">CURATOR</subfield>' ' <subfield code="a">10.1590/S1806-11172008005000006</subfield>' '</datafield>' ) # record/1117362 expected = [ { 'source': 'bibcheck', 'value': '10.1590/S1806-11172008005000006', }, ] result = hep.do(create_record(snippet)) # no roundtrip assert validate(result['dois'], subschema) is None assert expected == result['dois'] expected = [ { 'a': '10.1590/S1806-11172008005000006', '9': 'bibcheck', '2': 'DOI', }, ] result = hep2marc.do(result) assert expected == result['0247']
def test_dois_from_0247_a_2(): schema = load_schema('hep') subschema = schema['properties']['dois'] snippet = ( '<datafield tag="024" ind1="7" ind2=" ">' ' <subfield code="2">DOI</subfield>' ' <subfield code="a">10.1088/0264-9381/31/24/245004</subfield>' '</datafield>' ) # record/1302395 expected = [ {'value': '10.1088/0264-9381/31/24/245004'}, ] result = hep.do(create_record(snippet)) assert validate(result['dois'], subschema) is None assert expected == result['dois'] expected = [ { 'a': '10.1088/0264-9381/31/24/245004', '2': 'DOI', }, ] result = hep2marc.do(result) assert expected == result['0247']
def test_field_from_marcxml_650_with_two_a(): """Two 'a' subfields in one datafield. The first is an arXiv fieldcode an the second an INSPIRE category. """ snippet = ( '<record>' ' <datafield tag="650" ind1="1" ind2="7">' ' <subfield code="2">INSPIRE</subfield>' ' <subfield code="a">hep-ex</subfield>' ' <subfield code="a">Gravitation and Cosmology</subfield>' ' </datafield>' '</record>' ) expected = [ { '_scheme': 'INSPIRE', 'scheme': 'INSPIRE', '_term': 'hep-ex', 'term': 'Experiment-HEP', }, { '_scheme': 'INSPIRE', 'scheme': 'INSPIRE', '_term': 'Gravitation and Cosmology', 'term': 'Gravitation and Cosmology', }, ] result = clean_record(hepnames.do(create_record(snippet))) assert expected == result['field_categories']
def test_isbns_from_020__a_b_normalizes_hardcover(): schema = load_schema('hep') subschema = schema['properties']['isbns'] snippet = ( '<datafield tag="020" ind1=" " ind2=" ">' ' <subfield code="a">978-981-4571-66-1</subfield>' ' <subfield code="b">hardcover</subfield>' '</datafield>' ) # record/1351311 expected = [ { 'value': '9789814571661', 'medium': 'hardcover', }, ] result = hep.do(create_record(snippet)) assert validate(result['isbns'], subschema) is None assert expected == result['isbns'] expected = [ { 'a': '9789814571661', 'b': 'hardcover', }, ] result = hep2marc.do(result) assert expected == result['020']
def test_arxiv_eprints_from_037__a_c_9(): schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] snippet = ( '<datafield tag="037" ind1=" " ind2=" ">' ' <subfield code="9">arXiv</subfield>' ' <subfield code="a">arXiv:1505.01843</subfield>' ' <subfield code="c">hep-ph</subfield>' '</datafield>' ) # record/1368891 expected = [ { 'categories': [ 'hep-ph', ], 'value': '1505.01843', }, ] result = hep.do(create_record(snippet)) assert validate(result['arxiv_eprints'], subschema) is None assert expected == result['arxiv_eprints'] expected = [ { '9': 'arXiv', 'a': 'arXiv:1505.01843', 'c': 'hep-ph', }, ] result = hep2marc.do(result) assert expected == result['037']
def test_external_system_identifiers_from_035__z_9_handles_cernkey(): schema = load_schema('hep') subschema = schema['properties']['external_system_identifiers'] snippet = ( '<datafield tag="035" ind1=" " ind2=" ">' ' <subfield code="9">CERNKEY</subfield>' ' <subfield code="z">0263439</subfield>' '</datafield>' ) # record/451647 expected = [ { 'schema': 'CERNKEY', 'value': '0263439', }, ] result = hep.do(create_record(snippet)) assert validate(result['external_system_identifiers'], subschema) is None assert expected == result['external_system_identifiers'] expected = [ { '9': 'CERNKEY', 'z': '0263439', }, ] result = hep2marc.do(result) assert expected == result['035']
def test_harvesting_arxiv_workflow_accepted( mocked, db_only_app, record_oai_arxiv_plots): """Test a full harvesting workflow.""" from invenio_workflows import ( start, WorkflowEngine, ObjectStatus, workflow_object_class ) from dojson.contrib.marc21.utils import create_record from invenio_db import db from inspirehep.dojson.hep import hep from inspirehep.modules.converter.xslt import convert # Convert to MARCXML, then dict, then HEP JSON record_oai_arxiv_plots_marcxml = convert( record_oai_arxiv_plots, "oaiarXiv2marcxml.xsl" ) record_marc = create_record(record_oai_arxiv_plots_marcxml) record_json = hep.do(record_marc) workflow_uuid = None with db_only_app.app_context(): workflow_uuid = start('article', [record_json]) eng = WorkflowEngine.from_uuid(workflow_uuid) obj = eng.processed_objects[0] assert obj.status == ObjectStatus.HALTED assert obj.data_type == "hep" # Files should have been attached (tarball + pdf) assert obj.files["1407.7587.pdf"] assert obj.files["1407.7587.tar.gz"] # A publication note should have been extracted pub_info = obj.data.get('publication_info') assert pub_info assert pub_info[0] assert pub_info[0].get('year') == "2014" assert pub_info[0].get('journal_title') == "J. Math. Phys." # This record should not have been touched yet assert "approved" not in obj.extra_data # Now let's resolve it as accepted and continue # FIXME Should be accept, but record validation prevents us. obj.remove_action() obj.extra_data["approved"] = True obj.extra_data["core"] = True obj.save() db.session.commit() with db_only_app.app_context(): eng = WorkflowEngine.from_uuid(workflow_uuid) obj = eng.processed_objects[0] obj_id = obj.id obj.continue_workflow() obj = workflow_object_class.get(obj_id) # It was accepted assert obj.status == ObjectStatus.COMPLETED
def test_issn_from_marcxml_022_with_a_and_b_and_comment(): """Test ISSN with medium normalization. The original 'b' value will be stored in 'comment'. """ snippet = ('<record>' ' <datafield tag="022" ind1=" " ind2=" ">' ' <subfield code="a">2213-1337</subfield>' ' <subfield code="b">ebook</subfield>' ' </datafield>' '</record>') expected = [ { 'medium': 'online', 'value': '2213-1337', 'comment': 'ebook', }, ] result = journals.do(create_record(snippet)) assert expected == result['issn']
def deleted_record(app): snippet = ('<record>' ' <controlfield tag="001">111</controlfield>' ' <datafield tag="245" ind1=" " ind2=" ">' ' <subfield code="a">deleted</subfield>' ' </datafield>' ' <datafield tag="980" ind1=" " ind2=" ">' ' <subfield code="a">HEP</subfield>' ' <subfield code="c">DELETED</subfield>' ' </datafield>' '</record>') record = hep.do(create_record(snippet)) record['$schema'] = 'http://localhost:5000/schemas/records/hep.json' with db.session.begin_nested(): record_insert_or_replace(record) db.session.commit() yield _delete_record('lit', 111)
def test_deleted_records_from_981__a(): schema = load_schema('hep') subschema = schema['properties']['deleted_records'] snippet = ('<datafield tag="981" ind1=" " ind2=" ">' ' <subfield code="a">1508668</subfield>' '</datafield>') # record/1508886 expected = [{'$ref': 'http://localhost:5000/api/literature/1508668'}] result = hep.do(create_record(snippet)) assert validate(result['deleted_records'], subschema) is None assert expected == result['deleted_records'] expected = [ { 'a': 1508668 }, ] result = hep2marc.do(result) assert expected == result['981']
def test_thesis_info_defense_date_from_500__a_incomplete_human_date(): schema = load_schema('hep') subschema = schema['properties']['thesis_info'] snippet = ('<datafield tag="500" ind1=" " ind2=" ">' ' <subfield code="a">Presented on Dec 1992</subfield>' '</datafield>') # record/887715 expected = {'defense_date': '1992-12'} result = hep.do(create_record(snippet)) assert validate(result['thesis_info'], subschema) is None expected == result['thesis_info'] expected = [ { 'a': 'Presented on 1992-12' }, ] result = hep2marc.do(result) assert expected == result['500']
def test_thesis_info_defense_date_from_500__a(): schema = load_schema('hep') subschema = schema['properties']['thesis_info'] snippet = ('<datafield tag="500" ind1=" " ind2=" ">' ' <subfield code="a">Presented on 2016-09-30</subfield>' '</datafield>') # record/1517362 expected = {'defense_date': '2016-09-30'} result = hep.do(create_record(snippet)) assert validate(result['thesis_info'], subschema) is None assert expected == result['thesis_info'] expected = [ { 'a': 'Presented on 2016-09-30' }, ] result = hep2marc.do(result) assert expected == result['500']
def test_curated_from_500__a(): schema = load_schema('hep') subschema = schema['properties']['curated'] snippet = ('<datafield tag="500" ind1=" " ind2=" ">' ' <subfield code="a">* Brief entry *</subfield>' '</datafield>') # record/1184775 expected = False result = hep.do(create_record(snippet)) assert validate(result['curated'], subschema) is None assert expected == result['curated'] expected = [ { 'a': '* Brief entry *' }, ] result = hep2marc.do(result) assert expected == result['500']
def test_export_to_from_595__c_hal(): schema = load_schema('hep') subschema = schema['properties']['_export_to'] snippet = ('<datafield tag="595" ind1=" " ind2=" ">' ' <subfield code="c">HAL</subfield>' '</datafield>') # record/1623281 expected = {'HAL': True} result = hep.do(create_record(snippet)) assert validate(result['_export_to'], subschema) is None assert expected == result['_export_to'] expected = [ { 'c': 'HAL' }, ] result = hep2marc.do(result) assert expected == result['595']
def transform_harvested_records(sender=None, records=None, **kwargs): """Harvest records and transform them and send to the import queue. This function is called when the oaiharvester command is finished. :param sender: Sender of the signal. :param list records: Liste of records to harvest. """ start_time = time.time() max_records = kwargs.get('max', None) if kwargs.get('name'): print('Harvesting records from "{set}"'.format(set=kwargs.get('name'))) harvested_records = list(records) # Reduce array to max records if max_records: harvested_records = harvested_records[:int(max_records)] records = [] for harvested_record in harvested_records: # Convert from Marc XML to JSON data = create_record(harvested_record.xml) # Transform JSON data = marc21tojson.do(data) # Add transformed data to list records.append(data) # Chunk record list and send celery task for chunk in list(chunks(records, CHUNK_SIZE)): import_records.delay(chunk) print('{count} records harvested in {time} seconds'.format( count=len(records), time=time.time() - start_time))
def test_experiments_from_693__e__0(): schema = load_schema('jobs') subschema = schema['properties']['experiments'] snippet = ('<datafield tag="693" ind1=" " ind2=" ">' ' <subfield code="e">CERN-LHC-ATLAS</subfield>' ' <subfield code="0">1108541</subfield>' '</datafield>') # record/1332138 expected = [ { 'curated_relation': True, 'name': 'CERN-LHC-ATLAS', 'record': { '$ref': 'http://localhost:5000/api/experiments/1108541', }, }, ] result = jobs.do(create_record(snippet)) assert validate(result['experiments'], subschema) is None assert expected == result['experiments']
def test_marc21_to_provision_activity_ebooks_from_field_264_2(): """Test provision activity Place and Date from field 264_2 transform.""" marc21xml = """ <record> <datafield tag="264" ind1=" " ind2="2"> <subfield code="a">Lausanne :</subfield> <subfield code="b">Payot,</subfield> <subfield code="c">[2006-2010]</subfield> </datafield> </record> """ marc21json = create_record(marc21xml) data = marc21.do(marc21json) assert data.get('provisionActivity') == [ { 'type': 'bf:Distribution', 'statement': [ { 'label': [ {'value': 'Lausanne'} ], 'type': 'bf:Place' }, { 'label': [ {'value': 'Payot'} ], 'type': 'bf:Agent' }, { 'label': [ {'value': '[2006-2010]'} ], 'type': 'Date' } ] } ]
def test_stub_from_980__a_not_useful(): schema = load_schema('authors') subschema = schema['properties']['stub'] snippet = ( '<datafield tag="980" ind1=" " ind2=" ">' ' <subfield code="a">HEPNAMES</subfield>' '</datafield>' ) # record/1019103 expected = True result = hepnames.do(create_record(snippet)) assert validate(result['stub'], subschema) is None assert expected == result['stub'] expected = [ {'a': 'HEPNAMES'}, ] result = hepnames2marc.do(result) assert expected == result['980']
def test_marc21subjects(): """Test dojson subjects.""" unimarcxml = """ <record> <datafield tag="600" ind1=" " ind2=" "> <subfield code="a">subjects 600</subfield> </datafield> <datafield tag="616" ind1=" " ind2=" "> <subfield code="a">Capet</subfield> <subfield code="b">Louis</subfield> <subfield code="c">Jr.</subfield> <subfield code="d">III</subfield> <subfield code="f">1700-1780</subfield> </datafield> </record> """ unimarcjson = create_record(unimarcxml) data = unimarctojson.do(unimarcjson) assert data.get('subjects') == [ 'subjects 600', 'Capet, Louis III, Jr., 1700-1780' ]
def test_positions_from_371__a_r_t(): schema = load_schema('authors') subschema = schema['properties']['positions'] snippet = ( '<datafield tag="371" ind1=" " ind2=" ">' ' <subfield code="a">Case Western Reserve U.</subfield>' ' <subfield code="r">UNDERGRADUATE</subfield>' ' <subfield code="t">2011</subfield>' '</datafield>' ) # record/1590188 expected = [ { '_rank': 'UNDERGRADUATE', 'current': False, 'end_date': '2011', 'institution': { 'curated_relation': False, 'name': 'Case Western Reserve U.', }, 'rank': 'UNDERGRADUATE', }, ] result = hepnames.do(create_record(snippet)) assert validate(result['positions'], subschema) is None assert expected == result['positions'] expected = [ { 'a': 'Case Western Reserve U.', 'r': 'UG', 't': '2011', }, ] result = hepnames2marc.do(result) assert expected == result['371']
def test_urls_from_8564_u_and_8564_g_u_y(): schema = load_schema('authors') subschema = schema['properties']['urls'] snippet = ( '<record>' ' <datafield tag="856" ind1="4" ind2=" ">' ' <subfield code="u">http://www.haydenplanetarium.org/tyson/</subfield>' ' </datafield>' ' <datafield tag="856" ind1="4" ind2=" ">' ' <subfield code="g">active</subfield>' ' <subfield code="u">https://twitter.com/neiltyson</subfield>' ' <subfield code="y">TWITTER</subfield>' ' </datafield>' '</record>' ) # record/1073331 expected = [ {'value': 'http://www.haydenplanetarium.org/tyson/'}, { 'description': 'TWITTER', 'value': 'https://twitter.com/neiltyson', }, ] result = hepnames.do(create_record(snippet)) assert validate(result['urls'], subschema) is None assert expected == result['urls'] expected = [ {'u': 'http://www.haydenplanetarium.org/tyson/'}, { 'u': 'https://twitter.com/neiltyson', 'y': 'TWITTER', }, ] result = hepnames2marc.do(result) assert expected == result['8564']
def test_deleted_from_980__c(): schema = load_schema('hep') subschema = schema['properties']['deleted'] snippet = ('<datafield tag="980" ind1=" " ind2=" ">' ' <subfield code="c">DELETED</subfield>' '</datafield>') # record/1508668/export/xme expected = True result = hep.do(create_record(snippet)) assert validate(result['deleted'], subschema) is None assert expected == result['deleted'] expected = [ { 'c': 'DELETED' }, ] result = hep2marc.do(result) assert expected == result['980']
def test_marc21_to_edition_statement_one_field_250(): """Test dojson edition statement. - 1 edition designation and 1 responsibility from field 250 """ marc21xml = """ <record> <datafield tag="250" ind1=" " ind2=" "> <subfield code="a">2e ed.</subfield> <subfield code="b">avec un avant-propos par Jean Faret</subfield> </datafield> </record> """ marc21json = create_record(marc21xml) data = marc21.do(marc21json) assert data.get('editionStatement') == [{ 'editionDesignation': [{ 'value': '2e ed.' }], 'responsibility': [{ 'value': 'avec un avant-propos par Jean Faret' }] }]
def test_old_single_email_from_371__a(): schema = load_schema('authors') subschema = schema['properties']['positions'] snippet = ('<datafield tag="371" ind1=" " ind2=" ">' ' <subfield code="a">IMSc, Chennai</subfield>' ' <subfield code="o">[email protected]</subfield>' ' <subfield code="r">PD</subfield>' ' <subfield code="s">2012</subfield>' ' <subfield code="t">2013</subfield>' '</datafield>') # record/1060782 expected = [{ "current": False, "old_emails": ["*****@*****.**"], "end_date": "2013", "rank": "POSTDOC", "institution": { "name": "IMSc, Chennai", "curated_relation": False }, "_rank": "PD", "start_date": "2012" }] result = hepnames.do(create_record(snippet)) assert validate(result['positions'], subschema) is None assert expected == result['positions'] expected = [{ "a": "IMSc, Chennai", "o": ["*****@*****.**"], "s": "2012", "r": "PD", "t": "2013" }] result = hepnames2marc.do(result) assert expected == result['371']
def test_positions_from_371__a_m_r_z(): schema = load_schema('authors') subschema = schema['properties']['positions'] snippet = ('<datafield tag="371" ind1=" " ind2=" ">' ' <subfield code="a">Antwerp U.</subfield>' ' <subfield code="m">[email protected]</subfield>' ' <subfield code="r">SENIOR</subfield>' ' <subfield code="z">Current</subfield>' '</datafield>') # record/997958 expected = [ { 'current': True, 'emails': [ '*****@*****.**', ], 'institution': { 'curated_relation': False, 'name': 'Antwerp U.', }, 'rank': 'SENIOR', '_rank': 'SENIOR', }, ] result = hepnames.do(create_record(snippet)) assert validate(result['positions'], subschema) is None assert expected == result['positions'] expected = [{ 'a': 'Antwerp U.', 'm': ['*****@*****.**'], 'r': 'SENIOR', 'z': 'Current' }] result = hepnames2marc.do(result) assert expected == result['371']
def test_historical_data_from_6781_multiple_a(): schema = load_schema('institutions') subschema = schema['properties']['historical_data'] snippet = ( '<datafield tag="678" ind1="1" ind2=" ">' ' <subfield code="a">Conseil européen pour la Recherche Nucléaire (1952-1954)</subfield>' ' <subfield code="a">Organisation européenne pour la Recherche nucléaire (1954-now)</subfield>' ' <subfield code="a">Sub title: Laboratoire européen pour la Physique des Particules (1984-now)</subfield>' ' <subfield code="a">Sub title: European Laboratory for Particle Physics (1984-now)</subfield>' '</datafield>') # record/902725 expected = [ u'Conseil européen pour la Recherche Nucléaire (1952-1954)', u'Organisation européenne pour la Recherche nucléaire (1954-now)', u'Sub title: Laboratoire européen pour la Physique des Particules (1984-now)', u'Sub title: European Laboratory for Particle Physics (1984-now)', ] result = institutions.do(create_record(snippet)) assert validate(result['historical_data'], subschema) is None assert expected == result['historical_data']
def test_publication_info_from_7731_c_p_v_y(): schema = load_schema('hep') subschema = schema['properties']['publication_info'] snippet = ('<datafield tag="773" ind1="1" ind2=" ">' ' <subfield code="c">948-979</subfield>' ' <subfield code="p">Adv.Theor.Math.Phys.</subfield>' ' <subfield code="v">12</subfield>' ' <subfield code="y">2008</subfield>' '</datafield>') # record/697133 expected = [ { 'hidden': True, 'journal_title': 'Adv.Theor.Math.Phys.', 'journal_volume': '12', 'page_end': '979', 'page_start': '948', 'year': 2008, }, ] result = hep.do(create_record(snippet)) assert validate(result['publication_info'], subschema) is None assert expected == result['publication_info'] expected = [ { 'c': [ '948-979', ], 'p': 'Adv.Theor.Math.Phys.', 'v': '12', 'y': 2008, }, ] result = hep2marc.do(result) assert expected == result['7731']
def test_collaboration_from_710__g_0(): schema = load_schema('experiments') subschema = schema['properties']['collaboration'] snippet = ( '<datafield tag="710" ind1=" " ind2=" ">' ' <subfield code="g">DarkSide</subfield>' ' <subfield code="0">1108199</subfield>' '</datafield>' ) # record/1108199 expected = { 'curated_relation': True, 'record': { '$ref': 'http://localhost:5000/api/experiments/1108199', }, 'value': 'DarkSide', } result = experiments.do(create_record(snippet)) assert validate(result['collaboration'], subschema) is None assert expected == result['collaboration']
def test_related_records_from_double_510__a_w_0_accepts_predecessors(): schema = load_schema('experiments') subschema = schema['properties']['related_records'] snippet = ( '<record>' ' <datafield tag="510" ind1=" " ind2=" ">' ' <subfield code="0">1108293</subfield>' ' <subfield code="a">XENON</subfield>' ' <subfield code="w">a</subfield>' ' </datafield>' ' <datafield tag="510" ind1=" " ind2=" ">' ' <subfield code="0">1386527</subfield>' ' <subfield code="a">XENON100</subfield>' ' <subfield code="w">a</subfield>' ' </datafield>' '</record>' ) # record/1386519 expected = [ { 'curated_relation': True, 'record': { '$ref': 'http://localhost:5000/api/experiments/1108293', }, 'relation': 'predecessor', }, { 'curated_relation': True, 'record': { '$ref': 'http://localhost:5000/api/experiments/1386527', }, 'relation': 'predecessor', }, ] result = experiments.do(create_record(snippet)) assert validate(result['related_records'], subschema) is None assert expected == result['related_records']
def test_address_from_111__a_double_c_d_e_g_x_y(): schema = load_schema('conferences') subschema = schema['properties']['address'] snippet = ( '<datafield tag="111" ind1=" " ind2=" ">' ' <subfield code="a">16th High-Energy Physics International Conference in Quantum Chromodynamics</subfield>' ' <subfield code="c">QCD 12</subfield>' ' <subfield code="c">Montpellier, France</subfield>' ' <subfield code="d">2-7 Jul 2012</subfield>' ' <subfield code="e">QCD 12</subfield>' ' <subfield code="g">C12-07-02</subfield>' ' <subfield code="x">2012-07-02</subfield>' ' <subfield code="y">2012-07-07</subfield>' '</datafield>') # record/1085463 expected = [ { 'cities': [ 'QCD 12', ], 'postal_address': [ 'QCD 12', ], }, # XXX: Wrong, but the best we can do. { 'cities': [ 'Montpellier', ], 'country_code': 'FR', 'postal_address': [ 'Montpellier, France', ], }, ] result = conferences.do(create_record(snippet)) assert validate(result['address'], subschema) is None assert expected == result['address']
def test_texkeys_from_035__z_9_and_035__a_9(): schema = load_schema('hep') subschema = schema['properties']['texkeys'] snippet = ( '<record>' ' <datafield tag="035" ind1=" " ind2=" ">' ' <subfield code="9">SPIRESTeX</subfield>' ' <subfield code="z">N.Cartiglia:2015cn</subfield>' ' </datafield>' ' <datafield tag="035" ind1=" " ind2=" ">' ' <subfield code="9">INSPIRETeX</subfield>' ' <subfield code="a">Akiba:2016ofq</subfield>' ' </datafield>' '</record>' ) # record/1498308 expected = [ 'Akiba:2016ofq', # XXX: the first one is the one coming 'N.Cartiglia:2015cn', # from the "a" field. ] result = hep.do(create_record(snippet)) assert validate(result['texkeys'], subschema) is None assert expected == result['texkeys'] expected = [ { '9': 'INSPIRETeX', 'a': 'Akiba:2016ofq', }, { '9': 'INSPIRETeX', 'z': 'N.Cartiglia:2015cn', } ] result = hep2marc.do(result) assert expected == result['035']
def test_alternative_titles_from_711__a_b(): schema = load_schema('conferences') subschema = schema['properties']['alternative_titles'] snippet = ( '<datafield tag="711" ind1=" " ind2=" ">' ' <subfield code="a">XX Riunione Nazionale di Elettromagnetismo</subfield>' ' <subfield code="b">Padova</subfield>' '</datafield>') # record/1403856 expected = [ { 'title': 'XX Riunione Nazionale di Elettromagnetismo' }, { 'title': 'Padova' }, ] result = conferences.do(create_record(snippet)) assert validate(result['alternative_titles'], subschema) is None assert expected == result['alternative_titles']
def test_unimarc_edition(): """Test dojson edition statement. - 1 edition designation and 1 responsibility from field 205 """ unimarcxml = """ <record> <datafield tag="205" ind1=" " ind2=" "> <subfield code="a">2e ed.</subfield> <subfield code="f">avec un avant-propos par Jean Faret</subfield> </datafield> </record> """ unimarcjson = create_record(unimarcxml) data = unimarc.do(unimarcjson) assert data.get('editionStatement') == [{ 'editionDesignation': [{ 'value': '2e ed.' }], 'responsibility': [{ 'value': 'avec un avant-propos par Jean Faret' }] }]
def test_private_notes_from_595__double_a_9(): schema = load_schema('hep') subschema = schema['properties']['_private_notes'] snippet = ( '<datafield tag="595" ind1=" " ind2=" ">' ' <subfield code="9">SPIRES-HIDDEN</subfield>' ' <subfield code="a">TeXtitle from script</subfield>' ' <subfield code="a">no affiliation (not clear pn the fulltext)</subfield>' '</datafield>') # record/109310 expected = [ { 'source': 'SPIRES-HIDDEN', 'value': 'TeXtitle from script', }, { 'source': 'SPIRES-HIDDEN', 'value': 'no affiliation (not clear pn the fulltext)', }, ] result = hep.do(create_record(snippet)) assert validate(result['_private_notes'], subschema) is None assert expected == result['_private_notes'] expected = [ { '9': 'SPIRES-HIDDEN', 'a': 'TeXtitle from script', }, { '9': 'SPIRES-HIDDEN', 'a': 'no affiliation (not clear pn the fulltext)', }, ] result = hep2marc.do(result) assert expected == result['595']
def test_multiple_title_variants_from_marcxml_730(): snippet = ( '<record>' ' <datafield tag="730" ind1=" " ind2=" ">' ' <subfield code="a">PHYS REV SPECIAL TOPICS ACCELERATORS BEAMS</subfield>' ' </datafield>' ' <datafield tag="730" ind1=" " ind2=" ">' ' <subfield code="a">PHYSICS REVIEW ST ACCEL BEAMS</subfield>' ' </datafield>' '</record>') expected = [ { 'title': 'PHYS REV SPECIAL TOPICS ACCELERATORS BEAMS', }, { 'title': 'PHYSICS REVIEW ST ACCEL BEAMS', }, ] result = journals.do(create_record(snippet)) assert expected == result['title_variants']
def test_public_notes_from_500__double_a_9(): schema = load_schema('hep') subschema = schema['properties']['public_notes'] snippet = ( '<datafield tag="500" ind1=" " ind2=" ">' ' <subfield code="9">arXiv</subfield>' ' <subfield code="a">11 pages, 8 figures. Submitted to MNRAS</subfield>' ' <subfield code="a">preliminary entry</subfield>' '</datafield>') # record/1380257 expected = [ { 'source': 'arXiv', 'value': '11 pages, 8 figures. Submitted to MNRAS', }, { 'source': 'arXiv', 'value': 'preliminary entry', }, ] result = hep.do(create_record(snippet)) assert validate(result['public_notes'], subschema) is None assert expected == result['public_notes'] expected = [ { '9': 'arXiv', 'a': '11 pages, 8 figures. Submitted to MNRAS', }, { '9': 'arXiv', 'a': 'preliminary entry', }, ] result = hep2marc.do(result) assert expected == result['500']
def test_name_variants_from_410__double_a(): schema = load_schema('institutions') subschema = schema['properties']['name_variants'] snippet = ( '<datafield tag="410" ind1=" " ind2=" ">' ' <subfield code="a">Theoretische Teilchenphysik und Kosmologie</subfield>' ' <subfield code="a">Elementarteilchenphysik</subfield>' '</datafield>') # record/902624 expected = [ { 'value': 'Theoretische Teilchenphysik und Kosmologie' }, { 'value': 'Elementarteilchenphysik' }, ] result = institutions.do(create_record(snippet)) assert validate(result['name_variants'], subschema) is None assert expected == result['name_variants']