def merge_records_xml(marcxml_obj): """Function that takes in input a marcxml string and returns containing multiple records identified by the tag "collection" and for each one calls the function to merge the different flavors of the same record (identified by the tag "record"). """ logger.info(' Merger started.') #I get the bibrecord object from libxml2 one all_records = create_record_from_libxml_obj(marcxml_obj, logger) merged_records = [] records_with_merging_probl = [] for records in all_records: #I try to get the bibcode of the record I'm merging try: system_number_fields = records[0][FIELD_TO_MARC['system number']] bibcode = bibrecord.field_get_subfield_values(system_number_fields[0], SYSTEM_NUMBER_SUBFIELD)[0] except: bibcode = 'Unknown' logger.warn(' Merging bibcode "%s".' % bibcode) # Get the merged record try: merged_records.append(merge_multiple_records(records)) except Exception, error: exc_type, exc_obj, exc_tb = sys.exc_info() str_error_to_print = exc_type.__name__ + '\t' + str(error) + ' (Merger error)' logger.error(' Impossible to merge the record "%s" \t %s' % (bibcode, str_error_to_print)) records_with_merging_probl.append((bibcode, str_error_to_print))
def test_01_merge_two_records_one_field(self): """ PRIORITY: 2 records, 1 field, 2 origins. """ marcxml = """<collections><collection> <record> <datafield tag="300" ind1=" " ind2=" "> <subfield code="a">10</subfield> <subfield code="7">A&A</subfield> </datafield> <datafield tag="980" ind1="" ind2=""> <subfield code="a">ASTRONOMY</subfield> <subfield code="7">ADS metadata</subfield> </datafield> </record> <record> <datafield tag="300" ind1=" " ind2=" "> <subfield code="a">15</subfield> <subfield code="7">NED</subfield> </datafield> </record> </collection></collections>""" expected = """<collections><collection><record> <datafield tag="300" ind1=" " ind2=" "> <subfield code="a">10</subfield> <subfield code="7">A&A</subfield> </datafield> <datafield tag="980" ind1="" ind2=""> <subfield code="a">ASTRONOMY</subfield> <subfield code="7">ADS metadata</subfield> </datafield> </record></collection></collections>""" merged_record = m.merge_records_xml(libxml2.parseDoc(marcxml))[0] self.assertEqual(merged_record, create_record_from_libxml_obj(libxml2.parseDoc(expected), logger)[0])
def test_04_merge_three_records_two_fields(self): """ 3 records, 6 fields, 6 origins. """ marcxml = """<collections><collection> <record> <datafield tag="300" ind1=" " ind2=" "> <subfield code="a">10</subfield> <subfield code="7">A&A</subfield> </datafield> <datafield tag="773" ind1=" " ind2=" "> <subfield code="a">Libération</subfield> <subfield code="7">STI</subfield> </datafield> <datafield tag="980" ind1="" ind2=""> <subfield code="a">ASTRONOMY</subfield> <subfield code="7">ADS metadata</subfield> </datafield> </record> <record> <datafield tag="773" ind1=" " ind2=" "> <subfield code="a">Le Monde</subfield> <subfield code="7">AAS</subfield> </datafield> <datafield tag="300" ind1=" " ind2=" "> <subfield code="a">15</subfield> <subfield code="7">NED</subfield> </datafield> </record> <record> <datafield tag="300" ind1=" " ind2=" "> <subfield code="a">5</subfield> <subfield code="7">ADS metadata</subfield> </datafield> <datafield tag="773" ind1=" " ind2=" "> <subfield code="a">L'Express</subfield> <subfield code="7">OCR</subfield> </datafield> </record> </collection></collections>""" expected = """<collections><collection><record> <datafield tag="300" ind1=" " ind2=" "> <subfield code="a">5</subfield> <subfield code="7">ADS metadata</subfield> </datafield> <datafield tag="773" ind1=" " ind2=" "> <subfield code="a">Le Monde</subfield> <subfield code="7">AAS</subfield> </datafield> <datafield tag="980" ind1="" ind2=""> <subfield code="a">ASTRONOMY</subfield> <subfield code="7">ADS metadata</subfield> </datafield> </record></collection></collections>""" merged_record = m.merge_records_xml(libxml2.parseDoc(marcxml))[0] self.assertEqual(merged_record, create_record_from_libxml_obj(libxml2.parseDoc(expected), logger)[0])
def get_result_invenio_xmltransformer(xmlstring): xmlobj = libxml2.parseDoc(xmlstring) xslt = '../misc/AdsXML2MarcXML_v2.xsl' stylesheet = libxslt.parseStylesheetDoc(libxml2.parseFile(xslt)) xml_transformed_object = stylesheet.applyStylesheet(xmlobj, None) marcxml = xml_transformed_object.serialize(encoding='utf-8') #result with internal function result_xml_transformer = x.create_record_from_libxml_obj(xml_transformed_object, logger) #result with function from invenio regex = re.compile('<collection>.*?</collection>', re.DOTALL) record_xmls = regex.findall(marcxml) result_invenio = [[res[0] for res in bibrecord.create_records(xml)] for xml in record_xmls] return (result_xml_transformer, result_invenio)
def test_02_merge_two_records_additional_subfield(self): """ AUTHORS: 2 records, 1 additional subfield. """ marcxml = """<collections><collection> <record> <datafield tag="100" ind1=" " ind2=" "> <subfield code="a">Di Milia, Giovanni</subfield> <subfield code="b">Di Milia, G</subfield> <subfield code="7">A&A</subfield> </datafield> <datafield tag="980" ind1="" ind2=""> <subfield code="a">ASTRONOMY</subfield> <subfield code="7">ADS metadata</subfield> </datafield> </record> <record> <datafield tag="100" ind1=" " ind2=" "> <subfield code="a">Di Milia, Giancarlo</subfield> <subfield code="b">Di Milia, G</subfield> <subfield code="u">Center for astrophysics</subfield> <subfield code="7">ARXIV</subfield> </datafield> </record> </collection></collections>""" expected = """<collections><collection><record> <datafield tag="100" ind1=" " ind2=" "> <subfield code="a">Di Milia, Giovanni</subfield> <subfield code="b">Di Milia, G</subfield> <subfield code="u">Center for astrophysics</subfield> <subfield code="7">A&A</subfield> </datafield> <datafield tag="980" ind1="" ind2=""> <subfield code="a">ASTRONOMY</subfield> <subfield code="7">ADS metadata</subfield> </datafield> </record></collection></collections>""" #records = b.create_records(marcxml) expected_record = create_record_from_libxml_obj(libxml2.parseDoc(expected), logger)[0] merged_record = m.merge_records_xml(libxml2.parseDoc(marcxml))[0] self.assertTrue(b._compare_fields(merged_record[0]['100'][0], expected_record[0]['100'][0], strict=False))
def test_01_merge_two_records_one_field(self): """ AUTHORS: 2 records, priority. """ marcxml = """<collections><collection> <record> <datafield tag="100" ind1=" " ind2=" "> <subfield code="a">Di Milia, Giovanni</subfield> <subfield code="b">Di Milia, G</subfield> <subfield code="7">A&A</subfield> </datafield> <datafield tag="700" ind1=" " ind2=" "> <subfield code="a">Luker, Jay</subfield> <subfield code="b">Luker, J</subfield> <subfield code="7">A&A</subfield> </datafield> <datafield tag="700" ind1=" " ind2=" "> <subfield code="a">Henneken, Edwin</subfield> <subfield code="b">Henneken, E</subfield> <subfield code="7">A&A</subfield> </datafield> <datafield tag="980" ind1="" ind2=""> <subfield code="a">ASTRONOMY</subfield> <subfield code="7">ADS metadata</subfield> </datafield> </record> <record> <datafield tag="100" ind1=" " ind2=" "> <subfield code="a">Dimilia, Giovanni</subfield> <subfield code="b">Dimilia, G</subfield> <subfield code="7">ARXIV</subfield> </datafield> <datafield tag="700" ind1=" " ind2=" "> <subfield code="a">Luker, Jay</subfield> <subfield code="b">Luker, J</subfield> <subfield code="7">ARXIV</subfield> </datafield> <datafield tag="700" ind1=" " ind2=" "> <subfield code="a">Henneken, Edwin</subfield> <subfield code="b">Henneken, E</subfield> <subfield code="7">ARXIV</subfield> </datafield> </record> </collection></collections>""" expected = """<collections><collection><record> <datafield tag="100" ind1=" " ind2=" "> <subfield code="a">Di Milia, Giovanni</subfield> <subfield code="b">Di Milia, G</subfield> <subfield code="7">A&A</subfield> </datafield> <datafield tag="700" ind1=" " ind2=" "> <subfield code="a">Luker, Jay</subfield> <subfield code="b">Luker, J</subfield> <subfield code="7">A&A</subfield> </datafield> <datafield tag="700" ind1=" " ind2=" "> <subfield code="a">Henneken, Edwin</subfield> <subfield code="b">Henneken, E</subfield> <subfield code="7">A&A</subfield> </datafield> <datafield tag="980" ind1="" ind2=""> <subfield code="a">ASTRONOMY</subfield> <subfield code="7">ADS metadata</subfield> </datafield> </record></collection></collections>""" merged_record = m.merge_records_xml(libxml2.parseDoc(marcxml))[0] self.assertEqual(merged_record, create_record_from_libxml_obj(libxml2.parseDoc(expected), logger)[0])