Exemple #1
0
def get_marc_record_from_ia(identifier):
    """
    Takes IA identifiers and returns MARC record instance.
    08/2018: currently called by openlibrary/plugins/importapi/code.py
    when the /api/import/ia endpoint is POSTed to.

    :param str identifier: ocaid
    :rtype: MarcXML | MarcBinary
    """
    metadata = ia.get_metadata(identifier)
    filenames = metadata['_filenames']

    marc_xml_filename = identifier + '_marc.xml'
    marc_bin_filename = identifier + '_meta.mrc'

    item_base = '{}{}/'.format(IA_DOWNLOAD_URL, identifier)

    # Try marc.xml first
    if marc_xml_filename in filenames:
        data = urlopen_keep_trying(item_base + marc_xml_filename).read()
        try:
            root = etree.fromstring(data)
            return MarcXml(root)
        except Exception as e:
            print("Unable to read MarcXML: %s" % e)
            traceback.print_exc()

    # If that fails, try marc.bin
    if marc_bin_filename in filenames:
        data = urlopen_keep_trying(item_base + marc_bin_filename).read()
        return MarcBinary(data)
Exemple #2
0
def load_xml(ia):
    url = archive_url + ia + '/' + ia + '_marc.xml'
    f = urlopen_keep_trying(url)
    root = etree.parse(f).getroot()
    if root.tag == '{http://www.loc.gov/MARC21/slim}collection':
        root = root[0]
    return MarcXml(root)
 def test_subjects_xml(self, item, expected):
     filename = os.path.dirname(__file__) + '/test_data/xml_input/' + item + '_marc.xml'
     element = etree.parse(filename).getroot()
     if element.tag != record_tag and element[0].tag == record_tag:
         element = element[0]
     rec = MarcXml(element)
     assert read_subjects(rec) == expected
Exemple #4
0
def get_marc_record_from_ia(identifier):
    """Takes IA identifiers and returns MARC record instance.
    11/2017: currently called by openlibrary/plugins/importapi/code.py
    when the /api/import/ia endpoint is POSTed to.
    """
    metadata = ia.get_metadata(identifier)
    filenames = metadata['_filenames']

    marc_xml_filename = identifier + "_marc.xml"
    marc_bin_filename = identifier + "_meta.mrc"

    item_base = base + "/" + identifier + "/"

    # Try marc.xml first
    if marc_xml_filename in filenames:
        data = urlopen_keep_trying(item_base + marc_xml_filename).read()
        try:
            root = etree.fromstring(data)
            return MarcXml(root)
        except Exception as e:
            print "Unable to read MarcXML: %s" % e
            traceback.print_exc()

    # If that fails, try marc.bin
    if marc_bin_filename in filenames:
        data = urlopen_keep_trying(item_base + marc_bin_filename).read()
        if len(data) == int(data[:5]):
            # This checks the reported data length against the actual data length
            # BinaryMARCs with incorrectly converted unicode characters do not match.
            return MarcBinary(data)
Exemple #5
0
def load_xml(ia, host, path):
    url = 'http://' + host + path + '/' + ia + '_marc.xml'
    print(url)
    f = urlopen_keep_trying(url)
    root = etree.parse(f).getroot()
    if root.tag == '{http://www.loc.gov/MARC21/slim}collection':
        root = root[0]
    return MarcXml(root)
    edition = read_edition(rec)
    assert 'title' in edition
    return edition
Exemple #6
0
def parse_data(data):
    """
    Takes POSTed data and determines the format, and returns an Edition record
    suitable for adding to OL.

    :param bytes data: Raw data
    :rtype: (dict|None, str|None)
    :return: (Edition record, format (rdf|opds|marcxml|json|marc)) or (None, None)

    from typing import Dict, Optional, Tuple
    def parse_data(data: bytes) -> Tuple[Optional[Dict], Optional[str]]:
    """
    data = data.strip()
    if b'<?xml' in data[:10]:
        root = etree.fromstring(data)
        if '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF' == root.tag:
            edition_builder = import_rdf.parse(root)
            format = 'rdf'
        elif '{http://www.w3.org/2005/Atom}entry' == root.tag:
            edition_builder = import_opds.parse(root)
            format = 'opds'
        elif '{http://www.loc.gov/MARC21/slim}record' == root.tag:
            if root.tag == '{http://www.loc.gov/MARC21/slim}collection':
                root = root[0]
            rec = MarcXml(root)
            edition = read_edition(rec)
            edition_builder = import_edition_builder.import_edition_builder(
                init_dict=edition)
            format = 'marcxml'
        else:
            raise DataError('unrecognized-XML-format')
    elif data.startswith(b'{') and data.endswith(b'}'):
        obj = json.loads(data)
        edition_builder = import_edition_builder.import_edition_builder(
            init_dict=obj)
        format = 'json'
    elif data[:MARC_LENGTH_POS].isdigit():
        # Marc Binary
        if len(data) < MARC_LENGTH_POS or len(data) != int(
                data[:MARC_LENGTH_POS]):
            raise DataError('no-marc-record')
        rec = MarcBinary(data)
        edition = read_edition(rec)
        edition_builder = import_edition_builder.import_edition_builder(
            init_dict=edition)
        format = 'marc'
    else:
        raise DataError('unrecognised-import-format')

    parse_meta_headers(edition_builder)
    return edition_builder.get_dict(), format
 def test_xml(self, i):
     expect_filename = "%s/xml_expect/%s_marc.xml" % (test_data, i)
     path = "%s/xml_input/%s_marc.xml" % (test_data, i)
     element = etree.parse(open(path)).getroot()
     # Handle MARC XML collection elements in our test_data expectations:
     if element.tag == collection_tag and element[0].tag == record_tag:
         element = element[0]
     rec = MarcXml(element)
     edition_marc_xml = read_edition(rec)
     assert edition_marc_xml
     j = simplejson.load(open(expect_filename))
     assert j, "Unable to open test data: %s" % expect_filename
     assert sorted(edition_marc_xml.keys()) == sorted(j.keys())
     for k in edition_marc_xml.keys():
         assert edition_marc_xml[k] == j[k]
     assert edition_marc_xml == j
Exemple #8
0
 def test_xml(self, i):
     expect_filename = "%s/xml_expect/%s_marc.xml" % (test_data, i)
     path            = "%s/xml_input/%s_marc.xml"  % (test_data, i)
     element = etree.parse(open(path)).getroot()
     # Handle MARC XML collection elements in our test_data expectations:
     if element.tag == collection_tag and element[0].tag == record_tag:
         element = element[0]
     rec = MarcXml(element)
     edition_marc_xml = read_edition(rec)
     assert edition_marc_xml
     j = simplejson.load(open(expect_filename))
     assert j, 'Unable to open test data: %s' % expect_filename
     assert sorted(edition_marc_xml) == sorted(j), ('Processed MARCXML fields do '
                                                    'not match expectations in %s'
                                                    % expect_filename)
     for k in edition_marc_xml:
         assert edition_marc_xml[k] == j[k], ('Processed MARCXML values do not '
                                              'match expectations in %s'
                                              % expect_filename)
     assert edition_marc_xml == j
Exemple #9
0
def parse_data(data):
    data = data.strip()
    if -1 != data[:10].find('<?xml'):
        root = etree.fromstring(data)
        #print root.tag
        if '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF' == root.tag:
            edition_builder = import_rdf.parse(root)
            format = 'rdf'
        elif '{http://www.w3.org/2005/Atom}entry' == root.tag:
            edition_builder = import_opds.parse(root)
            format = 'opds'
        elif '{http://www.loc.gov/MARC21/slim}record' == root.tag:
            if root.tag == '{http://www.loc.gov/MARC21/slim}collection':
                root = root[0]
            rec = MarcXml(root)
            edition = read_edition(rec)
            edition_builder = import_edition_builder.import_edition_builder(
                init_dict=edition)
            format = 'marcxml'
        else:
            print 'unrecognized XML format'
            return None, None
    elif data.startswith('{') and data.endswith('}'):
        obj = json.loads(data)
        edition_builder = import_edition_builder.import_edition_builder(
            init_dict=obj)
        format = 'json'
    else:
        #Marc Binary
        if len(data) != int(data[:5]):
            return json.dumps({'success': False, 'error': 'Bad MARC length'})

        rec = MarcBinary(data)
        edition = read_edition(rec)
        edition_builder = import_edition_builder.import_edition_builder(
            init_dict=edition)
        format = 'marc'

    parse_meta_headers(edition_builder)

    return edition_builder.get_dict(), format
Exemple #10
0
 def test_xml(self, i):
     expect_filename = "%s/xml_expect/%s_marc.xml" % (test_data, i)
     path = "%s/xml_input/%s_marc.xml" % (test_data, i)
     element = etree.parse(open(path)).getroot()
     # Handle MARC XML collection elements in our test_data expectations:
     if element.tag == collection_tag and element[0].tag == record_tag:
         element = element[0]
     rec = MarcXml(element)
     edition_marc_xml = read_edition(rec)
     assert edition_marc_xml
     j = json.load(open(expect_filename))
     assert j, 'Unable to open test data: %s' % expect_filename
     assert sorted(edition_marc_xml) == sorted(j), (
         'Processed MARCXML fields do not match expectations in %s' %
         expect_filename)
     msg = ('Processed MARCXML values do not match expectations in %s' %
            expect_filename)
     for key, value in edition_marc_xml.items():
         if isinstance(value, Iterable):  # can not sort a list of dicts
             assert len(value) == len(j[key]), msg
             assert all(item in value for item in j[key]), msg
         else:
             assert value == j[key], msg
Exemple #11
0
def get_marc_record_from_ia(identifier):
    """Takes IA identifiers and returns MARC record instance.
    """
    metadata = ia.get_metadata(identifier)
    filenames = metadata['_filenames']

    marc_xml_filename = identifier + "_marc.xml"
    marc_bin_filename = identifier + "_meta.mrc"

    item_base = base + "/" + identifier + "/"

    # Try marc.xml first
    if marc_xml_filename in filenames:
        data = urlopen_keep_trying(item_base + marc_xml_filename).read()
        if data[:10].find('<?xml') != -1:
            root = etree.fromstring(data)
            return MarcXml(root)

    # If that fails, try marc.bin
    if marc_bin_filename in filenames:
        data = urlopen_keep_trying(item_base + marc_bin_filename).read()
        if len(data) == int(data[:5]):
            return MarcBinary(data)
Exemple #12
0
def parse_data(data):
    """
    Takes POSTed data and determines the format, and returns an Edition record
    suitable for adding to OL.

    :param str data: Raw data
    :rtype: (dict|None, str|None)
    :return: (Edition record, format (rdf|opds|marcxml|json|marc)) or (None, None)
    """
    data = data.strip()
    if -1 != data[:10].find('<?xml'):
        root = etree.fromstring(data)
        if '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF' == root.tag:
            edition_builder = import_rdf.parse(root)
            format = 'rdf'
        elif '{http://www.w3.org/2005/Atom}entry' == root.tag:
            edition_builder = import_opds.parse(root)
            format = 'opds'
        elif '{http://www.loc.gov/MARC21/slim}record' == root.tag:
            if root.tag == '{http://www.loc.gov/MARC21/slim}collection':
                root = root[0]
            rec = MarcXml(root)
            edition = read_edition(rec)
            edition_builder = import_edition_builder.import_edition_builder(init_dict=edition)
            format = 'marcxml'
        else:
            print 'unrecognized XML format'
            return None, None
    elif data.startswith('{') and data.endswith('}'):
        obj = json.loads(data)
        edition_builder = import_edition_builder.import_edition_builder(init_dict=obj)
        format = 'json'
    else:
        # Special case to load IA records, DEPRECATED: use import/ia endpoint
        # Just passing ia:foo00bar is enough to load foo00bar from IA.
        if data.startswith("ia:"):
            source_records = [data]
            itemid = data[len("ia:"):]

            metadata = ia.get_metadata(itemid)
            if not metadata:
                raise DataError("invalid-ia-identifier")

            # see ia_importapi to address `imagecount` limitations
            status = ia.get_item_status(itemid, metadata)
            if status != 'ok':
                raise DataError(status)

            try:
                rec = get_marc_record_from_ia(itemid)

                # skip serials
                if rec and rec.leader()[7] == 's':
                    raise DataError("item-is-serial")
            except IOError:
                raise DataError("no-marc-record")

            if not rec:
                raise DataError("no-marc-record")
        else:
            source_records = None
            itemid = None

            #Marc Binary
            if len(data) != int(data[:5]):
                return json.dumps({'success':False, 'error':'Bad MARC length'})

            rec = MarcBinary(data)

        edition = read_edition(rec)
        if source_records:
            edition['source_records'] = source_records
            edition['ocaid'] = itemid
        edition_builder = import_edition_builder.import_edition_builder(init_dict=edition)
        format = 'marc'

    parse_meta_headers(edition_builder)

    return edition_builder.get_dict(), format
Exemple #13
0
def parse_data(data):
    data = data.strip()
    if -1 != data[:10].find('<?xml'):
        root = etree.fromstring(data)
        #print root.tag
        if '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF' == root.tag:
            edition_builder = import_rdf.parse(root)
            format = 'rdf'
        elif '{http://www.w3.org/2005/Atom}entry' == root.tag:
            edition_builder = import_opds.parse(root)
            format = 'opds'
        elif '{http://www.loc.gov/MARC21/slim}record' == root.tag:
            if root.tag == '{http://www.loc.gov/MARC21/slim}collection':
                root = root[0]
            rec = MarcXml(root)
            edition = read_edition(rec)
            edition_builder = import_edition_builder.import_edition_builder(init_dict=edition)
            format = 'marcxml'
        else:
            print 'unrecognized XML format'
            return None, None
    elif data.startswith('{') and data.endswith('}'):
        obj = json.loads(data)
        edition_builder = import_edition_builder.import_edition_builder(init_dict=obj)
        format = 'json'
    else:
        # Special case to load IA records
        # Just passing ia:foo00bar is enough to load foo00bar from IA.
        if data.startswith("ia:"):
            source_records = [data]
            itemid = data[len("ia:"):]

            metadata = ia.get_metadata(itemid)
            if not metadata:
                raise DataError("invalid-ia-identifier")

            if not ia.edition_from_item_metadata(itemid, metadata):
                raise DataError("item-not-a-book")

            try:
                rec = get_marc_record_from_ia(itemid)

                # skip serials
                if rec.leader()[7] == 's':
                    raise DataError("item-is-serial")
            except IOError:
                raise DataError("no-marc-record")

            if not rec:
                raise DataError("no-marc-record")
        else:
            source_records = None
            itemid = None

            #Marc Binary
            if len(data) != int(data[:5]):
                return json.dumps({'success':False, 'error':'Bad MARC length'})

            rec = MarcBinary(data)

        edition = read_edition(rec)
        if source_records:
            edition['source_records'] = source_records
            edition['ocaid'] = itemid
        edition_builder = import_edition_builder.import_edition_builder(init_dict=edition)
        format = 'marc'

    parse_meta_headers(edition_builder)
    
    return edition_builder.get_dict(), format
Exemple #14
0
def parse_data(data):
    data = data.strip()
    if -1 != data[:10].find('<?xml'):
        root = etree.fromstring(data)
        #print root.tag
        if '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF' == root.tag:
            edition_builder = import_rdf.parse(root)
            format = 'rdf'
        elif '{http://www.w3.org/2005/Atom}entry' == root.tag:
            edition_builder = import_opds.parse(root)
            format = 'opds'
        elif '{http://www.loc.gov/MARC21/slim}record' == root.tag:
            if root.tag == '{http://www.loc.gov/MARC21/slim}collection':
                root = root[0]
            rec = MarcXml(root)
            edition = read_edition(rec)
            edition_builder = import_edition_builder.import_edition_builder(
                init_dict=edition)
            format = 'marcxml'
        else:
            print 'unrecognized XML format'
            return None, None
    elif data.startswith('{') and data.endswith('}'):
        obj = json.loads(data)
        edition_builder = import_edition_builder.import_edition_builder(
            init_dict=obj)
        format = 'json'
    else:
        # Special case to load IA records
        # Just passing ia:foo00bar is enough to load foo00bar from IA.
        if data.startswith("ia:"):
            source_records = [data]
            itemid = data[len("ia:"):]

            metadata = ia.get_metadata(itemid)
            if not metadata:
                raise DataError("invalid-ia-identifier")

            if not ia.edition_from_item_metadata(itemid, metadata):
                raise DataError("item-not-a-book")

            try:
                rec = get_marc_record_from_ia(itemid)
            except IOError:
                raise DataError("no-marc-record")

            if not rec:
                raise DataError("no-marc-record")
        else:
            source_records = None
            itemid = None

            #Marc Binary
            if len(data) != int(data[:5]):
                return json.dumps({
                    'success': False,
                    'error': 'Bad MARC length'
                })

            rec = MarcBinary(data)

        edition = read_edition(rec)
        if source_records:
            edition['source_records'] = source_records
            edition['ocaid'] = itemid
        edition_builder = import_edition_builder.import_edition_builder(
            init_dict=edition)
        format = 'marc'

    parse_meta_headers(edition_builder)

    return edition_builder.get_dict(), format
        data = open(filename).read()
        if len(data) != int(data[:5]):
            data = data.decode('utf-8').encode('raw_unicode_escape')
        rec = MarcBinary(data)
        assert read_subjects(rec) == expected


subjects = []
for item, expect in xml_samples:
    filename = os.path.dirname(
        __file__) + '/test_data/xml_input/' + item + '_marc.xml'
    element = etree.parse(filename).getroot()
    if element.tag != record_tag and element[0].tag == record_tag:
        element = element[0]
    rec = MarcXml(element)
    subjects.append(read_subjects(rec))

for item, expect in bin_samples:
    filename = os.path.dirname(__file__) + '/test_data/bin_input/' + item

    data = open(filename).read()
    if len(data) != int(data[:5]):
        data = data.decode('utf-8').encode('raw_unicode_escape')
    rec = MarcBinary(data)
    subjects.append(read_subjects(rec))

all_subjects = defaultdict(lambda: defaultdict(int))
for a in subjects:
    for b, c in a.items():
        for d, e in c.items():