Example #1
0
def parse_data(data):
    data = data.strip()
    if -1 != data[:10].find('<?xml'):
        root = etree.fromstring(data)
        #print root.tag
        if '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF' == root.tag:
            edition_builder = import_rdf.parse(root)
        elif '{http://www.w3.org/2005/Atom}entry' == root.tag:
            edition_builder = import_opds.parse(root)
        elif '{http://www.loc.gov/MARC21/slim}record' == root.tag:
            if root.tag == '{http://www.loc.gov/MARC21/slim}collection':
                root = root[0]
            rec = MarcXml(root)
            edition = read_edition(rec)
            edition_builder = import_edition_builder.import_edition_builder(init_dict=edition)
        else:
            print 'unrecognized XML format'
            return None
    elif data.startswith('{') and data.endswith('}'):
        obj = json.loads(data)
        edition_builder = import_edition_builder.import_edition_builder(init_dict=obj)
    else:
        #Marc Binary
        if len(data) != int(data[:5]):
            return json.dumps({'success':False, 'error':'Bad MARC length'})
    
        rec = MarcBinary(data)
        edition = read_edition(rec)
        edition_builder = import_edition_builder.import_edition_builder(init_dict=edition)

    parse_meta_headers(edition_builder)
    
    return edition_builder.get_dict()
Example #2
0
def parse_data(data):
    """
    Takes POSTed data and determines the format, and returns an Edition record
    suitable for adding to OL.

    :param bytes data: Raw data
    :rtype: (dict|None, str|None)
    :return: (Edition record, format (rdf|opds|marcxml|json|marc)) or (None, None)

    from typing import Dict, Optional, Tuple
    def parse_data(data: bytes) -> Tuple[Optional[Dict], Optional[str]]:
    """
    data = data.strip()
    if b'<?xml' in data[:10]:
        root = etree.fromstring(data)
        if '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF' == root.tag:
            edition_builder = import_rdf.parse(root)
            format = 'rdf'
        elif '{http://www.w3.org/2005/Atom}entry' == root.tag:
            edition_builder = import_opds.parse(root)
            format = 'opds'
        elif '{http://www.loc.gov/MARC21/slim}record' == root.tag:
            if root.tag == '{http://www.loc.gov/MARC21/slim}collection':
                root = root[0]
            rec = MarcXml(root)
            edition = read_edition(rec)
            edition_builder = import_edition_builder.import_edition_builder(
                init_dict=edition)
            format = 'marcxml'
        else:
            raise DataError('unrecognized-XML-format')
    elif data.startswith(b'{') and data.endswith(b'}'):
        obj = json.loads(data)
        edition_builder = import_edition_builder.import_edition_builder(
            init_dict=obj)
        format = 'json'
    elif data[:MARC_LENGTH_POS].isdigit():
        # Marc Binary
        if len(data) < MARC_LENGTH_POS or len(data) != int(
                data[:MARC_LENGTH_POS]):
            raise DataError('no-marc-record')
        rec = MarcBinary(data)
        edition = read_edition(rec)
        edition_builder = import_edition_builder.import_edition_builder(
            init_dict=edition)
        format = 'marc'
    else:
        raise DataError('unrecognised-import-format')

    parse_meta_headers(edition_builder)
    return edition_builder.get_dict(), format
Example #3
0
 def test_binary(self, i):
     expect_filename = f'{test_data}/bin_expect/{i}'
     with open(f'{test_data}/bin_input/{i}', 'rb') as f:
         rec = MarcBinary(f.read())
     edition_marc_bin = read_edition(rec)
     assert edition_marc_bin
     if not os.path.exists(expect_filename):
         # Missing test expectations file. Create a template from the input, but fail the current test.
         json.dump(edition_marc_bin, open(expect_filename, 'w'), indent=2)
         assert (
             False
         ), 'Expectations file {} not found: template generated in {}. Please review and commit this file.'.format(
             expect_filename, '/bin_expect'
         )
     j = json.load(open(expect_filename))
     assert j, 'Unable to open test data: %s' % expect_filename
     assert sorted(edition_marc_bin) == sorted(j), (
         'Processed binary MARC fields do not match expectations in %s'
         % expect_filename
     )
     msg = (
         'Processed binary MARC values do not match expectations in %s'
         % expect_filename
     )
     for key, value in edition_marc_bin.items():
         if isinstance(value, Iterable):  # can not sort a list of dicts
             assert len(value) == len(j[key]), msg
             assert all(item in value for item in j[key]), msg
         else:
             assert value == j[key], msg
Example #4
0
def test_no_extra_author(mock_site):
    add_languages(mock_site)

    author = {
        "name": "Paul  Boothe",
        "key": "/authors/OL2894448A",
        "type": {"key": "/type/author"},
    }
    mock_site.save(author)

    work = {
        "title": "A Separate Pension Plan for Alberta", 
        "covers": [1644794], 
        "key": "/works/OL8611498W",
        "authors": [{"type": "/type/author_role", "author": {"key": "/authors/OL2894448A"}}],
        "type": {"key": "/type/work"}, 
    }
    mock_site.save(work)

    edition = {
        "number_of_pages": 90,
        "subtitle": "Analysis and Discussion (Western Studies in Economic Policy, No. 5)",
        "weight": "6.2 ounces",
        "covers": [1644794],
        "latest_revision": 6,
        "title": "A Separate Pension Plan for Alberta",
        "languages": [{"key": "/languages/eng"}],
        "subjects": ["Economics", "Alberta", "Political Science / State & Local Government", "Government policy", "Old age pensions", "Pensions", "Social security"], 
        "type": {"key": "/type/edition"},
        "physical_dimensions": "9 x 6 x 0.2 inches",
        "publishers": ["The University of Alberta Press"],
        "physical_format": "Paperback",
        "key": "/books/OL8211505M",
        "authors": [{"key": "/authors/OL2894448A"}],
        "identifiers": {"goodreads": ["4340973"], "librarything": ["5580522"]},
        "isbn_13": ["9780888643513"],
        "isbn_10": ["0888643519"],
        "publish_date": "May 1, 2000",
        "works": [{"key": "/works/OL8611498W"}]
    }
    mock_site.save(edition)

    src = 'v39.i34.records.utf8:186503:1413'
    marc = MarcBinary(open('test_data/' + src).read())
    rec = read_edition(marc)
    rec['source_records'] = ['marc:' + src]

    reply = load(rec)
    assert reply['success'] == True

    if 'authors' in reply:
        assert reply['authors'][0]['key'] == author['key']
    assert reply['edition']['key'] == edition['key']
    assert reply['work']['key'] == work['key']

    e = mock_site.get(reply['edition']['key'])
    w = mock_site.get(reply['work']['key'])
    assert 'source_records' in e
    assert len(e['authors']) == 1
    assert len(w['authors']) == 1
Example #5
0
def get_ia(identifier):
    """
    :param str identifier: ocaid
    :rtype: dict
    """
    marc = get_marc_record_from_ia(identifier)
    return read_edition(marc)
Example #6
0
def test_from_marc_fields(mock_site, add_languages):
    ia = 'isbn_9781419594069'
    data = open_test_data(ia + '_meta.mrc').read()
    rec = read_edition(MarcBinary(data))
    rec['source_records'] = ['ia:' + ia]
    reply = load(rec)
    assert reply['success'] is True
    # author from 100
    assert reply['authors'][0]['name'] == 'Adam Weiner'

    edition = mock_site.get(reply['edition']['key'])
    # Publish place, publisher, & publish date - 260$a, $b, $c
    assert edition['publishers'][0] == 'Kaplan Publishing'
    assert edition['publish_date'] == '2007'
    assert edition['publish_places'][0] == 'New York'
    # Pagination 300
    assert edition['number_of_pages'] == 264
    assert edition['pagination'] == 'viii, 264 p.'
    # 8 subjects, 650
    assert len(edition['subjects']) == 8
    assert edition['subjects'] == [
        u'Action and adventure films', u'Miscellanea', u'Physics',
        u'Cinematography', u'Special effects', u'Physics in motion pictures',
        u'Science fiction films', u'Popular works'
    ]
    # Edition description from 520
    desc = 'Explains the basic laws of physics, covering such topics as mechanics, forces, and energy, while deconstructing famous scenes and stunts from motion pictures, including "Apollo 13" and "Titanic," to determine if they are possible.'
    assert isinstance(edition['description'], Text)
    assert edition['description'] == desc
    # Work description from 520
    work = mock_site.get(reply['work']['key'])
    assert isinstance(work['description'], Text)
    assert work['description'] == desc
Example #7
0
 def test_binary(self, i):
     expect_filename = "%s/bin_expect/%s" % (test_data, i)
     data = open("%s/bin_input/%s" % (test_data, i)).read()
     if len(data) != int(data[:5]):
         #TODO: Why are we fixing this in test expectations? Investigate.
         #      affects histoirereligieu05cr_meta.mrc and zweibchersatir01horauoft_meta.mrc
         data = data.decode('utf-8').encode('raw_unicode_escape')
     assert len(data) == int(data[:5])
     rec = MarcBinary(data)
     edition_marc_bin = read_edition(rec)
     assert edition_marc_bin
     if not os.path.exists(expect_filename):
         # Missing test expectations file. Create a template from the input, but fail the current test.
         simplejson.dump(edition_marc_bin,
                         open(expect_filename, 'w'),
                         indent=2)
         assert False, 'Expectations file %s not found: template generated in %s. Please review and commit this file.' % (
             expect_filename, '/bin_expect')
     j = simplejson.load(open(expect_filename))
     assert j, 'Unable to open test data: %s' % expect_filename
     assert sorted(edition_marc_bin.keys()) == sorted(
         j.keys()
     ), 'Processed binary MARC fields do not match expectations in %s' % expect_filename
     for k in edition_marc_bin.keys():
         if isinstance(j[k], list):
             for item1, item2 in zip(edition_marc_bin[k], j[k]):
                 assert item1 == item2
         assert edition_marc_bin[k] == j[
             k], 'Processed binary MARC values do not match expectations in %s' % expect_filename
     assert edition_marc_bin == j
def load(ia, use_binary=False):
    print("load", ia)
    if not use_binary:
        try:
            rec = load_xml(ia, host, path)
            edition = read_edition(rec)
        except BadSubtag:
            use_binary = True
        except BlankTag:
            use_binary = True
    if use_binary:
        rec = load_binary(ia, host, path)
        edition = read_edition(rec)
    assert 'title' in edition

    edition['ocaid'] = ia
    write_edition(ia, edition, rec)
Example #9
0
def parse(f):
    rec = xml_rec(f)
    edition = {}
    if rec.has_blank_tag:
        print('has blank tag')
    if rec.has_blank_tag or not read_edition(rec, edition):
        return {}
    return edition
Example #10
0
def test_missing_source_records(mock_site):
    add_languages(mock_site)

    mock_site.save({
        'key': '/authors/OL592898A',
        'name': 'Michael Robert Marrus',
        'personal_name': 'Michael Robert Marrus',
        'type': { 'key': '/type/author' }
    })

    mock_site.save({
        'authors': [{'author': '/authors/OL592898A', 'type': { 'key': '/type/author_role' }}],
        'key': '/works/OL16029710W',
        'subjects': ['Nuremberg Trial of Major German War Criminals, Nuremberg, Germany, 1945-1946', 'Protected DAISY', 'Lending library'],
        'title': 'The Nuremberg war crimes trial, 1945-46',
        'type': { 'key': '/type/work' },
    })

    mock_site.save({
        "number_of_pages": 276,
        "subtitle": "a documentary history",
        "series": ["The Bedford series in history and culture"],
        "covers": [6649715, 3865334, 173632],
        "lc_classifications": ["D804.G42 N87 1997"],
        "ocaid": "nurembergwarcrim00marr",
        "contributions": ["Marrus, Michael Robert."],
        "uri_descriptions": ["Book review (H-Net)"],
        "title": "The Nuremberg war crimes trial, 1945-46",
        "languages": [{"key": "/languages/eng"}],
        "subjects": ["Nuremberg Trial of Major German War Criminals, Nuremberg, Germany, 1945-1946"],
        "publish_country": "mau", "by_statement": "[compiled by] Michael R. Marrus.",
        "type": {"key": "/type/edition"},
        "uris": ["http://www.h-net.org/review/hrev-a0a6c9-aa"],
        "publishers": ["Bedford Books"],
        "ia_box_id": ["IA127618"],
        "key": "/books/OL1023483M",
        "authors": [{"key": "/authors/OL592898A"}],
        "publish_places": ["Boston"],
        "pagination": "xi, 276 p. :",
        "lccn": ["96086777"],
        "notes": {"type": "/type/text", "value": "Includes bibliographical references (p. 262-268) and index."},
        "identifiers": {"goodreads": ["326638"], "librarything": ["1114474"]},
        "url": ["http://www.h-net.org/review/hrev-a0a6c9-aa"],
        "isbn_10": ["031216386X", "0312136919"],
        "publish_date": "1997",
        "works": [{"key": "/works/OL16029710W"}]
    })

    ia = 'nurembergwarcrim1997marr'
    src = ia + '_meta.mrc'
    marc = MarcBinary(open_test_data(src).read())
    rec = read_edition(marc)
    rec['source_records'] = ['ia:' + ia]

    reply = load(rec)
    assert reply['success'] == True
    e = mock_site.get(reply['edition']['key'])
    assert 'source_records' in e
Example #11
0
def test_missing_source_records(mock_site):
    add_languages(mock_site)

    mock_site.save({
        'key': '/authors/OL592898A',
        'name': 'Michael Robert Marrus',
        'personal_name': 'Michael Robert Marrus',
        'type': { 'key': '/type/author' }
    })

    mock_site.save({
        'authors': [{'author': '/authors/OL592898A', 'type': { 'key': '/type/author_role' }}],
        'key': '/works/OL16029710W',
        'subjects': ['Nuremberg Trial of Major German War Criminals, Nuremberg, Germany, 1945-1946', 'Protected DAISY', 'Lending library'],
        'title': 'The Nuremberg war crimes trial, 1945-46',
        'type': { 'key': '/type/work' },
    })

    mock_site.save({
        "number_of_pages": 276, 
        "subtitle": "a documentary history", 
        "series": ["The Bedford series in history and culture"], 
        "covers": [6649715, 3865334, 173632], 
        "lc_classifications": ["D804.G42 N87 1997"], 
        "ocaid": "nurembergwarcrim00marr", 
        "contributions": ["Marrus, Michael Robert."], 
        "uri_descriptions": ["Book review (H-Net)"], 
        "title": "The Nuremberg war crimes trial, 1945-46", 
        "languages": [{"key": "/languages/eng"}], 
        "subjects": ["Nuremberg Trial of Major German War Criminals, Nuremberg, Germany, 1945-1946"],
        "publish_country": "mau", "by_statement": "[compiled by] Michael R. Marrus.", 
        "type": {"key": "/type/edition"}, 
        "uris": ["http://www.h-net.org/review/hrev-a0a6c9-aa"],
        "publishers": ["Bedford Books"], 
        "ia_box_id": ["IA127618"], 
        "key": "/books/OL1023483M", 
        "authors": [{"key": "/authors/OL592898A"}], 
        "publish_places": ["Boston"], 
        "pagination": "xi, 276 p. :", 
        "lccn": ["96086777"], 
        "notes": {"type": "/type/text", "value": "Includes bibliographical references (p. 262-268) and index."}, 
        "identifiers": {"goodreads": ["326638"], "librarything": ["1114474"]}, 
        "url": ["http://www.h-net.org/review/hrev-a0a6c9-aa"], 
        "isbn_10": ["031216386X", "0312136919"], 
        "publish_date": "1997", 
        "works": [{"key": "/works/OL16029710W"}]
    })

    ia = 'nurembergwarcrim1997marr'
    src = ia + '_meta.mrc'
    marc = MarcBinary(open('test_data/' + src).read())
    rec = read_edition(marc)
    rec['source_records'] = ['ia:' + ia]

    reply = load(rec)
    assert reply['success'] == True
    e = mock_site.get(reply['edition']['key'])
    assert 'source_records' in e
Example #12
0
def test_no_extra_author(mock_site, add_languages):
    author = {
        "name": "Paul Michael Boothe",
        "key": "/authors/OL1A",
        "type": {"key": "/type/author"},
    }
    mock_site.save(author)

    work = {
        "title": "A Separate Pension Plan for Alberta",
        "covers": [1644794],
        "key": "/works/OL1W",
        "authors": [{"type": "/type/author_role", "author": {"key": "/authors/OL1A"}}],
        "type": {"key": "/type/work"},
    }
    mock_site.save(work)

    edition = {
        "number_of_pages": 90,
        "subtitle": "Analysis and Discussion (Western Studies in Economic Policy, No. 5)",
        "weight": "6.2 ounces",
        "covers": [1644794],
        "latest_revision": 6,
        "title": "A Separate Pension Plan for Alberta",
        "languages": [{"key": "/languages/eng"}],
        "subjects": ["Economics", "Alberta", "Political Science / State & Local Government", "Government policy", "Old age pensions", "Pensions", "Social security"],
        "type": {"key": "/type/edition"},
        "physical_dimensions": "9 x 6 x 0.2 inches",
        "publishers": ["The University of Alberta Press"],
        "physical_format": "Paperback",
        "key": "/books/OL1M",
        "authors": [{"key": "/authors/OL2894448A"}],
        "identifiers": {"goodreads": ["4340973"], "librarything": ["5580522"]},
        "isbn_13": ["9780888643513"],
        "isbn_10": ["0888643519"],
        "publish_date": "May 1, 2000",
        "works": [{"key": "/works/OL1W"}]
    }
    mock_site.save(edition)

    src = 'v39.i34.records.utf8--186503-1413'
    marc = MarcBinary(open_test_data(src).read())
    rec = read_edition(marc)
    rec['source_records'] = ['marc:' + src]
    reply = load(rec)
    assert reply['success'] is True

    a = mock_site.get(reply['authors'][0]['key'])

    assert reply['authors'][0]['key'] == author['key']
    assert reply['edition']['key'] == edition['key']
    assert reply['work']['key'] == work['key']

    e = mock_site.get(reply['edition']['key'])
    w = mock_site.get(reply['work']['key'])
    assert 'source_records' in e
    assert len(e['authors']) == 1
    assert len(w['authors']) == 1
Example #13
0
def parse_data(data):
    """
    Takes POSTed data and determines the format, and returns an Edition record
    suitable for adding to OL.

    :param str data: Raw data
    :rtype: (dict|None, str|None)
    :return: (Edition record, format (rdf|opds|marcxml|json|marc)) or (None, None)
    """
    data = data.strip()
    if -1 != data[:10].find('<?xml'):
        root = etree.fromstring(data)
        if '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF' == root.tag:
            edition_builder = import_rdf.parse(root)
            format = 'rdf'
        elif '{http://www.w3.org/2005/Atom}entry' == root.tag:
            edition_builder = import_opds.parse(root)
            format = 'opds'
        elif '{http://www.loc.gov/MARC21/slim}record' == root.tag:
            if root.tag == '{http://www.loc.gov/MARC21/slim}collection':
                root = root[0]
            rec = MarcXml(root)
            edition = read_edition(rec)
            edition_builder = import_edition_builder.import_edition_builder(init_dict=edition)
            format = 'marcxml'
        else:
            print('unrecognized XML format')
            return None, None
    elif data.startswith('{') and data.endswith('}'):
        obj = json.loads(data)
        edition_builder = import_edition_builder.import_edition_builder(init_dict=obj)
        format = 'json'
    else:
        #Marc Binary
        if len(data) < MARC_LENGTH_POS or len(data) != int(data[:MARC_LENGTH_POS]):
            raise DataError('no-marc-record')
        rec = MarcBinary(data)

        edition = read_edition(rec)
        edition_builder = import_edition_builder.import_edition_builder(init_dict=edition)
        format = 'marc'

    parse_meta_headers(edition_builder)

    return edition_builder.get_dict(), format
Example #14
0
def load(ia, use_binary=False):
    print "load", ia
    if not use_binary:
        try:
            rec = load_xml(ia, host, path)
            edition = read_edition(rec)
        except BadSubtag:
            use_binary = True
        except BlankTag:
            use_binary = True
    if use_binary:
        rec = load_binary(ia, host, path)
        edition = read_edition(rec)
    pprint(edition)
    assert 'title' in edition

    edition['ocaid'] = ia
    write_edition(ia, edition, rec)
Example #15
0
def test_real_example(mock_site, add_languages):
    src = 'v38.i37.records.utf8--16478504-1254'
    marc = MarcBinary(open_test_data(src).read())
    rec = read_edition(marc)
    rec['source_records'] = ['marc:' + src]
    reply = load(rec)
    assert reply['success'] is True
    reply = load(rec)
    assert reply['success'] is True
    assert reply['edition']['status'] == 'matched'

    src = 'v39.i28.records.utf8--5362776-1764'
    marc = MarcBinary(open_test_data(src).read())
    rec = read_edition(marc)
    rec['source_records'] = ['marc:' + src]
    reply = load(rec)
    assert reply['success'] is True
    assert reply['edition']['status'] == 'modified'
Example #16
0
def get_ia(identifier):
    """
    DEPRECATED: Use get_marc_record_from_ia() above + parse.read_edition()

    :param str identifier: ocaid
    :rtype: dict
    """
    marc = get_marc_record_from_ia(identifier)
    return parse.read_edition(marc)
Example #17
0
def get_ia(identifier):
    """
    DEPRECATED: Use get_marc_record_from_ia() above + parse.read_edition()

    :param str identifier: ocaid
    :rtype: dict
    """
    marc = get_marc_record_from_ia(identifier)
    return parse.read_edition(marc)
Example #18
0
def load_xml(ia):
    url = archive_url + ia + "/" + ia + "_marc.xml"
    f = urlopen_keep_trying(url)
    root = etree.parse(f).getroot()
    if root.tag == "{http://www.loc.gov/MARC21/slim}collection":
        root = root[0]
    return MarcXml(root)
    edition = read_edition(rec)
    assert "title" in edition
    return edition
Example #19
0
    def ia_import(cls, identifier, require_marc=True, force_import=False):
        """
        Performs logic to fetch archive.org item + metadata,
        produces a data dict, then loads into Open Library

        :param str identifier: archive.org ocaid
        :param bool require_marc: require archive.org item have MARC record?
        :param bool force_import: force import of this record
        :rtype: dict
        :returns: the data of the imported book or raises  BookImportError
        """
        # Case 1 - Is this a valid Archive.org item?
        metadata = ia.get_metadata(identifier)
        if not metadata:
            raise BookImportError('invalid-ia-identifier',
                                  '%s not found' % identifier)

        # Case 2 - Does the item have an openlibrary field specified?
        # The scan operators search OL before loading the book and add the
        # OL key if a match is found. We can trust them and attach the item
        # to that edition.
        if metadata.get('mediatype') == 'texts' and metadata.get(
                'openlibrary'):
            edition_data = cls.get_ia_record(metadata)
            edition_data['openlibrary'] = metadata['openlibrary']
            edition_data = cls.populate_edition_data(edition_data, identifier)
            return cls.load_book(edition_data)

        # Case 3 - Can the item be loaded into Open Library?
        status = ia.get_item_status(identifier, metadata)
        if status != 'ok' and not force_import:
            raise BookImportError(status, 'Prohibited Item %s' % identifier)

        # Case 4 - Does this item have a marc record?
        marc_record = get_marc_record_from_ia(identifier)
        if require_marc and not marc_record:
            raise BookImportError('no-marc-record')
        if marc_record:
            if not force_import:
                raise_non_book_marc(marc_record)
            try:
                edition_data = read_edition(marc_record)
            except MarcException as e:
                logger.error('failed to read from MARC record %s: %s',
                             identifier, str(e))
                raise BookImportError('invalid-marc-record')
        else:
            try:
                edition_data = cls.get_ia_record(metadata)
            except KeyError:
                raise BookImportError('invalid-ia-metadata')

        # Add IA specific fields: ocaid, source_records, and cover
        edition_data = cls.populate_edition_data(edition_data, identifier)
        return cls.load_book(edition_data)
Example #20
0
def load_xml(ia, host, path):
    url = 'http://' + host + path + '/' + ia + '_marc.xml'
    print url
    f = urlopen_keep_trying(url)
    root = etree.parse(f).getroot()
    if root.tag == '{http://www.loc.gov/MARC21/slim}collection':
        root = root[0]
    return MarcXml(root)
    edition = read_edition(rec)
    assert 'title' in edition
    return edition
Example #21
0
def test_from_marc(mock_site):
    add_languages(mock_site)
    marc = MarcBinary(open('test_data/coursepuremath00hardrich_meta.mrc').read())
    rec = read_edition(marc)
    reply = load(rec)
    assert reply['success'] == True
    assert reply['edition']['status'] == 'created'
    reply = load(rec)
    assert reply['success'] == True
    assert reply['edition']['status'] == 'matched'

    marc = MarcBinary(open('test_data/flatlandromanceo00abbouoft_meta.mrc').read())

    rec = read_edition(marc)
    reply = load(rec)
    assert reply['success'] == True
    assert reply['edition']['status'] == 'created'
    reply = load(rec)
    assert reply['success'] == True
    assert reply['edition']['status'] == 'matched'
Example #22
0
def test_from_marc_3(mock_site, add_languages):
    ia = 'treatiseonhistor00dixo'
    data = open_test_data(ia + '_meta.mrc').read()
    assert len(data) == int(data[:5])
    rec = read_edition(MarcBinary(data))
    rec['source_records'] = ['ia:' + ia]
    reply = load(rec)
    assert reply['success'] is True
    assert reply['edition']['status'] == 'created'
    e = mock_site.get(reply['edition']['key'])
    assert e.type.key == '/type/edition'
Example #23
0
def test_real_example(mock_site):
    add_languages(mock_site)

    src = 'v38.i37.records.utf8:16478504:1254'
    marc = MarcBinary(open('test_data/' + src).read())
    rec = read_edition(marc)
    rec['source_records'] = ['marc:' + src]
    reply = load(rec)
    assert reply['success'] == True
    reply = load(rec)
    assert reply['success'] == True
    assert reply['edition']['status'] == 'matched'

    src = 'v39.i28.records.utf8:5362776:1764'
    marc = MarcBinary(open('test_data/' + src).read())
    rec = read_edition(marc)
    rec['source_records'] = ['marc:' + src]
    reply = load(rec)
    assert reply['success'] == True
    assert reply['edition']['status'] == 'modified'
Example #24
0
def test_real_example(mock_site):
    add_languages(mock_site)

    src = 'v38.i37.records.utf8:16478504:1254'
    marc = MarcBinary(open('test_data/' + src).read())
    rec = read_edition(marc)
    rec['source_records'] = ['marc:' + src]
    reply = load(rec)
    assert reply['success'] == True
    reply = load(rec)
    assert reply['success'] == True
    assert reply['edition']['status'] == 'matched'

    src = 'v39.i28.records.utf8:5362776:1764'
    marc = MarcBinary(open('test_data/' + src).read())
    rec = read_edition(marc)
    rec['source_records'] = ['marc:' + src]
    reply = load(rec)
    assert reply['success'] == True
    assert reply['edition']['status'] == 'modified'
Example #25
0
def load_xml(ia, host, path):
    url = 'http://' + host + path + '/' + ia + '_marc.xml'
    print(url)
    f = urlopen_keep_trying(url)
    root = etree.parse(f).getroot()
    if root.tag == '{http://www.loc.gov/MARC21/slim}collection':
        root = root[0]
    return MarcXml(root)
    edition = read_edition(rec)
    assert 'title' in edition
    return edition
Example #26
0
def parse_data(data):
    data = data.strip()
    if -1 != data[:10].find('<?xml'):
        root = etree.fromstring(data)
        #print root.tag
        if '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF' == root.tag:
            edition_builder = import_rdf.parse(root)
            format = 'rdf'
        elif '{http://www.w3.org/2005/Atom}entry' == root.tag:
            edition_builder = import_opds.parse(root)
            format = 'opds'
        elif '{http://www.loc.gov/MARC21/slim}record' == root.tag:
            if root.tag == '{http://www.loc.gov/MARC21/slim}collection':
                root = root[0]
            rec = MarcXml(root)
            edition = read_edition(rec)
            edition_builder = import_edition_builder.import_edition_builder(
                init_dict=edition)
            format = 'marcxml'
        else:
            print 'unrecognized XML format'
            return None, None
    elif data.startswith('{') and data.endswith('}'):
        obj = json.loads(data)
        edition_builder = import_edition_builder.import_edition_builder(
            init_dict=obj)
        format = 'json'
    else:
        #Marc Binary
        if len(data) != int(data[:5]):
            return json.dumps({'success': False, 'error': 'Bad MARC length'})

        rec = MarcBinary(data)
        edition = read_edition(rec)
        edition_builder = import_edition_builder.import_edition_builder(
            init_dict=edition)
        format = 'marc'

    parse_meta_headers(edition_builder)

    return edition_builder.get_dict(), format
Example #27
0
def test_from_marc(mock_site, add_languages):
    ia = 'flatlandromanceo00abbouoft'
    data = open_test_data(ia + '_meta.mrc').read()
    assert len(data) == int(data[:5])
    rec = read_edition(MarcBinary(data))
    reply = load(rec)
    assert reply['success'] is True
    akey1 = reply['authors'][0]['key']
    a = mock_site.get(akey1)
    assert a.type.key == '/type/author'
    assert a.name == 'Edwin Abbott Abbott'
    assert a.birth_date == '1838'
    assert a.death_date == '1926'
Example #28
0
def test_from_marc(mock_site):
    add_languages(mock_site)
    marc = MarcBinary(
        open('test_data/coursepuremath00hardrich_meta.mrc').read())
    rec = read_edition(marc)
    reply = load(rec)
    assert reply['success'] == True
    assert reply['edition']['status'] == 'created'
    reply = load(rec)
    assert reply['success'] == True
    assert reply['edition']['status'] == 'matched'

    marc = MarcBinary(
        open('test_data/flatlandromanceo00abbouoft_meta.mrc').read())

    rec = read_edition(marc)
    reply = load(rec)
    assert reply['success'] == True
    assert reply['edition']['status'] == 'created'
    reply = load(rec)
    assert reply['success'] == True
    assert reply['edition']['status'] == 'matched'
Example #29
0
def author_from_data(loc, data):
    edition = read_edition(data)
    assert 'authors' in edition
    east = east_in_by_statement(edition)
    assert len(edition['authors']) == 1
    print(repr(edition['authors'][0]))
    a = import_author(edition['authors'][0], eastern=east)
    if 'key' in a:
        return {'key': a['key']}
    ret = ol.new(a, comment='new author')
    print('ret:', ret)
    assert isinstance(ret, six.string_types)
    return {'key': ret}
Example #30
0
def test_author_from_700(mock_site, add_languages):
    ia = 'sexuallytransmit00egen'
    data = open_test_data(ia + '_meta.mrc').read()
    rec = read_edition(MarcBinary(data))
    rec['source_records'] = ['ia:' + ia]
    reply = load(rec)
    assert reply['success'] is True
    # author from 700
    akey = reply['authors'][0]['key']
    a = mock_site.get(akey)
    assert a.type.key == '/type/author'
    assert a.name == 'Laura K. Egendorf'
    assert a.birth_date == '1973'
Example #31
0
def author_from_data(loc, data):
    edition = read_edition(data)
    assert 'authors' in edition
    east = east_in_by_statement(edition)
    assert len(edition['authors']) == 1
    print(repr(edition['authors'][0]))
    a = import_author(edition['authors'][0], eastern=east)
    if 'key' in a:
        return {'key': a['key']}
    ret = ol.new(a, comment='new author')
    print('ret:', ret)
    assert isinstance(ret, basestring)
    return {'key': ret}
Example #32
0
def test_from_marc(mock_site, add_languages):
    ia = 'coursepuremath00hardrich'
    marc = MarcBinary(open_test_data(ia + '_meta.mrc').read())
    rec = read_edition(marc)
    rec['source_records'] = ['ia:' + ia]
    reply = load(rec)
    assert reply['success'] is True
    assert reply['edition']['status'] == 'created'
    reply = load(rec)
    assert reply['success'] is True
    assert reply['edition']['status'] == 'matched'

    ia = 'flatlandromanceo00abbouoft'
    marc = MarcBinary(open_test_data(ia + '_meta.mrc').read())

    rec = read_edition(marc)
    rec['source_records'] = ['ia:' + ia]
    reply = load(rec)
    assert reply['success'] is True
    assert reply['edition']['status'] == 'created'
    reply = load(rec)
    assert reply['success'] is True
    assert reply['edition']['status'] == 'matched'
Example #33
0
def test_missing_ocaid(mock_site, add_languages, ia_writeback):
    ia = 'descendantsofhug00cham'
    src = ia + '_meta.mrc'
    marc = MarcBinary(open_test_data(src).read())
    rec = read_edition(marc)
    rec['source_records'] = ['marc:testdata.mrc']
    reply = load(rec)
    assert reply['success'] is True
    rec['source_records'] = ['ia:' + ia]
    rec['ocaid'] = ia
    reply = load(rec)
    assert reply['success'] is True
    e = mock_site.get(reply['edition']['key'])
    assert e.ocaid == ia
    assert 'ia:' + ia in e.source_records
Example #34
0
def test_from_marc(mock_site):
    from openlibrary.catalog.marc.marc_binary import MarcBinary
    from openlibrary.catalog.marc.parse import read_edition

    add_languages(mock_site)
    data = open('test_data/flatlandromanceo00abbouoft_meta.mrc').read()
    assert len(data) == int(data[:5])
    rec = read_edition(MarcBinary(data))
    reply = load(rec)
    assert reply['success'] == True
    akey1 = reply['authors'][0]['key']
    a = mock_site.get(akey1)
    assert a.type.key == '/type/author'
    assert a.name == 'Edwin Abbott Abbott'
    assert a.birth_date == '1838'
    assert a.death_date == '1926'
Example #35
0
 def test_xml(self, i):
     expect_filename = "%s/xml_expect/%s_marc.xml" % (test_data, i)
     path = "%s/xml_input/%s_marc.xml" % (test_data, i)
     element = etree.parse(open(path)).getroot()
     # Handle MARC XML collection elements in our test_data expectations:
     if element.tag == collection_tag and element[0].tag == record_tag:
         element = element[0]
     rec = MarcXml(element)
     edition_marc_xml = read_edition(rec)
     assert edition_marc_xml
     j = simplejson.load(open(expect_filename))
     assert j, "Unable to open test data: %s" % expect_filename
     assert sorted(edition_marc_xml.keys()) == sorted(j.keys())
     for k in edition_marc_xml.keys():
         assert edition_marc_xml[k] == j[k]
     assert edition_marc_xml == j
Example #36
0
def test_from_marc_2(mock_site):
    add_languages(mock_site)
    ia = 'roadstogreatness00gall'

    data = open_test_data(ia + '_meta.mrc').read()
    assert len(data) == int(data[:5])
    rec = read_edition(MarcBinary(data))
    rec['source_records'] = ['ia:' + ia]
    reply = load(rec)
    assert reply['success'] == True
    assert reply['edition']['status'] == 'created'
    e = mock_site.get(reply['edition']['key'])
    assert e.type.key == '/type/edition'
    reply = load(rec)
    assert reply['success'] == True
    assert reply['edition']['status'] == 'matched'
Example #37
0
 def test_xml(self, i):
     expect_filename = "%s/xml_expect/%s_marc.xml" % (test_data, i)
     path            = "%s/xml_input/%s_marc.xml"  % (test_data, i)
     element = etree.parse(open(path)).getroot()
     # Handle MARC XML collection elements in our test_data expectations:
     if element.tag == collection_tag and element[0].tag == record_tag:
         element = element[0]
     rec = MarcXml(element)
     edition_marc_xml = read_edition(rec)
     assert edition_marc_xml
     j = simplejson.load(open(expect_filename))
     assert j, "Unable to open test data: %s" % expect_filename
     assert sorted(edition_marc_xml.keys()) == sorted(j.keys())
     for k in edition_marc_xml.keys():
         assert edition_marc_xml[k] == j[k]
     assert edition_marc_xml == j
Example #38
0
def test_from_marc(mock_site):
    from openlibrary.catalog.marc.marc_binary import MarcBinary
    from openlibrary.catalog.marc.parse import read_edition

    add_languages(mock_site)
    data = open('test_data/flatlandromanceo00abbouoft_meta.mrc').read()
    assert len(data) == int(data[:5])
    rec = read_edition(MarcBinary(data))
    reply = load(rec)
    assert reply['success'] == True
    akey1 = reply['authors'][0]['key']
    a = mock_site.get(akey1)
    assert a.type.key == '/type/author'
    assert a.name == 'Edwin Abbott Abbott'
    assert a.birth_date == '1838'
    assert a.death_date == '1926'
    def test_from_marc_author(self, mock_site, add_languages):
        ia = 'flatlandromanceo00abbouoft'
        marc = MarcBinary(open_test_data(ia + '_meta.mrc').read())

        rec = read_edition(marc)
        rec['source_records'] = ['ia:' + ia]
        reply = load(rec)
        assert reply['success'] is True
        assert reply['edition']['status'] == 'created'
        a = mock_site.get(reply['authors'][0]['key'])
        assert a.type.key == '/type/author'
        assert a.name == 'Edwin Abbott Abbott'
        assert a.birth_date == '1838'
        assert a.death_date == '1926'
        reply = load(rec)
        assert reply['success'] is True
        assert reply['edition']['status'] == 'matched'
Example #40
0
def test_from_marc_2(mock_site):
    add_languages(mock_site)
    ia = 'roadstogreatness00gall'
    
    data = open('test_data/' + ia + '_meta.mrc').read()
    assert len(data) == int(data[:5])
    rec = read_edition(MarcBinary(data))
    rec['source_records'] = ['ia:' + ia]
    reply = load(rec)
    assert reply['success'] == True
    assert reply['edition']['status'] == 'created'
    e = mock_site.get(reply['edition']['key'])
    assert e.type.key == '/type/edition'

    reply = load(rec)
    assert reply['success'] == True
    assert reply['edition']['status'] == 'matched'
Example #41
0
 def test_binary(self, i):
     expect_filename = "%s/bin_expect/%s" % (test_data, i)
     data = open("%s/bin_input/%s" % (test_data, i)).read()
     if len(data) != int(data[:5]):
         #TODO: Why are we fixing this in test expectations? Investigate.
         #      affects histoirereligieu05cr_meta.mrc and zweibchersatir01horauoft_meta.mrc
         data = data.decode('utf-8').encode('raw_unicode_escape')
     assert len(data) == int(data[:5])
     rec = MarcBinary(data)
     edition_marc_bin = read_edition(rec)
     assert edition_marc_bin
     j = simplejson.load(open(expect_filename))
     assert j, "Unable to open test data: %s" % expect_filename
     assert sorted(edition_marc_bin.keys()) == sorted(j.keys())
     for k in edition_marc_bin.keys():
         if isinstance(j[k], list):
             for item1, item2 in zip(edition_marc_bin[k], j[k]):
                 assert item1 == item2
         assert edition_marc_bin[k] == j[k]
     assert edition_marc_bin == j
Example #42
0
 def test_xml(self, i):
     expect_filename = "%s/xml_expect/%s_marc.xml" % (test_data, i)
     path            = "%s/xml_input/%s_marc.xml"  % (test_data, i)
     element = etree.parse(open(path)).getroot()
     # Handle MARC XML collection elements in our test_data expectations:
     if element.tag == collection_tag and element[0].tag == record_tag:
         element = element[0]
     rec = MarcXml(element)
     edition_marc_xml = read_edition(rec)
     assert edition_marc_xml
     j = simplejson.load(open(expect_filename))
     assert j, 'Unable to open test data: %s' % expect_filename
     assert sorted(edition_marc_xml) == sorted(j), ('Processed MARCXML fields do '
                                                    'not match expectations in %s'
                                                    % expect_filename)
     for k in edition_marc_xml:
         assert edition_marc_xml[k] == j[k], ('Processed MARCXML values do not '
                                              'match expectations in %s'
                                              % expect_filename)
     assert edition_marc_xml == j
Example #43
0
 def test_binary(self, i):
     expect_filename = "%s/bin_expect/%s" % (test_data, i)
     data = open("%s/bin_input/%s" % (test_data, i)).read()
     if len(data) != int(data[:5]):
         #TODO: Why are we fixing this in test expectations? Investigate.
         #      affects histoirereligieu05cr_meta.mrc and zweibchersatir01horauoft_meta.mrc
         data = data.decode('utf-8').encode('raw_unicode_escape')
     assert len(data) == int(data[:5])
     rec = MarcBinary(data)
     edition_marc_bin = read_edition(rec)
     assert edition_marc_bin
     j = simplejson.load(open(expect_filename))
     assert j, "Unable to open test data: %s" % expect_filename
     assert sorted(edition_marc_bin.keys()) == sorted(j.keys())
     for k in edition_marc_bin.keys():
         if isinstance(j[k], list):
             for item1, item2 in zip(edition_marc_bin[k], j[k]):
                 assert item1 == item2
         assert edition_marc_bin[k] == j[k]
     assert edition_marc_bin == j
Example #44
0
def test_extra_author(mock_site):
    add_languages(mock_site)

    mock_site.save({
        "name": "Hubert Howe Bancroft",
        "death_date": "1918.",
        "alternate_names": ["HUBERT HOWE BANCROFT", "Hubert Howe Bandcroft"], 
        "key": "/authors/OL563100A", 
        "birth_date": "1832", 
        "personal_name": "Hubert Howe Bancroft", 
        "type": {"key": "/type/author"}, 
    })

    mock_site.save({
        "title": "The works of Hubert Howe Bancroft",
        "covers": [6060295, 5551343],
        "first_sentence": {"type": "/type/text", "value": "When it first became known to Europe that a new continent had been discovered, the wise men, philosophers, and especially the learned ecclesiastics, were sorely perplexed to account for such a discovery."},
        "subject_places": ["Alaska", "America", "Arizona", "British Columbia", "California", "Canadian Northwest", "Central America", "Colorado", "Idaho", "Mexico", "Montana", "Nevada", "New Mexico", "Northwest Coast of North America", "Northwest boundary of the United States", "Oregon", "Pacific States", "Texas", "United States", "Utah", "Washington (State)", "West (U.S.)", "Wyoming"], 
        "excerpts": [{"excerpt": "When it first became known to Europe that a new continent had been discovered, the wise men, philosophers, and especially the learned ecclesiastics, were sorely perplexed to account for such a discovery."}], 
        "first_publish_date": "1882", 
        "key": "/works/OL3421434W",
        "authors": [{"type": {"key": "/type/author_role"}, "author": {"key": "/authors/OL563100A"}}],
        "subject_times": ["1540-1810", "1810-1821", "1821-1861", "1821-1951", "1846-1850", "1850-1950", "1859-", "1859-1950", "1867-1910", "1867-1959", "1871-1903", "Civil War, 1861-1865", "Conquest, 1519-1540", "European intervention, 1861-1867", "Spanish colony, 1540-1810", "To 1519", "To 1821", "To 1846", "To 1859", "To 1867", "To 1871", "To 1889", "To 1912", "Wars of Independence, 1810-1821"],
        "type": {"key": "/type/work"},
        "subjects": ["Antiquities", "Archaeology", "Autobiography", "Bibliography", "California Civil War, 1861-1865", "Comparative Literature", "Comparative civilization", "Courts", "Description and travel", "Discovery and exploration", "Early accounts to 1600", "English essays", "Ethnology", "Foreign relations", "Gold discoveries", "Historians", "History", "Indians", "Indians of Central America", "Indians of Mexico", "Indians of North America", "Languages", "Law", "Mayas", "Mexican War, 1846-1848", "Nahuas", "Nahuatl language", "Oregon question", "Political aspects of Law", "Politics and government", "Religion and mythology", "Religions", "Social life and customs", "Spanish", "Vigilance committees", "Writing", "Zamorano 80", "Accessible book", "Protected DAISY"]
    })

    ia = 'workshuberthowe00racegoog'
    src = ia + '_meta.mrc'
    marc = MarcBinary(open('test_data/' + src).read())
    rec = read_edition(marc)
    rec['source_records'] = ['ia:' + ia]

    reply = load(rec)
    assert reply['success'] == True

    w = mock_site.get(reply['work']['key'])

    reply = load(rec)
    assert reply['success'] == True
    w = mock_site.get(reply['work']['key'])
Example #45
0
def test_extra_author(mock_site):
    add_languages(mock_site)

    mock_site.save({
        "name": "Hubert Howe Bancroft",
        "death_date": "1918.",
        "alternate_names": ["HUBERT HOWE BANCROFT", "Hubert Howe Bandcroft"],
        "key": "/authors/OL563100A",
        "birth_date": "1832",
        "personal_name": "Hubert Howe Bancroft",
        "type": {"key": "/type/author"},
    })

    mock_site.save({
        "title": "The works of Hubert Howe Bancroft",
        "covers": [6060295, 5551343],
        "first_sentence": {"type": "/type/text", "value": "When it first became known to Europe that a new continent had been discovered, the wise men, philosophers, and especially the learned ecclesiastics, were sorely perplexed to account for such a discovery."},
        "subject_places": ["Alaska", "America", "Arizona", "British Columbia", "California", "Canadian Northwest", "Central America", "Colorado", "Idaho", "Mexico", "Montana", "Nevada", "New Mexico", "Northwest Coast of North America", "Northwest boundary of the United States", "Oregon", "Pacific States", "Texas", "United States", "Utah", "Washington (State)", "West (U.S.)", "Wyoming"],
        "excerpts": [{"excerpt": "When it first became known to Europe that a new continent had been discovered, the wise men, philosophers, and especially the learned ecclesiastics, were sorely perplexed to account for such a discovery."}],
        "first_publish_date": "1882",
        "key": "/works/OL3421434W",
        "authors": [{"type": {"key": "/type/author_role"}, "author": {"key": "/authors/OL563100A"}}],
        "subject_times": ["1540-1810", "1810-1821", "1821-1861", "1821-1951", "1846-1850", "1850-1950", "1859-", "1859-1950", "1867-1910", "1867-1959", "1871-1903", "Civil War, 1861-1865", "Conquest, 1519-1540", "European intervention, 1861-1867", "Spanish colony, 1540-1810", "To 1519", "To 1821", "To 1846", "To 1859", "To 1867", "To 1871", "To 1889", "To 1912", "Wars of Independence, 1810-1821"],
        "type": {"key": "/type/work"},
        "subjects": ["Antiquities", "Archaeology", "Autobiography", "Bibliography", "California Civil War, 1861-1865", "Comparative Literature", "Comparative civilization", "Courts", "Description and travel", "Discovery and exploration", "Early accounts to 1600", "English essays", "Ethnology", "Foreign relations", "Gold discoveries", "Historians", "History", "Indians", "Indians of Central America", "Indians of Mexico", "Indians of North America", "Languages", "Law", "Mayas", "Mexican War, 1846-1848", "Nahuas", "Nahuatl language", "Oregon question", "Political aspects of Law", "Politics and government", "Religion and mythology", "Religions", "Social life and customs", "Spanish", "Vigilance committees", "Writing", "Zamorano 80", "Accessible book", "Protected DAISY"]
    })

    ia = 'workshuberthowe00racegoog'
    src = ia + '_meta.mrc'
    marc = MarcBinary(open_test_data(src).read())
    rec = read_edition(marc)
    rec['source_records'] = ['ia:' + ia]

    reply = load(rec)
    assert reply['success'] == True

    w = mock_site.get(reply['work']['key'])

    reply = load(rec)
    assert reply['success'] == True
    w = mock_site.get(reply['work']['key'])
Example #46
0
 def test_xml(self, i):
     expect_filename = "%s/xml_expect/%s_marc.xml" % (test_data, i)
     path = "%s/xml_input/%s_marc.xml" % (test_data, i)
     element = etree.parse(open(path)).getroot()
     # Handle MARC XML collection elements in our test_data expectations:
     if element.tag == collection_tag and element[0].tag == record_tag:
         element = element[0]
     rec = MarcXml(element)
     edition_marc_xml = read_edition(rec)
     assert edition_marc_xml
     j = json.load(open(expect_filename))
     assert j, 'Unable to open test data: %s' % expect_filename
     assert sorted(edition_marc_xml) == sorted(j), (
         'Processed MARCXML fields do not match expectations in %s' %
         expect_filename)
     msg = ('Processed MARCXML values do not match expectations in %s' %
            expect_filename)
     for key, value in edition_marc_xml.items():
         if isinstance(value, Iterable):  # can not sort a list of dicts
             assert len(value) == len(j[key]), msg
             assert all(item in value for item in j[key]), msg
         else:
             assert value == j[key], msg
def test_from_marc_fields(mock_site, add_languages):
    ia = 'isbn_9781419594069'
    data = open_test_data(ia + '_meta.mrc').read()
    rec = read_edition(MarcBinary(data))
    rec['source_records'] = ['ia:' + ia]
    reply = load(rec)
    assert reply['success'] is True
    # author from 100
    assert reply['authors'][0]['name'] == 'Adam Weiner'

    edition = mock_site.get(reply['edition']['key'])
    # Publish place, publisher, & publish date - 260$a, $b, $c
    assert edition['publishers'][0] == 'Kaplan Publishing'
    assert edition['publish_date'] == '2007'
    assert edition['publish_places'][0] == 'New York'
    # Pagination 300
    assert edition['number_of_pages'] == 264
    assert edition['pagination'] == 'viii, 264 p.'
    # 8 subjects, 650
    assert len(edition['subjects']) == 8
    assert edition['subjects'] == [u'Action and adventure films',
                                   u'Miscellanea',
                                   u'Physics',
                                   u'Cinematography',
                                   u'Special effects',
                                   u'Physics in motion pictures',
                                   u'Science fiction films',
                                   u'Popular works']
    # Edition description from 520
    desc = 'Explains the basic laws of physics, covering such topics as mechanics, forces, and energy, while deconstructing famous scenes and stunts from motion pictures, including "Apollo 13" and "Titanic," to determine if they are possible.'
    assert isinstance(edition['description'], Text)
    assert edition['description'] == desc
    # Work description from 520
    work = mock_site.get(reply['work']['key'])
    assert isinstance(work['description'], Text)
    assert work['description'] == desc
Example #48
0
def test_don_quixote(mock_site):
    return
    dq = [u'lifeexploitsofin01cerv', u'cu31924096224518',
        u'elingeniosedcrit04cerv', u'ingeniousgentlem01cervuoft',
        u'historyofingenio01cerv', u'lifeexploitsofin02cerviala',
        u'elingeniosohidal03cervuoft', u'nybc209000', u'elingeniosohidal11cerv',
        u'elingeniosohidal01cervuoft', u'elingeniosoh01cerv',
        u'donquixotedelama00cerviala', u'1896elingeniosohid02cerv',
        u'ingeniousgentlem04cervuoft', u'cu31924027656978', u'histoiredeladmir01cerv',
        u'donquijotedelama04cerv', u'cu31924027657075', u'donquixotedelama03cervuoft',
        u'aventurasdedonqu00cerv', u'p1elingeniosohid03cerv',
        u'geshikhefundonik01cervuoft', u'historyofvalorou02cerviala',
        u'ingeniousgentlem01cerv', u'donquixotedelama01cervuoft',
        u'ingeniousgentlem0195cerv', u'firstpartofdelig00cervuoft',
        u'p4elingeniosohid02cerv', u'donquijote00cervuoft', u'cu31924008863924',
        u'c2elingeniosohid02cerv', u'historyofvalorou03cerviala',
        u'historyofingenio01cerviala', u'historyadventure00cerv',
        u'elingeniosohidal00cerv', u'lifeexploitsofin01cervuoft',
        u'p2elingeniosohid05cerv', u'nybc203136', u'elingeniosohidal00cervuoft',
        u'donquixotedelama02cervuoft', u'lingnieuxcheva00cerv',
        u'ingeniousgentlem03cerv', u'vidayhechosdeli00siscgoog',
        u'lifeandexploits01jarvgoog', u'elingeniosohida00puiggoog',
        u'elingeniosohida00navagoog', u'donquichottedel02florgoog',
        u'historydonquixo00cogoog', u'vidayhechosdeli01siscgoog',
        u'elingeniosohida28saavgoog', u'historyvalorous00brangoog',
        u'elingeniosohida01goog', u'historyandadven00unkngoog',
        u'historyvalorous01goog', u'ingeniousgentle11saavgoog',
        u'elingeniosohida10saavgoog', u'adventuresdonqu00jarvgoog',
        u'historydonquixo04saavgoog', u'lingnieuxcheval00rouxgoog',
        u'elingeniosohida19saavgoog', u'historyingeniou00lalagoog',
        u'elingeniosohida00ormsgoog', u'historyandadven01smolgoog',
        u'elingeniosohida27saavgoog', u'elingeniosohida21saavgoog',
        u'historyingeniou00mottgoog', u'historyingeniou03unkngoog',
        u'lifeandexploits00jarvgoog', u'ingeniousgentle00conggoog',
        u'elingeniosohida00quixgoog', u'elingeniosohida01saavgoog',
        u'donquixotedelam02saavgoog', u'adventuresdonqu00gilbgoog',
        u'historyingeniou02saavgoog', u'donquixotedelam03saavgoog',
        u'elingeniosohida00ochogoog', u'historyingeniou08mottgoog',
        u'lifeandexploits01saavgoog', u'firstpartdeligh00shelgoog',
        u'elingeniosohida00castgoog', u'elingeniosohida01castgoog',
        u'adventofdonquixo00cerv', u'portablecervante00cerv',
        u'firstpartofdelig14cerv', u'donquixotemanofl00cerv',
        u'firstpartofdelig00cerv']

    add_languages(mock_site)
    edition_status_counts = defaultdict(int)
    work_status_counts = defaultdict(int)
    author_status_counts = defaultdict(int)
    for num, ia in enumerate(dq):
        marc_url = 'http://archive.org/download/%s/%s_meta.mrc' % (ia, ia)
        data = urlopen(marc_url).read()
        if '<title>Internet Archive: Page Not Found</title>' in data:
            continue
        marc = MarcBinary(data)
        rec = read_edition(marc)
        rec['source_records'] = ['ia:' + ia]
        reply = load(rec)
        q = {
            'type': '/type/work',
            'authors.author': '/authors/OL1A',
        }
        work_keys = list(mock_site.things(q))
        assert work_keys
        
        assert reply['success'] == True
def test_don_quixote(mock_site):
    """
    All of these items are by 'Miguel de Cervantes Saavedra',
    only one Author should be created. Some items have bad
    MARC length, others are missing binary MARC altogether
    and raise BadMARC exceptions.
    """
    pytest.skip("This test make live requests to archive.org")

    dq = [u'lifeexploitsofin01cerv', u'cu31924096224518',
        u'elingeniosedcrit04cerv', u'ingeniousgentlem01cervuoft',
        u'historyofingenio01cerv', u'lifeexploitsofin02cerviala',
        u'elingeniosohidal03cervuoft', u'nybc209000', u'elingeniosohidal11cerv',
        u'elingeniosohidal01cervuoft', u'elingeniosoh01cerv',
        u'donquixotedelama00cerviala', u'1896elingeniosohid02cerv',
        u'ingeniousgentlem04cervuoft', u'cu31924027656978', u'histoiredeladmir01cerv',
        u'donquijotedelama04cerv', u'cu31924027657075', u'donquixotedelama03cervuoft',
        u'aventurasdedonqu00cerv', u'p1elingeniosohid03cerv',
        u'geshikhefundonik01cervuoft', u'historyofvalorou02cerviala',
        u'ingeniousgentlem01cerv', u'donquixotedelama01cervuoft',
        u'ingeniousgentlem0195cerv', u'firstpartofdelig00cervuoft',
        u'p4elingeniosohid02cerv', u'donquijote00cervuoft', u'cu31924008863924',
        u'c2elingeniosohid02cerv', u'historyofvalorou03cerviala',
        u'historyofingenio01cerviala', u'historyadventure00cerv',
        u'elingeniosohidal00cerv', u'lifeexploitsofin01cervuoft',
        u'p2elingeniosohid05cerv', u'nybc203136', u'elingeniosohidal00cervuoft',
        u'donquixotedelama02cervuoft', u'lingnieuxcheva00cerv',
        u'ingeniousgentlem03cerv', u'vidayhechosdeli00siscgoog',
        u'lifeandexploits01jarvgoog', u'elingeniosohida00puiggoog',
        u'elingeniosohida00navagoog', u'donquichottedel02florgoog',
        u'historydonquixo00cogoog', u'vidayhechosdeli01siscgoog',
        u'elingeniosohida28saavgoog', u'historyvalorous00brangoog',
        u'elingeniosohida01goog', u'historyandadven00unkngoog',
        u'historyvalorous01goog', u'ingeniousgentle11saavgoog',
        u'elingeniosohida10saavgoog', u'adventuresdonqu00jarvgoog',
        u'historydonquixo04saavgoog', u'lingnieuxcheval00rouxgoog',
        u'elingeniosohida19saavgoog', u'historyingeniou00lalagoog',
        u'elingeniosohida00ormsgoog', u'historyandadven01smolgoog',
        u'elingeniosohida27saavgoog', u'elingeniosohida21saavgoog',
        u'historyingeniou00mottgoog', u'historyingeniou03unkngoog',
        u'lifeandexploits00jarvgoog', u'ingeniousgentle00conggoog',
        u'elingeniosohida00quixgoog', u'elingeniosohida01saavgoog',
        u'donquixotedelam02saavgoog', u'adventuresdonqu00gilbgoog',
        u'historyingeniou02saavgoog', u'donquixotedelam03saavgoog',
        u'elingeniosohida00ochogoog', u'historyingeniou08mottgoog',
        u'lifeandexploits01saavgoog', u'firstpartdeligh00shelgoog',
        u'elingeniosohida00castgoog', u'elingeniosohida01castgoog',
        u'adventofdonquixo00cerv', u'portablecervante00cerv',
        u'firstpartofdelig14cerv', u'donquixotemanofl00cerv',
        u'firstpartofdelig00cerv']

    bad_length = []
    bad_marc = []

    add_languages(mock_site)
    edition_status_counts = defaultdict(int)
    work_status_counts = defaultdict(int)
    author_status_counts = defaultdict(int)

    for ocaid in dq:
        marc_url = 'https://archive.org/download/%s/%s_meta.mrc' % (ocaid, ocaid)
        data = urlopen(marc_url).read()
        try:
            marc = MarcBinary(data)
        except BadLength:
            bad_length.append(ocaid)
            continue
        except BadMARC:
            bad_marc.append(ocaid)
            continue

        rec = read_edition(marc)
        rec['source_records'] = ['ia:' + ocaid]
        reply = load(rec)

        q = {
            'type': '/type/work',
            'authors.author': '/authors/OL1A',
        }
        work_keys = list(mock_site.things(q))
        author_keys = list(mock_site.things({'type': '/type/author'}))
        print("\nReply for %s: %s" % (ocaid, reply))
        print("Work keys: %s" % work_keys)
        assert author_keys == ['/authors/OL1A']
        assert reply['success'] is True

        # Increment status counters
        edition_status_counts[reply['edition']['status']] += 1
        work_status_counts[reply['work']['status']] += 1
        if (reply['work']['status'] != 'matched') and (reply['edition']['status'] != 'modified'):
            # No author key in response if work is 'matched'
            # No author key in response if edition is 'modified'
            author_status_counts[reply['authors'][0]['status']] += 1

    print("BAD MARC LENGTH items: %s" % bad_length)
    print("BAD MARC items: %s" % bad_marc)
    print("Edition status counts: %s" % edition_status_counts)
    print("Work status counts: %s" % work_status_counts)
    print("Author status counts: %s" % author_status_counts)
Example #50
0
 def test_raises_see_also(self):
     filename = "%s/bin_input/talis_see_also.mrc" % test_data
     with open(filename, 'r') as f:
         rec = MarcBinary(f.read())
     with pytest.raises(SeeAlsoAsTitle):
         read_edition(rec)
Example #51
0
 def test_raises_no_title(self):
     filename = "%s/bin_input/talis_no_title2.mrc" % test_data
     with open(filename, 'r') as f:
         rec = MarcBinary(f.read())
     with pytest.raises(NoTitle):
         read_edition(rec)
Example #52
0
def parse_data(data):
    data = data.strip()
    if -1 != data[:10].find('<?xml'):
        root = etree.fromstring(data)
        #print root.tag
        if '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF' == root.tag:
            edition_builder = import_rdf.parse(root)
            format = 'rdf'
        elif '{http://www.w3.org/2005/Atom}entry' == root.tag:
            edition_builder = import_opds.parse(root)
            format = 'opds'
        elif '{http://www.loc.gov/MARC21/slim}record' == root.tag:
            if root.tag == '{http://www.loc.gov/MARC21/slim}collection':
                root = root[0]
            rec = MarcXml(root)
            edition = read_edition(rec)
            edition_builder = import_edition_builder.import_edition_builder(init_dict=edition)
            format = 'marcxml'
        else:
            print 'unrecognized XML format'
            return None, None
    elif data.startswith('{') and data.endswith('}'):
        obj = json.loads(data)
        edition_builder = import_edition_builder.import_edition_builder(init_dict=obj)
        format = 'json'
    else:
        # Special case to load IA records
        # Just passing ia:foo00bar is enough to load foo00bar from IA.
        if data.startswith("ia:"):
            source_records = [data]
            itemid = data[len("ia:"):]

            metadata = ia.get_metadata(itemid)
            if not metadata:
                raise DataError("invalid-ia-identifier")

            if not ia.edition_from_item_metadata(itemid, metadata):
                raise DataError("item-not-a-book")

            try:
                rec = get_marc_record_from_ia(itemid)

                # skip serials
                if rec.leader()[7] == 's':
                    raise DataError("item-is-serial")
            except IOError:
                raise DataError("no-marc-record")

            if not rec:
                raise DataError("no-marc-record")
        else:
            source_records = None
            itemid = None

            #Marc Binary
            if len(data) != int(data[:5]):
                return json.dumps({'success':False, 'error':'Bad MARC length'})

            rec = MarcBinary(data)

        edition = read_edition(rec)
        if source_records:
            edition['source_records'] = source_records
            edition['ocaid'] = itemid
        edition_builder = import_edition_builder.import_edition_builder(init_dict=edition)
        format = 'marc'

    parse_meta_headers(edition_builder)
    
    return edition_builder.get_dict(), format
Example #53
0
    def POST(self):
        web.header('Content-Type', 'application/json')

        if not can_write():
            raise web.HTTPError('403 Forbidden')

        i = web.input()

        require_marc = not (i.get('require_marc') == 'false')
        bulk_marc = i.get('bulk_marc') == 'true'

        if 'identifier' not in i:
            return self.error('bad-input', 'identifier not provided')
        identifier = i.identifier

        # First check whether this is a non-book, bulk-marc item
        if bulk_marc:
            # Get binary MARC by identifier = ocaid/filename:offset:length
            re_bulk_identifier = re.compile("([^/]*)/([^:]*):(\d*):(\d*)")
            try:
                ocaid, filename, offset, length = re_bulk_identifier.match(identifier).groups()
                data, next_offset, next_length = get_from_archive_bulk(identifier)
                next_data = {'next_record_offset': next_offset, 'next_record_length': next_length}
                rec = MarcBinary(data)
                edition = read_edition(rec)
            except MarcException as e:
                details = "%s: %s" % (identifier, str(e))
                logger.error("failed to read from bulk MARC record %s", details)
                return self.error('invalid-marc-record', details, **next_data)

            actual_length = int(rec.leader()[:5])
            edition['source_records'] = 'marc:%s/%s:%s:%d' % (ocaid, filename, offset, actual_length)

            #TODO: Look up URN prefixes to support more sources
            prefix = 'trent'
            edition['local_id'] = ['urn:%s:%s' % (prefix, _id) for _id in rec.get_fields('001')]
            result = add_book.load(edition)

            # Add next_data to the response as location of next record:
            result.update(next_data)

            return json.dumps(result)

        # Case 0 - Is the item already loaded
        key = self.find_edition(identifier)
        if key:
            return self.status_matched(key)

        # Case 1 - Is this a valid Archive.org item?
        try:
            item_json = ia.get_item_json(identifier)
            item_server = item_json['server']
            item_path = item_json['dir']
        except KeyError:
            return self.error("invalid-ia-identifier", "%s not found" % identifier)
        metadata = ia.extract_item_metadata(item_json)
        if not metadata:
            return self.error("invalid-ia-identifier")

        # Case 2 - Does the item have an openlibrary field specified?
        # The scan operators search OL before loading the book and add the
        # OL key if a match is found. We can trust them and attach the item
        # to that edition.
        if metadata.get("mediatype") == "texts" and metadata.get("openlibrary"):
            d = {
                "title": metadata['title'],
                "openlibrary": "/books/" + metadata["openlibrary"]
            }
            d = self.populate_edition_data(d, identifier)
            return self.load_book(d)

        # Case 3 - Can the item be loaded into Open Library?
        status = ia.get_item_status(identifier, metadata,
                                    item_server=item_server, item_path=item_path)
        if status != 'ok':
            return self.error(status, "Prohibited Item")

        # Gio - April 2016
        # items with metadata no_ol_import=true will be not imported
        if metadata.get("no_ol_import", '').lower() == 'true':
            return self.error("no-ol-import")

        # Case 4 - Does this item have a marc record?
        marc_record = self.get_marc_record(identifier)
        if marc_record:
            # Is the item a serial instead of a book?
            marc_leaders = marc_record.leader()
            if marc_leaders[7] == 's':
                return self.error("item-is-serial")

            # insider note: follows Archive.org's approach of
            # Item::isMARCXMLforMonograph() which excludes non-books
            if not (marc_leaders[7] == 'm' and marc_leaders[6] == 'a'):
                return self.error("item-not-book")

            try:
                edition_data = read_edition(marc_record)
            except MarcException as e:
                logger.error("failed to read from MARC record %s: %s", identifier, str(e))
                return self.error("invalid-marc-record")

        elif require_marc:
            return self.error("no-marc-record")

        else:
            try:
                edition_data = self.get_ia_record(metadata)
            except KeyError:
                return self.error("invalid-ia-metadata")

        # Add IA specific fields: ocaid, source_records, and cover
        edition_data = self.populate_edition_data(edition_data, identifier)

        return self.load_book(edition_data)
Example #54
0
def parse_data(data):
    """
    Takes POSTed data and determines the format, and returns an Edition record
    suitable for adding to OL.

    :param str data: Raw data
    :rtype: (dict|None, str|None)
    :return: (Edition record, format (rdf|opds|marcxml|json|marc)) or (None, None)
    """
    data = data.strip()
    if -1 != data[:10].find('<?xml'):
        root = etree.fromstring(data)
        if '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF' == root.tag:
            edition_builder = import_rdf.parse(root)
            format = 'rdf'
        elif '{http://www.w3.org/2005/Atom}entry' == root.tag:
            edition_builder = import_opds.parse(root)
            format = 'opds'
        elif '{http://www.loc.gov/MARC21/slim}record' == root.tag:
            if root.tag == '{http://www.loc.gov/MARC21/slim}collection':
                root = root[0]
            rec = MarcXml(root)
            edition = read_edition(rec)
            edition_builder = import_edition_builder.import_edition_builder(init_dict=edition)
            format = 'marcxml'
        else:
            print 'unrecognized XML format'
            return None, None
    elif data.startswith('{') and data.endswith('}'):
        obj = json.loads(data)
        edition_builder = import_edition_builder.import_edition_builder(init_dict=obj)
        format = 'json'
    else:
        # Special case to load IA records, DEPRECATED: use import/ia endpoint
        # Just passing ia:foo00bar is enough to load foo00bar from IA.
        if data.startswith("ia:"):
            source_records = [data]
            itemid = data[len("ia:"):]

            metadata = ia.get_metadata(itemid)
            if not metadata:
                raise DataError("invalid-ia-identifier")

            # see ia_importapi to address `imagecount` limitations
            status = ia.get_item_status(itemid, metadata)
            if status != 'ok':
                raise DataError(status)

            try:
                rec = get_marc_record_from_ia(itemid)

                # skip serials
                if rec and rec.leader()[7] == 's':
                    raise DataError("item-is-serial")
            except IOError:
                raise DataError("no-marc-record")

            if not rec:
                raise DataError("no-marc-record")
        else:
            source_records = None
            itemid = None

            #Marc Binary
            if len(data) != int(data[:5]):
                return json.dumps({'success':False, 'error':'Bad MARC length'})

            rec = MarcBinary(data)

        edition = read_edition(rec)
        if source_records:
            edition['source_records'] = source_records
            edition['ocaid'] = itemid
        edition_builder = import_edition_builder.import_edition_builder(init_dict=edition)
        format = 'marc'

    parse_meta_headers(edition_builder)

    return edition_builder.get_dict(), format