Beispiel #1
0
def merge_authors(ol, keys, debug=False):
    #    print 'merge author %s:"%s" and %s:"%s"' % (author['key'], author['name'], merge_with['key'], merge_with['name'])
    #    print 'becomes: "%s"' % repr(new_name)
    authors = [
        a for a in (withKey(k) for k in keys)
        if a['type']['key'] != '/type/redirect'
    ]
    not_redirect = set(a['key'] for a in authors)
    if debug:
        for a in authors:
            print(a)

    assert all(a['type']['key'] == '/type/author' for a in authors)
    name1 = authors[0]['name']
    for a in authors:
        print(repr(a['key'], a['name']))
    assert all(match_with_bad_chars(a['name'], name1) for a in authors[1:])

    best_key = pick_best_author(authors)['key']

    imgs = [
        a['key'] for a in authors
        if a['key'] != '/a/OL2688880A' and has_image(a['key'])
    ]
    if len(imgs) == 1:
        new_key = imgs[0]
    else:
        new_key = "/a/OL%dA" % min(key_int(a) for a in authors)
        # Molière and O. J. O. Ferreira
        if len(imgs) != 0:
            print('imgs:', imgs)
            return  # skip
        if not (imgs == [u'/a/OL21848A', u'/a/OL4280680A'] \
                or imgs == [u'/a/OL325189A', u'/a/OL266422A'] \
                or imgs == [u'/a/OL5160945A', u'/a/OL5776228A']):
            print(imgs)
            assert len(imgs) == 0

    print(new_key)
    print(best_key)

    do_normalize(new_key, best_key, authors)
    old_keys = set(k for k in keys if k != new_key)
    print('old keys:', old_keys)

    for old in old_keys:
        # /b/OL21291659M
        switch_author(ol, old, new_key, old_keys, debug=True)
        if old in not_redirect:
            make_redirect(ol, old, new_key)
        q = {
            'authors': old,
            'type': '/type/edition',
        }
        if list(get_things(q)) != []:
            switch_author(ol, old, new_key, old_keys, debug=True)
Beispiel #2
0
def follow_redirects(key):
    keys = []
    thing = None
    while not thing or thing['type']['key'] == '/type/redirect':
        keys.append(key)
        thing = withKey(key)
        assert thing
        if thing['type']['key'] == '/type/redirect':
            print('following redirect {} => {}'.format(key, thing['location']))
            key = thing['location']
    return (keys, thing)
Beispiel #3
0
def merge_authors(ol, keys, debug=False):
#    print 'merge author %s:"%s" and %s:"%s"' % (author['key'], author['name'], merge_with['key'], merge_with['name'])
#    print 'becomes: "%s"' % repr(new_name)
    authors = [a for a in (withKey(k) for k in keys) if a['type']['key'] != '/type/redirect']
    not_redirect = set(a['key'] for a in authors)
    if debug:
        for a in authors:
            print(a)

    assert all(a['type']['key'] == '/type/author' for a in authors)
    name1 = authors[0]['name']
    for a in authors:
        print(repr(a['key'], a['name']))
    assert all(match_with_bad_chars(a['name'], name1) for a in authors[1:])

    best_key = pick_best_author(authors)['key']

    imgs = [a['key'] for a in authors if a['key'] != '/a/OL2688880A' and has_image(a['key'])]
    if len(imgs) == 1:
        new_key = imgs[0]
    else:
        new_key = "/a/OL%dA" % min(key_int(a) for a in authors)
        # Molière and O. J. O. Ferreira
        if len(imgs) != 0:
            print('imgs:', imgs)
            return # skip
        if not (imgs == [u'/a/OL21848A', u'/a/OL4280680A'] \
                or imgs == [u'/a/OL325189A', u'/a/OL266422A'] \
                or imgs == [u'/a/OL5160945A', u'/a/OL5776228A']):
            print(imgs)
            assert len(imgs) == 0

    print(new_key)
    print(best_key)

    do_normalize(new_key, best_key, authors)
    old_keys = set(k for k in keys if k != new_key)
    print('old keys:', old_keys)

    for old in old_keys:
        # /b/OL21291659M
        switch_author(ol, old, new_key, old_keys, debug=True)
        if old in not_redirect:
            make_redirect(ol, old, new_key)
        q = { 'authors': old, 'type': '/type/edition', }
        if list(get_things(q)) != []:
            switch_author(ol, old, new_key, old_keys, debug=True)
Beispiel #4
0
def try_amazon(thing):
    if 'isbn_10' not in thing:
        return None
    if 'authors' in thing:
        authors = []
        for a in thing['authors']:
            # this is a hack
            # the type of thing['authors'] should all be the same type
            if isinstance(a, dict):
                akey = a['key']
            else:
                assert isinstance(a, six.string_types)
                akey = a
            author_thing = withKey(akey)
            if 'name' in author_thing:
                authors.append(author_thing['name'])
    else:
        authors = []
    return amazon.build_amazon(thing, authors)
Beispiel #5
0
def try_amazon(thing):
    if 'isbn_10' not in thing:
        return None
    if 'authors' in thing:
        authors = []
        for a in thing['authors']:
            # this is a hack
            # the type of thing['authors'] should all be the same type
            if isinstance(a, dict):
                akey = a['key']
            else:
                assert isinstance(a, basestring)
                akey = a
            author_thing = withKey(akey)
            if 'name' in author_thing:
                authors.append(author_thing['name'])
    else:
        authors = []
    return amazon.build_amazon(thing, authors)
Beispiel #6
0
def try_merge(e1, edition_key, thing):
    thing_type = thing['type']['key']
    if thing_type != '/type/edition':
        print thing['key'], 'is', thing['type']['key']
    if thing_type == '/type/delete': # 
        return False
    assert thing_type == '/type/edition'

    if 'source_records' in thing:
        if fix_source_records(edition_key, thing):
            thing = withKey(edition_key) # reload
        return source_records_match(e1, thing)

    ia = thing.get('ocaid', None)
    print edition_key
    mc = get_mc(edition_key)
    print mc
    if mc:
        if mc.startswith('ia:'):
            ia = mc[3:]
        elif mc.endswith('.xml') or mc.endswith('.mrc'):
            ia = mc[:mc.find('/')]
        if '_meta.mrc:' in mc:
            print thing
            if 'ocaid' not in thing:
                return False
            ia = thing['ocaid']
    rec2 = None
    if ia:
        if is_dark_or_bad(ia):
            return False
        try:
            loc2, rec2 = get_ia(ia)
        except xml.parsers.expat.ExpatError:
            return False
        except NoMARCXML:
            print 'no MARCXML'
            pass
        except urllib2.HTTPError, error:
            print error.code
            assert error.code in (404, 403)
        if not rec2:
            return True
Beispiel #7
0
def try_merge(e1, edition_key, thing):
    thing_type = thing['type']
    if thing_type != Reference('/type/edition'):
        print thing['key'], 'is', str(thing['type'])
    if thing_type == Reference('/type/delete'):
        return False
    assert thing_type == Reference('/type/edition')

    if 'source_records' in thing:
        if fix_source_records(edition_key, thing):
            thing = withKey(edition_key)  # reload
        return source_records_match(e1, thing)

    ia = thing.get('ocaid', None)
    print edition_key
    mc = get_mc(edition_key)
    print mc
    if mc:
        if mc.startswith('ia:'):
            ia = mc[3:]
        elif mc.endswith('.xml') or mc.endswith('.mrc'):
            ia = mc[:mc.find('/')]
        if '_meta.mrc:' in mc:
            print thing
            if 'ocaid' not in thing:
                return False
            ia = thing['ocaid']
    rec2 = None
    if ia:
        if is_dark_or_bad(ia):
            return False
        try:
            rec2 = get_ia(ia)
        except xml.parsers.expat.ExpatError:
            return False
        except NoMARCXML:
            print 'no MARCXML'
            pass
        except urllib2.HTTPError, error:
            print error.code
            assert error.code in (404, 403)
        if not rec2:
            return True
Beispiel #8
0
def try_merge(e1, edition_key, thing):
    thing_type = thing["type"]["key"]
    if thing_type != "/type/edition":
        print thing["key"], "is", thing["type"]["key"]
    if thing_type == "/type/delete":  #
        return False
    assert thing_type == "/type/edition"

    if "source_records" in thing:
        if fix_source_records(edition_key, thing):
            thing = withKey(edition_key)  # reload
        return source_records_match(e1, thing)

    ia = thing.get("ocaid", None)
    print edition_key
    mc = get_mc(edition_key)
    print mc
    if mc:
        if mc.startswith("ia:"):
            ia = mc[3:]
        elif mc.endswith(".xml") or mc.endswith(".mrc"):
            ia = mc[: mc.find("/")]
        if "_meta.mrc:" in mc:
            assert "ocaid" in thing
            ia = thing["ocaid"]
    rec2 = None
    if ia:
        if is_dark_or_bad(ia):
            return False
        try:
            loc2, rec2 = get_ia(ia)
        except xml.parsers.expat.ExpatError:
            return False
        except NoMARCXML:
            print "no MARCXML"
            pass
        except urllib2.HTTPError, error:
            print error.code
            assert error.code in (404, 403)
        if not rec2:
            return True
Beispiel #9
0
def try_merge(e1, edition_key, thing):
    thing_type = thing['type']
    if thing_type != Reference('/type/edition'):
        print(thing['key'], 'is', str(thing['type']))
    if thing_type == Reference('/type/delete'):
        return False
    assert thing_type == Reference('/type/edition')

    if 'source_records' in thing:
        if fix_source_records(edition_key, thing):
            thing = withKey(edition_key)  # reload
        return source_records_match(e1, thing)

    ia = thing.get('ocaid', None)
    print(edition_key)
    mc = get_mc(edition_key)
    print(mc)
    if mc:
        if mc.startswith('ia:'):
            ia = mc[3:]
        elif mc.endswith('.xml') or mc.endswith('.mrc'):
            ia = mc[:mc.find('/')]
        if '_meta.mrc:' in mc:
            print(thing)
            if 'ocaid' not in thing:
                return False
            ia = thing['ocaid']
    rec2 = None
    if ia:
        if is_dark_or_bad(ia):
            return False
        try:
            rec2 = get_ia(ia)
        except xml.parsers.expat.ExpatError:
            return False
        except NoMARCXML:
            print('no MARCXML')
            pass
        except urllib2.HTTPError as error:
            print(error.code)
            assert error.code in (404, 403)
        if not rec2:
            return True
    if not rec2:
        if not mc:
            mc = get_mc(thing['key'])
        if not mc or mc == 'initial import':
            return False
        if mc.startswith('amazon:'):
            try:
                a = try_amazon(thing)
            except IndexError:
                print(thing['key'])
                raise
            except AttributeError:
                return False
            if not a:
                return False
            try:
                return amazon.attempt_merge(a, e1, threshold, debug=False)
            except:
                print(a)
                print(e1)
                print(thing['key'])
                raise
        print('mc:', mc)
        try:
            assert not mc.startswith('ia:')
            data = get_from_archive(mc)
            if not data:
                return True
            rec2 = fast_parse.read_edition(data)
        except (fast_parse.SoundRecording, IndexError, AssertionError):
            print(mc)
            print(edition_key)
            return False
        except:
            print(mc)
            print(edition_key)
            raise
    if not rec2:
        return False
    try:
        e2 = build_marc(rec2)
    except TypeError:
        print(rec2)
        raise
    return attempt_merge(e1, e2, threshold, debug=False)
Beispiel #10
0
def try_merge(e1, edition_key, thing):
    thing_type = thing['type']
    if thing_type != Reference('/type/edition'):
        print(thing['key'], 'is', str(thing['type']))
    if thing_type == Reference('/type/delete'):
        return False
    assert thing_type == Reference('/type/edition')

    if 'source_records' in thing:
        if fix_source_records(edition_key, thing):
            thing = withKey(edition_key) # reload
        return source_records_match(e1, thing)

    ia = thing.get('ocaid', None)
    print(edition_key)
    mc = get_mc(edition_key)
    print(mc)
    if mc:
        if mc.startswith('ia:'):
            ia = mc[3:]
        elif mc.endswith('.xml') or mc.endswith('.mrc'):
            ia = mc[:mc.find('/')]
        if '_meta.mrc:' in mc:
            print(thing)
            if 'ocaid' not in thing:
                return False
            ia = thing['ocaid']
    rec2 = None
    if ia:
        if is_dark_or_bad(ia):
            return False
        try:
            rec2 = get_ia(ia)
        except xml.parsers.expat.ExpatError:
            return False
        except NoMARCXML:
            print('no MARCXML')
            pass
        except urllib2.HTTPError as error:
            print(error.code)
            assert error.code in (404, 403)
        if not rec2:
            return True
    if not rec2:
        if not mc:
            mc = get_mc(thing['key'])
        if not mc or mc == 'initial import':
            return False
        if mc.startswith('amazon:'):
            try:
                a = try_amazon(thing)
            except IndexError:
                print(thing['key'])
                raise
            except AttributeError:
                return False
            if not a:
                return False
            try:
                return amazon.attempt_merge(a, e1, threshold, debug=False)
            except:
                print(a)
                print(e1)
                print(thing['key'])
                raise
        print('mc:', mc)
        try:
            assert not mc.startswith('ia:')
            data = get_from_archive(mc)
            if not data:
                return True
            rec2 = fast_parse.read_edition(data)
        except (fast_parse.SoundRecording, IndexError, AssertionError):
            print(mc)
            print(edition_key)
            return False
        except:
            print(mc)
            print(edition_key)
            raise
    if not rec2:
        return False
    try:
        e2 = build_marc(rec2)
    except TypeError:
        print(rec2)
        raise
    return attempt_merge(e1, e2, threshold, debug=False)
Beispiel #11
0
def load_part(archive_id, part, start_pos=0):
    print 'load_part:', archive_id, part
    global rec_no, t_prev, load_count
    full_part = archive_id + "/" + part
    f = open(rc['marc_path'] + "/" + full_part)
    if start_pos:
        f.seek(start_pos)
    for pos, loc, data in read_marc_file(full_part, f, pos=start_pos):
        rec_no += 1
        if rec_no % chunk == 0:
            progress(archive_id, rec_no, start_pos, pos)

        if is_loaded(loc):
            continue
        want = ['001', '003', '010', '020', '035', '245']
        try:
            index_fields = fast_parse.index_fields(data, want)
        except KeyError:
            print loc
            print fast_parse.get_tag_lines(data, ['245'])
            raise
        except AssertionError:
            print loc
            raise
        except fast_parse.NotBook:
            continue
        if not index_fields or 'title' not in index_fields:
            continue

        print loc
        edition_pool = pool.build(index_fields)

        if not edition_pool:
            yield loc, data
            continue

        rec = fast_parse.read_edition(data)
        e1 = build_marc(rec)

        match = False
        seen = set()
        for k, v in edition_pool.iteritems():
            for edition_key in v:
                if edition_key in seen:
                    continue
                thing = None
                while not thing or thing['type']['key'] == '/type/redirect':
                    seen.add(edition_key)
                    thing = withKey(edition_key)
                    assert thing
                    if thing['type']['key'] == '/type/redirect':
                        print 'following redirect %s => %s' % (edition_key, thing['location'])
                        edition_key = thing['location']
                if try_merge(e1, edition_key, thing):
                    add_source_records(edition_key, loc, thing, data)
                    match = True
                    break
            if match:
                break

        if not match:
            yield loc, data
Beispiel #12
0
def load_part(archive_id, part, start_pos=0):
    print('load_part:', archive_id, part)
    global rec_no, t_prev, load_count
    full_part = archive_id + "/" + part
    f = open(rc['marc_path'] + "/" + full_part)
    if start_pos:
        f.seek(start_pos)
    for pos, loc, data in read_marc_file(full_part, f, pos=start_pos):
        rec_no += 1
        if rec_no % chunk == 0:
            progress(archive_id, rec_no, start_pos, pos)

        if is_loaded(loc):
            continue
        want = ['001', '003', '010', '020', '035', '245']
        try:
            index_fields = fast_parse.index_fields(data, want)
        except KeyError:
            print(loc)
            print(fast_parse.get_tag_lines(data, ['245']))
            raise
        except AssertionError:
            print(loc)
            raise
        except fast_parse.NotBook:
            continue
        if not index_fields or 'title' not in index_fields:
            continue

        print(loc)
        edition_pool = pool.build(index_fields)

        if not edition_pool:
            yield loc, data
            continue

        rec = fast_parse.read_edition(data)
        e1 = build_marc(rec)

        match = False
        seen = set()
        for k, v in edition_pool.iteritems():
            for edition_key in v:
                if edition_key in seen:
                    continue
                thing = None
                while not thing or thing['type']['key'] == '/type/redirect':
                    seen.add(edition_key)
                    thing = withKey(edition_key)
                    assert thing
                    if thing['type']['key'] == '/type/redirect':
                        print('following redirect %s => %s' %
                              (edition_key, thing['location']))
                        edition_key = thing['location']
                if try_merge(e1, edition_key, thing):
                    add_source_records(edition_key, loc, thing, data)
                    match = True
                    break
            if match:
                break

        if not match:
            yield loc, data