Example #1
0
def get_work_subjects(w):
    found = set()
    for e in w['editions']:
        sr = e.get('source_records', [])
        if sr:
            for i in sr:
                if i.endswith('initial import'):
                    bad_source_record(e, i)
                    continue
                if i.startswith('ia:') or i.startswith('marc:'):
                    found.add(i)
                    continue
        else:
            m = re_edition_key.match(e['key'])
            mc = get_mc('/b/' + m.group(1))
            if mc:
                if mc.endswith('initial import'):
                    bad_source_record(e, mc)
                    continue
                if not mc.startswith('amazon:') and not re_ia_marc.match(mc):
                    found.add('marc:' + mc)
    subjects = []
    for sr in found:
        if sr.startswith('marc:ia:'):
            subjects.append(get_subjects_from_ia(sr[8:]))
        elif sr.startswith('marc:'):
            loc = sr[5:]
            data = get_from_archive(loc)
            rec = MarcBinary(data)
            subjects.append(read_subjects(rec))
        else:
            assert sr.startswith('ia:')
            subjects.append(get_subjects_from_ia(sr[3:]))
    return combine_subjects(subjects)
Example #2
0
def get_marc_src(e):
    mc = get_mc(e['key'])
    if mc and mc.startswith('amazon:'):
        mc = None
    if mc and mc.startswith('ia:'):
        yield 'ia', mc[3:]
    elif mc:
        m = re_ia_marc.match(mc)
        if m:
            #print 'IA marc match:', m.group(1)
            yield 'ia', m.group(1)
        else:
            yield 'marc', mc
    source_records = e.get('source_records', [])
    if not source_records:
        return
    for src in source_records:
        if src.startswith('ia:'):
            if not mc or src != mc:
                yield 'ia', src[3:]
            continue
        if src.startswith('marc:'):
            if not mc or src != 'marc:' + mc:
                yield 'marc', src[5:]
            continue
Example #3
0
def get_work_subjects(w, do_get_mc=True):
    found = set()
    for e in w['editions']:
        sr = e.get('source_records', [])
        if sr:
            for i in sr:
                if i.endswith('initial import'):
                    continue
                if i.startswith(('ia:', 'marc:')):
                    found.add(i)
                    continue
        else:
            mc = None
            if do_get_mc:
                m = re_edition_key.match(e['key'])
                mc = get_mc('/b/' + m.group(1))
            if mc:
                if mc.endswith('initial import'):
                    continue
                if not mc.startswith('amazon:') and not re_ia_marc.match(mc):
                    found.add('marc:' + mc)
    subjects = []
    for sr in found:
        if sr.startswith('marc:ia:'):
            subjects.append(get_subjects_from_ia(sr[8:]))
        elif sr.startswith('marc:'):
            loc = sr[5:]
            data = get_from_archive(loc)
            rec = MarcBinary(data)
            subjects.append(read_subjects(rec))
        else:
            assert sr.startswith('ia:')
            subjects.append(get_subjects_from_ia(sr[3:]))
    return combine_subjects(subjects)
Example #4
0
def get_marc_src(e):
    mc = get_mc(e['key'])
    if mc and mc.startswith('amazon:'):
        mc = None
    if mc and mc.startswith('ia:'):
        yield 'ia', mc[3:]
    elif mc:
        m = re_ia_marc.match(mc)
        if m:
            #print 'IA marc match:', m.group(1)
            yield 'ia', m.group(1)
        else:
            yield 'marc', mc
    source_records = e.get('source_records', [])
    if not source_records:
        return
    for src in source_records:
        if src.startswith('ia:'):
            if not mc or src != mc:
                yield 'ia', src[3:]
            continue
        if src.startswith('marc:'):
            if not mc or src != 'marc:' + mc:
                yield 'marc', src[5:]
            continue
Example #5
0
def try_merge(edition, ekey, thing):
    thing_type = thing['type']['key']
    if 'isbn_10' not in edition:
        print(edition)
    asin = edition.get('isbn_10', None) or edition['asin']
    if 'authors' in edition:
        authors = [i['name'] for i in edition['authors']]
    else:
        authors = []
    a = amazon_merge.build_amazon(edition, authors)
    assert isinstance(asin, str)
    assert thing_type == '/type/edition'
    # print edition['asin'], ekey
    if 'source_records' in thing:
        if 'amazon:' + asin in thing['source_records']:
            return True
        return source_records_match(a, thing)

    # print 'no source records'
    mc = get_mc(ekey)
    # print 'mc:', mc
    if mc == 'amazon:' + asin:
        return True
    if not mc:
        return False
    data = get_from_local(mc)
    e1 = build_marc(fast_parse.read_edition(data))
    return amazon_merge.attempt_merge(a, e1, threshold, debug=False)
Example #6
0
def add_source_records(key, ia, v=None):
    new = 'ia:' + ia
    sr = None
    m = re_edition_key.match(key)
    old_style_key = '/b/' + m.group(1)
    key = '/books/' + m.group(1)
    e = ol.get(key, v=v)
    need_update = False
    if 'ocaid' not in e:
        need_update = True
        e['ocaid'] = ia
    if 'source_records' in e:
        if new in e['source_records'] and not need_update:
            return
        e['source_records'].append(new)
    else:
        existing = get_mc(old_style_key)
        print 'get_mc(%s) == %s' % (old_style_key, existing)
        if existing is None:
            sr = []
        elif existing.startswith('ia:') or existing.startswith('amazon:'):
            sr = [existing]
        else:
            m = re_meta_mrc.match(existing)
            sr = ['marc:' + existing if not m else 'ia:' + m.group(1)]
        print 'ocaid:', e['ocaid']
        if 'ocaid' in e and 'ia:' + e['ocaid'] not in sr:
            sr.append('ia:' + e['ocaid'])
        print 'sr:', sr
        print 'ocaid:', e['ocaid']
        if new not in sr:
            e['source_records'] = sr + [new]
        else:
            e['source_records'] = sr
        assert 'source_records' in e

    # fix other bits of the record as well
    new_toc = fix_toc(e)
    if new_toc:
        e['table_of_contents'] = new_toc
    if e.get('subjects', None) and any(has_dot(s) for s in e['subjects']):
        subjects = [s[:-1] if has_dot(s) else s for s in e['subjects']]
        e['subjects'] = subjects
    if 'authors' in e:
        assert not any(a=='None' for a in e['authors'])
        print e['authors']
        authors = [ol.get(akey) for akey in e['authors']]
        authors = [ol.get(a['location']) if a['type'] == '/type/redirect' else a \
                for a in authors]
        for a in authors:
            if a['type'] == '/type/redirect':
                print 'double redirect on:', e['key']
        e['authors'] = [{'key': a['key']} for a in authors]
        undelete_authors(authors)
    print 'saving', key
    assert 'source_records' in e
    print ol.save(key, e, 'found a matching MARC record')
    add_cover_image(key, ia)
Example #7
0
def add_source_records(key, ia, v=None):
    new = 'ia:' + ia
    sr = None
    m = re_edition_key.match(key)
    old_style_key = '/b/' + m.group(1)
    key = '/books/' + m.group(1)
    e = ol.get(key, v=v)
    need_update = False
    if 'ocaid' not in e:
        need_update = True
        e['ocaid'] = ia
    if 'source_records' in e:
        if new in e['source_records'] and not need_update:
            return
        e['source_records'].append(new)
    else:
        existing = get_mc(old_style_key)
        print('get_mc(%s) == %s' % (old_style_key, existing))
        if existing is None:
            sr = []
        elif existing.startswith('ia:') or existing.startswith('amazon:'):
            sr = [existing]
        else:
            m = re_meta_mrc.match(existing)
            sr = ['marc:' + existing if not m else 'ia:' + m.group(1)]
        print('ocaid:', e['ocaid'])
        if 'ocaid' in e and 'ia:' + e['ocaid'] not in sr:
            sr.append('ia:' + e['ocaid'])
        print('sr:', sr)
        print('ocaid:', e['ocaid'])
        if new not in sr:
            e['source_records'] = sr + [new]
        else:
            e['source_records'] = sr
        assert 'source_records' in e

    # fix other bits of the record as well
    new_toc = fix_toc(e)
    if new_toc:
        e['table_of_contents'] = new_toc
    if e.get('subjects', None) and any(has_dot(s) for s in e['subjects']):
        subjects = [s[:-1] if has_dot(s) else s for s in e['subjects']]
        e['subjects'] = subjects
    if 'authors' in e:
        assert not any(a == 'None' for a in e['authors'])
        print(e['authors'])
        authors = [ol.get(akey) for akey in e['authors']]
        authors = [ol.get(a['location']) if a['type'] == '/type/redirect' else a \
                for a in authors]
        for a in authors:
            if a['type'] == '/type/redirect':
                print('double redirect on:', e['key'])
        e['authors'] = [{'key': a['key']} for a in authors]
        undelete_authors(authors)
    print('saving', key)
    assert 'source_records' in e
    print(ol.save(key, e, 'found a matching MARC record'))
    add_cover_image(key, ia)
Example #8
0
def get_marc_source(w):
    found = set()
    for e in w['editions']:
        sr = e.get('source_record', [])
        if sr:
            found.update(i[5:] for i in sr if i.startswith('marc:'))
        else:
            mc = get_mc(e['key'])
            if mc and not mc.startswith('amazon:') and not re_ia_marc.match(mc):
                found.add(mc)
    return found
Example #9
0
def add_source_records(key, new, thing, data):
    sr = None
    e = get_with_retry(key)
    if 'source_records' in e:
        if new in e['source_records']:
            return
        e['source_records'].append(new)
    else:
        existing = get_mc(key)
        amazon = 'amazon:'
        if existing.startswith('ia:'):
            sr = [existing]
        elif existing.startswith(amazon):
            sr = amazon_source_records(existing[len(amazon):]) or [existing]
        else:
            m = re_meta_mrc.match(existing)
            sr = ['marc:' + existing if not m else 'ia:' + m.group(1)]
        assert new not in sr
        e['source_records'] = sr + [new]

    # fix other bits of the record as well
    new_toc = fix_toc(e)
    if new_toc:
        e['table_of_contents'] = new_toc
    if e.get('subjects', None) and any(has_dot(s) for s in e['subjects']):
        subjects = [s[:-1] if has_dot(s) else s for s in e['subjects']]
        e['subjects'] = subjects
    if 'authors' in e:
        if any(a == 'None' for a in e['authors']):
            assert len(e['authors']) == 1
            new_author = author_from_data(new, data)
            e['authors'] = [new_author]
        else:
            print(e['authors'])
            authors = [get_with_retry(akey) for akey in e['authors']]
            while any(a['type'] == '/type/redirect' for a in authors):
                print('following redirects')
                authors = [
                    ol.get(a['location'])
                    if a['type'] == '/type/redirect' else a for a in authors
                ]
            e['authors'] = [{'key': a['key']} for a in authors]
            undelete_authors(authors)
    try:
        print(save_with_retry(key, e, 'found a matching MARC record'))
    except:
        print(e)
        raise
    if new_toc:
        new_edition = ol.get(key)
        # [{u'type': <ref: u'/type/toc_item'>}, ...]
        assert 'title' in new_edition['table_of_contents'][0]
Example #10
0
def get_marc_source(w):
    found = set()
    for e in w['editions']:
        sr = e.get('source_records', [])
        if sr:
            found.update(i[5:] for i in sr if i.startswith('marc:'))
        else:
            m = re_edition_key.match(e['key'])
            if not m:
                print(e['key'])
            mc = get_mc('/b/' + m.group(1))
            if mc and not mc.startswith('amazon:') and not re_ia_marc.match(mc):
                found.add(mc)
    return found
Example #11
0
def get_marc_source(w):
    found = set()
    for e in w['editions']:
        sr = e.get('source_records', [])
        if sr:
            found.update(i[5:] for i in sr if i.startswith('marc:'))
        else:
            m = re_edition_key.match(e['key'])
            if not m:
                print e['key']
            mc = get_mc('/b/' + m.group(1))
            if mc and not mc.startswith('amazon:') and not re_ia_marc.match(mc):
                found.add(mc)
    return found
Example #12
0
def add_source_records(key, new, thing, data):
    sr = None
    e = get_with_retry(key)
    if 'source_records' in e:
        if new in e['source_records']:
            return
        e['source_records'].append(new)
    else:
        existing = get_mc(key)
        amazon = 'amazon:'
        if existing.startswith('ia:'):
            sr = [existing]
        elif existing.startswith(amazon):
            sr = amazon_source_records(existing[len(amazon):]) or [existing]
        else:
            m = re_meta_mrc.match(existing)
            sr = ['marc:' + existing if not m else 'ia:' + m.group(1)]
        assert new not in sr
        e['source_records'] = sr + [new]

    # fix other bits of the record as well
    new_toc = fix_toc(e)
    if new_toc:
        e['table_of_contents'] = new_toc
    if e.get('subjects', None) and any(has_dot(s) for s in e['subjects']):
        subjects = [s[:-1] if has_dot(s) else s for s in e['subjects']]
        e['subjects'] = subjects
    if 'authors' in e:
        if any(a=='None' for a in e['authors']):
            assert len(e['authors']) == 1
            new_author = author_from_data(new, data)
            e['authors'] = [new_author]
        else:
            print e['authors']
            authors = [get_with_retry(akey) for akey in e['authors']]
            while any(a['type'] == '/type/redirect' for a in authors):
                print 'following redirects'
                authors = [ol.get(a['location']) if a['type'] == '/type/redirect' else a for a in authors]
            e['authors'] = [{'key': a['key']} for a in authors]
            undelete_authors(authors)
    try:
        print save_with_retry(key, e, 'found a matching MARC record')
    except:
        print e
        raise
    if new_toc:
        new_edition = ol.get(key)
        # [{u'type': <ref: u'/type/toc_item'>}, ...]
        assert 'title' in new_edition['table_of_contents'][0]
Example #13
0
def try_merge(e1, edition_key, thing):
    thing_type = thing['type']['key']
    if thing_type != '/type/edition':
        print thing['key'], 'is', thing['type']['key']
    if thing_type == '/type/delete': # 
        return False
    assert thing_type == '/type/edition'

    if 'source_records' in thing:
        if fix_source_records(edition_key, thing):
            thing = withKey(edition_key) # reload
        return source_records_match(e1, thing)

    ia = thing.get('ocaid', None)
    print edition_key
    mc = get_mc(edition_key)
    print mc
    if mc:
        if mc.startswith('ia:'):
            ia = mc[3:]
        elif mc.endswith('.xml') or mc.endswith('.mrc'):
            ia = mc[:mc.find('/')]
        if '_meta.mrc:' in mc:
            print thing
            if 'ocaid' not in thing:
                return False
            ia = thing['ocaid']
    rec2 = None
    if ia:
        if is_dark_or_bad(ia):
            return False
        try:
            loc2, rec2 = get_ia(ia)
        except xml.parsers.expat.ExpatError:
            return False
        except NoMARCXML:
            print 'no MARCXML'
            pass
        except urllib2.HTTPError, error:
            print error.code
            assert error.code in (404, 403)
        if not rec2:
            return True
Example #14
0
def try_merge(e1, edition_key, thing):
    thing_type = thing['type']
    if thing_type != Reference('/type/edition'):
        print thing['key'], 'is', str(thing['type'])
    if thing_type == Reference('/type/delete'):
        return False
    assert thing_type == Reference('/type/edition')

    if 'source_records' in thing:
        if fix_source_records(edition_key, thing):
            thing = withKey(edition_key)  # reload
        return source_records_match(e1, thing)

    ia = thing.get('ocaid', None)
    print edition_key
    mc = get_mc(edition_key)
    print mc
    if mc:
        if mc.startswith('ia:'):
            ia = mc[3:]
        elif mc.endswith('.xml') or mc.endswith('.mrc'):
            ia = mc[:mc.find('/')]
        if '_meta.mrc:' in mc:
            print thing
            if 'ocaid' not in thing:
                return False
            ia = thing['ocaid']
    rec2 = None
    if ia:
        if is_dark_or_bad(ia):
            return False
        try:
            rec2 = get_ia(ia)
        except xml.parsers.expat.ExpatError:
            return False
        except NoMARCXML:
            print 'no MARCXML'
            pass
        except urllib2.HTTPError, error:
            print error.code
            assert error.code in (404, 403)
        if not rec2:
            return True
Example #15
0
def try_merge(e1, edition_key, thing):
    thing_type = thing["type"]["key"]
    if thing_type != "/type/edition":
        print thing["key"], "is", thing["type"]["key"]
    if thing_type == "/type/delete":  #
        return False
    assert thing_type == "/type/edition"

    if "source_records" in thing:
        if fix_source_records(edition_key, thing):
            thing = withKey(edition_key)  # reload
        return source_records_match(e1, thing)

    ia = thing.get("ocaid", None)
    print edition_key
    mc = get_mc(edition_key)
    print mc
    if mc:
        if mc.startswith("ia:"):
            ia = mc[3:]
        elif mc.endswith(".xml") or mc.endswith(".mrc"):
            ia = mc[: mc.find("/")]
        if "_meta.mrc:" in mc:
            assert "ocaid" in thing
            ia = thing["ocaid"]
    rec2 = None
    if ia:
        if is_dark_or_bad(ia):
            return False
        try:
            loc2, rec2 = get_ia(ia)
        except xml.parsers.expat.ExpatError:
            return False
        except NoMARCXML:
            print "no MARCXML"
            pass
        except urllib2.HTTPError, error:
            print error.code
            assert error.code in (404, 403)
        if not rec2:
            return True
Example #16
0
def fix_edition(key, e, ol):
    existing = get_mc(key)
    if 'source_records' not in e and existing:
        amazon = 'amazon:'
        if existing.startswith('ia:'):
            sr = [existing]
        elif existing.startswith(amazon):
            sr = amazon_source_records(existing[len(amazon):]) or [existing]
        else:
            print('existing:', existing)
            m = re_meta_mrc.search(existing)
            sr = ['marc:' + existing if not m else 'ia:' + m.group(1)]
        e['source_records'] = sr
    if 'ocaid' in e:
        ia = 'ia:' + e['ocaid']
        if 'source_records' not in e:
            e['source_records'] = [ia]
        elif ia not in e['source_records']:
            e['source_records'].append(ia)

    fix_toc(e)
    fix_subject(e)
    fix_authors(e, ol)
    return e
Example #17
0
def fix_edition(key, e, ol):
    existing = get_mc(key)
    if 'source_records' not in e and existing:
        amazon = 'amazon:'
        if existing.startswith('ia:'):
            sr = [existing]
        elif existing.startswith(amazon):
            sr = amazon_source_records(existing[len(amazon):]) or [existing]
        else:
            print('existing:', existing)
            m = re_meta_mrc.search(existing)
            sr = ['marc:' + existing if not m else 'ia:' + m.group(1)]
        e['source_records'] = sr
    if 'ocaid' in e:
        ia = 'ia:' + e['ocaid']
        if 'source_records' not in e:
            e['source_records'] = [ia]
        elif ia not in e['source_records']:
            e['source_records'].append(ia)

    fix_toc(e)
    fix_subject(e)
    fix_authors(e, ol)
    return e
Example #18
0
def get_books(akey, query, do_get_mc=True):
    for e in query:
        try:
            if not e.get('title', None):
                continue
        except:
            print e


#        if len(e.get('authors', [])) != 1:
#            continue
        if 'title_prefix' in e and e['title_prefix']:
            prefix = e['title_prefix']
            if prefix[-1] != ' ':
                prefix += ' '
            title = prefix + e['title']
        else:
            title = e['title']

        title = title.strip(' ')
        if has_dot(title):
            title = title[:-1]

        m = re_parens.match(title)
        if m:
            title = m.group(1)

        n = mk_norm(title)

        book = {
            'title': title,
            'norm_title': n,
            'key': e['key'],
        }

        lang = e.get('languages', [])
        if lang:
            book['lang'] = [re_lang_key.match(l['key']).group(1) for l in lang]

        if e.get('table_of_contents', None):
            if isinstance(e['table_of_contents'][0], basestring):
                book['table_of_contents'] = e['table_of_contents']
            else:
                assert isinstance(e['table_of_contents'][0], dict)
                if e['table_of_contents'][0].get('type', None) == '/type/text':
                    book['table_of_contents'] = [
                        i['value'] for i in e['table_of_contents']
                    ]
        if 'subtitle' in e:
            book['subtitle'] = e['subtitle']

        if 'source_records' in e:
            book['source_records'] = e['source_records']

        mc = get_mc(e['key']) if do_get_mc else None
        wt = get_work_title(e, mc)
        if not wt:
            yield book
            continue
        if wt in bad_titles:
            yield book
            continue
        n_wt = mk_norm(wt)
        book['work_title'] = wt
        book['norm_wt'] = n_wt
        yield book
Example #19
0
def try_merge(e1, edition_key, thing):
    thing_type = thing['type']
    if thing_type != Reference('/type/edition'):
        print(thing['key'], 'is', str(thing['type']))
    if thing_type == Reference('/type/delete'):
        return False
    assert thing_type == Reference('/type/edition')

    if 'source_records' in thing:
        if fix_source_records(edition_key, thing):
            thing = withKey(edition_key)  # reload
        return source_records_match(e1, thing)

    ia = thing.get('ocaid', None)
    print(edition_key)
    mc = get_mc(edition_key)
    print(mc)
    if mc:
        if mc.startswith('ia:'):
            ia = mc[3:]
        elif mc.endswith('.xml') or mc.endswith('.mrc'):
            ia = mc[:mc.find('/')]
        if '_meta.mrc:' in mc:
            print(thing)
            if 'ocaid' not in thing:
                return False
            ia = thing['ocaid']
    rec2 = None
    if ia:
        if is_dark_or_bad(ia):
            return False
        try:
            rec2 = get_ia(ia)
        except xml.parsers.expat.ExpatError:
            return False
        except NoMARCXML:
            print('no MARCXML')
            pass
        except urllib2.HTTPError as error:
            print(error.code)
            assert error.code in (404, 403)
        if not rec2:
            return True
    if not rec2:
        if not mc:
            mc = get_mc(thing['key'])
        if not mc or mc == 'initial import':
            return False
        if mc.startswith('amazon:'):
            try:
                a = try_amazon(thing)
            except IndexError:
                print(thing['key'])
                raise
            except AttributeError:
                return False
            if not a:
                return False
            try:
                return amazon.attempt_merge(a, e1, threshold, debug=False)
            except:
                print(a)
                print(e1)
                print(thing['key'])
                raise
        print('mc:', mc)
        try:
            assert not mc.startswith('ia:')
            data = get_from_archive(mc)
            if not data:
                return True
            rec2 = fast_parse.read_edition(data)
        except (fast_parse.SoundRecording, IndexError, AssertionError):
            print(mc)
            print(edition_key)
            return False
        except:
            print(mc)
            print(edition_key)
            raise
    if not rec2:
        return False
    try:
        e2 = build_marc(rec2)
    except TypeError:
        print(rec2)
        raise
    return attempt_merge(e1, e2, threshold, debug=False)
Example #20
0
         return False
     try:
         rec2 = get_ia(ia)
     except xml.parsers.expat.ExpatError:
         return False
     except NoMARCXML:
         print 'no MARCXML'
         pass
     except urllib2.HTTPError, error:
         print error.code
         assert error.code in (404, 403)
     if not rec2:
         return True
 if not rec2:
     if not mc:
         mc = get_mc(thing['key'])
     if not mc or mc == 'initial import':
         return False
     if mc.startswith('amazon:'):
         try:
             a = try_amazon(thing)
         except IndexError:
             print thing['key']
             raise
         except AttributeError:
             return False
         if not a:
             return False
         try:
             return amazon.attempt_merge(a, e1, threshold, debug=False)
         except:
Example #21
0
def get_books(akey, query, do_get_mc=True):
    for e in query:
        try:
            if not e.get('title', None):
                continue
        except:
            print(e)
#        if len(e.get('authors', [])) != 1:
#            continue
        if 'title_prefix' in e and e['title_prefix']:
            prefix = e['title_prefix']
            if prefix[-1] != ' ':
                prefix += ' '
            title = prefix + e['title']
        else:
            title = e['title']

        title = title.strip(' ')
        if has_dot(title):
            title = title[:-1]

        m = re_parens.match(title)
        if m:
            title = m.group(1)

        n = mk_norm(title)

        book = {
            'title': title,
            'norm_title': n,
            'key': e['key'],
        }

        lang = e.get('languages', [])
        if lang:
            book['lang'] = [re_lang_key.match(l['key']).group(1) for l in lang]

        if e.get('table_of_contents', None):
            if isinstance(e['table_of_contents'][0], six.string_types):
                book['table_of_contents'] = e['table_of_contents']
            else:
                assert isinstance(e['table_of_contents'][0], dict)
                if e['table_of_contents'][0].get('type', None) == '/type/text':
                    book['table_of_contents'] = [i['value'] for i in e['table_of_contents']]
        if 'subtitle' in e:
            book['subtitle'] = e['subtitle']

        if 'source_records' in e:
            book['source_records'] = e['source_records']

        mc = get_mc(e['key']) if do_get_mc else None
        wt = get_work_title(e, mc)
        if not wt:
            yield book
            continue
        if wt in bad_titles:
            yield book
            continue
        n_wt = mk_norm(wt)
        book['work_title'] = wt
        book['norm_wt'] = n_wt
        yield book
Example #22
0
         return False
     try:
         loc2, rec2 = get_ia(ia)
     except xml.parsers.expat.ExpatError:
         return False
     except NoMARCXML:
         print 'no MARCXML'
         pass
     except urllib2.HTTPError, error:
         print error.code
         assert error.code in (404, 403)
     if not rec2:
         return True
 if not rec2:
     if not mc:
         mc = get_mc(thing['key'])
     if not mc or mc == 'initial import':
         return False
     if mc.startswith('amazon:'):
         try:
             a = try_amazon(thing)
         except IndexError:
             print thing['key']
             raise
         except AttributeError:
             return False
         if not a:
             return False
         try:
             return amazon.attempt_merge(a, e1, threshold, debug=False)
         except:
Example #23
0
def try_merge(e1, edition_key, thing):
    thing_type = thing['type']
    if thing_type != Reference('/type/edition'):
        print(thing['key'], 'is', str(thing['type']))
    if thing_type == Reference('/type/delete'):
        return False
    assert thing_type == Reference('/type/edition')

    if 'source_records' in thing:
        if fix_source_records(edition_key, thing):
            thing = withKey(edition_key) # reload
        return source_records_match(e1, thing)

    ia = thing.get('ocaid', None)
    print(edition_key)
    mc = get_mc(edition_key)
    print(mc)
    if mc:
        if mc.startswith('ia:'):
            ia = mc[3:]
        elif mc.endswith('.xml') or mc.endswith('.mrc'):
            ia = mc[:mc.find('/')]
        if '_meta.mrc:' in mc:
            print(thing)
            if 'ocaid' not in thing:
                return False
            ia = thing['ocaid']
    rec2 = None
    if ia:
        if is_dark_or_bad(ia):
            return False
        try:
            rec2 = get_ia(ia)
        except xml.parsers.expat.ExpatError:
            return False
        except NoMARCXML:
            print('no MARCXML')
            pass
        except urllib2.HTTPError as error:
            print(error.code)
            assert error.code in (404, 403)
        if not rec2:
            return True
    if not rec2:
        if not mc:
            mc = get_mc(thing['key'])
        if not mc or mc == 'initial import':
            return False
        if mc.startswith('amazon:'):
            try:
                a = try_amazon(thing)
            except IndexError:
                print(thing['key'])
                raise
            except AttributeError:
                return False
            if not a:
                return False
            try:
                return amazon.attempt_merge(a, e1, threshold, debug=False)
            except:
                print(a)
                print(e1)
                print(thing['key'])
                raise
        print('mc:', mc)
        try:
            assert not mc.startswith('ia:')
            data = get_from_archive(mc)
            if not data:
                return True
            rec2 = fast_parse.read_edition(data)
        except (fast_parse.SoundRecording, IndexError, AssertionError):
            print(mc)
            print(edition_key)
            return False
        except:
            print(mc)
            print(edition_key)
            raise
    if not rec2:
        return False
    try:
        e2 = build_marc(rec2)
    except TypeError:
        print(rec2)
        raise
    return attempt_merge(e1, e2, threshold, debug=False)
Example #24
0
ol.login('EdwardBot', rc['EdwardBot'])

test_dir = '/home/edward/ol/test_data'

re_edition = re.compile('^/b/OL\d+M$')

re_meta_mrc = re.compile('^([^/]*)_meta.mrc:0:\d+$')

#out = open('source_records', 'w')
for f in os.listdir(test_dir):
    key = f.replace('_', '/')
    if not re_edition.match(key):
        continue
    print key
    continue
    mc = get_mc(key)
    print key, mc
    if not mc:
        continue
    e = ol.get(key)
    if e.get('source_records', []):
        continue
    if mc.startswith('ia:') or mc.startswith('amazon:'):
        sr = mc
    else:
        m = re_meta_mrc.match(mc)
        sr = 'marc:' + mc if not m else 'ia:' + m.group(1)
    e['source_records'] = [sr]
    print >> out, (key, sr)
    print ol.save(key, e, 'add source record')
#out.close()
Example #25
0
def add_source_records(key, ia):
    new = 'ia:' + ia
    sr = None
    e = ol.get(key)
    need_update = False
    if 'ocaid' not in e:
        need_update = True
        e['ocaid'] = ia
    if 'source_records' in e:
        if new in e['source_records'] and not need_update:
            return
        e['source_records'].append(new)
    else:
        existing = get_mc(key)
        amazon = 'amazon:'
        if existing is None:
            sr = []
        elif existing.startswith('ia:'):
            sr = [existing]
        elif existing.startswith(amazon):
            sr = amazon_source_records(existing[len(amazon):]) or [existing]
        else:
            m = re_meta_mrc.match(existing)
            sr = ['marc:' + existing if not m else 'ia:' + m.group(1)]
        if 'ocaid' in e and 'ia:' + e['ocaid'] not in sr:
            sr.append('ia:' + e['ocaid'])
        if new not in sr:
            e['source_records'] = sr + [new]

    # fix other bits of the record as well
    new_toc = fix_toc(e)
    if new_toc:
        e['table_of_contents'] = new_toc
    if e.get('subjects', None) and any(has_dot(s) for s in e['subjects']):
        subjects = [s[:-1] if has_dot(s) else s for s in e['subjects']]
        e['subjects'] = subjects
    if 'authors' in e:
        assert not any(a=='None' for a in e['authors'])
        print e['authors']
        authors = [ol.get(akey) for akey in e['authors']]
        authors = [ol.get(a['location']) if a['type'] == '/type/redirect' else a \
                for a in authors]
        e['authors'] = [{'key': a['key']} for a in authors]
        undelete_authors(authors)
    print 'saving', key
    print marshal(e)
    for attempt in range(50):
        try:
            print ol.save(key, e, 'found a matching MARC record')
            break
        except KeyboardInterrupt:
            raise
        except URLError:
            if attempt == 49:
                raise
        except:
            print e
            raise
        print 'attempt %d failed' % attempt
        sleep(30)
    if new_toc:
        new_edition = ol.get(key)
        # [{u'type': <ref: u'/type/toc_item'>}, ...]
        assert 'title' in new_edition['table_of_contents'][0]
ol.login('EdwardBot', rc['EdwardBot']) 

test_dir = '/home/edward/ol/test_data'

re_edition = re.compile('^/b/OL\d+M$')

re_meta_mrc = re.compile('^([^/]*)_meta.mrc:0:\d+$')

#out = open('source_records', 'w')
for f in os.listdir(test_dir):
    key = f.replace('_', '/')
    if not re_edition.match(key):
        continue
    print key
    continue
    mc = get_mc(key)
    print key, mc
    if not mc:
        continue
    e = ol.get(key)
    if e.get('source_records', []):
        continue
    if mc.startswith('ia:') or mc.startswith('amazon:'):
        sr = mc
    else:
        m = re_meta_mrc.match(mc)
        sr = 'marc:' + mc if not m else 'ia:' + m.group(1)
    e['source_records'] = [sr]
    print >> out, (key, sr)
    print ol.save(key, e, 'add source record')
#out.close()
Example #27
0
         return False
     try:
         loc2, rec2 = get_ia(ia)
     except xml.parsers.expat.ExpatError:
         return False
     except NoMARCXML:
         print "no MARCXML"
         pass
     except urllib2.HTTPError, error:
         print error.code
         assert error.code in (404, 403)
     if not rec2:
         return True
 if not rec2:
     if not mc:
         mc = get_mc(thing["key"])
     if not mc or mc == "initial import":
         return False
     if mc.startswith("amazon:"):
         try:
             a = try_amazon(thing)
         except IndexError:
             print thing["key"]
             raise
         except AttributeError:
             return False
         if not a:
             return False
         try:
             return amazon.attempt_merge(a, e1, threshold, debug=False)
         except: