Esempio n. 1
0
def merge_works(works):
    master = works.pop(0)
    master_first_publish_year = get_publish_year(
        master.get('first_publish_date'))
    subtitles = sorted((w['subtitle'] for w in works if w.get('subtitle')),
                       key=lambda s: len(s))
    if subtitles and len(subtitles[-1]) > len(master.get('subtitle', '')):
        master['subtitle'] = subtitles[-1]
    updates = []
    for w in works:
        wkey = w.pop('key')
        q = {'type': '/type/edition', 'works': wkey}
        for ekey in ol.query(q):
            e = ol.get(ekey)
            assert len(e['works']) == 1 and e['works'][0] == wkey
            e['works'] = [Reference(master['key'])]
            updates.append(e)
        assert w['type'] != Reference('/type/redirect')
        updates.append({
            'key': wkey,
            'type': Reference('/type/redirect'),
            'location': master['key'],
        })
        for f in 'covers', 'subjects', 'subject_places', 'subject_people', 'subject_times', 'lc_classifications', 'dewey_number':
            if not w.get(f):
                continue
            assert not isinstance(w[f], basestring)
            for i in w[f]:
                if i not in master.setdefault(f, []):
                    master[f].append(i)

        if w.get('first_sentence') and not master.get('first_sentence'):
            master['first_sentence'] = w['first_sentence']
        if w.get('first_publish_date'):
            if not master.get('first_publish_date'):
                master['first_publish_date'] = w['first_publish_date']
            else:
                publish_year = get_publish_year(w['first_publish_date'])
                if publish_year < master_first_publish_year:
                    master['first_publish_date'] = w['first_publish_date']
                    master_first_publish_year = publish_year

        for excerpt in w.get('exceprts', []):
            master.setdefault('exceprts', []).append(excerpt)

        for f in 'title', 'subtitle', 'created', 'last_modified', 'latest_revision', 'revision', 'number_of_editions', 'type', 'first_sentence', 'authors', 'first_publish_date', 'excerpts', 'covers', 'subjects', 'subject_places', 'subject_people', 'subject_times', 'lc_classifications', 'dewey_number':
            try:
                del w[f]
            except KeyError:
                pass

        print w
        assert not w
    updates.append(master)
    print len(updates), [(doc['key'], doc['type']) for doc in updates]
    # update master
    # update editions to point at master
    # replace works with redirects
    print ol.save_many(updates, 'merge works')
Esempio n. 2
0
def try_merge(e1, edition_key, thing):
    thing_type = thing['type']
    if thing_type != Reference('/type/edition'):
        print thing['key'], 'is', str(thing['type'])
    if thing_type == Reference('/type/delete'):
        return False
    assert thing_type == Reference('/type/edition')

    if 'source_records' in thing:
        if fix_source_records(edition_key, thing):
            thing = withKey(edition_key)  # reload
        return source_records_match(e1, thing)

    ia = thing.get('ocaid', None)
    print edition_key
    mc = get_mc(edition_key)
    print mc
    if mc:
        if mc.startswith('ia:'):
            ia = mc[3:]
        elif mc.endswith('.xml') or mc.endswith('.mrc'):
            ia = mc[:mc.find('/')]
        if '_meta.mrc:' in mc:
            print thing
            if 'ocaid' not in thing:
                return False
            ia = thing['ocaid']
    rec2 = None
    if ia:
        if is_dark_or_bad(ia):
            return False
        try:
            rec2 = get_ia(ia)
        except xml.parsers.expat.ExpatError:
            return False
        except NoMARCXML:
            print 'no MARCXML'
            pass
        except urllib2.HTTPError, error:
            print error.code
            assert error.code in (404, 403)
        if not rec2:
            return True
Esempio n. 3
0
def run_merge(ia):
    cur = g.db.cursor()
    cur.execute('select editions from merge where ia=%s', ia)
    [ekeys] = cur.fetchone()
    ekeys = [
        '/books/OL%dM' % x for x in sorted(
            int(re_edition_key.match(ekey).group(1))
            for ekey in ekeys.split(' '))
    ]
    min_ekey = ekeys[0]

    editions = [ol.get(ekey) for ekey in ekeys]
    editions_by_key = dict((e['key'][7:], e) for e in editions)
    merged = build_merged(editions)

    missing = []
    for k, v in merged.items():
        if v is not None:
            continue
        use_ekey = request.form.get(k)
        if use_ekey is None:
            missing.append(k)
            continue
        merged[k] = editions_by_key[use_ekey][k]
    if missing:
        flash('please select: ' + ', '.join(missing))
        return redirect(url_for('merge', ia=ia))

    master = ol.get(min_ekey)
    for k, v in merged.items():
        master[k] = v

    updates = []
    updates.append(master)
    for ekey in ekeys:
        if ekey == min_ekey:
            continue
        ol_redirect = {
            'type': Reference('/type/redirect'),
            'location': min_ekey,
            'key': ekey,
        }
        updates.append(ol_redirect)
    #print len(updates), min_ekey
    try:
        ol.save_many(updates, 'merge lending editions')
    except:
        #for i in updates:
        #    print i
        raise
    cur.execute('update merge set done=now() where ia=%s', [ia])

    flash(ia + ' merged')
    return redirect(url_for('index'))
Esempio n. 4
0
def add_work(akey, w):
    q = {
        'authors': [{
            'author': Reference(akey)
        }],
        'type': '/type/work',
        'title': w['title']
    }
    try:
        wkey = ol.new(q, comment='create work page')
    except:
        print(q)
        raise
    write_log('work', wkey, w['title'])
    assert isinstance(wkey, six.string_types)
    for ekey in w['editions']:
        e = ol.get(ekey)
        fix_edition(ekey, e, ol)
        #assert 'works' not in e
        write_log('edition', ekey, e.get('title', 'title missing'))
        e['works'] = [Reference(wkey)]
        yield e
Esempio n. 5
0
def update_edition(ekey, wkey):
    e = ol.get(ekey)
    fix_edition(ekey, e, ol)
    write_log('edition', ekey, e.get('title', 'title missing'))
    if e.get('works', []):
        assert len(e['works']) == 1
        if e['works'][0] != wkey:
            print('e:', e)
            print('wkey:', wkey)
            print('ekey:', ekey)
            print('e["works"]:', e['works'])
            #merge_works([e['works'][0], wkey])
        #assert e['works'][0] == wkey
        return None
    e['works'] = [Reference(wkey)]
    return e
Esempio n. 6
0
def update_work_edition(ekey, wkey, use):
    print((ekey, wkey, use))
    e = ol.get(ekey)
    works = []
    for w in e['works']:
        if w == wkey:
            if use not in works:
                works.append(Reference(use))
        else:
            if w not in works:
                works.append(w)

    if e['works'] == works:
        return
    print('before:', e['works'])
    print('after:', works)
    e['works'] = works
    print(ol.save(e['key'], e, 'remove duplicate work page'))
Esempio n. 7
0
def add_works(works):
    q = []
    for w in works:
        cur = {
            'authors': [{
                'author': Reference(w['author'])
            }],
            'type': '/type/work',
            'title': w['title']
        }
        if 'subjects' in w:
            cur['subjects'] = w['subjects']
        q.append(cur)
    try:
        return ol.new(q, comment='create work page')
    except:
        print(q)
        raise
Esempio n. 8
0
        all_keys.update(k for k, v in e.items() if v)
    for k in 'latest_revision', 'revision', 'created', 'last_modified', 'key', 'type', 'genres':
        if k in all_keys:
            all_keys.remove(k)

    for k in all_keys.copy():
        if k.startswith('subject'):
            all_keys.remove(k)

    for e in editions:  # resolve redirects
        if 'authors' not in e:
            continue
        new_authors = []
        for akey in e['authors']:
            a = ol.get(akey)
            if a['type'] == Reference('/type/redirect'):
                akey = Reference(a['location'])
            else:
                assert a['type'] == Reference('/type/author')
            new_authors.append(akey)
        e['authors'] = new_authors

    k = 'publish_date'
    publish_dates = set(e[k] for e in editions if k in e and len(e[k]) != 4)

    k = 'pagination'
    all_pagination = set(e[k].strip(':.') for e in editions if e.get(k))

    one_item_lists = {}
    for k in 'lc_classifications', 'publishers', 'contributions', 'series':
        one_item_lists[k] = set(e[k][0].strip('.') for e in editions
Esempio n. 9
0
def try_merge(e1, edition_key, thing):
    thing_type = thing['type']
    if thing_type != Reference('/type/edition'):
        print(thing['key'], 'is', str(thing['type']))
    if thing_type == Reference('/type/delete'):
        return False
    assert thing_type == Reference('/type/edition')

    if 'source_records' in thing:
        if fix_source_records(edition_key, thing):
            thing = withKey(edition_key)  # reload
        return source_records_match(e1, thing)

    ia = thing.get('ocaid', None)
    print(edition_key)
    mc = get_mc(edition_key)
    print(mc)
    if mc:
        if mc.startswith('ia:'):
            ia = mc[3:]
        elif mc.endswith('.xml') or mc.endswith('.mrc'):
            ia = mc[:mc.find('/')]
        if '_meta.mrc:' in mc:
            print(thing)
            if 'ocaid' not in thing:
                return False
            ia = thing['ocaid']
    rec2 = None
    if ia:
        if is_dark_or_bad(ia):
            return False
        try:
            rec2 = get_ia(ia)
        except xml.parsers.expat.ExpatError:
            return False
        except NoMARCXML:
            print('no MARCXML')
            pass
        except urllib2.HTTPError as error:
            print(error.code)
            assert error.code in (404, 403)
        if not rec2:
            return True
    if not rec2:
        if not mc:
            mc = get_mc(thing['key'])
        if not mc or mc == 'initial import':
            return False
        if mc.startswith('amazon:'):
            try:
                a = try_amazon(thing)
            except IndexError:
                print(thing['key'])
                raise
            except AttributeError:
                return False
            if not a:
                return False
            try:
                return amazon.attempt_merge(a, e1, threshold, debug=False)
            except:
                print(a)
                print(e1)
                print(thing['key'])
                raise
        print('mc:', mc)
        try:
            assert not mc.startswith('ia:')
            data = get_from_archive(mc)
            if not data:
                return True
            rec2 = fast_parse.read_edition(data)
        except (fast_parse.SoundRecording, IndexError, AssertionError):
            print(mc)
            print(edition_key)
            return False
        except:
            print(mc)
            print(edition_key)
            raise
    if not rec2:
        return False
    try:
        e2 = build_marc(rec2)
    except TypeError:
        print(rec2)
        raise
    return attempt_merge(e1, e2, threshold, debug=False)
Esempio n. 10
0
def toc_items(toc_list):
    return [{
        'title': six.text_type(item),
        'type': Reference('/type/toc_item')
    } for item in toc_list]
Esempio n. 11
0
def toc_items(toc_list):
    return [{
        'title': unicode(item),
        'type': Reference('/type/toc_item')
    } for item in toc_list]