Esempio n. 1
0
def find_match(e1, edition_pool):
    seen = set()
    for k, v in edition_pool.iteritems():
        for edition_key in v:
            if edition_key in seen:
                continue
            thing = None
            found = True
            while not thing or is_redirect(thing):
                seen.add(edition_key)
                thing = web.ctx.site.get(edition_key)
                if thing is None:
                    found = False
                    break
                if is_redirect(thing):
                    print 'following redirect %s => %s' % (edition_key,
                                                           thing['location'])
                    edition_key = thing['location']
            if not found:
                continue
            #print (e1, edition_key, thing)
            if try_merge(e1, edition_key, thing):
                #add_source_records(edition_key, ia)
                return edition_key
    return None
Esempio n. 2
0
def find_match(e1, edition_pool):
    """
    Find the best match for e1 in edition_pool and return its key.
    :param dict e1: the new edition we are trying to match
    :param list edition_pool: list of possible edition matches
    :rtype: str|None
    :return: None or the edition key '/books/OL...M' of the best edition match for e1 in edition_pool
    """
    seen = set()
    for k, v in edition_pool.iteritems():
        for edition_key in v:
            if edition_key in seen:
                continue
            thing = None
            found = True
            while not thing or is_redirect(thing):
                seen.add(edition_key)
                thing = web.ctx.site.get(edition_key)
                if thing is None:
                    found = False
                    break
                if is_redirect(thing):
                    edition_key = thing['location']
            if not found:
                continue
            if try_merge(e1, edition_key, thing):
                return edition_key
Esempio n. 3
0
def test_try_merge(mock_site):
    rec = {
        'title': 'Test item',
        'lccn': ['123'],
        'authors': [{'name': 'Smith, John', 'birth_date': '1980'}],
        'source_records': ['ia:test_item'],
    }
    reply = load(rec)
    ekey = reply['edition']['key']
    e = mock_site.get(ekey)

    rec['full_title'] = rec['title']
    e1 = build_marc(rec)
    add_db_name(e1)
    result = try_merge(e1, ekey, e)
    assert result is True
Esempio n. 4
0
def test_try_merge(mock_site):
    rec = {
        'title': 'Test item',
        'lccn': ['123'],
        'authors': [{'name': 'Smith, John', 'birth_date': '1980'}],
    }
    reply = load(rec)
    ekey = reply['edition']['key']
    e = mock_site.get(ekey)

    rec['full_title'] = rec['title']
    if rec.get('subtitle'):
        rec['full_title'] += ' ' + rec['subtitle']
    e1 = build_marc(rec)
    add_db_name(e1)

    assert try_merge(e1, ekey, e)
Esempio n. 5
0
def test_try_merge(mock_site):
    rec = {
        'title': 'Test item',
        'lccn': ['123'],
        'authors': [{
            'name': 'Smith, John',
            'birth_date': '1980'
        }],
    }
    reply = load(rec)
    ekey = reply['edition']['key']
    e = mock_site.get(ekey)

    rec['full_title'] = rec['title']
    if rec.get('subtitle'):
        rec['full_title'] += ' ' + rec['subtitle']
    e1 = build_marc(rec)
    add_db_name(e1)

    assert try_merge(e1, ekey, e)
Esempio n. 6
0
def find_match(e1, edition_pool):
    seen = set()
    for k, v in edition_pool.iteritems():
        for edition_key in v:
            if edition_key in seen:
                continue
            thing = None
            found = True
            while not thing or is_redirect(thing):
                seen.add(edition_key)
                thing = web.ctx.site.get(edition_key)
                if thing is None:
                    found = False
                    break
                if is_redirect(thing):
                    print 'following redirect %s => %s' % (edition_key, thing['location'])
                    edition_key = thing['location']
            if not found:
                continue
            if try_merge(e1, edition_key, thing):
                return edition_key
Esempio n. 7
0
def load():
    global rec_no, t_prev
    skipping = False
    #for ia in ['nybc200715']:
    #cur.execute("select identifier from metadata where collection=%(c)s", {'c': collection})
    cur.execute(
        "select identifier from metadata where scanner is not null and scanner != 'google' and noindex is null and mediatype='texts' and curatestate='approved'"
    )  # order by curatedate")
    for ia, in cur.fetchall():
        rec_no += 1
        if rec_no % chunk == 0:
            t = time() - t_prev
            t_prev = time()
            t1 = time() - t0
            rec_per_sec = chunk / t
            rec_per_sec_total = rec_no / t1
            remaining = total - rec_no
            sec = remaining / rec_per_sec_total
            print("%8d current: %9.3f overall: %9.3f" %
                  (rec_no, rec_per_sec, rec_per_sec_total),
                  end=' ')
            hours = sec / 3600
            print("%6.3f hours" % hours)

        print(ia)
        if get_things({'type': '/type/edition', 'ocaid': ia}):
            print('already loaded')
            continue
        try:
            loc, rec = get_ia(ia)
        except (KeyboardInterrupt, NameError):
            raise
        except urllib2.HTTPError:
            continue
        if loc is None:
            continue
        print(loc, rec)

        if not loc.endswith('.xml'):
            print("not XML")
            continue
        if 'full_title' not in rec:
            print("full_title missing")
            continue
        index_fields = make_index_fields(rec)
        if not index_fields:
            print("no index_fields")
            continue

        edition_pool = pool.build(index_fields)
        print(edition_pool)

        if not edition_pool:
            yield loc, ia
            continue

        e1 = build_marc(rec)

        match = False
        for k, v in edition_pool.iteritems():
            if k == 'title' and len(v) > 50:
                continue
            for edition_key in v:
                if try_merge(e1, edition_key.replace('\/', '/')):
                    match = True
                    break
            if match:
                break
        if not match:
            yield loc, ia
 def test(self):
   self.assertEqual(try_merge([8,8]), [16])
   self.assertEqual(try_merge([2,4]), [2, 4])
   self.assertEqual(try_merge([2, 2, 4, 8]), [4, 4, 8])
Esempio n. 9
0
def load():
    global rec_no, t_prev
    skipping = False
    #for ia in ['nybc200715']:
    #cur.execute("select identifier from metadata where collection=%(c)s", {'c': collection})
    cur.execute("select identifier from metadata where scanner is not null and scanner != 'google' and noindex is null and mediatype='texts' and curatestate='approved'") # order by curatedate")
    for ia, in cur.fetchall():
        rec_no += 1
        if rec_no % chunk == 0:
            t = time() - t_prev
            t_prev = time()
            t1 = time() - t0
            rec_per_sec = chunk / t
            rec_per_sec_total = rec_no / t1
            remaining = total - rec_no
            sec = remaining / rec_per_sec_total
            print("%8d current: %9.3f overall: %9.3f" % (rec_no, rec_per_sec, rec_per_sec_total), end=' ')
            hours = sec / 3600
            print("%6.3f hours" % hours)

        print(ia)
        if get_things({'type': '/type/edition', 'ocaid': ia}):
            print('already loaded')
            continue
        try:
            loc, rec = get_ia(ia)
        except (KeyboardInterrupt, NameError):
            raise
        except urllib2.HTTPError:
            continue
        if loc is None:
            continue
        print(loc, rec)

        if not loc.endswith('.xml'):
            print("not XML")
            continue
        if 'full_title' not in rec:
            print("full_title missing")
            continue
        index_fields = make_index_fields(rec)
        if not index_fields:
            print("no index_fields")
            continue

        edition_pool = pool.build(index_fields)
        print(edition_pool)

        if not edition_pool:
            yield loc, ia
            continue

        e1 = build_marc(rec)

        match = False
        for k, v in edition_pool.iteritems():
            if k == 'title' and len(v) > 50:
                continue
            for edition_key in v:
                if try_merge(e1, edition_key.replace('\/', '/')):
                    match = True
                    break
            if match:
                break
        if not match:
            yield loc, ia