Esempio n. 1
0
def build_titles(title):
    normalized_title = normalize(title).lower()
    titles = [title, normalized_title]
    if title.find(' & ') != -1:
        t = title.replace(" & ", " and ")
        titles.append(t)
        titles.append(normalize(t))
    t2 = []
    for t in titles:
        if t.lower().startswith('the '):
            t2.append(t[4:])
        elif t.lower().startswith('a '):
            t2.append(t[2:])
    titles += t2

    if re_amazon_title_paren.match(title):
        t2 = []
        for t in titles:
            m = re_amazon_title_paren.match(t)
            if m:
                t2.append(m.group(1))
                t2.append(normalize(m.group(1)))
        titles += t2

    return {
        'full_title': title,
        'normalized_title': normalized_title,
        'titles': titles,
        'short_title': normalized_title[:25],
    }
Esempio n. 2
0
def marc_title(amazon_first_parts, marc_first_parts):
    #            print 'title found: ', marc_first_parts[-1]
    if normalize(marc_first_parts[-1]) not in titles:
        return False
    if compare_parts(marc_first_parts[:-1], amazon_first_parts):
        if verbose:
            print("match with MARC end title")
        return True
    if normalize(amazon_first_parts[0]) in titles:
        if compare_parts(marc_first_parts[:-1], amazon_first_parts[1:]):
            if verbose:
                print("match, both with titles")
            return True
        if match_seq(marc_first_parts[:-1], amazon_first_parts[1:]):
            if verbose:
                print("partial match, both with titles")
            return True
    if match_seq(marc_first_parts[:-1], amazon_first_parts):
        if verbose:
            print("partial match with MARC end title")
        return True
    if match_seq(marc_first_parts, amazon_first_parts):
        if verbose:
            print("partial match with MARC end title")
    return False
Esempio n. 3
0
def authority_lookup(to_check, found, marc_alt):
    found_matches = False
    for person_key, match in to_check.items():
        if len(match) == 1:
            continue
        if len(match) == 2:
            d1, d2 = [get_marc_date(p) for p in match]
            if dates_not_close(d1, d2) and not is_date_transposed(d1, d2):
                continue

        name = ' '.join(v.strip() for k, v in person_key if k != 'd')
        search_results = authority.search(name)
        match_dates = dict((get_marc_date(p), p) for p in match)
        norm_name = normalize(name)
        authority_match = None
        for i in search_results:
            if i['type'] != 'personal name' or i['a'] == 'reference':
                continue
            if norm_name not in normalize(i['heading']):
                continue
            for d, p in match_dates.items():
                if i['heading'].endswith(d):
                    if authority_match: # more than one match
                        print 'dups:', match_dates.items()
                        authority_match = None
                        break
                    authority_match = p
        if authority_match:
            for p in match:
                if p == authority_match:
                    continue
                found[authority_match] += found.pop(p)
                marc_alt[p] = authority_match
                found_matches = True
    return found_matches
Esempio n. 4
0
def build_titles(title):
    warnings.warn(
        'Deprecated, use openlibrary.catalog.merge.merge_marc.build_titles() instead.',
        DeprecationWarning,
    )

    normalized_title = normalize(title).lower()
    titles = [title, normalized_title]
    if title.find(' & ') != -1:
        t = title.replace(" & ", " and ")
        titles.append(t)
        titles.append(normalize(t))
    t2 = []
    for t in titles:
        if t.lower().startswith('the '):
            t2.append(t[4:])
        elif t.lower().startswith('a '):
            t2.append(t[2:])
    titles += t2

    if re_amazon_title_paren.match(title):
        t2 = []
        for t in titles:
            m = re_amazon_title_paren.match(t)
            if m:
                t2.append(m.group(1))
                t2.append(normalize(m.group(1)))
        titles += t2

    return {
        'full_title': title,
        'normalized_title': normalized_title,
        'titles': titles,
        'short_title': normalized_title[:25],
    }
Esempio n. 5
0
def compare_author_fields(e1_authors, e2_authors):
    for i in e1_authors:
        for j in e2_authors:
            if normalize(i['db_name']) == normalize(j['db_name']):
                return True
            if normalize(i['name']).strip('.') == normalize(
                    j['name']).strip('.'):
                return True
    return False
def test_normalize_replace_MARCMaker_mnemonics():
    # see http://www.loc.gov/marc/mnemonics.html
    a = "The La{dotb}t{macr}a{mlrhring}if al-ma{mllhring}{macr}arif of Tha{mllhring} {macr}alibi. The book of curious and entertaining information"
    b = u"The La\xf2t\xe5a\xaeif al-ma\xb0\xe5arif of Tha\xb0 \xe5alibi. The book of curious and entertaining information"
    assert normalize(a) == normalize(b)

    a = "Tha{mllhring}{macr}alib{macr}i, {mllhring}Abd al-Malik ibn Mu{dotb}hammad 961 or 2-1037 or 8."
    b = u"Tha\xb0\xe5alib\xe5i, \xb0Abd al-Malik ibn Mu\xf2hammad 961 or 2-1037 or 8."
    assert normalize(a) == normalize(b)
Esempio n. 7
0
def flip_marc_name(marc):
    m = re_marc_name.match(marc)
    if not m:
        return remove_trailing_dot(marc)
    first_parts = split_parts(m.group(2))
    if normalize(first_parts[-1]) not in titles:
        # example: Eccles, David Eccles Viscount
        return remove_trailing_dot(m.group(2)) + ' ' + m.group(1)
    if len(first_parts) > 2 and normalize(first_parts[-2]) == normalize(
            m.group(1)):
        return u' '.join(first_parts[0:-1])
    return u' '.join(first_parts[:-1] + [m.group(1)])
Esempio n. 8
0
def compare_title(amazon, marc):
    amazon_title = amazon['normalized_title'].lower()
    marc_title = normalize(marc['full_title']).lower()
    short = False
    if len(amazon_title) < 9 or len(marc_title) < 9:
        short = True

    if not short:
        for a in amazon['titles']:
            for m in marc['titles']:
                if a == m:
                    return ('full-title', 'exact match', 600)

        for a in amazon['titles']:
            for m in marc['titles']:
                if substr_match(a, m):
                    return ('full-title', 'containted within other title', 350)

    max_score = 0
    for a in amazon['titles']:
        for m in marc['titles']:
            percent, ordered = keyword_match(a, m)
            score = percent * 450
            if ordered:
                score += 50
            if score and score > max_score:
                max_score = score
    if max_score:
        return ('full-title', 'keyword match', max_score)
    elif short:
        return ('full-title', 'shorter than 9 characters', 0)
    else:
        return ('full-title', 'mismatch', -600)
Esempio n. 9
0
def compare_publisher(amazon, marc):
    if 'publisher' in amazon and 'publishers' in marc:
        amazon_pub = amazon['publisher']
        norm_amazon = normalize(amazon_pub)
        for marc_pub in marc['publishers']:
            norm_marc = normalize(marc_pub)
            if norm_amazon == norm_marc:
                return ('publisher', 'match', 100)
            elif substr_match(norm_amazon, norm_marc):
                return ('publisher', 'occur within the other', 100)
            elif substr_match(norm_amazon.replace(' ', ''),
                              norm_marc.replace(' ', '')):
                return ('publisher', 'occur within the other', 100)
            elif short_part_publisher_match(norm_amazon, norm_marc):
                return ('publisher', 'match', 100)
        return ('publisher', 'mismatch', -25)

    if 'publisher' not in amazon or 'publishers' not in marc:
        return ('publisher', 'either missing', 0)
Esempio n. 10
0
def compare_publisher(e1, e2):
    if 'publishers' in e1 and 'publishers' in e2:
        for e1_pub in e1['publishers']:
            e1_norm = normalize(e1_pub)
            for e2_pub in e2['publishers']:
                e2_norm = normalize(e2_pub)
                if e1_norm == e2_norm:
                    return ('publisher', 'match', 100)
                elif substr_match(e1_norm, e2_norm):
                    return ('publisher', 'occur within the other', 100)
                elif substr_match(e1_norm.replace(' ', ''),
                                  e2_norm.replace(' ', '')):
                    return ('publisher', 'occur within the other', 100)
                elif short_part_publisher_match(e1_norm, e2_norm):
                    return ('publisher', 'match', 100)
        return ('publisher', 'mismatch', -25)

    if 'publishers' not in e1 or 'publishers' not in e2:
        return ('publisher', 'either missing', 0)
Esempio n. 11
0
def match_name(amazon, marc, last_name_only_ok=True):
    if amazon_spaced_name(amazon, marc):
        return True
    amazon_normalized = normalize(amazon)
    amazon_normalized_no_space = normalize(amazon).replace(' ', '')
    marc_normalized = normalize(marc)
    # catches events and organizations
    if amazon_normalized == marc_normalized:
        if verbose:
            print('normalized names match')
        return True
    if amazon_normalized_no_space == marc_normalized.replace(' ', ''):
        if verbose:
            print('normalized, spaces removed, names match')
        return True
    # split MARC name
    m = re_marc_name.match(marc)
    if not m:
        return False
    surname = m.group(1)
    surname_no_space = surname.replace(' ', '')
    if amazon_normalized == normalize(
            surname) or amazon_normalized_no_space == normalize(
                surname_no_space):
        if verbose:
            print('Amazon only has a last name, it matches MARC')
        return last_name_only_ok
    if amazon_normalized == normalize(m.group(2) + ' ' + surname):
        if verbose:
            print('match')
        return True
    if amazon_normalized_no_space == normalize(m.group(2) + surname).replace(
            ' ', ''):
        if verbose:
            print('match when spaces removed')
        return True
    if not match_surname(surname, amazon):
        if verbose:
            print('Last name mismatch')
        return False
    marc_first_parts = split_parts(m.group(2))
    amazon_first_parts = split_parts(amazon[0:-(len(m.group(1)) + 1)])
    if compare_parts(marc_first_parts, amazon_first_parts):
        if verbose:
            print("match")
        return True
    if marc_title(amazon_first_parts, marc_first_parts):
        return True
    if amazon_title(amazon_first_parts, marc_first_parts):
        return True
    if match_seq(amazon_first_parts, marc_first_parts):
        if verbose:
            print("partial match")
        return True
    if verbose:
        print("no match")
    return False
Esempio n. 12
0
def mk_norm(s):
    m = re_brackets.match(s)
    if m:
        s = m.group(1)
    norm = merge.normalize(s).strip(' ')
    norm = norm.replace(' and ', ' ')
    if norm.startswith('the '):
        norm = norm[4:]
    elif norm.startswith('a '):
        norm = norm[2:]
    return norm.replace(' ', '')
Esempio n. 13
0
def amazon_title(amazon_first_parts, marc_first_parts):
    if normalize(amazon_first_parts[0]) not in titles:
        return False
    if compare_parts(marc_first_parts, amazon_first_parts[1:]):
        if verbose:
            print("match with Amazon title")
        return True
    if match_seq(marc_first_parts, amazon_first_parts[1:]):
        if verbose:
            print("partial match, with Amazon title")
        return True
    return False
Esempio n. 14
0
def missing_subtag(found, marc_alt):
    merge = defaultdict(set)
    for p1, p2 in combinations(found, 2):
        subtag1 = [k for k, v in p1 if k in 'abcdq']
        subtag2 = [k for k, v in p2 if k in 'abcdq']

        if subtag1 == subtag2:
            continue

        name1 = ' '.join(v.strip() for k, v in p1)
        name2 = ' '.join(v.strip() for k, v in p2)

        if not match_with_bad_chars(name1, name2) \
                and normalize(name1) != normalize(name2) \
                and normalize(remove_bad_marc_subtag(name1)) != normalize(remove_bad_marc_subtag(name2)) \
                and normalize(name1.lower().replace(' the', '')) != normalize(name2.lower().replace(' the', '')):
            continue

        if len(subtag1) > len(subtag2):
            merge[p2].add(just_abcdq(p1))
        else:
            merge[p1].add(just_abcdq(p2))

    def flat_len(p):
        return len(' '.join(v for k, v in p))

    for old, new in merge.items():
        by_size = sorted((flat_len(p), p) for p in new)
        if len(by_size) > 1:
            assert by_size[-1][0] > by_size[-2][0]
        new_marc = by_size[-1][1]

        found[new_marc] += found.pop(old)
        marc_alt[old] = new_marc
Esempio n. 15
0
def match_name2(name1, name2):
    if name1 == name2:
        return True
    n1_normalized = normalize(name1)
    n2_normalized = normalize(name2)
    if n1_normalized == n2_normalized:
        return True
    n1_parts = split_parts(name1)
    n2_parts = split_parts(name2)
    if compare_parts(n1_parts, n2_parts):
        return True
    if match_seq(n1_parts, n2_parts):
        return True
    if marc_title(n1_parts, n2_parts):
        return True
    if marc_title(n2_parts, n1_parts):
        return True
    if amazon_title(n1_parts, n2_parts):
        return True
    if amazon_title(n2_parts, n1_parts):
        return True
    return False
Esempio n. 16
0
def build_titles(title):
    """
    Uses a full title to create normalized and short title versions.

    :param str title: Full title of an edition
    :rtype: dict
    :return: An expanded set of title variations
    """
    normalized_title = normalize(title).lower()
    titles = [title, normalized_title]
    if title.find(' & ') != -1:
        t = title.replace(" & ", " and ")
        titles.append(t)
        titles.append(normalize(t))
    t2 = []
    for t in titles:
        if t.lower().startswith('the '):
            t2.append(t[4:])
        elif t.lower().startswith('a '):
            t2.append(t[2:])
    titles += t2

    if re_amazon_title_paren.match(title):
        t2 = []
        for t in titles:
            m = re_amazon_title_paren.match(t)
            if m:
                t2.append(m.group(1))
                t2.append(normalize(m.group(1)))
        titles += t2

    return {
        'full_title': title,
        'normalized_title': normalized_title,
        'titles': titles,
        'short_title': normalized_title[:25],
    }
Esempio n. 17
0
def amazon_spaced_name(amazon, marc):
    len_amazon = len(amazon)
    if len_amazon != 30 and len_amazon != 31:
        return False
    m = re_amazon_space_name.search(amazon)
    if not m:
        return False
    amazon_surname = m.group(1)
    if normalize(amazon_surname) == normalize(marc):
        return True
    amazon_initals = m.group(2)
    m = re_marc_name.match(marc)
    if not m:
        return False
    marc_surname = m.group(1)
    if normalize(amazon_surname) != normalize(marc_surname):
        return False
    marc_first_parts = split_parts(m.group(2))
    amazon_first_parts = [x for x in amazon_initals]
    if compare_parts(marc_first_parts, amazon_first_parts):
        return True
    if match_seq(amazon_first_parts, marc_first_parts):
        return True
    return False
Esempio n. 18
0
def mk_norm(s):
    """
    Normalizes titles and strips ALL spaces and small words
    to aid with string comparisons of two titles.

    :param str s: A book title to normalize and strip.
    :rtype: str
    :return: a lowercase string with no spaces, containing the main words of the title.
    """

    m = re_brackets.match(s)
    if m:
        s = m.group(1)
    norm = merge.normalize(s).strip(' ')
    norm = norm.replace(' and ', ' ')
    if norm.startswith('the '):
        norm = norm[4:]
    elif norm.startswith('a '):
        norm = norm[2:]
    return norm.replace(' ', '')
Esempio n. 19
0
def match_marc_name(marc1, marc2, last_name_only_ok):
    m1_normalized = normalize(marc1)
    m2_normalized = normalize(marc2)
    if m1_normalized == m2_normalized:
        return True
    m1 = re_marc_name.match(marc1)
    m2 = re_marc_name.match(marc2)
    if not m1:
        if m2 and m1_normalized == normalize(m2.group(1)):
            return last_name_only_ok
        else:
            return False
    if not m2:
        if m2_normalized == normalize(m1.group(1)):
            return last_name_only_ok
        else:
            return False
    if (m1_normalized == normalize(m2.group(2) + ' ' + m2.group(1))
            or m2_normalized == normalize(m1.group(2) + ' ' + m1.group(1))):
        return True
    if not (m1.group(1).endswith(' ' + m2.group(1))
            or m1.endswith('.' + m2.group(1))
            or m2.group(1).endswith(' ' + m1.group(1))
            or m2.endswith('.' + m1.group(1))):
        return False  # Last name mismatch
    marc1_first_parts = split_parts(m1.group(2))
    marc2_first_parts = split_parts(m2.group(2))
    if compare_parts(marc1_first_parts, marc2_first_parts):
        return True
    if match_seq(marc1_first_parts, marc2_first_parts):
        return True
    if marc_title(marc1_first_parts, marc2_first_parts):
        return True
    if marc_title(marc2_first_parts, marc1_first_parts):
        return True
    if amazon_title(marc1_first_parts, marc2_first_parts):
        return True
    if amazon_title(marc2_first_parts, marc1_first_parts):
        return True
    return False
Esempio n. 20
0
def test_normalize_titles_with_and():
    a = 'This and That'
    b = 'This & that'
    norm = "this and that"
    assert normalize(a) == normalize(b)
    assert normalize(b) == norm
Esempio n. 21
0
def test_normalize_unicode():
    a = u'Kitāb Yatīmat ud-Dahr'
    assert normalize(a) == u'kitāb yatīmat ud dahr'
Esempio n. 22
0
def test_normalize():
    assert normalize(
        'Hello this is a           Title') == 'hello this is a title'
Esempio n. 23
0
def test_find_works():
    works = list(find_works([]))
    assert works == []

    books = [{'title': 'Magic', 'key': '/books/OL1M'}]
    book_iter = get_books('', books, do_get_mc=False)

    books2 = list(book_iter)
    assert books2 == [{'key': '/books/OL1M', 'norm_title': 'magic', 'title': 'Magic'}]

    var = find_works2(books2)
    assert var['equiv'] == {}
    assert var['norm_titles'] == {'magic': 1}
    assert var['books_by_key'] == {'/books/OL1M': books2[0]}
    assert var['books'] == books2
    assert var['rev_wt'] == {}

    assert build_work_title_map({}, {'magic': 1}) == {}
    assert build_work_title_map({}, {'magic': 2, 'test': 0}) == {}

    works = list(find_works(books2, do_get_mc=False))
    expect = [
    { 'title': 'Magic',
        'editions': [{
        'key': '/books/OL1M',
        'norm_title': 'magic',
        'title': 'Magic'}],
    }]
    assert works == expect


    books = [
        {'title': 'Magic', 'key': '/books/OL1M'},
        {'title': 'Magic', 'key': '/books/OL2M'},
    ]
    book_iter = get_books('', books, do_get_mc=False)
    books2 = list(book_iter)

    var = find_works2(books2)
    assert var['equiv'] == {}
    assert var['norm_titles'] == {'magic': 2}
    assert var['books_by_key'] == {'/books/OL1M': books2[0], '/books/OL2M': books2[1]}
    assert var['books'] == books2
    assert var['rev_wt'] == {}

    works = list(find_works(books2, do_get_mc=False))
    expect = [
    { 'title': 'Magic',
        'editions': [
            { 'key': '/books/OL1M', 'norm_title': 'magic', 'title': 'Magic'},
            { 'key': '/books/OL2M', 'norm_title': 'magic', 'title': 'Magic'},
        ],
    }]
    assert works == expect

    magico = u'm\xe1gico'

    assert normalize(magico) == magico

    books = [
        {'title': magico, 'work_title': ['magic'], 'key': '/books/OL1M'},
        {'title': 'magic', 'key': '/books/OL2M'},
        {'title': magico, 'work_title': ['magic'], 'key': '/books/OL3M'},
        {'title': 'magic', 'key': '/books/OL4M'},
    ]
    expect_keys = sorted(e['key'] for e in books)
    book_iter = get_books('', books, do_get_mc=False)
    books2 = list(book_iter)

    expect = [
        {'key': '/books/OL1M', 'norm_title': magico, 'work_title': 'magic', 'norm_wt': 'magic', 'title': magico},
        {'key': '/books/OL2M', 'norm_title': 'magic', 'title': 'magic'},
        {'key': '/books/OL3M', 'norm_title': magico, 'work_title': 'magic', 'norm_wt': 'magic', 'title': magico},
        {'key': '/books/OL4M', 'norm_title': 'magic', 'title': 'magic'},
    ]

    assert len(books2) == 4
    for i in range(4):
        assert books2[i] == expect[i]

    var = find_works2(books2)
    assert var['equiv'] == {(magico, 'magic'): 2}
    assert var['norm_titles'] == {magico: 2, 'magic': 2}
    assert len(var['books_by_key']) == 4
    bk = var['books_by_key']
    assert bk['/books/OL1M'] == books2[0]
    assert bk['/books/OL2M'] == books2[1]
    assert bk['/books/OL3M'] == books2[2]
    assert bk['/books/OL4M'] == books2[3]
    assert var['books'] == books2
    assert var['rev_wt'] == {'magic': {'magic': 2}}

    title_map = build_work_title_map(var['equiv'], var['norm_titles'])

    assert title_map == {magico: 'magic'}

    find_works3(var)
    assert var['works'] == {'magic': {'magic': expect_keys}}
    assert var['work_titles'] == {'magic': ['/books/OL1M', '/books/OL3M']}

    sorted_works = find_work_sort(var)
    assert sorted_works == [(6, 'magic', {'magic': expect_keys})]

    works = list(find_works(books2, do_get_mc=False))
    expect = [{
        'title': u'Magic',
        'editions': [
            {'key': '/books/OL2M', 'norm_title': 'magic', 'title': 'magic'},
            {'key': '/books/OL1M', 'norm_title': u'mágico', 'norm_wt': 'magic', 'title': u'Mágico'},
        ], 
    }]

    work_count = len(works)
    assert work_count == 1
    editions = works[0]['editions']
    edition_count = len(works[0]['editions'])
    edition_keys = sorted(e['key'] for e in editions) 
    assert edition_keys == expect_keys
    assert edition_count == 4
    del works[0]['editions']
    assert works[0] == {'title': 'magic'}
Esempio n. 24
0
def title_replace_amp(amazon):
    return normalize(amazon['full-title'].replace(" & ", " and ")).lower()
Esempio n. 25
0
from __future__ import print_function
import re
from openlibrary.catalog.merge.normalize import normalize

re_split_parts = re.compile('(.*?[. ]+)')
re_marc_name = re.compile('^(.*), (.*)$')
re_amazon_space_name = re.compile('^(.+?[^ ]) +([A-Z][a-z]?)$')

verbose = False

titles = frozenset([
    normalize(x)
    for x in ('Mrs', 'Sir', 'pseud', 'Lady', 'Baron', 'lawyer', 'Lord',
              'actress', 'Dame', 'Mr', 'Viscount', 'professeur', 'Graf', 'Dr',
              'Countess', 'Ministerialrat', 'Oberamtsrat', 'Rechtsanwalt')
])


def flip_name(name):
    m = re_marc_name.match(name)
    if not m:
        return None
    return m.group(2) + ' ' + m.group(1)


def match_seq(parts1, parts2):
    if len(parts1) == len(parts2):
        return False
    if len(parts1) > len(parts2):
        longer, shorter = parts1, parts2
    else:
Esempio n. 26
0
def test_find_works():
    works = list(find_works([]))
    assert works == []

    books = [{'title': 'Magic', 'key': '/books/OL1M'}]
    book_iter = get_books('', books, do_get_mc=False)

    books2 = list(book_iter)
    assert books2 == [{
        'key': '/books/OL1M',
        'norm_title': 'magic',
        'title': 'Magic'
    }]

    var = find_works2(books2)
    assert var['equiv'] == {}
    assert var['norm_titles'] == {'magic': 1}
    assert var['books_by_key'] == {'/books/OL1M': books2[0]}
    assert var['books'] == books2
    assert var['rev_wt'] == {}

    assert build_work_title_map({}, {'magic': 1}) == {}
    assert build_work_title_map({}, {'magic': 2, 'test': 0}) == {}

    works = list(find_works(books2, do_get_mc=False))
    expect = [{
        'title':
        'Magic',
        'editions': [{
            'key': '/books/OL1M',
            'norm_title': 'magic',
            'title': 'Magic'
        }],
    }]
    assert works == expect

    books = [
        {
            'title': 'Magic',
            'key': '/books/OL1M'
        },
        {
            'title': 'Magic',
            'key': '/books/OL2M'
        },
    ]
    book_iter = get_books('', books, do_get_mc=False)
    books2 = list(book_iter)

    var = find_works2(books2)
    assert var['equiv'] == {}
    assert var['norm_titles'] == {'magic': 2}
    assert var['books_by_key'] == {
        '/books/OL1M': books2[0],
        '/books/OL2M': books2[1]
    }
    assert var['books'] == books2
    assert var['rev_wt'] == {}

    works = list(find_works(books2, do_get_mc=False))
    expect = [{
        'title':
        'Magic',
        'editions': [
            {
                'key': '/books/OL1M',
                'norm_title': 'magic',
                'title': 'Magic'
            },
            {
                'key': '/books/OL2M',
                'norm_title': 'magic',
                'title': 'Magic'
            },
        ],
    }]
    assert works == expect

    magico = u'm\xe1gico'

    assert normalize(magico) == magico

    books = [
        {
            'title': magico,
            'work_title': ['magic'],
            'key': '/books/OL1M'
        },
        {
            'title': 'magic',
            'key': '/books/OL2M'
        },
        {
            'title': magico,
            'work_title': ['magic'],
            'key': '/books/OL3M'
        },
        {
            'title': 'magic',
            'key': '/books/OL4M'
        },
    ]
    expect_keys = sorted(e['key'] for e in books)
    book_iter = get_books('', books, do_get_mc=False)
    books2 = list(book_iter)

    expect = [
        {
            'key': '/books/OL1M',
            'norm_title': magico,
            'work_title': 'magic',
            'norm_wt': 'magic',
            'title': magico
        },
        {
            'key': '/books/OL2M',
            'norm_title': 'magic',
            'title': 'magic'
        },
        {
            'key': '/books/OL3M',
            'norm_title': magico,
            'work_title': 'magic',
            'norm_wt': 'magic',
            'title': magico
        },
        {
            'key': '/books/OL4M',
            'norm_title': 'magic',
            'title': 'magic'
        },
    ]

    assert len(books2) == 4
    for i in range(4):
        assert books2[i] == expect[i]

    var = find_works2(books2)
    assert var['equiv'] == {(magico, 'magic'): 2}
    assert var['norm_titles'] == {magico: 2, 'magic': 2}
    assert len(var['books_by_key']) == 4
    bk = var['books_by_key']
    assert bk['/books/OL1M'] == books2[0]
    assert bk['/books/OL2M'] == books2[1]
    assert bk['/books/OL3M'] == books2[2]
    assert bk['/books/OL4M'] == books2[3]
    assert var['books'] == books2
    assert var['rev_wt'] == {'magic': {'magic': 2}}

    title_map = build_work_title_map(var['equiv'], var['norm_titles'])

    assert title_map == {magico: 'magic'}

    find_works3(var)
    assert var['works'] == {'magic': {'magic': expect_keys}}
    assert var['work_titles'] == {'magic': ['/books/OL1M', '/books/OL3M']}

    sorted_works = find_work_sort(var)
    assert sorted_works == [(6, 'magic', {'magic': expect_keys})]

    works = list(find_works(books2, do_get_mc=False))
    expect = [{
        'title':
        u'Magic',
        'editions': [
            {
                'key': '/books/OL2M',
                'norm_title': 'magic',
                'title': 'magic'
            },
            {
                'key': '/books/OL1M',
                'norm_title': u'mágico',
                'norm_wt': 'magic',
                'title': u'Mágico'
            },
        ],
    }]

    work_count = len(works)
    assert work_count == 1
    editions = works[0]['editions']
    edition_count = len(works[0]['editions'])
    edition_keys = sorted(e['key'] for e in editions)
    assert edition_keys == expect_keys
    assert edition_count == 4
    del works[0]['editions']
    assert works[0] == {'title': 'magic'}
Esempio n. 27
0
def compare_part(p1, p2):
    p1 = normalize(p1)
    p2 = normalize(p2)
    return p1.startswith(p2) or p2.startswith(p1)
Esempio n. 28
0
def short_title(s):
    return normalize(s)[:25]
Esempio n. 29
0
import re
from openlibrary.catalog.merge.normalize import normalize

re_split_parts = re.compile('(.*?[. ]+)')
re_marc_name = re.compile('^(.*), (.*)$')
re_amazon_space_name = re.compile('^(.+?[^ ]) +([A-Z][a-z]?)$')

verbose = False

titles = frozenset(
    normalize(x) for x in (
        'Mrs',
        'Sir',
        'pseud',
        'Lady',
        'Baron',
        'lawyer',
        'Lord',
        'actress',
        'Dame',
        'Mr',
        'Viscount',
        'professeur',
        'Graf',
        'Dr',
        'Countess',
        'Ministerialrat',
        'Oberamtsrat',
        'Rechtsanwalt',
    ))