Ejemplo n.º 1
0
def missing_subtag(found, marc_alt):
    merge = defaultdict(set)
    for p1, p2 in combinations(found, 2):
        subtag1 = [k for k, v in p1 if k in 'abcdq']
        subtag2 = [k for k, v in p2 if k in 'abcdq']

        if subtag1 == subtag2:
            continue

        name1 = ' '.join(v.strip() for k, v in p1)
        name2 = ' '.join(v.strip() for k, v in p2)

        if not match_with_bad_chars(name1, name2) \
                and normalize(name1) != normalize(name2) \
                and normalize(remove_bad_marc_subtag(name1)) != normalize(remove_bad_marc_subtag(name2)) \
                and normalize(name1.lower().replace(' the', '')) != normalize(name2.lower().replace(' the', '')):
            continue

        if len(subtag1) > len(subtag2):
            merge[p2].add(just_abcdq(p1))
        else:
            merge[p1].add(just_abcdq(p2))

    def flat_len(p):
        return len(' '.join(v for k, v in p))

    for old, new in merge.items():
        by_size = sorted((flat_len(p), p) for p in new)
        if len(by_size) > 1:
            assert by_size[-1][0] > by_size[-2][0]
        new_marc = by_size[-1][1]

        found[new_marc] += found.pop(old)
        marc_alt[old] = new_marc
Ejemplo n.º 2
0
def test_match_with_bad_chars():
    samples = [
        [
            'Machiavelli, Niccolo, 1469-1527',
            'Machiavelli, Niccol\xf2 1469-1527'
        ],
        ['Humanitas Publica\xe7\xf5es', 'Humanitas Publicac?o?es'],
        [
            'A pesquisa ling\xfc\xedstica no Brasil',
            'A pesquisa lingu?i?stica no Brasil',
        ],
        ['S\xe3o Paulo', 'Sa?o Paulo'],
        [
            'Diccionario espa\xf1ol-ingl\xe9s de bienes ra\xedces',
            'Diccionario Espan\u0303ol-Ingle\u0301s de bienes rai\u0301ces',
        ],
        [
            'Konfliktunterdru?ckung in O?sterreich seit 1918',
            'Konfliktunterdru\u0308ckung in O\u0308sterreich seit 1918',
            'Konfliktunterdr\xfcckung in \xd6sterreich seit 1918',
        ],
        [
            'Soi\ufe20u\ufe21z khudozhnikov SSSR.',
            'Soi?u?z khudozhnikov SSSR.',
            'Soi\u0361uz khudozhnikov SSSR.',
        ],
        [
            'Andrzej Weronski', 'Andrzej Wero\u0144ski',
            'Andrzej Weron\u0301ski'
        ],
    ]
    for l in samples:
        for a, b in combinations(l, 2):
            assert match_with_bad_chars(a, b)
Ejemplo n.º 3
0
def merge_authors(ol, keys, debug=False):
    #    print 'merge author %s:"%s" and %s:"%s"' % (author['key'], author['name'], merge_with['key'], merge_with['name'])
    #    print 'becomes: "%s"' % repr(new_name)
    authors = [
        a for a in (withKey(k) for k in keys)
        if a['type']['key'] != '/type/redirect'
    ]
    not_redirect = set(a['key'] for a in authors)
    if debug:
        for a in authors:
            print(a)

    assert all(a['type']['key'] == '/type/author' for a in authors)
    name1 = authors[0]['name']
    for a in authors:
        print(repr(a['key'], a['name']))
    assert all(match_with_bad_chars(a['name'], name1) for a in authors[1:])

    best_key = pick_best_author(authors)['key']

    imgs = [
        a['key'] for a in authors
        if a['key'] != '/a/OL2688880A' and has_image(a['key'])
    ]
    if len(imgs) == 1:
        new_key = imgs[0]
    else:
        new_key = "/a/OL%dA" % min(key_int(a) for a in authors)
        # Molière and O. J. O. Ferreira
        if len(imgs) != 0:
            print('imgs:', imgs)
            return  # skip
        if not (imgs == [u'/a/OL21848A', u'/a/OL4280680A'] \
                or imgs == [u'/a/OL325189A', u'/a/OL266422A'] \
                or imgs == [u'/a/OL5160945A', u'/a/OL5776228A']):
            print(imgs)
            assert len(imgs) == 0

    print(new_key)
    print(best_key)

    do_normalize(new_key, best_key, authors)
    old_keys = set(k for k in keys if k != new_key)
    print('old keys:', old_keys)

    for old in old_keys:
        # /b/OL21291659M
        switch_author(ol, old, new_key, old_keys, debug=True)
        if old in not_redirect:
            make_redirect(ol, old, new_key)
        q = {
            'authors': old,
            'type': '/type/edition',
        }
        if list(get_things(q)) != []:
            switch_author(ol, old, new_key, old_keys, debug=True)
Ejemplo n.º 4
0
def merge_authors(ol, keys, debug=False):
#    print 'merge author %s:"%s" and %s:"%s"' % (author['key'], author['name'], merge_with['key'], merge_with['name'])
#    print 'becomes: "%s"' % repr(new_name)
    authors = [a for a in (withKey(k) for k in keys) if a['type']['key'] != '/type/redirect']
    not_redirect = set(a['key'] for a in authors)
    if debug:
        for a in authors:
            print(a)

    assert all(a['type']['key'] == '/type/author' for a in authors)
    name1 = authors[0]['name']
    for a in authors:
        print(repr(a['key'], a['name']))
    assert all(match_with_bad_chars(a['name'], name1) for a in authors[1:])

    best_key = pick_best_author(authors)['key']

    imgs = [a['key'] for a in authors if a['key'] != '/a/OL2688880A' and has_image(a['key'])]
    if len(imgs) == 1:
        new_key = imgs[0]
    else:
        new_key = "/a/OL%dA" % min(key_int(a) for a in authors)
        # Molière and O. J. O. Ferreira
        if len(imgs) != 0:
            print('imgs:', imgs)
            return # skip
        if not (imgs == [u'/a/OL21848A', u'/a/OL4280680A'] \
                or imgs == [u'/a/OL325189A', u'/a/OL266422A'] \
                or imgs == [u'/a/OL5160945A', u'/a/OL5776228A']):
            print(imgs)
            assert len(imgs) == 0

    print(new_key)
    print(best_key)

    do_normalize(new_key, best_key, authors)
    old_keys = set(k for k in keys if k != new_key)
    print('old keys:', old_keys)

    for old in old_keys:
        # /b/OL21291659M
        switch_author(ol, old, new_key, old_keys, debug=True)
        if old in not_redirect:
            make_redirect(ol, old, new_key)
        q = { 'authors': old, 'type': '/type/edition', }
        if list(get_things(q)) != []:
            switch_author(ol, old, new_key, old_keys, debug=True)
Ejemplo n.º 5
0
def bad_char_name_match(found, marc_alt):
    merge = []
    for p1, p2 in combinations(found, 2):
        if p1 == p2:
            continue
        if get_marc_date(p1) != get_marc_date(p2):
            continue
        p1, p2 = sorted([p1, p2], key=lambda i:found[i])
        if found[p1] != found[p2]:
            name1 = ' '.join(v for k, v in p1 if k in 'abc')
            name2 = ' '.join(v for k, v in p2 if k in 'abc')
            if match_with_bad_chars(name1, name2):
                found[p2] += found.pop(p1)
                marc_alt[p1] = p2

    for a, b in merge:
        if b not in found:
            continue
        found[a] += found.pop(b)
        marc_alt[b] = a
Ejemplo n.º 6
0
def test_match_with_bad_chars():
    samples = [
        [u'Machiavelli, Niccolo, 1469-1527', u'Machiavelli, Niccol\xf2 1469-1527'],
        [u'Humanitas Publica\xe7\xf5es', 'Humanitas Publicac?o?es'],
        [u'A pesquisa ling\xfc\xedstica no Brasil',
          'A pesquisa lingu?i?stica no Brasil'],
        [u'S\xe3o Paulo', 'Sa?o Paulo'],
        [u'Diccionario espa\xf1ol-ingl\xe9s de bienes ra\xedces',
         u'Diccionario Espan\u0303ol-Ingle\u0301s de bienes rai\u0301ces'],
        [u'Konfliktunterdru?ckung in O?sterreich seit 1918',
         u'Konfliktunterdru\u0308ckung in O\u0308sterreich seit 1918',
         u'Konfliktunterdr\xfcckung in \xd6sterreich seit 1918'],
        [u'Soi\ufe20u\ufe21z khudozhnikov SSSR.',
         u'Soi?u?z khudozhnikov SSSR.',
         u'Soi\u0361uz khudozhnikov SSSR.'],
        [u'Andrzej Weronski', u'Andrzej Wero\u0144ski', u'Andrzej Weron\u0301ski'],
    ]
    for l in samples:
        for a, b in combinations(l, 2):
            assert match_with_bad_chars(a, b)