Exemple #1
0
def east_in_by_statement(rec):
    if 'by_statement' not in rec:
        return False
    if 'authors' not in rec:
        return False
    name = rec['authors'][0]['name']
    flipped = flip_name(name)
    name = name.replace('.', '')
    name = name.replace(', ', '')
    if name == flipped.replace('.', ''):
        return False
    return rec['by_statement'].find(name) != -1
Exemple #2
0
def tidy_name(s):
    if s is None:
        return '<em>name missing</em>'
    if s == 'Mao, Zedong':
        return 'Mao Zedong'
    m = re_baron.match(s)
    if m:
        return m.group(2) + ' ' + m.group(1)
    if ' Baron ' in s:
        s = s[:s.find(' Baron ')]
    elif s.endswith(' Sir'):
        s = s[:-4]
    return flip_name(s)
Exemple #3
0
def find_entity(author):
    name = author['name']
    things = find_author(name)
    if author['entity_type'] != 'person':
        if not things:
            return None
        db_entity = withKey(things[0])
        if db_entity['type']['key'] == '/type/redirect':
            db_entity = withKey(db_entity['location'])
        assert db_entity['type']['key'] == '/type/author'
        return db_entity
    if ', ' in name:
        things += find_author(flip_name(name))
    match = []
    seen = set()
    for key in things:
        if key in seen:
            continue
        seen.add(key)
        db_entity = withKey(key)
        if db_entity['type']['key'] == '/type/redirect':
            key = db_entity['location']
            if key in seen:
                continue
            seen.add(key)
            db_entity = withKey(key)
        if db_entity['type']['key'] == '/type/delete':
            continue
        try:
            assert db_entity['type']['key'] == '/type/author'
        except:
            print name, key, db_entity
            raise
        if 'birth_date' in author and 'birth_date' not in db_entity:
            continue
        if 'birth_date' not in author and 'birth_date' in db_entity:
            continue
        if not author_dates_match(author, db_entity):
            continue
        match.append(db_entity)
    if not match:
        return None
    if len(match) == 1:
        return match[0]
    try:
        return pick_from_matches(author, match)
    except ValueError:
        print 'author:', author
        print 'match:', match
        raise
Exemple #4
0
def do_flip(author):
    # given an author name flip it in place
    if 'personal_name' not in author:
        return
    if author['personal_name'] != author['name']:
        return
    first_comma = author['name'].find(', ')
    if first_comma == -1:
        return
    # e.g: Harper, John Murdoch, 1845-
    if author['name'].find(',', first_comma + 1) != -1:
        return
    if author['name'].find('i.e.') != -1:
        return
    if author['name'].find('i. e.') != -1:
        return
    name = flip_name(author['name'])
    author['name'] = name
    author['personal_name'] = name
Exemple #5
0
def build_person_object(p, marc_alt):
    ab = [(k, v.strip(' /,;:')) for k, v in p if k in 'ab']

    has_b = any(k == 'b' for k, v in p)

    orig_name = ' '.join(v if k == 'a' else v for k, v in ab)
    c = ' '.join(v for k, v in p if k == 'c')
    name = flip_name(orig_name)
    if name[0].isdigit():
        name = orig_name
    else:
        of_count = c.count('of ')
        #    if of_count == 1 and not has_b and 'of the ' not in c:
        #        if c.startswith('King')
        #
        #        if c.startswith('Queen'):
        #        name += ' ' + c[c.find('of '):]
        #
        if of_count == 1 and 'of the ' not in c and 'Emperor of ' not in c:
            name += ' ' + c[c.find('of '):]
        elif ' ' not in name and of_count > 1:
            name += ', ' + c
        elif c.endswith(' of') or c.endswith(' de') and any(
                k == 'a' and ', ' in v for k, v in p):
            name = ' '.join(v for k, v in ab)
            c += ' ' + name[:name.find(', ')]
            name = name[name.find(', ') + 2:] + ', ' + c

    person = {}
    d = [v for k, v in p if k == 'd']
    if d:
        person = pick_first_date(d)
    person['name'] = name
    person['sort'] = orig_name

    if any(k == 'b' for k, v in p):
        person['enumeration'] = ' '.join(v for k, v in p if k == 'b')

    if c:
        person['title'] = c
    person['marc'] = [p] + list(marc_alt)

    return person
Exemple #6
0
def do_flip(author):
    # given an author name flip it in place
    if 'personal_name' not in author:
        return
    if author['personal_name'] != author['name']:
        return
    first_comma = author['name'].find(', ')
    if first_comma == -1:
        return
    # e.g: Harper, John Murdoch, 1845-
    if author['name'].find(',', first_comma + 1) != -1:
        return
    if author['name'].find('i.e.') != -1:
        return
    if author['name'].find('i. e.') != -1:
        return
    name = flip_name(author['name'])
    author['name'] = name
    author['personal_name'] = name
Exemple #7
0
def build_person_object(p, marc_alt):
    ab = [(k, v.strip(' /,;:')) for k, v in p if k in 'ab']

    has_b = any(k=='b' for k, v in p)

    orig_name = ' '.join(v if k == 'a' else v for k, v in ab)
    c = ' '.join(v for k, v in p if k == 'c')
    name = flip_name(orig_name)
    if name[0].isdigit():
        name = orig_name
    else:
        of_count = c.count('of ')
    #    if of_count == 1 and not has_b and 'of the ' not in c:
    #        if c.startswith('King')
    #
    #        if c.startswith('Queen'):
    #        name += ' ' + c[c.find('of '):]
    #
        if of_count == 1 and 'of the ' not in c and 'Emperor of ' not in c:
            name += ' ' + c[c.find('of '):]
        elif ' ' not in name and of_count > 1:
            name += ', ' + c
        elif c.endswith(' of') or c.endswith(' de') and any(k == 'a' and ', ' in v for k, v in p):
            name = ' '.join(v for k, v in ab)
            c += ' ' + name[:name.find(', ')]
            name = name[name.find(', ') + 2:] + ', ' + c

    person = {}
    d = [v for k, v in p if k =='d']
    if d:
        person = pick_first_date(d)
    person['name'] = name
    person['sort'] = orig_name

    if any(k=='b' for k, v in p):
        person['enumeration'] = ' '.join(v for k, v in p if k == 'b')

    if c:
        person['title'] = c
    person['marc'] = [p] + list(marc_alt)

    return person
Exemple #8
0
def find_entity(author):
    """
    Looks for an existing Author record in OL by name
    and returns it if found.

    :param dict author: Author import dict {"name": "Some One"}
    :rtype: dict|None
    :return: Existing Author record, if one is found
    """
    name = author['name']
    things = find_author(name)
    et = author.get('entity_type')
    if et and et != 'person':
        if not things:
            return None
        db_entity = things[0]
        assert db_entity['type']['key'] == '/type/author'
        return db_entity
    if ', ' in name:
        things += find_author(flip_name(name))
    match = []
    seen = set()
    for a in things:
        key = a['key']
        if key in seen:
            continue
        seen.add(key)
        orig_key = key
        assert a.type.key == '/type/author'
        if 'birth_date' in author and 'birth_date' not in a:
            continue
        if 'birth_date' not in author and 'birth_date' in a:
            continue
        if not author_dates_match(author, a):
            continue
        match.append(a)
    if not match:
        return None
    if len(match) == 1:
        return match[0]
    return pick_from_matches(author, match)
Exemple #9
0
def find_entity(author):
    name = author['name']
    things = find_author(name)
    if author['entity_type'] != 'person':
        if not things:
            return None
        db_entity = things[0]
#        if db_entity['type']['key'] == '/type/redirect':
#            db_entity = withKey(db_entity['location'])
        assert db_entity['type'] == '/type/author'
        return db_entity
    if ', ' in name:
        things += find_author(flip_name(name))
    match = []
    seen = set()
    for a in things:
        key = a['key']
        if key in seen:
            continue
        seen.add(key)
        orig_key = key
        assert a['type'] == '/type/author'
        if 'birth_date' in author and 'birth_date' not in a:
            continue
        if 'birth_date' not in author and 'birth_date' in a:
            continue
        if not author_dates_match(author, a):
            continue
        match.append(a)
    if not match:
        return None
    if len(match) == 1:
        return match[0]
    try:
        return pick_from_matches(author, match)
    except ValueError:
        print('author:', author)
        print('match:', match)
        raise
Exemple #10
0
def find_entity(author):
    name = author['name']
    things = find_author(name)
    if author['entity_type'] != 'person':
        if not things:
            return None
        db_entity = things[0]
        #        if db_entity['type']['key'] == '/type/redirect':
        #            db_entity = withKey(db_entity['location'])
        assert db_entity['type'] == '/type/author'
        return db_entity
    if ', ' in name:
        things += find_author(flip_name(name))
    match = []
    seen = set()
    for a in things:
        key = a['key']
        if key in seen:
            continue
        seen.add(key)
        orig_key = key
        assert a['type'] == '/type/author'
        if 'birth_date' in author and 'birth_date' not in a:
            continue
        if 'birth_date' not in author and 'birth_date' in a:
            continue
        if not author_dates_match(author, a):
            continue
        match.append(a)
    if not match:
        return None
    if len(match) == 1:
        return match[0]
    try:
        return pick_from_matches(author, match)
    except ValueError:
        print 'author:', author
        print 'match:', match
        raise
Exemple #11
0
def find_entity(author): # no direct DB calls
    name = author['name']
    things = find_author(name)
    et = author.get('entity_type')
    if et and et != 'person':
        if not things:
            return None
        db_entity = things[0]
        assert db_entity['type']['key'] == '/type/author'
        return db_entity
    if ', ' in name:
        things += find_author(flip_name(name))
    match = []
    seen = set()
    for a in things:
        key = a['key']
        if key in seen:
            continue
        seen.add(key)
        orig_key = key
        assert a.type.key == '/type/author'
        if 'birth_date' in author and 'birth_date' not in a:
            continue
        if 'birth_date' not in author and 'birth_date' in a:
            continue
        if not author_dates_match(author, a):
            continue
        match.append(a)
    if not match:
        return None
    if len(match) == 1:
        return match[0]
    try:
        return pick_from_matches(author, match)
    except ValueError:
        print 'author:', author
        print 'match:', match
        raise
Exemple #12
0
def east_in_by_statement(rec, author):
    """
    Returns False if there is no by_statement in rec.
    Otherwise returns whether author name uses eastern name order.
    TODO: elaborate on what this actually means, and how it is used.

    :param dict rec: import source edition record
    :param dict author: import source author dict: {"name": "Some One"}
    :rtype: bool
    """

    if 'by_statement' not in rec:
        return False
    if 'authors' not in rec:
        return False
    name = author['name']
    flipped = flip_name(name)
    name = name.replace('.', '')
    name = name.replace(', ', '')
    if name == flipped.replace('.', ''):
        # name was not flipped
        return False
    return rec['by_statement'].find(name) != -1
Exemple #13
0
def do_flip(author):
    """
    Given an author import dict, flip its name in place
    i.e. Smith, John => John Smith

    :param dict author:
    """
    if 'personal_name' not in author:
        return
    if author['personal_name'] != author['name']:
        return
    first_comma = author['name'].find(', ')
    if first_comma == -1:
        return
    # e.g: Harper, John Murdoch, 1845-
    if author['name'].find(',', first_comma + 1) != -1:
        return
    if author['name'].find('i.e.') != -1:
        return
    if author['name'].find('i. e.') != -1:
        return
    name = flip_name(author['name'])
    author['name'] = name
    author['personal_name'] = name
def read_subjects(rec):
    subjects = defaultdict(lambda: defaultdict(int))
    for tag, field in rec.read_fields(subject_fields):
        f = rec.decode_field(field)
        aspects = find_aspects(f)

        if tag == '600': # people
            name_and_date = []
            for k, v in f.get_subfields(['a', 'b', 'c', 'd']):
                v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip(' /,;:')
                if k == 'a':
                    m = re_flip_name.match(v)
                    if m:
                        v = flip_name(v)
                name_and_date.append(v)
            name = remove_trailing_dot(' '.join(name_and_date)).strip()
            if name != '':
                subjects['person'][name] += 1
        elif tag == '610': # org
            v = ' '.join(f.get_subfield_values('abcd'))
            v = v.strip()
            if v:
                v = remove_trailing_dot(v).strip()
            if v:
                v = tidy_subject(v)
            if v:
                subjects['org'][v] += 1

            for v in f.get_subfield_values('a'):
                v = v.strip()
                if v:
                    v = remove_trailing_dot(v).strip()
                if v:
                    v = tidy_subject(v)
                if v:
                    subjects['org'][v] += 1
        elif tag == '611': # event
            v = ' '.join(j.strip() for i, j in f.get_all_subfields() if i not in 'vxyz')
            if v:
                v = v.strip()
            v = tidy_subject(v)
            if v:
                subjects['event'][v] += 1
        elif tag == '630': # work
            for v in f.get_subfield_values(['a']):
                v = v.strip()
                if v:
                    v = remove_trailing_dot(v).strip()
                if v:
                    v = tidy_subject(v)
                if v:
                    subjects['work'][v] += 1
        elif tag == '650': # topical
            for v in f.get_subfield_values(['a']):
                if v:
                    v = v.strip()
                v = tidy_subject(v)
                if v:
                    subjects['subject'][v] += 1
        elif tag == '651': # geo
            for v in f.get_subfield_values(['a']):
                if v:
                    subjects['place'][flip_place(v).strip()] += 1

        for v in f.get_subfield_values(['y']):
            v = v.strip()
            if v:
                subjects['time'][remove_trailing_dot(v).strip()] += 1
        for v in f.get_subfield_values(['v']):
            v = v.strip()
            if v:
                v = remove_trailing_dot(v).strip()
            v = tidy_subject(v)
            if v:
                subjects['subject'][v] += 1
        for v in f.get_subfield_values(['z']):
            v = v.strip()
            if v:
                subjects['place'][flip_place(v).strip()] += 1
        for v in f.get_subfield_values(['x']):
            v = v.strip()
            if not v:
                continue
            if aspects and re_aspects.search(v):
                continue
            v = tidy_subject(v)
            if v:
                subjects['subject'][v] += 1

    return dict((k, dict(v)) for k, v in subjects.items())
Exemple #15
0
def read_subjects(rec):
    subjects = defaultdict(lambda: defaultdict(int))
    for tag, field in rec.read_fields(subject_fields):
        f = rec.decode_field(field)
        aspects = find_aspects(f)

        if tag == '600':  # people
            name_and_date = []
            for k, v in f.get_subfields(['a', 'b', 'c', 'd']):
                v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip(
                    ' /,;:')
                if k == 'a':
                    m = re_flip_name.match(v)
                    if m:
                        v = flip_name(v)
                name_and_date.append(v)
            name = remove_trailing_dot(' '.join(name_and_date)).strip()
            if name != '':
                subjects['person'][name] += 1
        elif tag == '610':  # org
            v = ' '.join(f.get_subfield_values('abcd'))
            v = v.strip()
            if v:
                v = remove_trailing_dot(v).strip()
            if v:
                v = tidy_subject(v)
            if v:
                subjects['org'][v] += 1

            for v in f.get_subfield_values('a'):
                v = v.strip()
                if v:
                    v = remove_trailing_dot(v).strip()
                if v:
                    v = tidy_subject(v)
                if v:
                    subjects['org'][v] += 1
        elif tag == '611':  # event
            v = ' '.join(j.strip() for i, j in f.get_all_subfields()
                         if i not in 'vxyz')
            if v:
                v = v.strip()
            v = tidy_subject(v)
            if v:
                subjects['event'][v] += 1
        elif tag == '630':  # work
            for v in f.get_subfield_values(['a']):
                v = v.strip()
                if v:
                    v = remove_trailing_dot(v).strip()
                if v:
                    v = tidy_subject(v)
                if v:
                    subjects['work'][v] += 1
        elif tag == '650':  # topical
            for v in f.get_subfield_values(['a']):
                if v:
                    v = v.strip()
                v = tidy_subject(v)
                if v:
                    subjects['subject'][v] += 1
        elif tag == '651':  # geo
            for v in f.get_subfield_values(['a']):
                if v:
                    subjects['place'][flip_place(v).strip()] += 1

        for v in f.get_subfield_values(['y']):
            v = v.strip()
            if v:
                subjects['time'][remove_trailing_dot(v).strip()] += 1
        for v in f.get_subfield_values(['v']):
            v = v.strip()
            if v:
                v = remove_trailing_dot(v).strip()
            v = tidy_subject(v)
            if v:
                subjects['subject'][v] += 1
        for v in f.get_subfield_values(['z']):
            v = v.strip()
            if v:
                subjects['place'][flip_place(v).strip()] += 1
        for v in f.get_subfield_values(['x']):
            v = v.strip()
            if not v:
                continue
            if aspects and re_aspects.search(v):
                continue
            v = tidy_subject(v)
            if v:
                subjects['subject'][v] += 1

    return dict((k, dict(v)) for k, v in subjects.items())
Exemple #16
0
def test_flip_name():
    assert flip_name('Smith, John.') == 'John Smith'
    assert flip_name('Smith, J.') == 'J. Smith'
def find_subjects(marc_subjects):
    person = defaultdict(int)
    event = defaultdict(int)
    work = defaultdict(int)
    org = defaultdict(int)
    time = defaultdict(int)
    place = defaultdict(int)
    subject = defaultdict(int)
    #fiction = False
    for lines in marc_subjects:
        for tag, line in lines:
            aspects = find_aspects(line)
            if aspects:
                subject[aspects] += 1
            if re_large_book.search(line):
                continue
            if tag == '600': # people
                name_and_date = []
                for k, v in get_subfields(line, ['a', 'b', 'c', 'd']):
                    v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip(' /,;:')
                    if k == 'a':
                        if v == 'Mao, Zedong':
                            v = 'Mao Zedong'
                        else:
                            m = re_flip_name.match(v)
                            if m:
                                v = flip_name(v)
                    name_and_date.append(v)
                name = remove_trailing_dot(' '.join(name_and_date)).strip()
                if name != '':
                    person[name] += 1
            elif tag == '610': # org
                v = ' '.join(get_subfield_values(line, 'abcd'))
                v = v.strip()
                if v:
                    v = remove_trailing_dot(v).strip()
                if v:
                    v = tidy_subject(v)
                if v:
                    org[v] += 1

                for v in get_subfield_values(line, 'a'):
                    v = v.strip()
                    if v:
                        v = remove_trailing_dot(v).strip()
                    if v:
                        v = tidy_subject(v)
                    if v:
                        org[v] += 1
            elif tag == '611': # event
                v = ' '.join(j.strip() for i, j in get_all_subfields(line) if i not in 'vxyz')
                if v:
                    v = v.strip()
                v = tidy_subject(v)
                if v:
                    event[v] += 1
            elif tag == '630': # work
                for v in get_subfield_values(line, ['a']):
                    v = v.strip()
                    if v:
                        v = remove_trailing_dot(v).strip()
                    if v:
                        v = tidy_subject(v)
                    if v:
                        work[v] += 1
            elif tag == '650': # topical
                for v in get_subfield_values(line, ['a']):
                    if v:
                        v = v.strip()
                    v = tidy_subject(v)
                    if v:
                        subject[v] += 1
            elif tag == '651': # geo
                for v in get_subfield_values(line, ['a']):
                    if v:
                        place[flip_place(v).strip()] += 1
            else:
                print 'other', tag, list(get_all_subfields(line))

            cur = [v for k, v in get_all_subfields(line) if k=='a' or v.strip('. ').lower() == 'fiction']

            # skip: 'Good, Sally (Fictitious character) in fiction'
            if len(cur) > 1 and cur[-1].strip('. ').lower() == 'fiction' and ')' not in cur[-2]:
                subject[flip_subject(cur[-2]) + ' in fiction'] += 1

            for v in get_subfield_values(line, ['y']):
                v = v.strip()
                if v:
                    time[remove_trailing_dot(v).strip()] += 1
            for v in get_subfield_values(line, ['v']):
                v = v.strip()
                if v:
                    v = remove_trailing_dot(v).strip()
                v = tidy_subject(v)
                if v:
                    subject[v] += 1
            for v in get_subfield_values(line, ['z']):
                v = v.strip()
                if v:
                    place[flip_place(v).strip()] += 1
            for v in get_subfield_values(line, ['x']):
                v = v.strip()
                if not v:
                    continue
                if aspects and re_aspects.search(v):
                    continue
                v = tidy_subject(v)
                if v:
                    subject[v] += 1

            v_and_x = get_subfield_values(line, ['v', 'x'])
            #if 'Fiction' in v_and_x or 'Fiction.' in v_and_x:
            #    fiction = True
    #if 'Fiction' in subject:
    #    del subject['Fiction']
    ret = {}
    if person:
        ret['person'] = dict(person)
    if time:
        ret['time'] = dict(time)
    if place:
        ret['place'] = dict(place)
    if subject:
        ret['subject'] = dict(subject)
    if event:
        ret['event'] = dict(event)
    if org:
        ret['org'] = dict(org)
    if work:
        ret['work'] = dict(work)
    return ret
Exemple #18
0
def find_subjects(marc_subjects):
    person = defaultdict(int)
    event = defaultdict(int)
    work = defaultdict(int)
    org = defaultdict(int)
    time = defaultdict(int)
    place = defaultdict(int)
    subject = defaultdict(int)
    #fiction = False
    for lines in marc_subjects:
        for tag, line in lines:
            aspects = find_aspects(line)
            if aspects:
                subject[aspects] += 1
            if re_large_book.search(line):
                continue
            if tag == '600':  # people
                name_and_date = []
                for k, v in get_subfields(line, ['a', 'b', 'c', 'd']):
                    v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip(
                        ' /,;:')
                    if k == 'a':
                        if v == 'Mao, Zedong':
                            v = 'Mao Zedong'
                        else:
                            m = re_flip_name.match(v)
                            if m:
                                v = flip_name(v)
                    name_and_date.append(v)
                name = remove_trailing_dot(' '.join(name_and_date)).strip()
                if name != '':
                    person[name] += 1
            elif tag == '610':  # org
                v = ' '.join(get_subfield_values(line, 'abcd'))
                v = v.strip()
                if v:
                    v = remove_trailing_dot(v).strip()
                if v:
                    v = tidy_subject(v)
                if v:
                    org[v] += 1

                for v in get_subfield_values(line, 'a'):
                    v = v.strip()
                    if v:
                        v = remove_trailing_dot(v).strip()
                    if v:
                        v = tidy_subject(v)
                    if v:
                        org[v] += 1
            elif tag == '611':  # event
                v = ' '.join(j.strip() for i, j in get_all_subfields(line)
                             if i not in 'vxyz')
                if v:
                    v = v.strip()
                v = tidy_subject(v)
                if v:
                    event[v] += 1
            elif tag == '630':  # work
                for v in get_subfield_values(line, ['a']):
                    v = v.strip()
                    if v:
                        v = remove_trailing_dot(v).strip()
                    if v:
                        v = tidy_subject(v)
                    if v:
                        work[v] += 1
            elif tag == '650':  # topical
                for v in get_subfield_values(line, ['a']):
                    if v:
                        v = v.strip()
                    v = tidy_subject(v)
                    if v:
                        subject[v] += 1
            elif tag == '651':  # geo
                for v in get_subfield_values(line, ['a']):
                    if v:
                        place[flip_place(v).strip()] += 1
            else:
                print 'other', tag, list(get_all_subfields(line))

            cur = [
                v for k, v in get_all_subfields(line)
                if k == 'a' or v.strip('. ').lower() == 'fiction'
            ]

            # skip: 'Good, Sally (Fictitious character) in fiction'
            if len(cur) > 1 and cur[-1].strip(
                    '. ').lower() == 'fiction' and ')' not in cur[-2]:
                subject[flip_subject(cur[-2]) + ' in fiction'] += 1

            for v in get_subfield_values(line, ['y']):
                v = v.strip()
                if v:
                    time[remove_trailing_dot(v).strip()] += 1
            for v in get_subfield_values(line, ['v']):
                v = v.strip()
                if v:
                    v = remove_trailing_dot(v).strip()
                v = tidy_subject(v)
                if v:
                    subject[v] += 1
            for v in get_subfield_values(line, ['z']):
                v = v.strip()
                if v:
                    place[flip_place(v).strip()] += 1
            for v in get_subfield_values(line, ['x']):
                v = v.strip()
                if not v:
                    continue
                if aspects and re_aspects.search(v):
                    continue
                v = tidy_subject(v)
                if v:
                    subject[v] += 1

            v_and_x = get_subfield_values(line, ['v', 'x'])
            #if 'Fiction' in v_and_x or 'Fiction.' in v_and_x:
            #    fiction = True
    #if 'Fiction' in subject:
    #    del subject['Fiction']
    ret = {}
    if person:
        ret['person'] = dict(person)
    if time:
        ret['time'] = dict(time)
    if place:
        ret['place'] = dict(place)
    if subject:
        ret['subject'] = dict(subject)
    if event:
        ret['event'] = dict(event)
    if org:
        ret['org'] = dict(org)
    if work:
        ret['work'] = dict(work)
    return ret
Exemple #19
0
def find_subjects(w, marc_subjects=None):
    people = defaultdict(int)
    genres = defaultdict(int)
    when = defaultdict(int)
    place = defaultdict(int)
    subject = defaultdict(int)
    #fiction = False
    for lines in marc_subjects or get_marc_subjects(w):
        for tag, line in lines:
            if re_large_book.match(line):
                continue
            if tag == '600': # people
                name_and_date = []
                for k, v in get_subfields(line, ['a', 'b', 'c', 'd']):
                    v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip(' /,;:')
                    if k == 'a':
                        if v == 'Mao, Zedong':
                            v = 'Mao Zedong'
                        else:
                            m = re_flip_name.match(v)
                            if m:
                                v = flip_name(v)
                    name_and_date.append(v)
                name = remove_trailing_dot(' '.join(name_and_date)).strip()
                if name != '':
                    people[name] += 1
            if tag == '650':
                for v in get_subfield_values(line, ['a']):
                    if v:
                        v = v.strip()
                    v = tidy_subject(v)
                    if v:
                        subject[v] += 1
            if tag == '651':
                for v in get_subfield_values(line, ['a']):
                    if v:
                        place[flip_place(v).strip()] += 1

            for v in get_subfield_values(line, ['y']):
                v = v.strip()
                if v:
                    when[remove_trailing_dot(v).strip()] += 1
            for v in get_subfield_values(line, ['v']):
                v = v.strip()
                if v:
                    subject[remove_trailing_dot(v).strip()] += 1
            for v in get_subfield_values(line, ['z']):
                v = v.strip()
                if v:
                    place[flip_place(v).strip()] += 1
            for v in get_subfield_values(line, ['x']):
                v = v.strip()
                if v:
                    v = tidy_subject(v)
                if v:
                    subject[v] += 1

            v_and_x = get_subfield_values(line, ['v', 'x'])
            #if 'Fiction' in v_and_x or 'Fiction.' in v_and_x:
            #    fiction = True
    #if 'Fiction' in subject:
    #    del subject['Fiction']
    ret = {}
    if people:
        ret['people'] = dict(people)
    if when:
        ret['times'] = dict(when)
    if place:
        ret['places'] = dict(place)
    if subject:
        ret['subjects'] = dict(subject)
    return ret