コード例 #1
0
ファイル: parse.py プロジェクト: artmedlar/openlibrary
def read_author_person(f):
    f.remove_brackets()
    author = {}
    contents = f.get_contents(['a', 'b', 'c', 'd', 'e'])
    if 'a' not in contents and 'c' not in contents:
        return # should at least be a name or title
    name = [v.strip(' /,;:') for v in f.get_subfield_values(['a', 'b', 'c'])]
    if 'd' in contents:
        author = pick_first_date(strip_foc(d).strip(',') for d in contents['d'])
        if 'death_date' in author and author['death_date']:
            death_date = author['death_date']
            if re_number_dot.search(death_date):
                author['death_date'] = death_date[:-1]

    author['name'] = ' '.join(name)
    author['entity_type'] = 'person'
    subfields = [
        ('a', 'personal_name'),
        ('b', 'numeration'),
        ('c', 'title'),
        ('e', 'role')
    ]
    for subfield, field_name in subfields:
        if subfield in contents:
            author[field_name] = remove_trailing_dot(' '.join([x.strip(' /,;:') for x in contents[subfield]]))
    if 'q' in contents:
        author['fuller_name'] = ' '.join(contents['q'])
    for f in 'name', 'personal_name':
        author[f] = remove_trailing_dot(strip_foc(author[f]))
    return author
コード例 #2
0
ファイル: parse.py プロジェクト: artmedlar/openlibrary
def read_work_titles(rec):
    found = []
    tag_240 = rec.get_fields('240')
    if tag_240:
        for f in tag_240:
            title = f.get_subfield_values(['a', 'm', 'n', 'p', 'r'])
            found.append(remove_trailing_dot(' '.join(title).strip(',')))

    tag_130 = rec.get_fields('130')
    if tag_130:
        for f in tag_130:
            title = ' '.join(v for k, v in f.get_all_subfields() if k.islower() and k != 'n')
            found.append(remove_trailing_dot(title.strip(',')))

    return remove_duplicates(found)
コード例 #3
0
ファイル: parse.py プロジェクト: mikemaehr/openlibrary
def read_title(rec):
    fields = rec.get_fields('245')
    if not fields:
        fields = rec.get_fields('740')
    if not fields:
        raise NoTitle

#   example MARC record with multiple titles:
#   http://openlibrary.org/show-marc/marc_western_washington_univ/wwu_bibs.mrc_revrev.mrc:299505697:862
    contents = fields[0].get_contents(['a', 'b', 'c', 'h', 'p'])

    b_and_p = [i for i in fields[0].get_subfield_values(['b', 'p']) if i]

    ret = {}
    title = None

#   MARC record with 245a missing:
#   http://openlibrary.org/show-marc/marc_western_washington_univ/wwu_bibs.mrc_revrev.mrc:516779055:1304
    if 'a' in contents:
        title = ' '.join(x.strip(' /,;:') for x in contents['a'])
    elif b_and_p:
        title = b_and_p.pop(0).strip(' /,;:')
# talis_openlibrary_contribution/talis-openlibrary-contribution.mrc:183427199:255
    if title in ('See.', 'See also.'):
        raise SeeAlsoAsTitle
# talis_openlibrary_contribution/talis-openlibrary-contribution.mrc:5654086:483
# scrapbooksofmoun03tupp
    if title is None:
        subfields = list(fields[0].get_all_subfields())
        title = ' '.join(v for k, v in subfields)
        if not title: # ia:scrapbooksofmoun03tupp
            raise NoTitle
    ret['title'] = remove_trailing_dot(title)
    if b_and_p:
        ret["subtitle"] = ' : '.join(remove_trailing_dot(x.strip(' /,;:')) for x in b_and_p)
    if 'c' in contents:
        ret["by_statement"] = remove_trailing_dot(' '.join(contents['c']))
    if 'h' in contents:
        h = ' '.join(contents['h']).strip(' ')
        m = re_bracket_field.match(h)
        if m:
            h = m.group(1)
        assert h
        ret["physical_format"] = h
    return ret
コード例 #4
0
ファイル: parse.py プロジェクト: randomecho/openlibrary
def read_contributions(rec):
    want = dict((
        ('700', 'abcdeq'),
        ('710', 'ab'),
        ('711', 'acdn'),
        ('720', 'a'),
    ))

    ret = {}
    skip_authors = set()
    for tag in ('100', '110', '111'):
        fields = rec.get_fields(tag)
        for f in fields:
            skip_authors.add(tuple(f.get_all_subfields()))

    if not skip_authors:
        for tag, f in rec.read_fields(['700', '710', '711', '720']):
            f = rec.decode_field(f)
            if tag in ('700', '720'):
                if 'authors' not in ret or last_name_in_245c(rec, f):
                    ret.setdefault('authors', []).append(read_author_person(f))
                    skip_authors.add(tuple(f.get_subfields(want[tag])))
                continue
            elif 'authors' in ret:
                break
            if tag == '710':
                name = [v.strip(' /,;:') for v in f.get_subfield_values(want[tag])]
                ret['authors'] = [{ 'entity_type': 'org', 'name': remove_trailing_dot(' '.join(name))}]
                skip_authors.add(tuple(f.get_subfields(want[tag])))
                break
            if tag == '711':
                name = [v.strip(' /,;:') for v in f.get_subfield_values(want[tag])]
                ret['authors'] = [{ 'entity_type': 'event', 'name': remove_trailing_dot(' '.join(name))}]
                skip_authors.add(tuple(f.get_subfields(want[tag])))
                break

    for tag, f in rec.read_fields(['700', '710', '711', '720']):
        sub = want[tag]
        cur = tuple(rec.decode_field(f).get_subfields(sub))
        if tuple(cur) in skip_authors:
            continue
        name = remove_trailing_dot(' '.join(strip_foc(i[1]) for i in cur).strip(','))
        ret.setdefault('contributions', []).append(name) # need to add flip_name

    return ret
コード例 #5
0
def flip_place(s):
    s = remove_trailing_dot(s)
    # Whitechapel (London, England)
    # East End (London, England)
    # Whitechapel (Londres, Inglaterra)
    if re_paren.search(s):
        return s
    m = re_place_comma.match(s)
    return m.group(2) + ' ' + m.group(1) if m else s
コード例 #6
0
ファイル: test_utils.py プロジェクト: hornc/openlibrary-1
def test_remove_trailing_dot():
    data = [
        ('Test', 'Test'),
        ('Test.', 'Test'),
        ('Test J.', 'Test J.'),
        ('Test...', 'Test...')
    ]
    for input, expect in data:
        output = remove_trailing_dot(input)
        assert output == expect
コード例 #7
0
ファイル: work_subject.py プロジェクト: sribanta/openlibrary
def tidy_subject(s):
    s = s.strip()
    if len(s) < 2:
        print 'short subject:', `s`
    else:
        s = s[0].upper() + s[1:]
    m = re_etc.search(s)
    if m:
        return m.group(1)
    s = remove_trailing_dot(s)
    m = re_fictitious_character.match(s)
    return m.group(2) + ' ' + m.group(1) + m.group(3) if m else s
コード例 #8
0
ファイル: parse.py プロジェクト: artmedlar/openlibrary
def read_authors(rec):
    count = 0
    fields_100 = rec.get_fields('100')
    fields_110 = rec.get_fields('110')
    fields_111 = rec.get_fields('111')
    count = len(fields_100) + len(fields_110) + len(fields_111)
    if count == 0:
        return
    # talis_openlibrary_contribution/talis-openlibrary-contribution.mrc:11601515:773 has two authors:
    # 100 1  $aDowling, James Walter Frederick.
    # 111 2  $aConference on Civil Engineering Problems Overseas.

    found = [read_author_person(f) for f in fields_100]
    for f in fields_110:
        f.remove_brackets()
        name = [v.strip(' /,;:') for v in f.get_subfield_values(['a', 'b'])]
        found.append({ 'entity_type': 'org', 'name': remove_trailing_dot(' '.join(name))})
    for f in fields_111:
        f.remove_brackets()
        name = [v.strip(' /,;:') for v in f.get_subfield_values(['a', 'c', 'd', 'n'])]
        found.append({ 'entity_type': 'event', 'name': remove_trailing_dot(' '.join(name))})
    if found:
        return found
コード例 #9
0
def read_subjects(rec):
    subjects = defaultdict(lambda: defaultdict(int))
    for tag, field in rec.read_fields(subject_fields):
        f = rec.decode_field(field)
        aspects = find_aspects(f)

        if tag == '600': # people
            name_and_date = []
            for k, v in f.get_subfields(['a', 'b', 'c', 'd']):
                v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip(' /,;:')
                if k == 'a':
                    m = re_flip_name.match(v)
                    if m:
                        v = flip_name(v)
                name_and_date.append(v)
            name = remove_trailing_dot(' '.join(name_and_date)).strip()
            if name != '':
                subjects['person'][name] += 1
        elif tag == '610': # org
            v = ' '.join(f.get_subfield_values('abcd'))
            v = v.strip()
            if v:
                v = remove_trailing_dot(v).strip()
            if v:
                v = tidy_subject(v)
            if v:
                subjects['org'][v] += 1

            for v in f.get_subfield_values('a'):
                v = v.strip()
                if v:
                    v = remove_trailing_dot(v).strip()
                if v:
                    v = tidy_subject(v)
                if v:
                    subjects['org'][v] += 1
        elif tag == '611': # event
            v = ' '.join(j.strip() for i, j in f.get_all_subfields() if i not in 'vxyz')
            if v:
                v = v.strip()
            v = tidy_subject(v)
            if v:
                subjects['event'][v] += 1
        elif tag == '630': # work
            for v in f.get_subfield_values(['a']):
                v = v.strip()
                if v:
                    v = remove_trailing_dot(v).strip()
                if v:
                    v = tidy_subject(v)
                if v:
                    subjects['work'][v] += 1
        elif tag == '650': # topical
            for v in f.get_subfield_values(['a']):
                if v:
                    v = v.strip()
                v = tidy_subject(v)
                if v:
                    subjects['subject'][v] += 1
        elif tag == '651': # geo
            for v in f.get_subfield_values(['a']):
                if v:
                    subjects['place'][flip_place(v).strip()] += 1

        for v in f.get_subfield_values(['y']):
            v = v.strip()
            if v:
                subjects['time'][remove_trailing_dot(v).strip()] += 1
        for v in f.get_subfield_values(['v']):
            v = v.strip()
            if v:
                v = remove_trailing_dot(v).strip()
            v = tidy_subject(v)
            if v:
                subjects['subject'][v] += 1
        for v in f.get_subfield_values(['z']):
            v = v.strip()
            if v:
                subjects['place'][flip_place(v).strip()] += 1
        for v in f.get_subfield_values(['x']):
            v = v.strip()
            if not v:
                continue
            if aspects and re_aspects.search(v):
                continue
            v = tidy_subject(v)
            if v:
                subjects['subject'][v] += 1

    return dict((k, dict(v)) for k, v in subjects.items())
コード例 #10
0
def tidy_subfield(v):
    return remove_trailing_dot(v.strip(' /,;:'))
コード例 #11
0
def find_subjects(marc_subjects):
    person = defaultdict(int)
    event = defaultdict(int)
    work = defaultdict(int)
    org = defaultdict(int)
    time = defaultdict(int)
    place = defaultdict(int)
    subject = defaultdict(int)
    #fiction = False
    for lines in marc_subjects:
        for tag, line in lines:
            aspects = find_aspects(line)
            if aspects:
                subject[aspects] += 1
            if re_large_book.search(line):
                continue
            if tag == '600': # people
                name_and_date = []
                for k, v in get_subfields(line, ['a', 'b', 'c', 'd']):
                    v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip(' /,;:')
                    if k == 'a':
                        if v == 'Mao, Zedong':
                            v = 'Mao Zedong'
                        else:
                            m = re_flip_name.match(v)
                            if m:
                                v = flip_name(v)
                    name_and_date.append(v)
                name = remove_trailing_dot(' '.join(name_and_date)).strip()
                if name != '':
                    person[name] += 1
            elif tag == '610': # org
                v = ' '.join(get_subfield_values(line, 'abcd'))
                v = v.strip()
                if v:
                    v = remove_trailing_dot(v).strip()
                if v:
                    v = tidy_subject(v)
                if v:
                    org[v] += 1

                for v in get_subfield_values(line, 'a'):
                    v = v.strip()
                    if v:
                        v = remove_trailing_dot(v).strip()
                    if v:
                        v = tidy_subject(v)
                    if v:
                        org[v] += 1
            elif tag == '611': # event
                v = ' '.join(j.strip() for i, j in get_all_subfields(line) if i not in 'vxyz')
                if v:
                    v = v.strip()
                v = tidy_subject(v)
                if v:
                    event[v] += 1
            elif tag == '630': # work
                for v in get_subfield_values(line, ['a']):
                    v = v.strip()
                    if v:
                        v = remove_trailing_dot(v).strip()
                    if v:
                        v = tidy_subject(v)
                    if v:
                        work[v] += 1
            elif tag == '650': # topical
                for v in get_subfield_values(line, ['a']):
                    if v:
                        v = v.strip()
                    v = tidy_subject(v)
                    if v:
                        subject[v] += 1
            elif tag == '651': # geo
                for v in get_subfield_values(line, ['a']):
                    if v:
                        place[flip_place(v).strip()] += 1
            else:
                print 'other', tag, list(get_all_subfields(line))

            cur = [v for k, v in get_all_subfields(line) if k=='a' or v.strip('. ').lower() == 'fiction']

            # skip: 'Good, Sally (Fictitious character) in fiction'
            if len(cur) > 1 and cur[-1].strip('. ').lower() == 'fiction' and ')' not in cur[-2]:
                subject[flip_subject(cur[-2]) + ' in fiction'] += 1

            for v in get_subfield_values(line, ['y']):
                v = v.strip()
                if v:
                    time[remove_trailing_dot(v).strip()] += 1
            for v in get_subfield_values(line, ['v']):
                v = v.strip()
                if v:
                    v = remove_trailing_dot(v).strip()
                v = tidy_subject(v)
                if v:
                    subject[v] += 1
            for v in get_subfield_values(line, ['z']):
                v = v.strip()
                if v:
                    place[flip_place(v).strip()] += 1
            for v in get_subfield_values(line, ['x']):
                v = v.strip()
                if not v:
                    continue
                if aspects and re_aspects.search(v):
                    continue
                v = tidy_subject(v)
                if v:
                    subject[v] += 1

            v_and_x = get_subfield_values(line, ['v', 'x'])
            #if 'Fiction' in v_and_x or 'Fiction.' in v_and_x:
            #    fiction = True
    #if 'Fiction' in subject:
    #    del subject['Fiction']
    ret = {}
    if person:
        ret['person'] = dict(person)
    if time:
        ret['time'] = dict(time)
    if place:
        ret['place'] = dict(place)
    if subject:
        ret['subject'] = dict(subject)
    if event:
        ret['event'] = dict(event)
    if org:
        ret['org'] = dict(org)
    if work:
        ret['work'] = dict(work)
    return ret
コード例 #12
0
def find_subjects(marc_subjects):
    person = defaultdict(int)
    event = defaultdict(int)
    work = defaultdict(int)
    org = defaultdict(int)
    time = defaultdict(int)
    place = defaultdict(int)
    subject = defaultdict(int)
    #fiction = False
    for lines in marc_subjects:
        for tag, line in lines:
            aspects = find_aspects(line)
            if aspects:
                subject[aspects] += 1
            if re_large_book.search(line):
                continue
            if tag == '600': # people
                name_and_date = []
                for k, v in get_subfields(line, ['a', 'b', 'c', 'd']):
                    v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip(' /,;:')
                    if k == 'a':
                        if v == 'Mao, Zedong':
                            v = 'Mao Zedong'
                        else:
                            m = re_flip_name.match(v)
                            if m:
                                v = flip_name(v)
                    name_and_date.append(v)
                name = remove_trailing_dot(' '.join(name_and_date)).strip()
                if name != '':
                    person[name] += 1
            elif tag == '610': # org
                v = ' '.join(get_subfield_values(line, 'abcd'))
                v = v.strip()
                if v:
                    v = remove_trailing_dot(v).strip()
                if v:
                    v = tidy_subject(v)
                if v:
                    org[v] += 1

                for v in get_subfield_values(line, 'a'):
                    v = v.strip()
                    if v:
                        v = remove_trailing_dot(v).strip()
                    if v:
                        v = tidy_subject(v)
                    if v:
                        org[v] += 1
            elif tag == '611': # event
                v = ' '.join(j.strip() for i, j in get_all_subfields(line) if i not in 'vxyz')
                if v:
                    v = v.strip()
                v = tidy_subject(v)
                if v:
                    event[v] += 1
            elif tag == '630': # work
                for v in get_subfield_values(line, ['a']):
                    v = v.strip()
                    if v:
                        v = remove_trailing_dot(v).strip()
                    if v:
                        v = tidy_subject(v)
                    if v:
                        work[v] += 1
            elif tag == '650': # topical
                for v in get_subfield_values(line, ['a']):
                    if v:
                        v = v.strip()
                    v = tidy_subject(v)
                    if v:
                        subject[v] += 1
            elif tag == '651': # geo
                for v in get_subfield_values(line, ['a']):
                    if v:
                        place[flip_place(v).strip()] += 1
            else:
                print('other', tag, list(get_all_subfields(line)))

            cur = [v for k, v in get_all_subfields(line) if k=='a' or v.strip('. ').lower() == 'fiction']

            # skip: 'Good, Sally (Fictitious character) in fiction'
            if len(cur) > 1 and cur[-1].strip('. ').lower() == 'fiction' and ')' not in cur[-2]:
                subject[flip_subject(cur[-2]) + ' in fiction'] += 1

            for v in get_subfield_values(line, ['y']):
                v = v.strip()
                if v:
                    time[remove_trailing_dot(v).strip()] += 1
            for v in get_subfield_values(line, ['v']):
                v = v.strip()
                if v:
                    v = remove_trailing_dot(v).strip()
                v = tidy_subject(v)
                if v:
                    subject[v] += 1
            for v in get_subfield_values(line, ['z']):
                v = v.strip()
                if v:
                    place[flip_place(v).strip()] += 1
            for v in get_subfield_values(line, ['x']):
                v = v.strip()
                if not v:
                    continue
                if aspects and re_aspects.search(v):
                    continue
                v = tidy_subject(v)
                if v:
                    subject[v] += 1

            v_and_x = get_subfield_values(line, ['v', 'x'])
            #if 'Fiction' in v_and_x or 'Fiction.' in v_and_x:
            #    fiction = True
    #if 'Fiction' in subject:
    #    del subject['Fiction']
    ret = {}
    if person:
        ret['person'] = dict(person)
    if time:
        ret['time'] = dict(time)
    if place:
        ret['place'] = dict(place)
    if subject:
        ret['subject'] = dict(subject)
    if event:
        ret['event'] = dict(event)
    if org:
        ret['org'] = dict(org)
    if work:
        ret['work'] = dict(work)
    return ret
コード例 #13
0
ファイル: parse.py プロジェクト: user404d/openlibrary
def read_contributions(rec):
    """ Reads contributors from a MARC record
    and use values in 7xx fields to set 'authors'
    if the 1xx fields do not exist. Otherwise set
    additional 'contributions'

    :param (MarcBinary | MarcXml) rec:
    :rtype: dict
    """
    want = dict((
        ('700', 'abcdeq'),
        ('710', 'ab'),
        ('711', 'acdn'),
        ('720', 'a'),
    ))

    ret = {}
    skip_authors = set()
    for tag in ('100', '110', '111'):
        fields = rec.get_fields(tag)
        for f in fields:
            skip_authors.add(tuple(f.get_all_subfields()))

    if not skip_authors:
        for tag, f in rec.read_fields(['700', '710', '711', '720']):
            f = rec.decode_field(f)
            if tag in ('700', '720'):
                if 'authors' not in ret or last_name_in_245c(rec, f):
                    ret.setdefault('authors', []).append(read_author_person(f))
                    skip_authors.add(tuple(f.get_subfields(want[tag])))
                continue
            elif 'authors' in ret:
                break
            if tag == '710':
                name = [
                    v.strip(' /,;:') for v in f.get_subfield_values(want[tag])
                ]
                ret['authors'] = [{
                    'entity_type': 'org',
                    'name': remove_trailing_dot(' '.join(name))
                }]
                skip_authors.add(tuple(f.get_subfields(want[tag])))
                break
            if tag == '711':
                name = [
                    v.strip(' /,;:') for v in f.get_subfield_values(want[tag])
                ]
                ret['authors'] = [{
                    'entity_type': 'event',
                    'name': remove_trailing_dot(' '.join(name))
                }]
                skip_authors.add(tuple(f.get_subfields(want[tag])))
                break

    for tag, f in rec.read_fields(['700', '710', '711', '720']):
        sub = want[tag]
        cur = tuple(rec.decode_field(f).get_subfields(sub))
        if tuple(cur) in skip_authors:
            continue
        name = remove_trailing_dot(' '.join(strip_foc(i[1])
                                            for i in cur).strip(','))
        ret.setdefault('contributions',
                       []).append(name)  # need to add flip_name

    return ret
コード例 #14
0
ファイル: marc_subject.py プロジェクト: yzou/openlibrary
def read_subjects(rec):
    subjects = defaultdict(lambda: defaultdict(int))
    for tag, field in rec.read_fields(subject_fields):
        f = rec.decode_field(field)
        aspects = find_aspects(f)

        if tag == '600': # people
            name_and_date = []
            for k, v in f.get_subfields(['a', 'b', 'c', 'd']):
                v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip(' /,;:')
                if k == 'a':
                    m = re_flip_name.match(v)
                    if m:
                        v = flip_name(v)
                name_and_date.append(v)
            name = remove_trailing_dot(' '.join(name_and_date)).strip()
            if name != '':
                subjects['person'][name] += 1
        elif tag == '610': # org
            v = ' '.join(f.get_subfield_values('abcd'))
            v = v.strip()
            if v:
                v = remove_trailing_dot(v).strip()
            if v:
                v = tidy_subject(v)
            if v:
                subjects['org'][v] += 1

            for v in f.get_subfield_values('a'):
                v = v.strip()
                if v:
                    v = remove_trailing_dot(v).strip()
                if v:
                    v = tidy_subject(v)
                if v:
                    subjects['org'][v] += 1
        elif tag == '611': # event
            v = ' '.join(j.strip() for i, j in f.get_all_subfields() if i not in 'vxyz')
            if v:
                v = v.strip()
            v = tidy_subject(v)
            if v:
                subjects['event'][v] += 1
        elif tag == '630': # work
            for v in f.get_subfield_values(['a']):
                v = v.strip()
                if v:
                    v = remove_trailing_dot(v).strip()
                if v:
                    v = tidy_subject(v)
                if v:
                    subjects['work'][v] += 1
        elif tag == '650': # topical
            for v in f.get_subfield_values(['a']):
                if v:
                    v = v.strip()
                v = tidy_subject(v)
                if v:
                    subjects['subject'][v] += 1
        elif tag == '651': # geo
            for v in f.get_subfield_values(['a']):
                if v:
                    subjects['place'][flip_place(v).strip()] += 1

        for v in f.get_subfield_values(['y']):
            v = v.strip()
            if v:
                subjects['time'][remove_trailing_dot(v).strip()] += 1
        for v in f.get_subfield_values(['v']):
            v = v.strip()
            if v:
                v = remove_trailing_dot(v).strip()
            v = tidy_subject(v)
            if v:
                subjects['subject'][v] += 1
        for v in f.get_subfield_values(['z']):
            v = v.strip()
            if v:
                subjects['place'][flip_place(v).strip()] += 1
        for v in f.get_subfield_values(['x']):
            v = v.strip()
            if not v:
                continue
            if aspects and re_aspects.search(v):
                continue
            v = tidy_subject(v)
            if v:
                subjects['subject'][v] += 1

    return dict((k, dict(v)) for k, v in subjects.items())
コード例 #15
0
ファイル: work_subject.py プロジェクト: sribanta/openlibrary
def find_subjects(w, marc_subjects=None):
    people = defaultdict(int)
    genres = defaultdict(int)
    when = defaultdict(int)
    place = defaultdict(int)
    subject = defaultdict(int)
    #fiction = False
    for lines in marc_subjects or get_marc_subjects(w):
        for tag, line in lines:
            if re_large_book.match(line):
                continue
            if tag == '600': # people
                name_and_date = []
                for k, v in get_subfields(line, ['a', 'b', 'c', 'd']):
                    v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip(' /,;:')
                    if k == 'a':
                        if v == 'Mao, Zedong':
                            v = 'Mao Zedong'
                        else:
                            m = re_flip_name.match(v)
                            if m:
                                v = flip_name(v)
                    name_and_date.append(v)
                name = remove_trailing_dot(' '.join(name_and_date)).strip()
                if name != '':
                    people[name] += 1
            if tag == '650':
                for v in get_subfield_values(line, ['a']):
                    if v:
                        v = v.strip()
                    v = tidy_subject(v)
                    if v:
                        subject[v] += 1
            if tag == '651':
                for v in get_subfield_values(line, ['a']):
                    if v:
                        place[flip_place(v).strip()] += 1

            for v in get_subfield_values(line, ['y']):
                v = v.strip()
                if v:
                    when[remove_trailing_dot(v).strip()] += 1
            for v in get_subfield_values(line, ['v']):
                v = v.strip()
                if v:
                    subject[remove_trailing_dot(v).strip()] += 1
            for v in get_subfield_values(line, ['z']):
                v = v.strip()
                if v:
                    place[flip_place(v).strip()] += 1
            for v in get_subfield_values(line, ['x']):
                v = v.strip()
                if v:
                    v = tidy_subject(v)
                if v:
                    subject[v] += 1

            v_and_x = get_subfield_values(line, ['v', 'x'])
            #if 'Fiction' in v_and_x or 'Fiction.' in v_and_x:
            #    fiction = True
    #if 'Fiction' in subject:
    #    del subject['Fiction']
    ret = {}
    if people:
        ret['people'] = dict(people)
    if when:
        ret['times'] = dict(when)
    if place:
        ret['places'] = dict(place)
    if subject:
        ret['subjects'] = dict(subject)
    return ret
コード例 #16
0
ファイル: identify_people.py プロジェクト: zgruza/openlibrary
def tidy_subfield(v):
    return remove_trailing_dot(v.strip(' /,;:'))