def read_author_person(line): author = {} contents = get_contents(line, ['a', 'b', 'c', 'd']) if 'a' not in contents and 'c' not in contents: return None # should at least be a name or title name = [v.strip(' /,;:') for v in get_subfield_values(line, ['a', 'b', 'c'])] if 'd' in contents: author = pick_first_date(contents['d']) if 'death_date' in author and author['death_date']: death_date = author['death_date'] if re_number_dot.search(death_date): author['death_date'] = death_date[:-1] author['name'] = ' '.join(name) author['entity_type'] = 'person' subfields = [ ('a', 'personal_name'), ('b', 'numeration'), ('c', 'title') ] for subfield, field_name in subfields: if subfield in contents: author[field_name] = ' '.join([x.strip(' /,;:') for x in contents[subfield]]) if 'q' in contents: author['fuller_name'] = ' '.join(contents['q']) return author
def test_lookup4(): fields = (('a', 'Forbes, George'), ('d', '1849-1936.')) found = name_lookup(fields) dates = pick_first_date(v for k, v in fields if k == 'd') match = look_for_match(found, dates, False) for k, v in match.iteritems(): print k, v match = pick_from_match(match) pprint(match)
def read_line(line, name): if not line or '\x1fd' not in line: return subfields = tuple((k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd')) marc_name = ' '.join(v for k, v in subfields if k in 'abc') flipped = flip_name(marc_name) if marc_name != name and flipped != name: return d = pick_first_date(v for k, v in subfields if k in 'abcd') dates = tuple(d.get(k, None) for k in ['birth_date', 'death_date', 'date']) return (marc_name, flipped, dates)
def test_lookup3(): line = '00\x1faJohn,\x1fcof Paris,\x1fd1240?-1306.\x1e' print fmt_line(get_subfields(line, 'abcd')) fields = tuple((k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd')) print fields found = name_lookup(fields) # print [i for i in found if 'Paris' in i[0]] # found = [(u'John of Paris', [u'Christian philosophers', u'Dominicans', u'Roman Catholic theologians', u'13th-century Latin writers', u'1255 births', u'1306 deaths'], u'john of paris', None)] dates = pick_first_date(v for k, v in fields if k == 'd') match = look_for_match(found, dates, False) match = pick_from_match(match) pprint(match)
def test_lookup(): line = '00\x1faEgeria,\x1fd4th/5th cent.\x1e' # count=3 wiki = 'Egeria (pilgrim)' print fmt_line(get_subfields(line, 'abcd')) fields = tuple((k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd')) print fields found = name_lookup(fields) print found dates = pick_first_date(v for k, v in fields if k == 'd') assert dates.items()[0] != ('date', '') print dates print print look_for_match(found, dates, True)
def test_lookup2(): line = '00\x1faRichard,\x1fcof St. Victor,\x1fdd. 1173.\x1e' print fmt_line(get_subfields(line, 'abcd')) fields = tuple((k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd')) print fields found = name_lookup(fields) dates = pick_first_date(v for k, v in fields if k == 'd') assert dates.items()[0] != ('date', '') print dates print match = look_for_match(found, dates, False) pprint(match) print match = pick_from_match(match) pprint(match)
def parse_person(line): contents = get_person_content(line) marc_orig = list(get_all_subfields(line)), if not ('a' in contents or 'c' in contents): return marc_orig, {} assert 'a' in contents or 'c' in contents if 'd' in contents: author = pick_first_date(contents['d']) else: author = {} #author['marc_orig'] = list(get_all_subfields(line)), for tag, f in [ ('b', 'numeration'), ('c', 'title') ]: if tag in contents: author[f] = ' '.join(x.strip(' /,;:') for x in contents[tag]) if 'a' in contents: name = ' '.join(x.strip(' /,;:') for x in contents['a']) name = remove_trailing_dot(name) m = re_marc_name.match(name) if m: author['family_name'] = m.group(1) author['given_names'] = m.group(2) author['name'] = m.group(2) + ' ' + m.group(1) else: author['name'] = name name_subfields = get_subfield_values(line, ['a', 'b', 'c']) author['sort'] = ' '.join(v.strip(' /,;:') for v in name_subfields) if 'q' in contents: if len(contents['q']) != 1: print(marc_orig) assert len(contents['q']) == 1 q = strip_q(contents['q'][0]) if 'given_names' in authors: assert initials(q) == initials(author['given_names']) \ or q.startswith(author['given_names']) author['given_names'] = q return marc_orig, author
def db_marc_lookup(): verbose = False c = get_cursor() articles = set() count = 0 count_with_date = 0 t0 = time() match_count = 0 total = 3596802 prev_fields = None fh = open('matches3', 'w') for line in bz2.BZ2File('marc_authors.bz2'): count+=1 line = eval(line) line = strip_brackets(line) if count % 5000 == 0: t1 = time() - t0 rec_per_sec = count / t1 time_left = (total - count) / rec_per_sec print(fmt_line(get_subfields(line, 'abcd'))) print(count, count_with_date, match_count, "%.2f%% %.2f mins left" % (float(match_count * 100.0) / float(count_with_date), time_left / 60)) fields = tuple((k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd')) if prev_fields == fields: continue prev_fields = fields dates = pick_first_date(v for k, v in fields if k == 'd') if dates.items()[0] == ('date', ''): continue count_with_date += 1 if verbose: print(fmt_line(get_subfields(line, 'abcd'))) print(dates) is_noble_or_clergy = any(re_noble_or_clergy.search(v) \ for v in get_subfield_values(line, 'c')) found = name_lookup(c, fields) if not found: continue if is_noble_or_clergy: print('noble or clergy not found:') print(fmt_line(get_subfields(line, 'abcd'))) print() continue match = {} seen = set() for name, cats, match_name, pd in found: if name in seen: continue seen.add(name) cats = eval(cats) if not any(cat.endswith(' births') or cat.endswith(' deaths') for cat in cats): continue dm = date_match(dates, cats) if dm: match[name] = (cats, match_name) if not verbose: continue print((name, match_name)) print("cats =", cats) print(('match' if dm else 'no match')) for field in ['birth', 'death']: print(field + 's:', [i[:-(len(field)+2)] for i in cats if i.endswith(' %ss' % field)], end=' ') print() if verbose: print('---') if not match: continue if is_noble_or_clergy: print('noble or clergy not found:') print(fmt_line(get_subfields(line, 'abcd'))) print(found) print() continue match_count+=1 # articles.add(match.keys()[0]) if len(match) != 1: match = pick_from_match(match) if len(match) != 1: print(count, match_count) print(fmt_line(get_subfields(line, 'abcd'))) more_than_one_match(match) else: #print (list(get_subfields(line, 'abcd')), match.keys()[0]) print((match.keys()[0], fields), file=fh) continue # print len(articles), match[0][0], fmt_line(get_subfields(line, 'abcd')) assert len(match) == 1 print(match_count) fh.close()
def db_marc_lookup(): c = get_cursor() articles = set() count = 0 t0 = time() match_count = 0 total = 3596802 for line in bz2.BZ2File('marc_authors.bz2'): count += 1 if count % 1000 == 0: t1 = time() - t0 rec_per_sec = count / t1 time_left = (total - count) / rec_per_sec print( count, match_count, "%.2f%% %.2f mins left" % ((match_count * 100) / count, time_left / 60)) line = eval(line) line = strip_brackets(line) fields = [(k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd')] dates = pick_first_date(v for k, v in fields if k == 'd') if dates.items()[0] == ('date', ''): continue found = name_lookup(c, fields) if not found: continue match = {} seen = set() # print fmt_line(get_subfields(line, 'abcd')) # print dates for name, cats, match_name, pd in found: if name in seen: continue seen.add(name) cats = eval(cats) if not any( cat.endswith(' births') or cat.endswith(' deaths') for cat in cats): continue dm = date_match(dates, cats) if dm: match[name] = (cats, match_name) continue print((name, match_name)) print("cats =", cats) print(('match' if dm else 'no match')) for field in ['birth', 'death']: print(field + 's:', [ i[:-(len(field) + 2)] for i in cats if i.endswith(' %ss' % field) ], end=' ') print() # print '---' if not match: continue match_count += 1 # articles.add(match.keys()[0]) if len(match) != 1: print(count, match_count) print(fmt_line(get_subfields(line, 'abcd'))) for name, (cats, match_name) in match.items(): print(name, cats, match_name) print("http://en.wikipedia.org/wiki/" + name.replace(' ', '_')) print() continue # print len(articles), match[0][0], fmt_line(get_subfields(line, 'abcd')) assert len(match) == 1 print(match_count)
def db_marc_lookup(): c = get_cursor() articles = set() count = 0 t0 = time() match_count = 0 total = 3596802 for line in bz2.BZ2File('marc_authors.bz2'): count+=1 if count % 1000 == 0: t1 = time() - t0 rec_per_sec = count / t1 time_left = (total - count) / rec_per_sec print count, match_count, "%.2f%% %.2f mins left" % ((match_count * 100) / count, time_left / 60) line = eval(line) line = strip_brackets(line) fields = [(k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd')] dates = pick_first_date(v for k, v in fields if k == 'd') if dates.items()[0] == ('date', ''): continue found = name_lookup(c, fields) if not found: continue match = {} seen = set() # print fmt_line(get_subfields(line, 'abcd')) # print dates for name, cats, match_name, pd in found: if name in seen: continue seen.add(name) cats = eval(cats) if not any(cat.endswith(' births') or cat.endswith(' deaths') for cat in cats): continue dm = date_match(dates, cats) if dm: match[name] = (cats, match_name) continue print (name, match_name) print "cats =", cats print ('match' if dm else 'no match') for field in ['birth', 'death']: print field + 's:', [i[:-(len(field)+2)] for i in cats if i.endswith(' %ss' % field)], print # print '---' if not match: continue match_count+=1 # articles.add(match.keys()[0]) if len(match) != 1: print count, match_count print fmt_line(get_subfields(line, 'abcd')) for name, (cats, match_name) in match.items(): print name, cats, match_name print "http://en.wikipedia.org/wiki/" + name.replace(' ', '_') print continue # print len(articles), match[0][0], fmt_line(get_subfields(line, 'abcd')) assert len(match) == 1 print match_count
def db_marc_lookup(): verbose = False articles = set() count = 0 count_with_date = 0 t0 = time() match_count = 0 total = 3596802 prev_fields = None fh = open('matches', 'w') bad = codecs.open('more_than_one_match', 'w', 'utf8') for line in open('/1/edward/wikipedia/marc_authors2'): count += 1 # (author_count, line) = eval(line) (line, author_count) = eval(line) # line = strip_brackets(line) if count % 5000 == 0: t1 = time() - t0 rec_per_sec = count / t1 time_left = (total - count) / rec_per_sec #print fmt_line(get_subfields(line, 'abcd')) # print list(get_subfields(line, 'abcd')) print line print count, count_with_date, match_count, "%.2f%% %.2f mins left" % ( float(match_count * 100.0) / float(count_with_date), time_left / 60) fields = tuple((k, v.strip(' /,;:')) for k, v in line) if prev_fields == fields: continue prev_fields = fields dates = pick_first_date(v for k, v in fields if k == 'd') if dates.items()[0] == ('date', ''): continue count_with_date += 1 if verbose: print line print dates is_noble_or_clergy = any(k == 'c' and re_noble_or_clergy.search(v) for k, v in fields) found = name_lookup(fields) if not found: continue if is_noble_or_clergy: print 'noble or clergy not found:', line print continue match = look_for_match(found, dates, verbose) if not match: continue if is_noble_or_clergy: print 'noble or clergy not found:' print fmt_line(line) print found print continue match_count += 1 # articles.add(match.keys()[0]) if len(match) != 1: match = pick_from_match(match) if len(match) != 1: print >> bad, "\n" + fmt_line(line) for i in more_than_one_match(match): print >> bad, i else: #print (list(get_subfields(line, 'abcd')), match.keys()[0]) cats = match.values()[0]['cats'] exact = match.values()[0]['exact_dates'] dc = [i for i in cats if any(i.endswith(j) for j in date_cats)] print >> fh, (match.keys()[0], fields, author_count, dc, exact, 'Living people' in cats) print match_count fh.close()
def db_marc_lookup(): verbose = False articles = set() count = 0 count_with_date = 0 t0 = time() match_count = 0 total = 3596802 prev_fields = None fh = open('matches', 'w') bad = codecs.open('more_than_one_match', 'w', 'utf8') for line in open('/1/edward/wikipedia/marc_authors2'): count+=1 # (author_count, line) = eval(line) (line, author_count) = eval(line) # line = strip_brackets(line) if count % 5000 == 0: t1 = time() - t0 rec_per_sec = count / t1 time_left = (total - count) / rec_per_sec #print fmt_line(get_subfields(line, 'abcd')) # print list(get_subfields(line, 'abcd')) print line print count, count_with_date, match_count, "%.2f%% %.2f mins left" % (float(match_count * 100.0) / float(count_with_date), time_left / 60) fields = tuple((k, v.strip(' /,;:')) for k, v in line) if prev_fields == fields: continue prev_fields = fields dates = pick_first_date(v for k, v in fields if k == 'd') if dates.items()[0] == ('date', ''): continue count_with_date += 1 if verbose: print line print dates is_noble_or_clergy = any(k =='c' and re_noble_or_clergy.search(v) for k, v in fields) found = name_lookup(fields) if not found: continue if is_noble_or_clergy: print 'noble or clergy not found:', line print continue match = look_for_match(found, dates, verbose) if not match: continue if is_noble_or_clergy: print 'noble or clergy not found:' print fmt_line(line) print found print continue match_count+=1 # articles.add(match.keys()[0]) if len(match) != 1: match = pick_from_match(match) if len(match) != 1: print >> bad, "\n" + fmt_line(line) for i in more_than_one_match(match): print >> bad, i else: #print (list(get_subfields(line, 'abcd')), match.keys()[0]) cats = match.values()[0]['cats'] exact = match.values()[0]['exact_dates'] dc = [i for i in cats if any(i.endswith(j) for j in date_cats)] print >> fh, (match.keys()[0], fields, author_count, dc, exact, 'Living people' in cats) print match_count fh.close()
from __future__ import print_function from catalog.utils import pick_first_date import web import re import sys import codecs sys.stdout = codecs.getwriter('utf-8')(sys.stdout) re_marc_name = re.compile('^(.*), (.*)$') re_end_dot = re.compile('[^ ][^ ]\.$', re.UNICODE) def flip_name(name): # strip end dots like this: "Smith, John." but not like this: "Smith, J." m = re_end_dot.search(name) if m: name = name[:-1] m = re_marc_name.match(name) return m.group(2) + ' ' + m.group(1) for wikipedia, marc in (eval(i) for i in open("matches4")): dates = pick_first_date(v for k, v in marc if k == 'd') name = ' '.join(v for k, v in marc if k in 'abc') print(name) if ', ' in name: print(flip_name(name)) print(dates)
from catalog.utils import pick_first_date import web, re, sys, codecs sys.stdout = codecs.getwriter('utf-8')(sys.stdout) re_marc_name = re.compile('^(.*), (.*)$') re_end_dot = re.compile('[^ ][^ ]\.$', re.UNICODE) def flip_name(name): # strip end dots like this: "Smith, John." but not like this: "Smith, J." m = re_end_dot.search(name) if m: name = name[:-1] m = re_marc_name.match(name) return m.group(2) + ' ' + m.group(1) for wikipedia, marc in (eval(i) for i in open("matches4")): dates = pick_first_date(v for k, v in marc if k == 'd') name = ' '.join(v for k, v in marc if k in 'abc') print name if ', ' in name: print flip_name(name) print dates