def east_in_by_statement(rec): if 'by_statement' not in rec: return False if 'authors' not in rec: return False name = rec['authors'][0]['name'] flipped = flip_name(name) name = name.replace('.', '') name = name.replace(', ', '') if name == flipped.replace('.', ''): return False return rec['by_statement'].find(name) != -1
def tidy_name(s): if s is None: return '<em>name missing</em>' if s == 'Mao, Zedong': return 'Mao Zedong' m = re_baron.match(s) if m: return m.group(2) + ' ' + m.group(1) if ' Baron ' in s: s = s[:s.find(' Baron ')] elif s.endswith(' Sir'): s = s[:-4] return flip_name(s)
def find_entity(author): name = author['name'] things = find_author(name) if author['entity_type'] != 'person': if not things: return None db_entity = withKey(things[0]) if db_entity['type']['key'] == '/type/redirect': db_entity = withKey(db_entity['location']) assert db_entity['type']['key'] == '/type/author' return db_entity if ', ' in name: things += find_author(flip_name(name)) match = [] seen = set() for key in things: if key in seen: continue seen.add(key) db_entity = withKey(key) if db_entity['type']['key'] == '/type/redirect': key = db_entity['location'] if key in seen: continue seen.add(key) db_entity = withKey(key) if db_entity['type']['key'] == '/type/delete': continue try: assert db_entity['type']['key'] == '/type/author' except: print name, key, db_entity raise if 'birth_date' in author and 'birth_date' not in db_entity: continue if 'birth_date' not in author and 'birth_date' in db_entity: continue if not author_dates_match(author, db_entity): continue match.append(db_entity) if not match: return None if len(match) == 1: return match[0] try: return pick_from_matches(author, match) except ValueError: print 'author:', author print 'match:', match raise
def do_flip(author): # given an author name flip it in place if 'personal_name' not in author: return if author['personal_name'] != author['name']: return first_comma = author['name'].find(', ') if first_comma == -1: return # e.g: Harper, John Murdoch, 1845- if author['name'].find(',', first_comma + 1) != -1: return if author['name'].find('i.e.') != -1: return if author['name'].find('i. e.') != -1: return name = flip_name(author['name']) author['name'] = name author['personal_name'] = name
def build_person_object(p, marc_alt): ab = [(k, v.strip(' /,;:')) for k, v in p if k in 'ab'] has_b = any(k == 'b' for k, v in p) orig_name = ' '.join(v if k == 'a' else v for k, v in ab) c = ' '.join(v for k, v in p if k == 'c') name = flip_name(orig_name) if name[0].isdigit(): name = orig_name else: of_count = c.count('of ') # if of_count == 1 and not has_b and 'of the ' not in c: # if c.startswith('King') # # if c.startswith('Queen'): # name += ' ' + c[c.find('of '):] # if of_count == 1 and 'of the ' not in c and 'Emperor of ' not in c: name += ' ' + c[c.find('of '):] elif ' ' not in name and of_count > 1: name += ', ' + c elif c.endswith(' of') or c.endswith(' de') and any( k == 'a' and ', ' in v for k, v in p): name = ' '.join(v for k, v in ab) c += ' ' + name[:name.find(', ')] name = name[name.find(', ') + 2:] + ', ' + c person = {} d = [v for k, v in p if k == 'd'] if d: person = pick_first_date(d) person['name'] = name person['sort'] = orig_name if any(k == 'b' for k, v in p): person['enumeration'] = ' '.join(v for k, v in p if k == 'b') if c: person['title'] = c person['marc'] = [p] + list(marc_alt) return person
def build_person_object(p, marc_alt): ab = [(k, v.strip(' /,;:')) for k, v in p if k in 'ab'] has_b = any(k=='b' for k, v in p) orig_name = ' '.join(v if k == 'a' else v for k, v in ab) c = ' '.join(v for k, v in p if k == 'c') name = flip_name(orig_name) if name[0].isdigit(): name = orig_name else: of_count = c.count('of ') # if of_count == 1 and not has_b and 'of the ' not in c: # if c.startswith('King') # # if c.startswith('Queen'): # name += ' ' + c[c.find('of '):] # if of_count == 1 and 'of the ' not in c and 'Emperor of ' not in c: name += ' ' + c[c.find('of '):] elif ' ' not in name and of_count > 1: name += ', ' + c elif c.endswith(' of') or c.endswith(' de') and any(k == 'a' and ', ' in v for k, v in p): name = ' '.join(v for k, v in ab) c += ' ' + name[:name.find(', ')] name = name[name.find(', ') + 2:] + ', ' + c person = {} d = [v for k, v in p if k =='d'] if d: person = pick_first_date(d) person['name'] = name person['sort'] = orig_name if any(k=='b' for k, v in p): person['enumeration'] = ' '.join(v for k, v in p if k == 'b') if c: person['title'] = c person['marc'] = [p] + list(marc_alt) return person
def find_entity(author): """ Looks for an existing Author record in OL by name and returns it if found. :param dict author: Author import dict {"name": "Some One"} :rtype: dict|None :return: Existing Author record, if one is found """ name = author['name'] things = find_author(name) et = author.get('entity_type') if et and et != 'person': if not things: return None db_entity = things[0] assert db_entity['type']['key'] == '/type/author' return db_entity if ', ' in name: things += find_author(flip_name(name)) match = [] seen = set() for a in things: key = a['key'] if key in seen: continue seen.add(key) orig_key = key assert a.type.key == '/type/author' if 'birth_date' in author and 'birth_date' not in a: continue if 'birth_date' not in author and 'birth_date' in a: continue if not author_dates_match(author, a): continue match.append(a) if not match: return None if len(match) == 1: return match[0] return pick_from_matches(author, match)
def find_entity(author): name = author['name'] things = find_author(name) if author['entity_type'] != 'person': if not things: return None db_entity = things[0] # if db_entity['type']['key'] == '/type/redirect': # db_entity = withKey(db_entity['location']) assert db_entity['type'] == '/type/author' return db_entity if ', ' in name: things += find_author(flip_name(name)) match = [] seen = set() for a in things: key = a['key'] if key in seen: continue seen.add(key) orig_key = key assert a['type'] == '/type/author' if 'birth_date' in author and 'birth_date' not in a: continue if 'birth_date' not in author and 'birth_date' in a: continue if not author_dates_match(author, a): continue match.append(a) if not match: return None if len(match) == 1: return match[0] try: return pick_from_matches(author, match) except ValueError: print('author:', author) print('match:', match) raise
def find_entity(author): name = author['name'] things = find_author(name) if author['entity_type'] != 'person': if not things: return None db_entity = things[0] # if db_entity['type']['key'] == '/type/redirect': # db_entity = withKey(db_entity['location']) assert db_entity['type'] == '/type/author' return db_entity if ', ' in name: things += find_author(flip_name(name)) match = [] seen = set() for a in things: key = a['key'] if key in seen: continue seen.add(key) orig_key = key assert a['type'] == '/type/author' if 'birth_date' in author and 'birth_date' not in a: continue if 'birth_date' not in author and 'birth_date' in a: continue if not author_dates_match(author, a): continue match.append(a) if not match: return None if len(match) == 1: return match[0] try: return pick_from_matches(author, match) except ValueError: print 'author:', author print 'match:', match raise
def find_entity(author): # no direct DB calls name = author['name'] things = find_author(name) et = author.get('entity_type') if et and et != 'person': if not things: return None db_entity = things[0] assert db_entity['type']['key'] == '/type/author' return db_entity if ', ' in name: things += find_author(flip_name(name)) match = [] seen = set() for a in things: key = a['key'] if key in seen: continue seen.add(key) orig_key = key assert a.type.key == '/type/author' if 'birth_date' in author and 'birth_date' not in a: continue if 'birth_date' not in author and 'birth_date' in a: continue if not author_dates_match(author, a): continue match.append(a) if not match: return None if len(match) == 1: return match[0] try: return pick_from_matches(author, match) except ValueError: print 'author:', author print 'match:', match raise
def east_in_by_statement(rec, author): """ Returns False if there is no by_statement in rec. Otherwise returns whether author name uses eastern name order. TODO: elaborate on what this actually means, and how it is used. :param dict rec: import source edition record :param dict author: import source author dict: {"name": "Some One"} :rtype: bool """ if 'by_statement' not in rec: return False if 'authors' not in rec: return False name = author['name'] flipped = flip_name(name) name = name.replace('.', '') name = name.replace(', ', '') if name == flipped.replace('.', ''): # name was not flipped return False return rec['by_statement'].find(name) != -1
def do_flip(author): """ Given an author import dict, flip its name in place i.e. Smith, John => John Smith :param dict author: """ if 'personal_name' not in author: return if author['personal_name'] != author['name']: return first_comma = author['name'].find(', ') if first_comma == -1: return # e.g: Harper, John Murdoch, 1845- if author['name'].find(',', first_comma + 1) != -1: return if author['name'].find('i.e.') != -1: return if author['name'].find('i. e.') != -1: return name = flip_name(author['name']) author['name'] = name author['personal_name'] = name
def read_subjects(rec): subjects = defaultdict(lambda: defaultdict(int)) for tag, field in rec.read_fields(subject_fields): f = rec.decode_field(field) aspects = find_aspects(f) if tag == '600': # people name_and_date = [] for k, v in f.get_subfields(['a', 'b', 'c', 'd']): v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip(' /,;:') if k == 'a': m = re_flip_name.match(v) if m: v = flip_name(v) name_and_date.append(v) name = remove_trailing_dot(' '.join(name_and_date)).strip() if name != '': subjects['person'][name] += 1 elif tag == '610': # org v = ' '.join(f.get_subfield_values('abcd')) v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: subjects['org'][v] += 1 for v in f.get_subfield_values('a'): v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: subjects['org'][v] += 1 elif tag == '611': # event v = ' '.join(j.strip() for i, j in f.get_all_subfields() if i not in 'vxyz') if v: v = v.strip() v = tidy_subject(v) if v: subjects['event'][v] += 1 elif tag == '630': # work for v in f.get_subfield_values(['a']): v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: subjects['work'][v] += 1 elif tag == '650': # topical for v in f.get_subfield_values(['a']): if v: v = v.strip() v = tidy_subject(v) if v: subjects['subject'][v] += 1 elif tag == '651': # geo for v in f.get_subfield_values(['a']): if v: subjects['place'][flip_place(v).strip()] += 1 for v in f.get_subfield_values(['y']): v = v.strip() if v: subjects['time'][remove_trailing_dot(v).strip()] += 1 for v in f.get_subfield_values(['v']): v = v.strip() if v: v = remove_trailing_dot(v).strip() v = tidy_subject(v) if v: subjects['subject'][v] += 1 for v in f.get_subfield_values(['z']): v = v.strip() if v: subjects['place'][flip_place(v).strip()] += 1 for v in f.get_subfield_values(['x']): v = v.strip() if not v: continue if aspects and re_aspects.search(v): continue v = tidy_subject(v) if v: subjects['subject'][v] += 1 return dict((k, dict(v)) for k, v in subjects.items())
def read_subjects(rec): subjects = defaultdict(lambda: defaultdict(int)) for tag, field in rec.read_fields(subject_fields): f = rec.decode_field(field) aspects = find_aspects(f) if tag == '600': # people name_and_date = [] for k, v in f.get_subfields(['a', 'b', 'c', 'd']): v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip( ' /,;:') if k == 'a': m = re_flip_name.match(v) if m: v = flip_name(v) name_and_date.append(v) name = remove_trailing_dot(' '.join(name_and_date)).strip() if name != '': subjects['person'][name] += 1 elif tag == '610': # org v = ' '.join(f.get_subfield_values('abcd')) v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: subjects['org'][v] += 1 for v in f.get_subfield_values('a'): v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: subjects['org'][v] += 1 elif tag == '611': # event v = ' '.join(j.strip() for i, j in f.get_all_subfields() if i not in 'vxyz') if v: v = v.strip() v = tidy_subject(v) if v: subjects['event'][v] += 1 elif tag == '630': # work for v in f.get_subfield_values(['a']): v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: subjects['work'][v] += 1 elif tag == '650': # topical for v in f.get_subfield_values(['a']): if v: v = v.strip() v = tidy_subject(v) if v: subjects['subject'][v] += 1 elif tag == '651': # geo for v in f.get_subfield_values(['a']): if v: subjects['place'][flip_place(v).strip()] += 1 for v in f.get_subfield_values(['y']): v = v.strip() if v: subjects['time'][remove_trailing_dot(v).strip()] += 1 for v in f.get_subfield_values(['v']): v = v.strip() if v: v = remove_trailing_dot(v).strip() v = tidy_subject(v) if v: subjects['subject'][v] += 1 for v in f.get_subfield_values(['z']): v = v.strip() if v: subjects['place'][flip_place(v).strip()] += 1 for v in f.get_subfield_values(['x']): v = v.strip() if not v: continue if aspects and re_aspects.search(v): continue v = tidy_subject(v) if v: subjects['subject'][v] += 1 return dict((k, dict(v)) for k, v in subjects.items())
def test_flip_name(): assert flip_name('Smith, John.') == 'John Smith' assert flip_name('Smith, J.') == 'J. Smith'
def find_subjects(marc_subjects): person = defaultdict(int) event = defaultdict(int) work = defaultdict(int) org = defaultdict(int) time = defaultdict(int) place = defaultdict(int) subject = defaultdict(int) #fiction = False for lines in marc_subjects: for tag, line in lines: aspects = find_aspects(line) if aspects: subject[aspects] += 1 if re_large_book.search(line): continue if tag == '600': # people name_and_date = [] for k, v in get_subfields(line, ['a', 'b', 'c', 'd']): v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip(' /,;:') if k == 'a': if v == 'Mao, Zedong': v = 'Mao Zedong' else: m = re_flip_name.match(v) if m: v = flip_name(v) name_and_date.append(v) name = remove_trailing_dot(' '.join(name_and_date)).strip() if name != '': person[name] += 1 elif tag == '610': # org v = ' '.join(get_subfield_values(line, 'abcd')) v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: org[v] += 1 for v in get_subfield_values(line, 'a'): v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: org[v] += 1 elif tag == '611': # event v = ' '.join(j.strip() for i, j in get_all_subfields(line) if i not in 'vxyz') if v: v = v.strip() v = tidy_subject(v) if v: event[v] += 1 elif tag == '630': # work for v in get_subfield_values(line, ['a']): v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: work[v] += 1 elif tag == '650': # topical for v in get_subfield_values(line, ['a']): if v: v = v.strip() v = tidy_subject(v) if v: subject[v] += 1 elif tag == '651': # geo for v in get_subfield_values(line, ['a']): if v: place[flip_place(v).strip()] += 1 else: print 'other', tag, list(get_all_subfields(line)) cur = [v for k, v in get_all_subfields(line) if k=='a' or v.strip('. ').lower() == 'fiction'] # skip: 'Good, Sally (Fictitious character) in fiction' if len(cur) > 1 and cur[-1].strip('. ').lower() == 'fiction' and ')' not in cur[-2]: subject[flip_subject(cur[-2]) + ' in fiction'] += 1 for v in get_subfield_values(line, ['y']): v = v.strip() if v: time[remove_trailing_dot(v).strip()] += 1 for v in get_subfield_values(line, ['v']): v = v.strip() if v: v = remove_trailing_dot(v).strip() v = tidy_subject(v) if v: subject[v] += 1 for v in get_subfield_values(line, ['z']): v = v.strip() if v: place[flip_place(v).strip()] += 1 for v in get_subfield_values(line, ['x']): v = v.strip() if not v: continue if aspects and re_aspects.search(v): continue v = tidy_subject(v) if v: subject[v] += 1 v_and_x = get_subfield_values(line, ['v', 'x']) #if 'Fiction' in v_and_x or 'Fiction.' in v_and_x: # fiction = True #if 'Fiction' in subject: # del subject['Fiction'] ret = {} if person: ret['person'] = dict(person) if time: ret['time'] = dict(time) if place: ret['place'] = dict(place) if subject: ret['subject'] = dict(subject) if event: ret['event'] = dict(event) if org: ret['org'] = dict(org) if work: ret['work'] = dict(work) return ret
def find_subjects(marc_subjects): person = defaultdict(int) event = defaultdict(int) work = defaultdict(int) org = defaultdict(int) time = defaultdict(int) place = defaultdict(int) subject = defaultdict(int) #fiction = False for lines in marc_subjects: for tag, line in lines: aspects = find_aspects(line) if aspects: subject[aspects] += 1 if re_large_book.search(line): continue if tag == '600': # people name_and_date = [] for k, v in get_subfields(line, ['a', 'b', 'c', 'd']): v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip( ' /,;:') if k == 'a': if v == 'Mao, Zedong': v = 'Mao Zedong' else: m = re_flip_name.match(v) if m: v = flip_name(v) name_and_date.append(v) name = remove_trailing_dot(' '.join(name_and_date)).strip() if name != '': person[name] += 1 elif tag == '610': # org v = ' '.join(get_subfield_values(line, 'abcd')) v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: org[v] += 1 for v in get_subfield_values(line, 'a'): v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: org[v] += 1 elif tag == '611': # event v = ' '.join(j.strip() for i, j in get_all_subfields(line) if i not in 'vxyz') if v: v = v.strip() v = tidy_subject(v) if v: event[v] += 1 elif tag == '630': # work for v in get_subfield_values(line, ['a']): v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: work[v] += 1 elif tag == '650': # topical for v in get_subfield_values(line, ['a']): if v: v = v.strip() v = tidy_subject(v) if v: subject[v] += 1 elif tag == '651': # geo for v in get_subfield_values(line, ['a']): if v: place[flip_place(v).strip()] += 1 else: print 'other', tag, list(get_all_subfields(line)) cur = [ v for k, v in get_all_subfields(line) if k == 'a' or v.strip('. ').lower() == 'fiction' ] # skip: 'Good, Sally (Fictitious character) in fiction' if len(cur) > 1 and cur[-1].strip( '. ').lower() == 'fiction' and ')' not in cur[-2]: subject[flip_subject(cur[-2]) + ' in fiction'] += 1 for v in get_subfield_values(line, ['y']): v = v.strip() if v: time[remove_trailing_dot(v).strip()] += 1 for v in get_subfield_values(line, ['v']): v = v.strip() if v: v = remove_trailing_dot(v).strip() v = tidy_subject(v) if v: subject[v] += 1 for v in get_subfield_values(line, ['z']): v = v.strip() if v: place[flip_place(v).strip()] += 1 for v in get_subfield_values(line, ['x']): v = v.strip() if not v: continue if aspects and re_aspects.search(v): continue v = tidy_subject(v) if v: subject[v] += 1 v_and_x = get_subfield_values(line, ['v', 'x']) #if 'Fiction' in v_and_x or 'Fiction.' in v_and_x: # fiction = True #if 'Fiction' in subject: # del subject['Fiction'] ret = {} if person: ret['person'] = dict(person) if time: ret['time'] = dict(time) if place: ret['place'] = dict(place) if subject: ret['subject'] = dict(subject) if event: ret['event'] = dict(event) if org: ret['org'] = dict(org) if work: ret['work'] = dict(work) return ret
def find_subjects(w, marc_subjects=None): people = defaultdict(int) genres = defaultdict(int) when = defaultdict(int) place = defaultdict(int) subject = defaultdict(int) #fiction = False for lines in marc_subjects or get_marc_subjects(w): for tag, line in lines: if re_large_book.match(line): continue if tag == '600': # people name_and_date = [] for k, v in get_subfields(line, ['a', 'b', 'c', 'd']): v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip(' /,;:') if k == 'a': if v == 'Mao, Zedong': v = 'Mao Zedong' else: m = re_flip_name.match(v) if m: v = flip_name(v) name_and_date.append(v) name = remove_trailing_dot(' '.join(name_and_date)).strip() if name != '': people[name] += 1 if tag == '650': for v in get_subfield_values(line, ['a']): if v: v = v.strip() v = tidy_subject(v) if v: subject[v] += 1 if tag == '651': for v in get_subfield_values(line, ['a']): if v: place[flip_place(v).strip()] += 1 for v in get_subfield_values(line, ['y']): v = v.strip() if v: when[remove_trailing_dot(v).strip()] += 1 for v in get_subfield_values(line, ['v']): v = v.strip() if v: subject[remove_trailing_dot(v).strip()] += 1 for v in get_subfield_values(line, ['z']): v = v.strip() if v: place[flip_place(v).strip()] += 1 for v in get_subfield_values(line, ['x']): v = v.strip() if v: v = tidy_subject(v) if v: subject[v] += 1 v_and_x = get_subfield_values(line, ['v', 'x']) #if 'Fiction' in v_and_x or 'Fiction.' in v_and_x: # fiction = True #if 'Fiction' in subject: # del subject['Fiction'] ret = {} if people: ret['people'] = dict(people) if when: ret['times'] = dict(when) if place: ret['places'] = dict(place) if subject: ret['subjects'] = dict(subject) return ret