def get_work_title(e): # use first work title we find in source MARC records wt = None for src_type, src in get_marc_src(e): if src_type == 'ia': wt = get_ia_work_title(src) if wt: break continue assert src_type == 'marc' try: data = get_from_archive(src) except ValueError: print 'bad record source:', src print 'http://openlibrary.org' + e['key'] continue if not data: continue try: line = get_first_tag(data, set(['240'])) except BadDictionary: print 'bad dictionary:', src print 'http://openlibrary.org' + e['key'] continue if line: wt = ' '.join(get_subfield_values(line, ['a'])).strip('. ') break if wt: return wt if not e.get('work_titles', []): return print 'work title in MARC, but not in OL' print 'http://openlibrary.org' + e['key'] return e['work_titles'][0]
def get_work_title(e, mc): # use first work title we find in source MARC records wt = None for src_type, src in get_marc_src(e, mc): if src_type == 'ia': wt = get_ia_work_title(src) if wt: wt = wt.strip('. ') if wt: break continue assert src_type == 'marc' data = None try: data = get_data(src) except ValueError: print 'bad record source:', src print 'http://openlibrary.org' + e['key'] continue except urllib2.HTTPError, error: print 'HTTP error:', error.code, error.msg print e['key'] if not data: continue is_marc8 = data[9] != 'a' try: line = get_first_tag(data, set(['240'])) except BadDictionary: print 'bad dictionary:', src print 'http://openlibrary.org' + e['key'] continue if line: wt = ' '.join(get_subfield_values(line, ['a'], is_marc8)).strip('. ') break
def get_work_title(e, mc): # use first work title we find in source MARC records wt = None for src_type, src in get_marc_src(e, mc): if src_type == 'ia': wt = get_ia_work_title(src) if wt: wt = wt.strip('. ') if wt: break continue assert src_type == 'marc' data = None try: data = get_data(src) except ValueError: print 'bad record source:', src print 'http://openlibrary.org' + e['key'] continue except urllib2.HTTPError, error: print 'HTTP error:', error.code, error.msg print e['key'] if not data: continue is_marc8 = data[9] != 'a' try: line = get_first_tag(data, set(['240'])) except BadDictionary: print 'bad dictionary:', src print 'http://openlibrary.org' + e['key'] continue if line: wt = ' '.join(get_subfield_values(line, ['a'], is_marc8)).strip('. ') break
def get_work_title(e): if e['key'] not in marc: assert not e.get('work_titles', []) return # assert e.get('work_titles', []) data = marc[e['key']][1] line = get_first_tag(data, set(['240'])) if not line: assert not e.get('work_titles', []) return return ' '.join(get_subfield_values(line, ['a'])).strip('. ')
def get_work_title(e): if e['key'] not in marc: assert not e.get('work_titles', []) return # assert e.get('work_titles', []) data = marc[e['key']][1] line = get_first_tag(data, set(['240'])) if not line: assert not e.get('work_titles', []) return return ' '.join(get_subfield_values(line, ['a'])).strip('. ')
def get_work_title(e, mc): # use first work title we find in source MARC records wt = None for src_type, src in get_marc_src(e, mc): if src_type == 'ia': wt = get_ia_work_title(src) if wt: wt = wt.strip('. ') if wt: break continue assert src_type == 'marc' data = None try: data = get_data(src) except ValueError: print('bad record source:', src) print('http://openlibrary.org' + e['key']) continue except urllib.error.HTTPError as error: print('HTTP error:', error.code, error.msg) print(e['key']) if not data: continue is_marc8 = data[9] != 'a' try: line = get_first_tag(data, set(['240'])) except BadDictionary: print('bad dictionary:', src) print('http://openlibrary.org' + e['key']) continue if line: wt = ' '.join(get_subfield_values(line, ['a'], is_marc8)).strip('. ') break if wt: return wt for f in 'work_titles', 'work_title': e_wt = e.get(f, []) if e_wt: assert isinstance(e_wt, list) return e_wt[0].strip('. ')
def get_work_title(e, mc): # use first work title we find in source MARC records wt = None for src_type, src in get_marc_src(e, mc): if src_type == 'ia': wt = get_ia_work_title(src) if wt: wt = wt.strip('. ') if wt: break continue assert src_type == 'marc' data = None try: data = get_data(src) except ValueError: print('bad record source:', src) print('http://openlibrary.org' + e['key']) continue except urllib2.HTTPError as error: print('HTTP error:', error.code, error.msg) print(e['key']) if not data: continue is_marc8 = data[9] != 'a' try: line = get_first_tag(data, set(['240'])) except BadDictionary: print('bad dictionary:', src) print('http://openlibrary.org' + e['key']) continue if line: wt = ' '.join(get_subfield_values(line, ['a'], is_marc8)).strip('. ') break if wt: return wt for f in 'work_titles', 'work_title': e_wt = e.get(f, []) if e_wt: assert isinstance(e_wt, list) return e_wt[0].strip('. ')
def get_work_title(e): # use first work title we find in source MARC records wt = None for src_type, src in get_marc_src(e): if src_type == 'ia': wt = get_ia_work_title(src) if wt: break continue assert src_type == 'marc' data = None #print 'get from archive:', src try: data = get_data(src) except ValueError: print('bad record source:', src) print('http://openlibrary.org' + e['key']) continue except urllib2.HTTPError as error: print('HTTP error:', error.code, error.msg) print(e['key']) if not data: continue try: line = get_first_tag(data, set(['240'])) except BadDictionary: print('bad dictionary:', src) print('http://openlibrary.org' + e['key']) continue if line: wt = ' '.join(get_subfield_values(line, ['a'])).strip('. ') break if wt: return wt if not e.get('work_titles', []): return print('work title in MARC, but not in OL') print('http://openlibrary.org' + e['key']) return e['work_titles'][0]
def find_subjects(marc_subjects): person = defaultdict(int) event = defaultdict(int) work = defaultdict(int) org = defaultdict(int) time = defaultdict(int) place = defaultdict(int) subject = defaultdict(int) #fiction = False for lines in marc_subjects: for tag, line in lines: aspects = find_aspects(line) if aspects: subject[aspects] += 1 if re_large_book.search(line): continue if tag == '600': # people name_and_date = [] for k, v in get_subfields(line, ['a', 'b', 'c', 'd']): v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip( ' /,;:') if k == 'a': if v == 'Mao, Zedong': v = 'Mao Zedong' else: m = re_flip_name.match(v) if m: v = flip_name(v) name_and_date.append(v) name = remove_trailing_dot(' '.join(name_and_date)).strip() if name != '': person[name] += 1 elif tag == '610': # org v = ' '.join(get_subfield_values(line, 'abcd')) v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: org[v] += 1 for v in get_subfield_values(line, 'a'): v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: org[v] += 1 elif tag == '611': # event v = ' '.join(j.strip() for i, j in get_all_subfields(line) if i not in 'vxyz') if v: v = v.strip() v = tidy_subject(v) if v: event[v] += 1 elif tag == '630': # work for v in get_subfield_values(line, ['a']): v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: work[v] += 1 elif tag == '650': # topical for v in get_subfield_values(line, ['a']): if v: v = v.strip() v = tidy_subject(v) if v: subject[v] += 1 elif tag == '651': # geo for v in get_subfield_values(line, ['a']): if v: place[flip_place(v).strip()] += 1 else: print 'other', tag, list(get_all_subfields(line)) cur = [ v for k, v in get_all_subfields(line) if k == 'a' or v.strip('. ').lower() == 'fiction' ] # skip: 'Good, Sally (Fictitious character) in fiction' if len(cur) > 1 and cur[-1].strip( '. ').lower() == 'fiction' and ')' not in cur[-2]: subject[flip_subject(cur[-2]) + ' in fiction'] += 1 for v in get_subfield_values(line, ['y']): v = v.strip() if v: time[remove_trailing_dot(v).strip()] += 1 for v in get_subfield_values(line, ['v']): v = v.strip() if v: v = remove_trailing_dot(v).strip() v = tidy_subject(v) if v: subject[v] += 1 for v in get_subfield_values(line, ['z']): v = v.strip() if v: place[flip_place(v).strip()] += 1 for v in get_subfield_values(line, ['x']): v = v.strip() if not v: continue if aspects and re_aspects.search(v): continue v = tidy_subject(v) if v: subject[v] += 1 v_and_x = get_subfield_values(line, ['v', 'x']) #if 'Fiction' in v_and_x or 'Fiction.' in v_and_x: # fiction = True #if 'Fiction' in subject: # del subject['Fiction'] ret = {} if person: ret['person'] = dict(person) if time: ret['time'] = dict(time) if place: ret['place'] = dict(place) if subject: ret['subject'] = dict(subject) if event: ret['event'] = dict(event) if org: ret['org'] = dict(org) if work: ret['work'] = dict(work) return ret
def find_subjects(marc_subjects): person = defaultdict(int) event = defaultdict(int) work = defaultdict(int) org = defaultdict(int) time = defaultdict(int) place = defaultdict(int) subject = defaultdict(int) #fiction = False for lines in marc_subjects: for tag, line in lines: aspects = find_aspects(line) if aspects: subject[aspects] += 1 if re_large_book.search(line): continue if tag == '600': # people name_and_date = [] for k, v in get_subfields(line, ['a', 'b', 'c', 'd']): v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip(' /,;:') if k == 'a': if v == 'Mao, Zedong': v = 'Mao Zedong' else: m = re_flip_name.match(v) if m: v = flip_name(v) name_and_date.append(v) name = remove_trailing_dot(' '.join(name_and_date)).strip() if name != '': person[name] += 1 elif tag == '610': # org v = ' '.join(get_subfield_values(line, 'abcd')) v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: org[v] += 1 for v in get_subfield_values(line, 'a'): v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: org[v] += 1 elif tag == '611': # event v = ' '.join(j.strip() for i, j in get_all_subfields(line) if i not in 'vxyz') if v: v = v.strip() v = tidy_subject(v) if v: event[v] += 1 elif tag == '630': # work for v in get_subfield_values(line, ['a']): v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: work[v] += 1 elif tag == '650': # topical for v in get_subfield_values(line, ['a']): if v: v = v.strip() v = tidy_subject(v) if v: subject[v] += 1 elif tag == '651': # geo for v in get_subfield_values(line, ['a']): if v: place[flip_place(v).strip()] += 1 else: print 'other', tag, list(get_all_subfields(line)) cur = [v for k, v in get_all_subfields(line) if k=='a' or v.strip('. ').lower() == 'fiction'] # skip: 'Good, Sally (Fictitious character) in fiction' if len(cur) > 1 and cur[-1].strip('. ').lower() == 'fiction' and ')' not in cur[-2]: subject[flip_subject(cur[-2]) + ' in fiction'] += 1 for v in get_subfield_values(line, ['y']): v = v.strip() if v: time[remove_trailing_dot(v).strip()] += 1 for v in get_subfield_values(line, ['v']): v = v.strip() if v: v = remove_trailing_dot(v).strip() v = tidy_subject(v) if v: subject[v] += 1 for v in get_subfield_values(line, ['z']): v = v.strip() if v: place[flip_place(v).strip()] += 1 for v in get_subfield_values(line, ['x']): v = v.strip() if not v: continue if aspects and re_aspects.search(v): continue v = tidy_subject(v) if v: subject[v] += 1 v_and_x = get_subfield_values(line, ['v', 'x']) #if 'Fiction' in v_and_x or 'Fiction.' in v_and_x: # fiction = True #if 'Fiction' in subject: # del subject['Fiction'] ret = {} if person: ret['person'] = dict(person) if time: ret['time'] = dict(time) if place: ret['place'] = dict(place) if subject: ret['subject'] = dict(subject) if event: ret['event'] = dict(event) if org: ret['org'] = dict(org) if work: ret['work'] = dict(work) return ret
continue title_seen = True if line[1] == '0': # no prefix continue contents = get_contents(line, ['a', 'b']) if 'a' in contents: rec['title'] = ' '.join(x.strip(' /,;:') for x in contents['a']) elif 'b' in contents: rec['title'] = contents['b'][0].strip(' /,;:') if 'title' in rec and has_dot(rec['title']): rec['title'] = rec['title'][:-1] continue if tag == '300': if 'accompanying_material' in rec: continue subtag_e = ' '.join(i.strip('. ') for i in get_subfield_values(line, set(['e']))) if subtag_e: if subtag_e.lower() in ('list', 'notes', 'book'): continue rec['accompanying_material'] = subtag_e continue fields.setdefault(tag, []).append(line) for line in fields.get('041', []): found = [] marc_h = list(get_subfield_values(line, 'h')) if not marc_h: continue for h in marc_h: if len(h) % 3 != 0: print 'bad:', list(get_all_subfields(line))
if line[1] == '0': # no prefix continue contents = get_contents(line, ['a', 'b']) if 'a' in contents: rec['title'] = ' '.join( x.strip(' /,;:') for x in contents['a']) elif 'b' in contents: rec['title'] = contents['b'][0].strip(' /,;:') if 'title' in rec and has_dot(rec['title']): rec['title'] = rec['title'][:-1] continue if tag == '300': if 'accompanying_material' in rec: continue subtag_e = ' '.join( i.strip('. ') for i in get_subfield_values(line, set(['e']))) if subtag_e: if subtag_e.lower() in ('list', 'notes', 'book'): continue rec['accompanying_material'] = subtag_e continue fields.setdefault(tag, []).append(line) for line in fields.get('041', []): found = [] marc_h = list(get_subfield_values(line, 'h')) if not marc_h: continue for h in marc_h: if len(h) % 3 != 0: print 'bad:', list(get_all_subfields(line))
def find_subjects(w, marc_subjects=None): people = defaultdict(int) genres = defaultdict(int) when = defaultdict(int) place = defaultdict(int) subject = defaultdict(int) #fiction = False for lines in marc_subjects or get_marc_subjects(w): for tag, line in lines: if re_large_book.match(line): continue if tag == '600': # people name_and_date = [] for k, v in get_subfields(line, ['a', 'b', 'c', 'd']): v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip(' /,;:') if k == 'a': if v == 'Mao, Zedong': v = 'Mao Zedong' else: m = re_flip_name.match(v) if m: v = flip_name(v) name_and_date.append(v) name = remove_trailing_dot(' '.join(name_and_date)).strip() if name != '': people[name] += 1 if tag == '650': for v in get_subfield_values(line, ['a']): if v: v = v.strip() v = tidy_subject(v) if v: subject[v] += 1 if tag == '651': for v in get_subfield_values(line, ['a']): if v: place[flip_place(v).strip()] += 1 for v in get_subfield_values(line, ['y']): v = v.strip() if v: when[remove_trailing_dot(v).strip()] += 1 for v in get_subfield_values(line, ['v']): v = v.strip() if v: subject[remove_trailing_dot(v).strip()] += 1 for v in get_subfield_values(line, ['z']): v = v.strip() if v: place[flip_place(v).strip()] += 1 for v in get_subfield_values(line, ['x']): v = v.strip() if v: v = tidy_subject(v) if v: subject[v] += 1 v_and_x = get_subfield_values(line, ['v', 'x']) #if 'Fiction' in v_and_x or 'Fiction.' in v_and_x: # fiction = True #if 'Fiction' in subject: # del subject['Fiction'] ret = {} if people: ret['people'] = dict(people) if when: ret['times'] = dict(when) if place: ret['places'] = dict(place) if subject: ret['subjects'] = dict(subject) return ret