def manual_fix_individual(book_summaries): """ Note we do not manually fix the plays, since we do not use them in the literature dataset. """ def fix_north(title): return title.replace(',', ':', 1).replace('Vol.', 'Book', 1).replace('Volume', 'Book', 1) \ .replace('of ', '').replace('Chaper', 'Chapter') start = False book_summaries_new = [] for idx, book_summ in enumerate(book_summaries): sect_summs_new = [] sect_summs_old = book_summ.section_summaries title = book_summ.title # if idx == 125: # start = True if title in NON_NOVEL_TITLES: continue elif title in set([ "Connecticut Yankee in King Arthur's Court", "Little Women", "Walden" ]): sect_summs_new = [(' '.join(chap_title.split(' ', 2)[0:2]), sect_summ) for chap_title, sect_summ in sect_summs_old if chap_title] elif title in set([ 'Germinal', "Little Dorrit", "Our Mutual Friend", "The War of the Worlds" ]): sect_summs_new = [(x[0].replace(',', ':', 1), x[1]) for x in sect_summs_old] elif title == 'The Adventures of Huckleberry Finn': sect_summs_new = [x for x in sect_summs_old if x[0]] elif title == 'The Age of Innocence': for chap_title, sect_summ in sect_summs_old: arr = chap_title.split(':', 1) if len(arr) == 2: chap_title = clean_title(arr[0]) sect_summ = [arr[1].strip()] + sect_summ if not chap_title.startswith('Chapter'): continue sect_summs_new.append((chap_title, sect_summ)) elif title == 'Alice in Wonderland': for i, (chap_title, sect_summ) in enumerate(sect_summs_old, 1): if not chap_title: chap_title = 'Chapter {}'.format(i) else: chap_title = chap_title.split(':', 1)[0] sect_summs_new.append((chap_title, sect_summ)) elif title == 'The Ambassadors': book_idx = 'O' for chap_title, sect_summ in sect_summs_old: if chap_title.startswith('Volume'): continue if chap_title.startswith('Book'): book_idx = chap_title.split(' ', 1)[-1] continue sect_idx = chap_title.split(' ', 1)[-1] chap_title = 'Book {}: Chapter {}'.format(book_idx, sect_idx) sect_summs_new.append((chap_title, sect_summ)) elif title == 'Black Beauty': sect_summs_new = [(chap_title.split(', ', 1)[-1], sect_summ) for chap_title, sect_summ in sect_summs_old] elif title == 'Bleak House': for chap_title, sect_summ in sect_summs_old: if not chap_title: prev_title = sect_summs_new[-1][0] if prev_title == 'Chapters 60-63': chap_title = 'Chapter 64-67' elif not chap_title.startswith('Chapter'): continue sect_summs_new.append((chap_title, sect_summ)) elif title == 'The Count of Monte Cristo': sect_summs_new = [ x for x in sect_summs_old if not x[0].startswith('The book has') ] elif title == 'Emma': for chap_title, sect_summ in sect_summs_old: if chap_title.startswith('Chapter Eighteen:'): chap_title, sect_summ = chap_title.split(':', 1) sect_summs_new.append((chap_title, sect_summ)) elif title == 'Ethan Frome': assert sect_summs_old[0][0] == sect_summs_old[1][0] == '' book_summ.section_summaries[0] = ( 'Prologue', book_summ.section_summaries[1][1]) book_summ.section_summaries[-1] = ( 'Epilogue', book_summ.section_summaries[-1][1]) del book_summ.section_summaries[1] book_summ_new = book_summ elif title == 'Far from the Madding Crowd': for chap_title, sect_summ in sect_summs_old: if chap_title == '': chap_title = 'Chapter 38-45' elif not chap_title.startswith('Chapter'): continue elif chap_title == 'Chapters 54-Conclusion': chap_title = 'Chapter 54-57' sect_summs_new.append((chap_title, sect_summ)) elif title == 'Frankenstein': sect_summs_new = sect_summs_old sect_summs_new[-1] = ('Final Letters', sect_summs_new[-1][1]) elif title == 'The American': sect_summs_new = [ ('Chapter {}'.format(chap_title) if not chap_title.startswith('Ch') else chap_title, sect_summ) for chap_title, sect_summ in sect_summs_old ] elif title == 'Great Expectations': i = 1 for chap_title, sect_summ in sect_summs_old: if not chap_title.startswith(('Part', 'Chapter')): addtl_text = [chap_title] + sect_summ sect_summs_new[-1] = (sect_summs_new[-1][0], sect_summs_new[-1][1] + addtl_text) else: chap_title = 'Chapter {}'.format(i) sect_summs_new.append((chap_title, sect_summ)) i += 1 elif title == 'The Hound of the Baskervilles': for i, (chap_title, sect_summ) in enumerate(sect_summs_old): chap_title = chap_title.split(':', 1)[0] if not chap_title.startswith('Chapter'): continue if not sect_summ: assert sect_summs_old[i + 1][0].startswith( ('This chapter', 'In this final')) sect_summ = [sect_summs_old[i + 1][0] ] + sect_summs_old[i + 1][1] sect_summs_new.append((chap_title, sect_summ)) elif title == 'Howards End': for i, (chap_title, sect_summ) in enumerate(sect_summs_old): if not chap_title and len(sect_summ) == 2: continue elif not chap_title: chap_title = 'Chapter 16-19' sect_summs_new.append((chap_title, sect_summ)) elif title == "Lady Audley's Secret": for chap_title, sect_summ in sect_summs_old: if chap_title == "Volume 3, Chapter 1": chap_title = 'Chapter 1' sect_summs_new.append((chap_title, sect_summ)) elif title == "Mary Barton": for chap_title, sect_summ in sect_summs_old: if not chap_title: chap_title = 'Chapters XVI-XX' elif chap_title == 'Chapters XXI-XV': chap_title = 'Chapters XXI-XXV' sect_summs_new.append((chap_title, sect_summ)) elif title == 'Moby Dick': for chap_title, sect_summ in sect_summs_old: if not chap_title.startswith('Chapter'): continue chap_title = chap_title.split(':', 1)[0].replace('One Hundred and ', 'One-Hundred-', 1) \ .replace('One Hundred', 'One-Hundred', 1) sect_summs_new.append((chap_title, sect_summ)) elif title == 'Northanger Abbey': sect_summs_new = [(fix_north(x[0]), x[1]) for x in sect_summs_old] elif title in set(["The Vicar of Wakefield", 'Uncle Tom\'s Cabin']): for i, (chap_title, sect_summ) in enumerate(sect_summs_old): if not chap_title.startswith('Chapter'): continue if not sect_summ: sect_summ = [sect_summs_old[i + 1][0] ] + sect_summs_old[i + 1][1] sect_summs_new.append((chap_title, sect_summ)) elif title == 'Persuasion': for chap_title, sect_summ in sect_summs_old: if not chap_title: chap_title = 'Chapter 22-24' elif chap_title.startswith('The final chapter'): continue sect_summs_new.append((chap_title, sect_summ)) elif title in set(["The Scarlet Letter", "The Blithedale Romance"]): sect_summs_new = [ x for x in sect_summs_old if x[0].startswith('Chapter') ] elif title == 'Siddhartha': for _, lines in sect_summs_old: sect_summ_curr = [] for line in lines: if line in SIDDHARTHA_TITLES: if sect_summ_curr: sect_summs_new.append((chap_title, sect_summ_curr)) sect_summ_curr = [] chap_title = line.replace("The Brahmins Son", "The Brahmin's Son").replace( 'Goatama', 'Gotama') else: sect_summ_curr.append(line) if sect_summ_curr: sect_summs_new.append((chap_title, sect_summ_curr)) sect_summ_curr = [] elif title == 'A Study in Scarlet': sect_summs_new = [(x[0].split(':', 1)[0].strip().replace(',', ':', 1), x[1]) for x in sect_summs_old] elif title == 'Treasure Island': for i, (chap_title, sect_summ) in enumerate(sect_summs_old): if not sect_summ: sect_summ = sect_summs_old[i + 1][1] elif not chap_title: continue sect_summs_new.append((chap_title, sect_summ)) elif title == 'A Study in Scarlet': sect_summs_new = [(x[0].replace('of ', '', 1), x[1]) for x in sect_summs_old] elif title == "The Valley of Fear": sect_summs_new = [x for x in sect_summs_old if x[1]] elif title == "Villette": for i, (chap_title, sect_summ) in enumerate(sect_summs_old): prev_chap = sect_summs_new[-1][0] if sect_summs_new else '' if prev_chap.endswith('XIII'): sect_summ = [chap_title] + sect_summ chap_title = "Chapter 14-16" elif prev_chap.endswith('XXV'): sect_summ = [chap_title] + sect_summ chap_title = "Chapter 26-28" sect_summs_new.append((chap_title, sect_summ)) elif title == 'What Maisie Knew': book_summ.section_summaries[0] = ( 'Introduction', book_summ.section_summaries[0][1]) book_summ_new = book_summ elif title == 'Winesburg, Ohio': for i, (chap_title, sect_summ) in enumerate(sect_summs_old): chap_title = chap_title.replace('"', '').replace("'", '').replace( ' Summary ', ' ') if chap_title.startswith("Surrender"): chap_title = "Godliness Part 3" elif chap_title.startswith("Terror"): chap_title = "Godliness Part 4" elif chap_title.startswith("Prologue"): chap_title = "The Book of the Grotesque" sect_summs_new.append((chap_title, sect_summ)) elif title == 'Wuthering Heights': for i, (chap_title, sect_summ) in enumerate(sect_summs_old): if not sect_summ: next_chap, next_summ = sect_summs_old[i + 1] # Chapter 25 section has a typo https://www.gradesaver.com/wuthering-heights/study-guide/summary-chapters-21-25 if not next_summ and not chap_title == 'Chapter 25': print("need to update Wuthering Heights") sect_summ = next_summ elif not chap_title: continue sect_summs_new.append((chap_title, sect_summ)) elif title == "The Yellow Wallpaper": all_sects = [x[1] for x in sect_summs_old] all_sects = [sublist for l in all_sects for sublist in l] sect_summs_new = [('book', all_sects)] elif title in set([ "The Mill on the Floss", 'My Antonia', "A Tale of Two Cities", 'War and Peace' ]): # multibook book_count = 0 seen = set() for i, (chap_title, sect_summ) in enumerate(sect_summs_old): if not chap_title.startswith("Chapter") or chap_title.endswith( '.'): continue chap_title, book_count = fix_multibook(chap_title, book_count) if not sect_summ: sect_summ = [sect_summs_old[i + 1][0] ] + sect_summs_old[i + 1][1] if chap_title == "Book 2: Chapter 4 -": chap_title = "Book 2: Chapter 4" if title == 'A Tale of Two Cities': if chap_title in seen: continue seen.add(chap_title) arr = chap_title.split(':') if len(arr) == 3: chap_title = ':'.join(arr[0:2]) sect_summs_new.append((chap_title, sect_summ)) elif title == 'Hard Times': for i, (chap_title, sect_summ) in enumerate(sect_summs_old): if chap_title.startswith('Book the First'): book_num = 1 elif chap_title == 'Book II': book_num = 2 elif chap_title.startswith('Book III'): book_num = 3 else: chap_title = 'Book {}: {}'.format( book_num, chap_title.split(':', 1)[0]) sect_summs_new.append((chap_title, sect_summ)) elif title in set([ "Gulliver's Travels", "Jude the Obscure", "Madame Bovary", 'Crime and Punishment' ]): # multipart book_count = 0 for i, (chap_title, sect_summ) in enumerate(sect_summs_old): if title in set(["Gulliver's Travels", 'Crime and Punishment' ]) and not sect_summ: sect_summ = [sect_summs_old[i + 1][0] ] + sect_summs_old[i + 1][1] if not chap_title.startswith("Chapter"): continue elif title == "Madame Bovary" and "-" in chap_title: continue chap_title, book_count = fix_multipart(chap_title, book_count) sect_summs_new.append((chap_title, sect_summ)) elif title in set(['Pride and Prejudice', 'Jane Eyre']): for i, (chap_title, sect_summ) in enumerate(sect_summs_old, 1): chap_title = 'Chapter {}'.format(i) if title == 'Pride and Prejudice' and chap_title == 'Chapter 60': sect_summs_new.append((chap_title, sect_summ[0:1])) sect_summs_new.append(('Chapter 61', sect_summ[1:])) continue sect_summs_new.append((chap_title, sect_summ)) elif title == "The Phantom of the Opera": book_summ.section_summaries[-1] = ( 'Chapter 21-Epilogue', book_summ.section_summaries[-1][1]) book_summ_new = book_summ elif title == "The Picture of Dorian Gray": sect_summs_new = sect_summs_old sect_summs_new[0] = ('Preface-Chapter 2', sect_summs_new[0][1]) elif title == "Tess of the D'Urbervilles": sect_summs_new = [(x[0], x[1]) for x in sect_summs_old if x[0].startswith('Chapter')] elif title == 'Washington Square': sect_summs_new = [(x[0].replace(' Summaries', '', 1), x[1]) for x in sect_summs_old if x[0].startswith('Chapter')] elif title == "The Wind in the Willows": sect_summs_new = sect_summs_old sect_summs_new[-2] = (sect_summs_new[-2][0], sect_summs_new[-2][1] + [sect_summs_new[-1][0]]) sect_summs_new.pop(-1) elif title == "The Brothers Karamazov": sect_summs_new = sect_summs_old sect_summs_new[-1] = ('Book 13', sect_summs_old[-1][1]) elif title == "The Metamorphosis": sect_summs_new = [(x[0].replace('Chapter', 'Part', 1), x[1]) for x in sect_summs_old] elif title == 'The Secret Garden': assert sect_summs_old[1][0] == 'Chapters 5-19' sect_summs_new = sect_summs_old sect_summs_new[1] = ('Chapters 5-9', sect_summs_old[1][1]) else: sect_summs_new = sect_summs_old if sect_summs_new: sect_summs_new = [(standardize_sect_title(x[0]), x[1]) for x in sect_summs_new] title_new = standardize_title(title) if title_new != title: print('renamed {} -> {}'.format(title, title_new)) title = title_new book_summ_new = book_summ._replace( section_summaries=sect_summs_new, title=title_new) book_summaries_new.append(book_summ_new) if start: # for debugging # if title == 'Frankenstein': print(title, idx) assert title == book_summaries_new[-1].title for i, x in enumerate(book_summaries_new[-1].section_summaries, 1): print(x[0] or x[1][0][0:100] + ' index ' + str(i)) input() return book_summaries_new
def manual_fix_individual(book_summaries): """ Note we do not manually fix the plays, since we do not use them in the literature dataset. """ start = False # to debug book_summaries_new = [] for idx, book_summ in enumerate(book_summaries): sect_summs_new = [] sect_summs_old = book_summ.section_summaries title = book_summ.title # if idx == 79: # start = True if title in NON_NOVEL_TITLES: continue if title in set([ 'Adam Bede', 'The Brothers Karamazov', 'The Age of Innocence', 'Siddhartha', 'Silas Marner', 'The Three Musketeers' ]): sect_summs_new = [(x[0].split(':', 1)[1].strip(), x[1]) for x in sect_summs_old] if title == 'The Brothers Karamazov': assert sect_summs_new[-1][0] == 'Epilogue' sect_summs_new[-1] = ('Book 13', sect_summs_new[-1][1]) if title == 'Siddhartha': sect_summs_new = [ ('Samsara' if chap_title == 'Sansara' else 'Amongst the People' if chap_title == 'With the Childlike People' else chap_title, chap_summ) for chap_title, chap_summ in sect_summs_new ] elif title in set(["Tess of the d'Urbervilles"]): sect_summs_new = [(x[0].rsplit(':', 1)[1].strip(), x[1]) for x in sect_summs_old] elif title == 'White Fang': sect_summs_new = [(x[0].split('(', 1)[0].strip(), x[1]) for x in sect_summs_old] elif title == 'The Adventures of Huckleberry Finn': sect_summs_new = sect_summs_old sect_summs_new[-1] = ('Chapter 43', sect_summs_new[-1][1]) assert sect_summs_new[0][0] == 'Notice; Explanatory' sect_summs_new = sect_summs_new[1:] elif title == "A Connecticut Yankee in King Arthur's Court": sect_summs_new = sect_summs_old sect_summs_new[-1] = ('Chapter 39-45', sect_summs_new[-1][1]) elif title == 'Jane Eyre': sect_summs_new = sect_summs_old sect_summs_new[-1] = ('Chapter 38', sect_summs_new[-1][1]) elif title in set(['The Mill on the Floss', 'My Ántonia']): for sect_title, summ in sect_summs_old: if sect_title.startswith('Introduction'): pass elif sect_title.endswith('Conclusion'): sect_title = 'Book 7: Conclusion' else: arr = sect_title.split() sect_title = '{} {} {} {}'.format(arr[0], arr[1], arr[-2], arr[-1]) sect_summs_new.append((sect_title, summ)) elif title == "The Turn of the Screw": sect_summs_new = [(x[0].replace('Section', 'Chapter').replace('"', ''), x[1]) for x in sect_summs_old] elif title == 'The Way of All Flesh': for sect_title, summ in sect_summs_old: if '(' in sect_title: chapter_nums = sect_title.split('(', 1)[1].split( ' ', 1)[0].replace(')', '') sect_title = 'Chapter {}'.format(chapter_nums) sect_title = sect_title.replace('87', '86', 1) # fix typo sect_summs_new.append((sect_title, summ)) elif title == 'The Secret Sharer': sect_summs_new = [(x[0].replace('Part', 'Chapter'), x[1]) for x in sect_summs_old] elif title == 'Winesburg, Ohio': sect_summs_new = [(x[0].replace('"', ''), x[1]) for x in sect_summs_old] elif title == 'Treasure Island': treasure_chapters = [ "1-6", "7-12", "13-15", "16-21", "22-27", "28-34" ] for i, (chap, sect_summ) in enumerate( zip(treasure_chapters, sect_summs_old)): sect_summs_new.append(('Chapter ' + chap, sect_summ[1])) elif title == 'Emma': for sect_title, summ in sect_summs_old: if 'Volume 1' in sect_title: offset = 0 elif 'Volume 2' in sect_title: offset = 18 else: offset = 36 last = sect_title.rsplit(' ', 1)[-1] if '-' in last: first, last = [ roman_to_int(x) + offset for x in last.split('-', 1) ] sect_title = 'Chapter {}-{}'.format(first, last) else: sect_title = 'Chapter {}'.format( roman_to_int(last) + offset) sect_summs_new.append((sect_title, summ)) elif title == "War and Peace": ssn = [(standardize_sect_title(x[0], False), x[1]) for x in sect_summs_old] book_summ_new = book_summ._replace(section_summaries=ssn) else: sect_summs_new = sect_summs_old if sect_summs_new: sect_summs_new = [(standardize_sect_title(x[0]), x[1]) for x in sect_summs_new] title_new = standardize_title(title) if title_new != title: print('renamed {} -> {}'.format(title, title_new)) title = title_new book_summ_new = book_summ._replace( section_summaries=sect_summs_new, title=title_new) book_summaries_new.append(book_summ_new) if start: # for debugging print(title, idx) assert title == book_summaries_new[-1].title for i, x in enumerate(book_summaries_new[-1].section_summaries, 1): print(x[0] or x[1][0][0:100] + ' index ' + str(i)) input() return book_summaries_new
def manual_fix_individual(book_summaries): start = False # to debug book_summaries_new = [] seen = set() for idx, book_summ in enumerate(book_summaries): sect_summs_new = [] sect_summs_old = book_summ.section_summaries title = book_summ.title source = book_summ.source if (title, source) in seen: continue # if idx == 0: # start = True if title in NON_NOVEL_TITLES: continue if title in set(['Kidnapped', 'Treasure Island', 'The Call of the Wild', 'Dr. Jekyll and Mr. Hyde', 'The House of the Seven Gables', 'The Prince and the Pauper', 'Huckleberry Finn', 'Tom Sawyer']) and source == 'monkeynotes' or \ title == "The Mayor of Casterbridge" and source == 'barrons': sect_summs_new = [(x[0].split(':', 1)[0].strip(), x[1]) for x in sect_summs_old] elif title in set(["Oliver Twist", 'Candide', "Alice's Adventures in Wonderland"]) and source == 'monkeynotes' or \ title == 'The Scarlet Letter' and source == 'barrons': sect_summs_new = [(x[0].split('-', 1)[0].strip(), x[1]) for x in sect_summs_old] elif title in set(["Uncle Tom's Cabin"]) and source == 'barrons': sect_summs_new = [(x[0].split('.', 1)[0], x[1]) for x in sect_summs_old] elif title in set(["Emma", "Pride and Prejudice"]) and source == 'monkeynotes' or \ title in set(["Huckleberry Finn", "Great Expectations"]) and source == 'barrons': sect_summs_new = [(x[0].replace('&', '-', 1).strip(), x[1]) for x in sect_summs_old] # skip below, as inconsistent sections vs Gutenberg book elif title in set(['War and Peace', 'The Time Machine' ]) and source == 'monkeynotes': continue elif title in set([ 'Anna Karenina', 'Don Quixote', "Crime and Punishment", 'Madame Bovary', 'White Fang', 'Jude the Obscure', "Gulliver's Travels", "The Idiot", 'Under the Greenwood Tree' ]): # multipart book_count = 0 if title == 'Madame Bovary' and source == 'barrons': assert sect_summs_old[26][0] == 'Chapter 4' assert sect_summs_old[31][0] == 'Chapter 10' sect_summs_old[26] = ('Chapter 3-4', sect_summs_old[26][1]) sect_summs_old[31] = ('Chapter 9-10', sect_summs_old[31][1]) if title == 'The Idiot': sect_summs_old[1] = ('Chapter 2-3', sect_summs_old[1][1]) for i, (chap_title, sect_summ) in enumerate(sect_summs_old): if title == 'Crime and Punishment': chap_title = chap_title.replace('PART VI, ', '', 1) if chap_title.startswith('Part'): chap_title = 'Chapter {}'.format( chap_title.split(' ', 1)[-1]) chap_title = re.sub(RE_CHAPTER_ONLY, 'Chapter', chap_title) if not chap_title.startswith("Chapter"): continue chap_title, book_count = fix_multipart(chap_title, book_count) sect_summs_new.append((chap_title, sect_summ)) elif title in set([ 'My Antonia', "A Tale of Two Cities", "The War of the Worlds", 'The House of Mirth', 'Hard Times' ]): # multibook book_count = 0 for i, (chap_title, sect_summ) in enumerate(sect_summs_old): if chap_title.endswith('CHAPTER I'): # for barrons chap_title = 'CHAPTER I' chap_title = re.sub(RE_CHAPTER_ONLY, 'Chapter', chap_title) if not chap_title.startswith("Chapter"): continue if title == 'A Tale of Two Cities': chap_title = chap_title.split(':')[0] chap_title, book_count = fix_multibook(chap_title, book_count) sect_summs_new.append((chap_title, sect_summ)) elif title in set(['The House of the Seven Gables', 'Walden' ]) and source == 'barrons': sect_summs_new = [('Chapter {}'.format(x[0].split('.', 1)[0]) if not x[0].startswith('P') else x[0], x[1]) \ for x in sect_summs_old] elif title == 'Heart of Darkness': sect_summs_new = [(x[0].replace('Chapter', 'Part'), x[1]) for x in sect_summs_old] elif title == 'The Hound of the Baskervilles' and source == 'monkeynotes': assert book_summ.section_summaries[1][0] == 'Chapter Summary' book_summ.section_summaries[1] = ( 'Chapter 2', book_summ.section_summaries[2][1]) book_summ_new = book_summ elif title == 'Tess of the D\'Urbervilles' and source == 'barrons': sect_summs_new = [(x[0].replace('AND', '-').replace( ', 14, - ', ' - ').replace(', 27, - ', ' - ').replace(', 57, - ', ' - '), x[1]) for x in sect_summs_old] elif title == 'The Prince': sect_summs_new = [(x[0].replace('AND', '-'), x[1]) for x in book_summ.section_summaries] elif title == 'Ivanhoe': sect_summs_old.pop(0) chapters = [ 'Chapter 1', 'Chapter 2', 'Chapter 3', 'Chapter 4', 'Chapter 5', 'Chapter 6', 'Chapter 7-9', 'Chapter 10', 'Chapter 11', 'Chapter 12', 'Chapter 13-15', 'Chapter 16-17', 'Chapter 18-19', 'Chapter 20-21', 'Chapter 22', 'Chapter 23', 'Chapter 24', 'Chapter 25-27', 'Chapter 28', 'Chapter 29', 'Chapter 30-31', 'Chapter 32', 'Chapter 33-34', 'Chapter 35', 'Chapter 37-39', 'Chapter 40-42', 'Chapter 43', 'Chapter 44' ] sect_summs_new = [ (chap, summ) for chap, summ in zip(chapters, [x[1] for x in sect_summs_old]) ] elif title == 'Winesburg, Ohio': for sect_title, sect_summ in sect_summs_old: if sect_title == 'Story 13 -': sect_title = 'The Strength of God' elif sect_title == 'PART I - SUMMARY': sect_title = 'Godliness Part I' elif sect_title == 'PART II - SUMMARY': sect_title = 'Godliness Part II' elif sect_title == 'PART III - Surrender': sect_title = 'Godliness Part III' elif sect_title == 'PART IV - Terror': sect_title = 'Godliness Part IV' else: sect_title = sect_title.split('-', 1)[-1].strip() sect_summs_new.append((sect_title, sect_summ)) elif (title == 'Silas Marner' and source == 'barrons') or \ (title == 'Looking Backward: 2000-1887' and source == 'monkeynotes'): sect_summs_new = [ x for x in sect_summs_old if x[0].startswith('C') ] elif title == 'Turn of the Screw': sect_summs_new = [(x[0].replace('SECTION', 'Chapter'), x[1]) for x in sect_summs_old] sect_summs_new = sect_summs_new[4:] assert sect_summs_new[0][0] == 'PROLOGUE' elif title == 'The Metamorphosis' and source == 'monkeynotes': sect_summs_new = [(x[0].replace('Section', 'Part'), x[1]) for x in sect_summs_old] elif title == 'Sons and Lovers' and source == 'barrons': for sect_title, sect_summ in sect_summs_old: sect_title = sect_title.replace('PART TWO - ', '', 1) if not sect_title.startswith('CHAPTER'): continue sect_summs_new.append((sect_title, sect_summ)) elif title == 'Moby Dick' and source == 'monkeynotes': sect_summs_new = [x for x in sect_summs_old if not x[0] == 'Notes'] elif title == 'Moby Dick' and source == 'barrons': chap_nums_re = r'\d+' for sect_title, sect_summ in sect_summs_old: chaps = re.findall(chap_nums_re, sect_title) if sect_title == 'Epilogue': pass elif len(chaps) == 1: sect_title = 'Chapter {}'.format(chaps[0]) else: sect_title = 'Chapter {}-{}'.format(chaps[0], chaps[-1]) sect_summs_new.append((sect_title, sect_summ)) elif title == 'Siddhartha' and source == 'monkeynotes': for sect_title, sect_summ in sect_summs_old: sect_title = sect_title.split(':', 1)[1].strip() sect_summs_new.append((sect_title, sect_summ)) elif title == 'Siddhartha' and source == 'barrons': assert len(sect_summs_old) == 1 continue elif title == "Walden": sect_summs_new = [(x[0].replace('Chapter1', 'Chapter 1', 1), x[1]) for x in sect_summs_old] elif title == "Ethan Frome" and source == 'monkeynotes': for sect_title, sect_summ in sect_summs_old: if sect_title == 'Opening': sect_title = 'Prologue' elif sect_title.startswith('Chapter 10'): sect_title = 'Epilogue' sect_summs_new.append((sect_title, sect_summ)) elif title == 'Typee' and source == 'monkeynotes': sect_summs_new = [(x[0].replace('Chapter 1Summary', 'Chapter 1', 1), x[1]) for x in sect_summs_old] elif title == 'Typee' and source == 'barrons': sect_summs_new = [(x[0].replace('PREFACE AND CHAPTERS 1 TO 5', 'Preface to Chapter 5'), x[1]) for x in sect_summs_old] elif title == "A Connecticut Yankee in King Arthur's Court": sect_summs_new = [(x[0].split('"', 1)[0].split(':', 1)[0].strip(), x[1]) for x in sect_summs_old if x[0].startswith('CHAPTER')] elif title == 'The Count of Monte Cristo' and source == 'monkeynotes': sect_summs_new = [(x[0].split(':', 1)[0].split('-', 1)[0].strip(), x[1]) for x in sect_summs_old] elif title == 'The Secret Sharer' and source == 'monkeynotes': chap1 = set(['Section 1', 'Section 2', 'Section 3', 'Section 4']) chap2 = set(['Section 5', 'Section 6', 'Section 7', 'Section 8']) chap1_text, chap2_text = [], [] for sect_title, sect_summ in sect_summs_old: if sect_title in chap1: chap1_text.extend(sect_summ) elif sect_title in chap2: chap2_text.extend(sect_summ) sect_summs_new = [('Chapter 1', chap1_text), ('Chapter 2', chap2_text)] else: sect_summs_new = sect_summs_old if sect_summs_new: sect_summs_new = [(standardize_sect_title(x[0]), x[1]) for x in sect_summs_new] title_new = standardize_title(title) if title_new != title: print('renamed {} -> {}'.format(title, title_new)) title = title_new book_summ_new = book_summ._replace( section_summaries=sect_summs_new, title=title_new) book_summaries_new.append(book_summ_new) seen.add((title, source)) if start: # for debugging print(title, source, idx) assert title == book_summaries_new[-1].title for i, x in enumerate(book_summaries_new[-1].section_summaries, 1): print(x[0] or x[1][0][0:100] + ' index ' + str(i)) input() return book_summaries_new
def manual_fix_individual(book_summaries): start = False book_summaries_new = [] for idx, book_summ in enumerate(book_summaries): sect_summs_old = book_summ.section_summaries sect_summs_new = [] title = book_summ.title # if idx == 73: # start = True if title in NON_NOVEL_TITLES: continue elif title == 'Homecoming': # not the same as one in Gutenberg continue elif title in set(['A Christmas Carol']): sect_summs_new = [(x[0].split(':', 1)[0], x[1]) for x in sect_summs_old] # sect_summs_new[0] = ('Stave One', sect_summs_new[0][1]) elif title in set(['Bleak House']): sect_summs_new = [(x[0].split(',', 1)[0], x[1]) for x in sect_summs_old] elif title in set(['David Copperfield']): sect_summs_new = [(x[0].split('.', 1)[0], x[1]) for x in sect_summs_old] elif title in set(['Moll Flanders']): sect_summs_new = [(x[0].split(' (', 1)[0], x[1]) for x in sect_summs_old] elif title in set([ 'The Idiot', "The Good Soldier", "Anna Karenina", 'The Return of the Native' ]): sect_summs_new = [(x[0].replace(',', ':'), x[1]) for x in sect_summs_old] elif title in set( ['Crime and Punishment', 'Don Quixote', 'Madame Bovary']): # multipart book_count = 0 for i, (chap_title, sect_summ) in enumerate(sect_summs_old): if not chap_title.startswith("Chapter"): continue chap_title, book_count = fix_multipart(chap_title, book_count) sect_summs_new.append((chap_title, sect_summ)) elif title in set([ 'The Brothers Karamazov', 'Hard Times', 'The Mill on the Floss', 'My Ántonia', 'Northanger Abbey', 'A Tale of Two Cities', "The House of Mirth" ]): # multibook book_count = 0 for i, (chap_title, sect_summ) in enumerate(sect_summs_old): if not chap_title.startswith("Chapter"): continue chap_title, book_count = fix_multibook(chap_title, book_count) sect_summs_new.append((chap_title, sect_summ)) elif title == 'Ethan Frome': for i, (chap_title, sect_summ) in enumerate(sect_summs_old): arr = chap_title.split(' ', 1) if len(arr) == 2: chap_title = '{} {}'.format(arr[0], arr[1].upper()) elif chap_title == 'Introduction': chap_title = 'Prologue' elif chap_title == 'Conclusion': chap_title = 'Epilogue' sect_summs_new.append((chap_title, sect_summ)) elif title == 'O Pioneers!': sect_summs_new = [(x[0].replace(',', ':'), x[1]) for x in sect_summs_old if x[0].startswith('Part')] elif title == 'The Age of Innocence': sect_summs_new = [(x[0].replace('Book One ', '', 1).replace('Book Two ', ''), x[1]) for x in sect_summs_old] elif title == 'The Three Musketeers': offset = 0 for i, (chap_title, sect_summ) in enumerate(sect_summs_old): if not chap_title.startswith(('Chapter', 'Part')): continue if chap_title.startswith('Part II'): offset = 37 if offset: chapter_range = re.search(('\d+-.+'), chap_title) beg, end = chapter_range[0].split('-') beg = int(beg) + offset end = end if end == 'Epilogue' else int(end) + offset chap_title = 'Chapter {}-{}'.format(beg, end) sect_summs_new.append((chap_title, sect_summ)) elif title == 'Frankenstein': for sect_title, sect_summ in sect_summs_old: sect_title = sect_title.replace('Letters', 'Letter', 1) if sect_title.startswith("Walton"): sect_title = "Final Letters" sect_summs_new.append((sect_title, sect_summ)) elif title in set([ 'Kidnapped', 'The House of the Seven Gables', "Northanger Abbey", "The Scarlet Letter", 'Typee', "Anthem", "Ivanhoe", 'The Adventures of Huckleberry Finn', 'Maggie: A Girl of the Streets' ]): sect_summs_new = [ x for x in sect_summs_old if x[0].startswith(('Chapter', 'Preface')) ] elif title == 'Siddhartha': book_summ.section_summaries[0] = ("The Brahmin's Son", book_summ[0][1]) book_summ_new = book_summ elif title == 'Heart of Darkness': part = 'Part 1' curr_summ = [] for sect_title, sect_summ in sect_summs_old: if sect_title != part: sect_summs_new.append((part, curr_summ)) part = sect_title curr_summ = [] curr_summ.extend(sect_summ) if curr_summ: sect_summs_new.append((part, curr_summ)) elif title == 'Jane Eyre': sect_summs_new = [(x[0].replace('Summary', 'Chapter 26'), x[1]) for x in sect_summs_old] elif title == 'Moby-Dick': for sect_title, sect_summ in sect_summs_old: if sect_title in set(['Etymology', 'Extracts']): continue sect_summs_new.append((sect_title, sect_summ)) elif title == 'This Side of Paradise': sect_summs_new = [(x[0].split(':', 1)[0].replace(',', ':'), x[1]) for x in sect_summs_old if x[0].startswith('Book')] elif title == 'A Portrait of the Artist as a Young Man': curr_chap = 'Chapter 1' curr_summ = [] for sect_title, sect_summ in sect_summs_old: chap_title = sect_title.split(',', 1)[0] if chap_title != curr_chap: sect_summs_new.append((curr_chap, curr_summ)) curr_chap = chap_title curr_summ = [] curr_summ.extend(sect_summ) if curr_summ and curr_chap.startswith('Chapter'): sect_summs_new.append((curr_chap, curr_summ)) elif title == "Dubliners": sect_summs_new = [(x[0].replace('“', '').replace('”', ''), x[1]) for x in sect_summs_old] elif title == "Jude the Obscure": sect_summs_new = [(x[0].split(':', 1)[0], x[1]) for x in sect_summs_old if x[0].startswith('Part')] elif title == "Lord Jim": sect_summs_new = [(x[0].replace('and', '-'), x[1]) for x in sect_summs_old] elif title == "Middlemarch": sect_summs_new = [(x[0].split(': ', 1)[1], x[1]) for x in sect_summs_old] elif title in set(["Sense and Sensibility", "Dracula", "Don Quixote"]): sect_summs_new = [(x[0], x[1]) for x in sect_summs_old if x[0].startswith('Chapter')] elif title == "This Side of Paradise": sect_summs_new = [(x[0].split(':', 1)[0].replace(',', ''), x[1]) for x in sect_summs_old if x[0].startswith('Chapter')] # start = True elif title == 'The Time Machine': sect_summs_new = [(x[0].replace('and', '-'), x[1]) for x in sect_summs_old] sect_summs_new[-1] = ('Chapter 11 - Epilogue', sect_summs_new[-1][1]) elif title == 'Ulysses': sect_summs_new = [(x[0].replace('Episode', 'Chapter').split(':', 1)[0], x[1]) for x in sect_summs_old] elif title == 'Walden': sect_summs_new = [('Chapter {}'.format(i), x[1]) for i, x in enumerate(sect_summs_old, 1)] elif title == 'Winesburg, Ohio': for sect_title, sect_summ in sect_summs_old: sect_title = sect_title.replace('"', '') if sect_title == "Godliness, Parts I-II": sect_title = "Godliness Part I, Godliness Part II" elif sect_title.startswith("Godliness, Parts III"): sect_title = "Godliness Part III, Godliness Part IV, A Man of Ideas" elif sect_title.startswith("Analytical"): continue sect_summs_new.append((sect_title, sect_summ)) elif title == "White Fang": sect_summs_new = [(x[0].replace(',', ':').replace('and', '-', 1), x[1]) for x in sect_summs_old] elif title == 'The Picture of Dorian Gray': sect_summs_new = sect_summs_old sect_summs_new[0] = ('Preface', sect_summs_new[0][1]) elif title == "War and Peace": ssn = [(standardize_sect_title(x[0].replace(',', ':'), False), x[1]) for x in sect_summs_old] book_summ_new = book_summ._replace(section_summaries=ssn) else: sect_summs_new = sect_summs_old if sect_summs_new: sect_summs_new = [(standardize_sect_title(x[0]), x[1]) for x in sect_summs_new] title_new = standardize_title(title) if title_new != title: print('renamed {} -> {}'.format(title, title_new)) title = title_new book_summ_new = book_summ._replace( section_summaries=sect_summs_new, title=title_new) book_summaries_new.append(book_summ_new) if start: # for debugging print(title, idx) assert title == book_summaries_new[-1].title for i, x in enumerate(book_summaries_new[-1].section_summaries, 1): print(x[0] or x[1][0][0:100] + ' index ' + str(i)) input() return book_summaries_new
def manual_fix_individual(book_summaries): """ Note we do not manually fix the plays, since we do not use them in the literature dataset. """ def remove_duplicates(sect_summs_old): seen = set() sect_summs = [] for sect_title, sect_summ in sect_summs_old: if sect_title in seen: continue sect_summs.append((sect_title, sect_summ)) seen.add(sect_title) return sect_summs def add_dash_numwords(sect_summs_old): sect_summs_new = [] tens = ['Twenty', 'Thirty', 'Forty', 'Fifty', 'Sixty'] for sect_title, sect_summ in sect_summs_old: sect_title = sect_title.replace('Sixty Two', 'Sixty Two and Sixty Three') for t in tens: sect_title = sect_title.replace(t + ' ', t + '-') sect_title = sect_title.replace(',', '') words = sect_title.split() if len(words) > 2: sect_title = '{} {} - {}'.format(words[0], words[1], words[-1]) else: sect_title = '{} {}'.format(words[0], words[1]) sect_summs_new.append((sect_title, sect_summ)) return sect_summs_new def greenwood_fix(sect_summs_old): sect_summs_new = [] sect_summs_old = remove_duplicates(sect_summs_old) for sect_title, sect_summ in sect_summs_old: if sect_title.startswith('C'): sect_title = get_first_last_chapter(sect_title) sect_summs_new.append((sect_title, sect_summ)) else: # Part Two ss, st = [], '' curr_summ = [] write = False for line in sect_summ: if line.startswith('Analysis'): ss.append((st, curr_summ)) curr_summ = [] write = False elif line.startswith('Summary'): st = get_first_last_chapter(line) write = True elif write: curr_summ.append(line) sect_summs_new.extend(ss) return sect_summs_new def ambass_fix(sect_summs_old): sect_summs_old = remove_duplicates(sect_summs_old) numbers = [ 1, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4 ] sect_summs_new = [('Chapter {}'.format(num), summ) for num, (_, summ) in zip(numbers, sect_summs_old)] return sect_summs_new def mirth_fix(sect_summs_old): sect_summs_new = [] for sect_title, sect_summ in sect_summs_old: sect_title = sect_title.split(' – ', 1)[-1] sect_title = sect_title.replace('6,7,8', '6-8').replace(',', '-') sect_title = sect_title.replace('and', '-') sect_summs_new.append((sect_title, sect_summ)) return sect_summs_new def bovary_fix(sect_summs_old): sect_summs_old = deepcopy(sect_summs_old) sect_summs_new = sect_summs_old[0:-5] chap_8 = [] for chap_title, chap_summ in sect_summs_old[-5:]: if chap_title.endswith('8'): chap_8.extend(chap_summ) elif chap_title.endswith('9'): chap_8.extend(chap_summ) sect_summs_new.append(('Chapter 8', chap_8)) else: orig = int(chap_title.rsplit(' ', 1)[-1]) sect_summs_new.append( ('Chapter {}'.format(orig - 1), chap_summ)) return sect_summs_new start = False # True to debug book_summaries_new = [] for idx, book_summ in enumerate(book_summaries): sect_summs_new = [] sect_summs_old = book_summ.section_summaries title = book_summ.title # if idx == 0: # start = True if title in NON_NOVEL_TITLES: continue elif title == 'Don Quixote': # not the same chapter numbering as Gutenberg continue elif title in set([ 'My Antonia', "The House of Mirth", 'The Ambassadors', 'War of the Worlds', 'Hard Times' ]): # multibook book_count = 0 if title == 'The House of Mirth': sect_summs_old = mirth_fix(sect_summs_old) elif title == 'The Ambassadors': sect_summs_old = ambass_fix(sect_summs_old) elif title == 'War of the Worlds': sect_summs_old = [('Chapter {}'.format(x[0].split('.', 1)[0]), x[1]) for x in sect_summs_old if '.' in x[0]] elif title == 'My Antonia': assert sect_summs_old[3][0] == 'Part IV' assert sect_summs_old[4][0] == 'Part I' sect_summs_old = sect_summs_old[4:] for i, (chap_title, sect_summ) in enumerate(sect_summs_old): chap_title = chap_title.replace("Part", 'Chapter') if not chap_title.startswith("Chapter"): continue chap_title, book_count = fix_multibook(chap_title, book_count) sect_summs_new.append((chap_title, sect_summ)) elif title in set([ "Madame Bovary", "Gulliver's Travels", "Under the Greenwood Tree" ]): # multipart book_count = 0 if title == "Under the Greenwood Tree": sect_summs_old = greenwood_fix(sect_summs_old) elif title == 'Madame Bovary': sect_summs_old = bovary_fix(sect_summs_old) for i, (chap_title, sect_summ) in enumerate(sect_summs_old): if not chap_title.startswith("Chapter"): continue chap_title, book_count = fix_multipart(chap_title, book_count) sect_summs_new.append((chap_title, sect_summ)) elif title in set(['Crime and Punishment']): sect_summs_new = [(chap.replace(',', ':', 1), summ) for chap, summ in sect_summs_old] elif title in set(['Treasure Island', 'Kidnapped']): sect_summs_old = remove_duplicates(sect_summs_old) sect_summs_new = [ x for x in sect_summs_old if x[0].startswith('Chapter') ] elif title in set([ 'Main Street', 'The Scarlet Letter', 'The Beast in the Jungle', "The Age of Innocence", "The Call of the Wild", 'Ivanhoe' ]): sect_summs_new = remove_duplicates(sect_summs_old) elif title == 'Great Expectations': sect_summs_new = [('Chapter {}'.format(i), summ) for i, (_, summ) in enumerate(sect_summs_old, 1)] elif title == 'Babbitt': sect_summs_old = remove_duplicates(sect_summs_old) for sect_title, sect_summ in sect_summs_old: sect_title = sect_title.replace(',', '') nums = re.findall('\d+', sect_title) sect_title = 'Chapter {}-{}'.format(nums[0], nums[-1]) sect_summs_new.append((sect_title, sect_summ)) elif title == 'Adam Bede': for sect_title, sect_summ in sect_summs_old: if sect_summ[0].startswith('George Eliot, Adam Bede. Edited'): continue sect_summs_new.append((sect_title, sect_summ)) assert sect_summs_new[0][0] == sect_summs_new[1][0] == 'Chapter 1' sect_summs_new = sect_summs_new[1:] elif title == 'Dracula': assert sect_summs_old[0][0] == 'Summary' sect_summs_new = sect_summs_old[1:] elif title == 'Lord Jim': chapters = [ '1 - 2', '3 - 5', '6 - 8', '9 - 11', '12 - 13', '14 - 16', '17 - 18', '19 - 21', '22 - 23', '24 - 26', '27 - 29', '30 - 32', '33 - 35', '36 - 37', '38 - 40', '41 - 43', '44 - 45' ] chapters = ['Chapter {}'.format(x) for x in chapters] sect_summs_new = [(chap, summ[1]) for chap, summ in zip(chapters, sect_summs_old)] elif title == 'Ethan Frome': sect_summs_new = sect_summs_old sect_summs_new[0] = ('Prologue', sect_summs_new[0][1]) sect_summs_new[-1] = ('Epilogue', sect_summs_new[-1][1]) elif title == "A Connecticut Yankee in King Arthur's Court": for sect_title, sect_summ in sect_summs_old: if not sect_title.startswith("Chapter"): continue sect_summs_new.append((sect_title, sect_summ)) sect_summs_new[-1] = ('Chapter 36-45', sect_summs_new[-1][1]) elif title == 'The Adventures of Tom Sawyer': # chapter 16 is split into 16 and 17 sect_summs_old = deepcopy(sect_summs_old) sect_summs_new = sect_summs_old[0:15] chap_16 = [] for chap_title, chap_summ in sect_summs_old[15:]: if chap_title.endswith('16'): chap_16.extend(chap_summ) elif chap_title.endswith('17'): chap_16.extend(chap_summ) sect_summs_new.append(('Chapter 16', chap_16)) else: orig = int(chap_title.rsplit(' ', 1)[-1]) sect_summs_new.append( ('Chapter {}'.format(orig - 1), chap_summ)) elif title == "Tess of the d'Urbervilles": phase_found = False for sect_title, sect_summ in sect_summs_old: if sect_title.startswith('Phase'): phase_found = True continue if not phase_found: continue if sect_title == 'Chapters I–XI': continue sect_summs_new.append((sect_title, sect_summ)) elif title == 'A Portrait of the Artist as a Young Man': sect_summs_old = remove_duplicates(sect_summs_old) chap1_summ = [] for sect_title, sect_summ in sect_summs_old: if "Part" in sect_title: chap1_summ.extend(sect_summ) continue elif chap1_summ: sect_summs_new.append(('Chapter 1', chap1_summ)) chap1_summ = [] sect_summs_new.append((sect_title, sect_summ)) elif title == 'Of Human Bondage': sect_summs_old = remove_duplicates(sect_summs_old) sect_summs_new = [(chap.replace(' and ', '-'), summ) for chap, summ in sect_summs_old] elif title == 'The Secret Sharer': sect_summs_new = [(x[0].replace('Part', 'Chapter'), x[1]) for x in sect_summs_old if x[0].startswith('Part')] elif title == "Moby Dick": # TODO: scrape with less manual fixing for sect_title, sect_summs in sect_summs_old: summs_new = [] sect_title = sect_title.replace('\xa0', '').replace( ' and ', ' - ').split(", “", 1)[0].split(",“", 1)[0].strip() if sect_title.startswith('hapter'): sect_title = 'C' + sect_title elif sect_title == 'Chatper 39': sect_title = 'Chapter 39' elif sect_title == 'Chapter 50' and sect_summs_new[-1][ 0] == 'Chapter 50': sect_title = 'Chapter 51' elif sect_title == 'Chapter 72' and sect_summs_new[-1][ 0] == 'Chapter 72': sect_title = 'Chapter 73' elif sect_title.startswith('Chapters 95'): sect_title = 'Chapters 95-98' elif sect_title.startswith('Chapters 101'): sect_title = 'Chapters 101-105' elif sect_title.startswith('Chapters 120'): sect_title = 'Chapters 120-124' elif sect_title == 'Chapters 10, 11, - 12': continue elif sect_title.startswith('Chapters 26 - 27'): sect_title = 'Chapters 26-27' elif sect_title == 'The Epilogue': sect_title = 'Epilogue' for p in sect_summs: if p == 'Summary': continue elif p.startswith('Analysis'): break else: summs_new.append(p) if not sect_title.startswith(('C', 'Epilogue')): continue sect_summs_new.append((sect_title, summs_new)) sect_summs_new = remove_duplicates(sect_summs_new) elif title == "Gulliver's Travels": for sect_title, sect_summ in sect_summs_old: if not sect_summ: continue if sect_title.startswith("Part I"): continue sect_summs_new.append((sect_title, sect_summ)) elif title == "Siddhartha": for sect_title, sect_summ in sect_summs_old: if '-' in sect_title: sect_title = sect_title.split('-', 1)[-1].strip() else: sect_title, sect_summ = sect_summ[0], sect_summ[1:] sect_summs_new.append((sect_title, sect_summ)) elif title == 'Sense and Sensibility': sect_summs_old = sect_summs_old[0:11] + sect_summs_old[21:] offset = 0 for i, (sect_title, sect_summ) in enumerate(sect_summs_old, 1): if sect_title == 'Chapter XIII': # chapter 12 is missing offset = 1 sect_title = 'Chapter {}'.format(i + offset) sect_summs_new.append((sect_title, sect_summ)) elif title == 'White Fang': for sect_title, sect_summ in sect_summs_old: nums = sect_title.split(' ', 1)[0] part, chapter = nums.split('.', 1) sect_title = 'Part {}: Chapter {}'.format(part, chapter) sect_summs_new.append((sect_title, sect_summ)) elif title == 'Bleak House': sect_summs_new = remove_duplicates(sect_summs_old) assert sect_summs_new[0][0] == 'Author’s Preface' sect_summs_new[0] = ('Preface', sect_summs_new[0][1]) assert sect_summs_new[19][0] == 'Chapter XIX' text = sect_summs_new[20][1] text[0] = 'I' + text[0] XIX_new = (sect_summs_new[19][0], text) sect_summs_new[19] = XIX_new del sect_summs_new[20] elif title == 'Notes from the Underground': sect_summs_new = [(x[0].replace(' C', ': C'), x[1]) for x in sect_summs_old] elif title == "Middlemarch": sect_summs_new = remove_duplicates(sect_summs_old) sect_summs_new = [(chap.split('(', 1)[0].strip(), summ) for chap, summ in sect_summs_new] elif title == "Walden": sect_summs_new = remove_duplicates(sect_summs_old) sect_summs_new = [(chap.split('‘', 1)[0].strip(), summ) for chap, summ in sect_summs_new] sect_summs_new[-1] = ('Chapter 17-18', sect_summs_new[-1][1]) elif title == 'A Tale of Two Cities': sect_summs_new = [] for sect_title, sect_summ in sect_summs_old: sect_title = re.sub(r' ?C', ': C', sect_title) sect_summs_new.append((sect_title, sect_summ)) sect_summs_new = remove_duplicates(sect_summs_new) elif title == 'A Christmas Carol': sect_summs_new = remove_duplicates(sect_summs_old) sect_summs_new = [(chap.split(':', 1)[0], summ) for chap, summ in sect_summs_new if not chap.startswith('Stave 1')] elif title == 'The Awakening': sect_summs_new = [(chap.replace('Part', 'Chapter', 1), summ) for chap, summ in sect_summs_old] elif title == 'Around the World in Eighty Days': sect_summs_old = remove_duplicates(sect_summs_old) sect_summs_new = [(chap.split(':', 1)[0], summ) for chap, summ in sect_summs_old if chap.startswith('Chapter')] assert sect_summs_new[1][0] == 'Chapter 1' del sect_summs_new[1] elif title == "Fathers and Sons": sect_summs_old = remove_duplicates(sect_summs_old) for sect_title, sect_summ in sect_summs_old: if sect_title.endswith('Analysis'): continue elif sect_title == 'Chapter 16': # this one is analysis continue elif sect_title == 'Chapters 16': sect_title = 'Chapter 16' sect_summs_new.append((sect_title, sect_summ)) elif title == "The Yellow Wallpaper": all_sects = [x[1] for x in book_summ.section_summaries] all_sects = [sublist for l in all_sects for sublist in l] sect_summs_new = [('book', all_sects)] elif title == 'Anna Karenina': sect_summs_new = [(chap.replace(' section', ': Chapter'), summ) for chap, summ in sect_summs_old] elif title == 'The Metamorphosis': sect_summs_new = [(chap.replace('Section', 'Part'), summ) for chap, summ in sect_summs_old] elif title in set([ 'Vanity Fair', 'Mansfield Park', 'Washington Square', 'The Deerslayer' ]): sect_summs_new = add_dash_numwords(sect_summs_old) if title == 'The Deerslayer': assert sect_summs_new[0][0] == sect_summs_new[1][0] sect_summs_new.pop(0) assert sect_summs_new[-6][ 0] == 'Chapters Twenty-and - Twenty-One' sect_summs_new[-6] = ('Chapter 20-21', sect_summs_new[-6][1]) elif title == "The Jungle": sect_summs_new = [(chap.replace('Twenty ', 'Twenty-'), summ) for chap, summ in sect_summs_old] elif title == 'The Mayor of Casterbridge': sect_summs_new = add_dash_numwords(sect_summs_old) assert sect_summs_new[4][0] == 'Twelve Thirteen - Fourteen' sect_summs_new[4] = ('Chapter 12-14', sect_summs_new[4][1]) elif title == 'Persuasion': assert sect_summs_old[0][0].startswith('Volume') sect_summs_old = sect_summs_old[1:] sect_summs_old = sorted(sect_summs_old, key=lambda x: int(x[0].rsplit('-', 1)[-1]) ) # sort by page number offset = 0 for sect_title, sect_summ in sect_summs_old: if sect_title == "Chapter I, pages 115-122": offset = 12 sect_title = sect_title.split(',', 1)[0] if offset: chap = roman_to_int(sect_title.rsplit(' ', 1)[-1]) sect_title = 'Chapter {}'.format(chap + offset) sect_summs_new.append((sect_title, sect_summ)) elif title == 'Far from the Madding Crowd': sect_summs_new = [(x[0].replace('Ch.', 'Chapter').split(':', 1)[0], x[1]) \ for x in sect_summs_old] elif title == 'The Turn of the Screw': # novelguide has 2 books with same title, use the other one continue elif title == 'Turn of the Screw': sect_summs_new = [(x[0].replace('Section', 'Chapter'), x[1]) for x in sect_summs_old] elif title == "The Adventures of Huckleberry Finn": for sect_title, sect_summ in sect_summs_old: if sect_title == 'Chapter 1-3': sects = sect_summ[0].split("Chapter") for sect in sects: if not sect: continue st, ss = sect.split(':', 1) sect_summs_new.append( ('Chapter {}'.format(st.strip()), ss)) else: sect_summs_new.append((sect_title, sect_summ)) elif title == "The Picture of Dorian Gray": sect_summs_new = sect_summs_old sect_summs_new[0] = ('Chapters 1-3', sect_summs_new[0][1]) elif title == 'The Scarlet Pimpernel': sect_summs_new = sect_summs_old sect_summs_new[3] = ('Chapter III', sect_summs_new[3][1]) sect_summs_new[7] = ('Chapter VII', sect_summs_new[7][1]) sect_summs_new.pop(0) elif title == 'Jude the Obscure': for sect_title, sect_summ in sect_summs_old: if not sect_summ: continue if sect_title == 'At Marygreen': sect_title = 'I–1' elif sect_title == 'At Melchester': sect_title = 'III–1' elif sect_title == 'At Christminster Again': sect_title = 'VI–1' part_, chap = sect_title.split('–', 1) if chap == '1': # the roman numerals are inaccurate on the original pages part = part_ sect_title = 'Part {}: Chapter {}'.format(part, chap) sect_summs_new.append((sect_title, sect_summ)) elif title == 'Ulysses': sect_summs_new = [('Chapter {}'.format(x[0].rsplit(' ', 1)[-1]), x[1]) for x in sect_summs_old] elif title == 'The American': sect_summs_new = [(x[0].replace('Book', 'Chapter'), x[1]) for x in sect_summs_old] elif title == 'The Brothers Karamazov': book = 0 for sect_title, sect_summ in sect_summs_old: if not sect_title.startswith('Chapter'): continue if sect_title == 'Chapter 1': book += 1 sect_title = "Book {}: {}".format(book, sect_title) sect_summs_new.append((sect_title, sect_summ)) elif title == "Winesburg, Ohio": sect_summs_new = [(x[0].replace('&', ',').replace('VI', 'IV').replace('Godliness', 'Godliness Part'), \ x[1]) for x in sect_summs_old] elif title == "War and Peace": ssn = [(standardize_sect_title(x[0], False), x[1]) for x in sect_summs_old] book_summ_new = book_summ._replace(section_summaries=ssn) elif title == 'The Hound of the Baskervilles': for sect_title, sect_summ in sect_summs_old: if ' - ' in sect_title: sect_title = sect_title.split(' - ')[0] elif sect_title == 'Chapter 10' and sect_summs_new[-1][ 0] == 'Chapter 15': continue elif not sect_title.startswith('C'): continue sect_summs_new.append((sect_title, sect_summ)) else: sect_summs_new = sect_summs_old if sect_summs_new: sect_summs_new = [(standardize_sect_title(x[0]), x[1]) for x in sect_summs_new] title_new = standardize_title(title) if title_new != title: print('renamed {} -> {}'.format(title, title_new)) title = title_new book_summ_new = book_summ._replace( section_summaries=sect_summs_new, title=title_new) book_summaries_new.append(book_summ_new) if start: # for debugging print(title, idx) assert title == book_summaries_new[-1].title for i, x in enumerate(book_summaries_new[-1].section_summaries, 1): print(x[0] or x[1][0][0:100] + ' index ' + str(i)) input() return book_summaries_new