Python standardize_title Exemples, scrape_lib.standardize_title Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : bookwolf_scrape.py Projet : manestay/novel-chapter-dataset

def manual_fix(book_summaries):
    book_summaries_new = []
    for book_summ in book_summaries:
        title = book_summ.title
        title_new = standardize_title(title)
        if title_new != title:
            print('renamed {} -> {}'.format(title, title_new))
        section_summaries_new = []
        section_summaries_old = book_summ.section_summaries
        for i, curr_summ in enumerate(section_summaries_old):
            chap_title, sect_summ = curr_summ
            chap_title = chap_title.replace('Chapters', 'Chapter').replace(
                'Letters', 'Letter ').replace('&', '-')
            if re.match(r'Chap \d+', chap_title):
                chap_title = chap_title.replace('Chap', 'Chapter')
            if re.search(RE_CHAPTER_NOSPACE, chap_title):
                chap_title = re.sub('Chapter', 'Chapter ', chap_title)
            section_summaries_new.append((chap_title.strip(), sect_summ))
        book_summ_new = book_summ._replace(
            section_summaries=section_summaries_new, title=title_new)
        book_summaries_new.append(book_summ_new)
    return book_summaries_new

Exemple #2

0

Afficher le fichier

def manual_fix_individual(book_summaries):
    """
    Note we do not manually fix the plays, since we do not use them in the literature dataset.
    """
    def fix_north(title):
        return title.replace(',', ':', 1).replace('Vol.', 'Book', 1).replace('Volume', 'Book', 1) \
                    .replace('of ', '').replace('Chaper', 'Chapter')

    start = False
    book_summaries_new = []
    for idx, book_summ in enumerate(book_summaries):
        sect_summs_new = []
        sect_summs_old = book_summ.section_summaries
        title = book_summ.title
        # if idx == 125:
        #     start = True
        if title in NON_NOVEL_TITLES:
            continue
        elif title in set([
                "Connecticut Yankee in King Arthur's Court", "Little Women",
                "Walden"
        ]):
            sect_summs_new = [(' '.join(chap_title.split(' ',
                                                         2)[0:2]), sect_summ)
                              for chap_title, sect_summ in sect_summs_old
                              if chap_title]
        elif title in set([
                'Germinal', "Little Dorrit", "Our Mutual Friend",
                "The War of the Worlds"
        ]):
            sect_summs_new = [(x[0].replace(',', ':', 1), x[1])
                              for x in sect_summs_old]
        elif title == 'The Adventures of Huckleberry Finn':
            sect_summs_new = [x for x in sect_summs_old if x[0]]
        elif title == 'The Age of Innocence':
            for chap_title, sect_summ in sect_summs_old:
                arr = chap_title.split(':', 1)
                if len(arr) == 2:
                    chap_title = clean_title(arr[0])
                    sect_summ = [arr[1].strip()] + sect_summ
                if not chap_title.startswith('Chapter'):
                    continue
                sect_summs_new.append((chap_title, sect_summ))
        elif title == 'Alice in Wonderland':
            for i, (chap_title, sect_summ) in enumerate(sect_summs_old, 1):
                if not chap_title:
                    chap_title = 'Chapter {}'.format(i)
                else:
                    chap_title = chap_title.split(':', 1)[0]
                sect_summs_new.append((chap_title, sect_summ))
        elif title == 'The Ambassadors':
            book_idx = 'O'
            for chap_title, sect_summ in sect_summs_old:
                if chap_title.startswith('Volume'):
                    continue
                if chap_title.startswith('Book'):
                    book_idx = chap_title.split(' ', 1)[-1]
                    continue
                sect_idx = chap_title.split(' ', 1)[-1]
                chap_title = 'Book {}: Chapter {}'.format(book_idx, sect_idx)
                sect_summs_new.append((chap_title, sect_summ))
        elif title == 'Black Beauty':
            sect_summs_new = [(chap_title.split(', ', 1)[-1], sect_summ)
                              for chap_title, sect_summ in sect_summs_old]
        elif title == 'Bleak House':
            for chap_title, sect_summ in sect_summs_old:
                if not chap_title:
                    prev_title = sect_summs_new[-1][0]
                    if prev_title == 'Chapters 60-63':
                        chap_title = 'Chapter 64-67'
                elif not chap_title.startswith('Chapter'):
                    continue
                sect_summs_new.append((chap_title, sect_summ))
        elif title == 'The Count of Monte Cristo':
            sect_summs_new = [
                x for x in sect_summs_old
                if not x[0].startswith('The book has')
            ]
        elif title == 'Emma':
            for chap_title, sect_summ in sect_summs_old:
                if chap_title.startswith('Chapter Eighteen:'):
                    chap_title, sect_summ = chap_title.split(':', 1)
                sect_summs_new.append((chap_title, sect_summ))
        elif title == 'Ethan Frome':
            assert sect_summs_old[0][0] == sect_summs_old[1][0] == ''
            book_summ.section_summaries[0] = (
                'Prologue', book_summ.section_summaries[1][1])
            book_summ.section_summaries[-1] = (
                'Epilogue', book_summ.section_summaries[-1][1])
            del book_summ.section_summaries[1]
            book_summ_new = book_summ
        elif title == 'Far from the Madding Crowd':
            for chap_title, sect_summ in sect_summs_old:
                if chap_title == '':
                    chap_title = 'Chapter 38-45'
                elif not chap_title.startswith('Chapter'):
                    continue
                elif chap_title == 'Chapters 54-Conclusion':
                    chap_title = 'Chapter 54-57'
                sect_summs_new.append((chap_title, sect_summ))
        elif title == 'Frankenstein':
            sect_summs_new = sect_summs_old
            sect_summs_new[-1] = ('Final Letters', sect_summs_new[-1][1])
        elif title == 'The American':
            sect_summs_new = [
                ('Chapter {}'.format(chap_title)
                 if not chap_title.startswith('Ch') else chap_title, sect_summ)
                for chap_title, sect_summ in sect_summs_old
            ]
        elif title == 'Great Expectations':
            i = 1
            for chap_title, sect_summ in sect_summs_old:
                if not chap_title.startswith(('Part', 'Chapter')):
                    addtl_text = [chap_title] + sect_summ
                    sect_summs_new[-1] = (sect_summs_new[-1][0],
                                          sect_summs_new[-1][1] + addtl_text)
                else:
                    chap_title = 'Chapter {}'.format(i)
                    sect_summs_new.append((chap_title, sect_summ))
                    i += 1
        elif title == 'The Hound of the Baskervilles':
            for i, (chap_title, sect_summ) in enumerate(sect_summs_old):
                chap_title = chap_title.split(':', 1)[0]
                if not chap_title.startswith('Chapter'):
                    continue
                if not sect_summ:
                    assert sect_summs_old[i + 1][0].startswith(
                        ('This chapter', 'In this final'))
                    sect_summ = [sect_summs_old[i + 1][0]
                                 ] + sect_summs_old[i + 1][1]
                sect_summs_new.append((chap_title, sect_summ))
        elif title == 'Howards End':
            for i, (chap_title, sect_summ) in enumerate(sect_summs_old):
                if not chap_title and len(sect_summ) == 2:
                    continue
                elif not chap_title:
                    chap_title = 'Chapter 16-19'
                sect_summs_new.append((chap_title, sect_summ))
        elif title == "Lady Audley's Secret":
            for chap_title, sect_summ in sect_summs_old:
                if chap_title == "Volume 3, Chapter 1":
                    chap_title = 'Chapter 1'
                sect_summs_new.append((chap_title, sect_summ))
        elif title == "Mary Barton":
            for chap_title, sect_summ in sect_summs_old:
                if not chap_title:
                    chap_title = 'Chapters XVI-XX'
                elif chap_title == 'Chapters XXI-XV':
                    chap_title = 'Chapters XXI-XXV'
                sect_summs_new.append((chap_title, sect_summ))
        elif title == 'Moby Dick':
            for chap_title, sect_summ in sect_summs_old:
                if not chap_title.startswith('Chapter'): continue
                chap_title = chap_title.split(':', 1)[0].replace('One Hundred and ', 'One-Hundred-', 1) \
                                       .replace('One Hundred', 'One-Hundred', 1)
                sect_summs_new.append((chap_title, sect_summ))
        elif title == 'Northanger Abbey':
            sect_summs_new = [(fix_north(x[0]), x[1]) for x in sect_summs_old]
        elif title in set(["The Vicar of Wakefield", 'Uncle Tom\'s Cabin']):
            for i, (chap_title, sect_summ) in enumerate(sect_summs_old):
                if not chap_title.startswith('Chapter'): continue
                if not sect_summ:
                    sect_summ = [sect_summs_old[i + 1][0]
                                 ] + sect_summs_old[i + 1][1]
                sect_summs_new.append((chap_title, sect_summ))
        elif title == 'Persuasion':
            for chap_title, sect_summ in sect_summs_old:
                if not chap_title:
                    chap_title = 'Chapter 22-24'
                elif chap_title.startswith('The final chapter'):
                    continue
                sect_summs_new.append((chap_title, sect_summ))
        elif title in set(["The Scarlet Letter", "The Blithedale Romance"]):
            sect_summs_new = [
                x for x in sect_summs_old if x[0].startswith('Chapter')
            ]
        elif title == 'Siddhartha':
            for _, lines in sect_summs_old:
                sect_summ_curr = []
                for line in lines:
                    if line in SIDDHARTHA_TITLES:
                        if sect_summ_curr:
                            sect_summs_new.append((chap_title, sect_summ_curr))
                            sect_summ_curr = []
                        chap_title = line.replace("The Brahmins Son",
                                                  "The Brahmin's Son").replace(
                                                      'Goatama', 'Gotama')
                    else:
                        sect_summ_curr.append(line)
                if sect_summ_curr:
                    sect_summs_new.append((chap_title, sect_summ_curr))
                    sect_summ_curr = []
        elif title == 'A Study in Scarlet':
            sect_summs_new = [(x[0].split(':',
                                          1)[0].strip().replace(',', ':',
                                                                1), x[1])
                              for x in sect_summs_old]
        elif title == 'Treasure Island':
            for i, (chap_title, sect_summ) in enumerate(sect_summs_old):
                if not sect_summ:
                    sect_summ = sect_summs_old[i + 1][1]
                elif not chap_title:
                    continue
                sect_summs_new.append((chap_title, sect_summ))
        elif title == 'A Study in Scarlet':
            sect_summs_new = [(x[0].replace('of ', '', 1), x[1])
                              for x in sect_summs_old]
        elif title == "The Valley of Fear":
            sect_summs_new = [x for x in sect_summs_old if x[1]]
        elif title == "Villette":
            for i, (chap_title, sect_summ) in enumerate(sect_summs_old):
                prev_chap = sect_summs_new[-1][0] if sect_summs_new else ''
                if prev_chap.endswith('XIII'):
                    sect_summ = [chap_title] + sect_summ
                    chap_title = "Chapter 14-16"
                elif prev_chap.endswith('XXV'):
                    sect_summ = [chap_title] + sect_summ
                    chap_title = "Chapter 26-28"
                sect_summs_new.append((chap_title, sect_summ))
        elif title == 'What Maisie Knew':
            book_summ.section_summaries[0] = (
                'Introduction', book_summ.section_summaries[0][1])
            book_summ_new = book_summ
        elif title == 'Winesburg, Ohio':
            for i, (chap_title, sect_summ) in enumerate(sect_summs_old):
                chap_title = chap_title.replace('"',
                                                '').replace("'", '').replace(
                                                    ' Summary ', ' ')
                if chap_title.startswith("Surrender"):
                    chap_title = "Godliness Part 3"
                elif chap_title.startswith("Terror"):
                    chap_title = "Godliness Part 4"
                elif chap_title.startswith("Prologue"):
                    chap_title = "The Book of the Grotesque"
                sect_summs_new.append((chap_title, sect_summ))
        elif title == 'Wuthering Heights':
            for i, (chap_title, sect_summ) in enumerate(sect_summs_old):
                if not sect_summ:
                    next_chap, next_summ = sect_summs_old[i + 1]
                    # Chapter 25 section has a typo https://www.gradesaver.com/wuthering-heights/study-guide/summary-chapters-21-25
                    if not next_summ and not chap_title == 'Chapter 25':
                        print("need to update Wuthering Heights")
                    sect_summ = next_summ
                elif not chap_title:
                    continue
                sect_summs_new.append((chap_title, sect_summ))
        elif title == "The Yellow Wallpaper":
            all_sects = [x[1] for x in sect_summs_old]
            all_sects = [sublist for l in all_sects for sublist in l]
            sect_summs_new = [('book', all_sects)]
        elif title in set([
                "The Mill on the Floss", 'My Antonia', "A Tale of Two Cities",
                'War and Peace'
        ]):  # multibook
            book_count = 0
            seen = set()
            for i, (chap_title, sect_summ) in enumerate(sect_summs_old):
                if not chap_title.startswith("Chapter") or chap_title.endswith(
                        '.'):
                    continue
                chap_title, book_count = fix_multibook(chap_title, book_count)
                if not sect_summ:
                    sect_summ = [sect_summs_old[i + 1][0]
                                 ] + sect_summs_old[i + 1][1]
                if chap_title == "Book 2: Chapter 4 -":
                    chap_title = "Book 2: Chapter 4"
                if title == 'A Tale of Two Cities':
                    if chap_title in seen:
                        continue
                    seen.add(chap_title)
                    arr = chap_title.split(':')
                    if len(arr) == 3:
                        chap_title = ':'.join(arr[0:2])
                sect_summs_new.append((chap_title, sect_summ))
        elif title == 'Hard Times':
            for i, (chap_title, sect_summ) in enumerate(sect_summs_old):
                if chap_title.startswith('Book the First'):
                    book_num = 1
                elif chap_title == 'Book II':
                    book_num = 2
                elif chap_title.startswith('Book III'):
                    book_num = 3
                else:
                    chap_title = 'Book {}: {}'.format(
                        book_num,
                        chap_title.split(':', 1)[0])
                    sect_summs_new.append((chap_title, sect_summ))
        elif title in set([
                "Gulliver's Travels", "Jude the Obscure", "Madame Bovary",
                'Crime and Punishment'
        ]):  # multipart
            book_count = 0
            for i, (chap_title, sect_summ) in enumerate(sect_summs_old):
                if title in set(["Gulliver's Travels", 'Crime and Punishment'
                                 ]) and not sect_summ:
                    sect_summ = [sect_summs_old[i + 1][0]
                                 ] + sect_summs_old[i + 1][1]
                if not chap_title.startswith("Chapter"):
                    continue
                elif title == "Madame Bovary" and "-" in chap_title:
                    continue
                chap_title, book_count = fix_multipart(chap_title, book_count)
                sect_summs_new.append((chap_title, sect_summ))
        elif title in set(['Pride and Prejudice', 'Jane Eyre']):
            for i, (chap_title, sect_summ) in enumerate(sect_summs_old, 1):
                chap_title = 'Chapter {}'.format(i)
                if title == 'Pride and Prejudice' and chap_title == 'Chapter 60':
                    sect_summs_new.append((chap_title, sect_summ[0:1]))
                    sect_summs_new.append(('Chapter 61', sect_summ[1:]))
                    continue
                sect_summs_new.append((chap_title, sect_summ))
        elif title == "The Phantom of the Opera":
            book_summ.section_summaries[-1] = (
                'Chapter 21-Epilogue', book_summ.section_summaries[-1][1])
            book_summ_new = book_summ
        elif title == "The Picture of Dorian Gray":
            sect_summs_new = sect_summs_old
            sect_summs_new[0] = ('Preface-Chapter 2', sect_summs_new[0][1])
        elif title == "Tess of the D'Urbervilles":
            sect_summs_new = [(x[0], x[1]) for x in sect_summs_old
                              if x[0].startswith('Chapter')]
        elif title == 'Washington Square':
            sect_summs_new = [(x[0].replace(' Summaries', '', 1), x[1])
                              for x in sect_summs_old
                              if x[0].startswith('Chapter')]

        elif title == "The Wind in the Willows":
            sect_summs_new = sect_summs_old
            sect_summs_new[-2] = (sect_summs_new[-2][0],
                                  sect_summs_new[-2][1] +
                                  [sect_summs_new[-1][0]])
            sect_summs_new.pop(-1)
        elif title == "The Brothers Karamazov":
            sect_summs_new = sect_summs_old
            sect_summs_new[-1] = ('Book 13', sect_summs_old[-1][1])
        elif title == "The Metamorphosis":
            sect_summs_new = [(x[0].replace('Chapter', 'Part', 1), x[1])
                              for x in sect_summs_old]
        elif title == 'The Secret Garden':
            assert sect_summs_old[1][0] == 'Chapters 5-19'
            sect_summs_new = sect_summs_old
            sect_summs_new[1] = ('Chapters 5-9', sect_summs_old[1][1])

        else:
            sect_summs_new = sect_summs_old

        if sect_summs_new:
            sect_summs_new = [(standardize_sect_title(x[0]), x[1])
                              for x in sect_summs_new]
            title_new = standardize_title(title)
            if title_new != title:
                print('renamed {} -> {}'.format(title, title_new))
                title = title_new
            book_summ_new = book_summ._replace(
                section_summaries=sect_summs_new, title=title_new)

        book_summaries_new.append(book_summ_new)
        if start:  # for debugging
            # if title == 'Frankenstein':
            print(title, idx)
            assert title == book_summaries_new[-1].title
            for i, x in enumerate(book_summaries_new[-1].section_summaries, 1):
                print(x[0] or x[1][0][0:100] + ' index ' + str(i))
            input()

    return book_summaries_new

Exemple #3

0

Afficher le fichier

Fichier : cliffsnotes_scrape.py Projet : manestay/novel-chapter-dataset

def manual_fix_individual(book_summaries):
    """
    Note we do not manually fix the plays, since we do not use them in the literature dataset.
    """
    start = False  # to debug
    book_summaries_new = []
    for idx, book_summ in enumerate(book_summaries):
        sect_summs_new = []
        sect_summs_old = book_summ.section_summaries
        title = book_summ.title
        # if idx == 79:
        #     start = True
        if title in NON_NOVEL_TITLES:
            continue
        if title in set([
                'Adam Bede', 'The Brothers Karamazov', 'The Age of Innocence',
                'Siddhartha', 'Silas Marner', 'The Three Musketeers'
        ]):
            sect_summs_new = [(x[0].split(':', 1)[1].strip(), x[1])
                              for x in sect_summs_old]
            if title == 'The Brothers Karamazov':
                assert sect_summs_new[-1][0] == 'Epilogue'
                sect_summs_new[-1] = ('Book 13', sect_summs_new[-1][1])
            if title == 'Siddhartha':
                sect_summs_new = [
                    ('Samsara' if chap_title == 'Sansara' else
                     'Amongst the People' if chap_title
                     == 'With the Childlike People' else chap_title, chap_summ)
                    for chap_title, chap_summ in sect_summs_new
                ]
        elif title in set(["Tess of the d'Urbervilles"]):
            sect_summs_new = [(x[0].rsplit(':', 1)[1].strip(), x[1])
                              for x in sect_summs_old]
        elif title == 'White Fang':
            sect_summs_new = [(x[0].split('(', 1)[0].strip(), x[1])
                              for x in sect_summs_old]
        elif title == 'The Adventures of Huckleberry Finn':
            sect_summs_new = sect_summs_old
            sect_summs_new[-1] = ('Chapter 43', sect_summs_new[-1][1])
            assert sect_summs_new[0][0] == 'Notice; Explanatory'
            sect_summs_new = sect_summs_new[1:]
        elif title == "A Connecticut Yankee in King Arthur's Court":
            sect_summs_new = sect_summs_old
            sect_summs_new[-1] = ('Chapter 39-45', sect_summs_new[-1][1])
        elif title == 'Jane Eyre':
            sect_summs_new = sect_summs_old
            sect_summs_new[-1] = ('Chapter 38', sect_summs_new[-1][1])
        elif title in set(['The Mill on the Floss', 'My Ántonia']):
            for sect_title, summ in sect_summs_old:
                if sect_title.startswith('Introduction'):
                    pass
                elif sect_title.endswith('Conclusion'):
                    sect_title = 'Book 7: Conclusion'
                else:
                    arr = sect_title.split()
                    sect_title = '{} {} {} {}'.format(arr[0], arr[1], arr[-2],
                                                      arr[-1])
                sect_summs_new.append((sect_title, summ))
        elif title == "The Turn of the Screw":
            sect_summs_new = [(x[0].replace('Section',
                                            'Chapter').replace('"', ''), x[1])
                              for x in sect_summs_old]
        elif title == 'The Way of All Flesh':
            for sect_title, summ in sect_summs_old:
                if '(' in sect_title:
                    chapter_nums = sect_title.split('(', 1)[1].split(
                        ' ', 1)[0].replace(')', '')
                    sect_title = 'Chapter {}'.format(chapter_nums)
                sect_title = sect_title.replace('87', '86', 1)  # fix typo
                sect_summs_new.append((sect_title, summ))
        elif title == 'The Secret Sharer':
            sect_summs_new = [(x[0].replace('Part', 'Chapter'), x[1])
                              for x in sect_summs_old]
        elif title == 'Winesburg, Ohio':
            sect_summs_new = [(x[0].replace('"', ''), x[1])
                              for x in sect_summs_old]
        elif title == 'Treasure Island':
            treasure_chapters = [
                "1-6", "7-12", "13-15", "16-21", "22-27", "28-34"
            ]
            for i, (chap, sect_summ) in enumerate(
                    zip(treasure_chapters, sect_summs_old)):
                sect_summs_new.append(('Chapter ' + chap, sect_summ[1]))
        elif title == 'Emma':
            for sect_title, summ in sect_summs_old:
                if 'Volume 1' in sect_title:
                    offset = 0
                elif 'Volume 2' in sect_title:
                    offset = 18
                else:
                    offset = 36
                last = sect_title.rsplit(' ', 1)[-1]

                if '-' in last:
                    first, last = [
                        roman_to_int(x) + offset for x in last.split('-', 1)
                    ]
                    sect_title = 'Chapter {}-{}'.format(first, last)
                else:
                    sect_title = 'Chapter {}'.format(
                        roman_to_int(last) + offset)
                sect_summs_new.append((sect_title, summ))
        elif title == "War and Peace":
            ssn = [(standardize_sect_title(x[0], False), x[1])
                   for x in sect_summs_old]
            book_summ_new = book_summ._replace(section_summaries=ssn)
        else:
            sect_summs_new = sect_summs_old

        if sect_summs_new:
            sect_summs_new = [(standardize_sect_title(x[0]), x[1])
                              for x in sect_summs_new]
            title_new = standardize_title(title)
            if title_new != title:
                print('renamed {} -> {}'.format(title, title_new))
                title = title_new
            book_summ_new = book_summ._replace(
                section_summaries=sect_summs_new, title=title_new)

        book_summaries_new.append(book_summ_new)
        if start:  # for debugging
            print(title, idx)
            assert title == book_summaries_new[-1].title
            for i, x in enumerate(book_summaries_new[-1].section_summaries, 1):
                print(x[0] or x[1][0][0:100] + ' index ' + str(i))
            input()

    return book_summaries_new

Exemple #4

0

Afficher le fichier

def manual_fix_individual(book_summaries):
    start = False
    book_summaries_new = []
    for idx, book_summ in enumerate(book_summaries):
        sect_summs_old = book_summ.section_summaries
        sect_summs_new = []
        title = book_summ.title
        # if idx == 73:
        #     start = True
        if title in NON_NOVEL_TITLES:
            continue
        elif title == 'Homecoming':  # not the same as one in Gutenberg
            continue
        elif title in set(['A Christmas Carol']):
            sect_summs_new = [(x[0].split(':', 1)[0], x[1])
                              for x in sect_summs_old]
            # sect_summs_new[0] = ('Stave One', sect_summs_new[0][1])
        elif title in set(['Bleak House']):
            sect_summs_new = [(x[0].split(',', 1)[0], x[1])
                              for x in sect_summs_old]
        elif title in set(['David Copperfield']):
            sect_summs_new = [(x[0].split('.', 1)[0], x[1])
                              for x in sect_summs_old]
        elif title in set(['Moll Flanders']):
            sect_summs_new = [(x[0].split(' (', 1)[0], x[1])
                              for x in sect_summs_old]
        elif title in set([
                'The Idiot', "The Good Soldier", "Anna Karenina",
                'The Return of the Native'
        ]):
            sect_summs_new = [(x[0].replace(',', ':'), x[1])
                              for x in sect_summs_old]
        elif title in set(
            ['Crime and Punishment', 'Don Quixote',
             'Madame Bovary']):  # multipart
            book_count = 0
            for i, (chap_title, sect_summ) in enumerate(sect_summs_old):
                if not chap_title.startswith("Chapter"):
                    continue
                chap_title, book_count = fix_multipart(chap_title, book_count)
                sect_summs_new.append((chap_title, sect_summ))
        elif title in set([
                'The Brothers Karamazov', 'Hard Times',
                'The Mill on the Floss', 'My Ántonia', 'Northanger Abbey',
                'A Tale of Two Cities', "The House of Mirth"
        ]):  # multibook
            book_count = 0
            for i, (chap_title, sect_summ) in enumerate(sect_summs_old):
                if not chap_title.startswith("Chapter"):
                    continue
                chap_title, book_count = fix_multibook(chap_title, book_count)
                sect_summs_new.append((chap_title, sect_summ))
        elif title == 'Ethan Frome':
            for i, (chap_title, sect_summ) in enumerate(sect_summs_old):
                arr = chap_title.split(' ', 1)
                if len(arr) == 2:
                    chap_title = '{} {}'.format(arr[0], arr[1].upper())
                elif chap_title == 'Introduction':
                    chap_title = 'Prologue'
                elif chap_title == 'Conclusion':
                    chap_title = 'Epilogue'
                sect_summs_new.append((chap_title, sect_summ))
        elif title == 'O Pioneers!':
            sect_summs_new = [(x[0].replace(',', ':'), x[1])
                              for x in sect_summs_old
                              if x[0].startswith('Part')]
        elif title == 'The Age of Innocence':
            sect_summs_new = [(x[0].replace('Book One ', '',
                                            1).replace('Book Two ', ''), x[1])
                              for x in sect_summs_old]
        elif title == 'The Three Musketeers':
            offset = 0
            for i, (chap_title, sect_summ) in enumerate(sect_summs_old):
                if not chap_title.startswith(('Chapter', 'Part')):
                    continue
                if chap_title.startswith('Part II'):
                    offset = 37
                if offset:
                    chapter_range = re.search(('\d+-.+'), chap_title)
                    beg, end = chapter_range[0].split('-')
                    beg = int(beg) + offset
                    end = end if end == 'Epilogue' else int(end) + offset
                    chap_title = 'Chapter {}-{}'.format(beg, end)
                sect_summs_new.append((chap_title, sect_summ))
        elif title == 'Frankenstein':
            for sect_title, sect_summ in sect_summs_old:
                sect_title = sect_title.replace('Letters', 'Letter', 1)
                if sect_title.startswith("Walton"):
                    sect_title = "Final Letters"
                sect_summs_new.append((sect_title, sect_summ))
        elif title in set([
                'Kidnapped', 'The House of the Seven Gables',
                "Northanger Abbey", "The Scarlet Letter", 'Typee', "Anthem",
                "Ivanhoe", 'The Adventures of Huckleberry Finn',
                'Maggie: A Girl of the Streets'
        ]):
            sect_summs_new = [
                x for x in sect_summs_old
                if x[0].startswith(('Chapter', 'Preface'))
            ]
        elif title == 'Siddhartha':
            book_summ.section_summaries[0] = ("The Brahmin's Son",
                                              book_summ[0][1])
            book_summ_new = book_summ
        elif title == 'Heart of Darkness':
            part = 'Part 1'
            curr_summ = []
            for sect_title, sect_summ in sect_summs_old:
                if sect_title != part:
                    sect_summs_new.append((part, curr_summ))
                    part = sect_title
                    curr_summ = []
                curr_summ.extend(sect_summ)
            if curr_summ:
                sect_summs_new.append((part, curr_summ))
        elif title == 'Jane Eyre':
            sect_summs_new = [(x[0].replace('Summary', 'Chapter 26'), x[1])
                              for x in sect_summs_old]
        elif title == 'Moby-Dick':
            for sect_title, sect_summ in sect_summs_old:
                if sect_title in set(['Etymology', 'Extracts']):
                    continue
                sect_summs_new.append((sect_title, sect_summ))
        elif title == 'This Side of Paradise':
            sect_summs_new = [(x[0].split(':', 1)[0].replace(',', ':'), x[1])
                              for x in sect_summs_old
                              if x[0].startswith('Book')]
        elif title == 'A Portrait of the Artist as a Young Man':
            curr_chap = 'Chapter 1'
            curr_summ = []
            for sect_title, sect_summ in sect_summs_old:
                chap_title = sect_title.split(',', 1)[0]
                if chap_title != curr_chap:
                    sect_summs_new.append((curr_chap, curr_summ))
                    curr_chap = chap_title
                    curr_summ = []
                curr_summ.extend(sect_summ)
            if curr_summ and curr_chap.startswith('Chapter'):
                sect_summs_new.append((curr_chap, curr_summ))
        elif title == "Dubliners":
            sect_summs_new = [(x[0].replace('“', '').replace('”', ''), x[1])
                              for x in sect_summs_old]
        elif title == "Jude the Obscure":
            sect_summs_new = [(x[0].split(':', 1)[0], x[1])
                              for x in sect_summs_old
                              if x[0].startswith('Part')]
        elif title == "Lord Jim":
            sect_summs_new = [(x[0].replace('and', '-'), x[1])
                              for x in sect_summs_old]
        elif title == "Middlemarch":
            sect_summs_new = [(x[0].split(': ', 1)[1], x[1])
                              for x in sect_summs_old]
        elif title in set(["Sense and Sensibility", "Dracula", "Don Quixote"]):
            sect_summs_new = [(x[0], x[1]) for x in sect_summs_old
                              if x[0].startswith('Chapter')]
        elif title == "This Side of Paradise":
            sect_summs_new = [(x[0].split(':', 1)[0].replace(',', ''), x[1])
                              for x in sect_summs_old
                              if x[0].startswith('Chapter')]
            # start = True
        elif title == 'The Time Machine':
            sect_summs_new = [(x[0].replace('and', '-'), x[1])
                              for x in sect_summs_old]
            sect_summs_new[-1] = ('Chapter 11 - Epilogue',
                                  sect_summs_new[-1][1])
        elif title == 'Ulysses':
            sect_summs_new = [(x[0].replace('Episode',
                                            'Chapter').split(':', 1)[0], x[1])
                              for x in sect_summs_old]
        elif title == 'Walden':
            sect_summs_new = [('Chapter {}'.format(i), x[1])
                              for i, x in enumerate(sect_summs_old, 1)]
        elif title == 'Winesburg, Ohio':
            for sect_title, sect_summ in sect_summs_old:
                sect_title = sect_title.replace('"', '')
                if sect_title == "Godliness, Parts I-II":
                    sect_title = "Godliness Part I, Godliness Part II"
                elif sect_title.startswith("Godliness, Parts III"):
                    sect_title = "Godliness Part III, Godliness Part IV, A Man of Ideas"
                elif sect_title.startswith("Analytical"):
                    continue
                sect_summs_new.append((sect_title, sect_summ))
        elif title == "White Fang":
            sect_summs_new = [(x[0].replace(',', ':').replace('and', '-',
                                                              1), x[1])
                              for x in sect_summs_old]
        elif title == 'The Picture of Dorian Gray':
            sect_summs_new = sect_summs_old
            sect_summs_new[0] = ('Preface', sect_summs_new[0][1])
        elif title == "War and Peace":
            ssn = [(standardize_sect_title(x[0].replace(',', ':'),
                                           False), x[1])
                   for x in sect_summs_old]
            book_summ_new = book_summ._replace(section_summaries=ssn)
        else:
            sect_summs_new = sect_summs_old

        if sect_summs_new:
            sect_summs_new = [(standardize_sect_title(x[0]), x[1])
                              for x in sect_summs_new]
            title_new = standardize_title(title)
            if title_new != title:
                print('renamed {} -> {}'.format(title, title_new))
                title = title_new
            book_summ_new = book_summ._replace(
                section_summaries=sect_summs_new, title=title_new)

        book_summaries_new.append(book_summ_new)
        if start:  # for debugging
            print(title, idx)
            assert title == book_summaries_new[-1].title
            for i, x in enumerate(book_summaries_new[-1].section_summaries, 1):
                print(x[0] or x[1][0][0:100] + ' index ' + str(i))
            input()

    return book_summaries_new

Exemple #5

0

Afficher le fichier

def manual_fix_individual(book_summaries):
    start = False  # to debug
    book_summaries_new = []
    seen = set()
    for idx, book_summ in enumerate(book_summaries):
        sect_summs_new = []
        sect_summs_old = book_summ.section_summaries
        title = book_summ.title
        source = book_summ.source
        if (title, source) in seen:
            continue
        # if idx == 0:
        #     start = True
        if title in NON_NOVEL_TITLES:
            continue
        if title in set(['Kidnapped', 'Treasure Island', 'The Call of the Wild',
                         'Dr. Jekyll and Mr. Hyde', 'The House of the Seven Gables', 'The Prince and the Pauper',
                         'Huckleberry Finn', 'Tom Sawyer']) and source == 'monkeynotes' or \
                title == "The Mayor of Casterbridge" and source == 'barrons':
            sect_summs_new = [(x[0].split(':', 1)[0].strip(), x[1])
                              for x in sect_summs_old]
        elif title in set(["Oliver Twist", 'Candide', "Alice's Adventures in Wonderland"]) and source == 'monkeynotes' or \
                title == 'The Scarlet Letter' and source == 'barrons':
            sect_summs_new = [(x[0].split('-', 1)[0].strip(), x[1])
                              for x in sect_summs_old]
        elif title in set(["Uncle Tom's Cabin"]) and source == 'barrons':
            sect_summs_new = [(x[0].split('.', 1)[0], x[1])
                              for x in sect_summs_old]
        elif title in set(["Emma", "Pride and Prejudice"]) and source == 'monkeynotes' or \
                title in set(["Huckleberry Finn", "Great Expectations"]) and source == 'barrons':
            sect_summs_new = [(x[0].replace('&', '-', 1).strip(), x[1])
                              for x in sect_summs_old]
        # skip below, as inconsistent sections vs Gutenberg book
        elif title in set(['War and Peace', 'The Time Machine'
                           ]) and source == 'monkeynotes':
            continue
        elif title in set([
                'Anna Karenina', 'Don Quixote', "Crime and Punishment",
                'Madame Bovary', 'White Fang', 'Jude the Obscure',
                "Gulliver's Travels", "The Idiot", 'Under the Greenwood Tree'
        ]):  # multipart
            book_count = 0
            if title == 'Madame Bovary' and source == 'barrons':
                assert sect_summs_old[26][0] == 'Chapter 4'
                assert sect_summs_old[31][0] == 'Chapter 10'
                sect_summs_old[26] = ('Chapter 3-4', sect_summs_old[26][1])
                sect_summs_old[31] = ('Chapter 9-10', sect_summs_old[31][1])
            if title == 'The Idiot':
                sect_summs_old[1] = ('Chapter 2-3', sect_summs_old[1][1])
            for i, (chap_title, sect_summ) in enumerate(sect_summs_old):
                if title == 'Crime and Punishment':
                    chap_title = chap_title.replace('PART VI, ', '', 1)
                    if chap_title.startswith('Part'):
                        chap_title = 'Chapter {}'.format(
                            chap_title.split(' ', 1)[-1])
                chap_title = re.sub(RE_CHAPTER_ONLY, 'Chapter', chap_title)
                if not chap_title.startswith("Chapter"):
                    continue
                chap_title, book_count = fix_multipart(chap_title, book_count)
                sect_summs_new.append((chap_title, sect_summ))
        elif title in set([
                'My Antonia', "A Tale of Two Cities", "The War of the Worlds",
                'The House of Mirth', 'Hard Times'
        ]):  # multibook
            book_count = 0
            for i, (chap_title, sect_summ) in enumerate(sect_summs_old):
                if chap_title.endswith('CHAPTER I'):  # for barrons
                    chap_title = 'CHAPTER I'
                chap_title = re.sub(RE_CHAPTER_ONLY, 'Chapter', chap_title)
                if not chap_title.startswith("Chapter"):
                    continue
                if title == 'A Tale of Two Cities':
                    chap_title = chap_title.split(':')[0]
                chap_title, book_count = fix_multibook(chap_title, book_count)
                sect_summs_new.append((chap_title, sect_summ))
        elif title in set(['The House of the Seven Gables', 'Walden'
                           ]) and source == 'barrons':
            sect_summs_new = [('Chapter {}'.format(x[0].split('.', 1)[0]) if not x[0].startswith('P') else x[0], x[1]) \
                              for x in sect_summs_old]
        elif title == 'Heart of Darkness':
            sect_summs_new = [(x[0].replace('Chapter', 'Part'), x[1])
                              for x in sect_summs_old]
        elif title == 'The Hound of the Baskervilles' and source == 'monkeynotes':
            assert book_summ.section_summaries[1][0] == 'Chapter Summary'
            book_summ.section_summaries[1] = (
                'Chapter 2', book_summ.section_summaries[2][1])
            book_summ_new = book_summ
        elif title == 'Tess of the D\'Urbervilles' and source == 'barrons':
            sect_summs_new = [(x[0].replace('AND', '-').replace(
                ', 14, - ', ' - ').replace(', 27, - ',
                                           ' - ').replace(', 57, - ',
                                                          ' - '), x[1])
                              for x in sect_summs_old]
        elif title == 'The Prince':
            sect_summs_new = [(x[0].replace('AND', '-'), x[1])
                              for x in book_summ.section_summaries]
        elif title == 'Ivanhoe':
            sect_summs_old.pop(0)
            chapters = [
                'Chapter 1', 'Chapter 2', 'Chapter 3', 'Chapter 4',
                'Chapter 5', 'Chapter 6', 'Chapter 7-9', 'Chapter 10',
                'Chapter 11', 'Chapter 12', 'Chapter 13-15', 'Chapter 16-17',
                'Chapter 18-19', 'Chapter 20-21', 'Chapter 22', 'Chapter 23',
                'Chapter 24', 'Chapter 25-27', 'Chapter 28', 'Chapter 29',
                'Chapter 30-31', 'Chapter 32', 'Chapter 33-34', 'Chapter 35',
                'Chapter 37-39', 'Chapter 40-42', 'Chapter 43', 'Chapter 44'
            ]
            sect_summs_new = [
                (chap, summ)
                for chap, summ in zip(chapters, [x[1] for x in sect_summs_old])
            ]
        elif title == 'Winesburg, Ohio':
            for sect_title, sect_summ in sect_summs_old:
                if sect_title == 'Story 13 -':
                    sect_title = 'The Strength of God'
                elif sect_title == 'PART I - SUMMARY':
                    sect_title = 'Godliness Part I'
                elif sect_title == 'PART II - SUMMARY':
                    sect_title = 'Godliness Part II'
                elif sect_title == 'PART III - Surrender':
                    sect_title = 'Godliness Part III'
                elif sect_title == 'PART IV - Terror':
                    sect_title = 'Godliness Part IV'
                else:
                    sect_title = sect_title.split('-', 1)[-1].strip()
                sect_summs_new.append((sect_title, sect_summ))
        elif (title == 'Silas Marner' and source == 'barrons') or \
             (title == 'Looking Backward: 2000-1887' and source == 'monkeynotes'):
            sect_summs_new = [
                x for x in sect_summs_old if x[0].startswith('C')
            ]
        elif title == 'Turn of the Screw':
            sect_summs_new = [(x[0].replace('SECTION', 'Chapter'), x[1])
                              for x in sect_summs_old]
            sect_summs_new = sect_summs_new[4:]
            assert sect_summs_new[0][0] == 'PROLOGUE'
        elif title == 'The Metamorphosis' and source == 'monkeynotes':
            sect_summs_new = [(x[0].replace('Section', 'Part'), x[1])
                              for x in sect_summs_old]
        elif title == 'Sons and Lovers' and source == 'barrons':
            for sect_title, sect_summ in sect_summs_old:
                sect_title = sect_title.replace('PART TWO - ', '', 1)
                if not sect_title.startswith('CHAPTER'):
                    continue
                sect_summs_new.append((sect_title, sect_summ))
        elif title == 'Moby Dick' and source == 'monkeynotes':
            sect_summs_new = [x for x in sect_summs_old if not x[0] == 'Notes']
        elif title == 'Moby Dick' and source == 'barrons':
            chap_nums_re = r'\d+'
            for sect_title, sect_summ in sect_summs_old:
                chaps = re.findall(chap_nums_re, sect_title)
                if sect_title == 'Epilogue':
                    pass
                elif len(chaps) == 1:
                    sect_title = 'Chapter {}'.format(chaps[0])
                else:
                    sect_title = 'Chapter {}-{}'.format(chaps[0], chaps[-1])
                sect_summs_new.append((sect_title, sect_summ))
        elif title == 'Siddhartha' and source == 'monkeynotes':
            for sect_title, sect_summ in sect_summs_old:
                sect_title = sect_title.split(':', 1)[1].strip()
                sect_summs_new.append((sect_title, sect_summ))
        elif title == 'Siddhartha' and source == 'barrons':
            assert len(sect_summs_old) == 1
            continue
        elif title == "Walden":
            sect_summs_new = [(x[0].replace('Chapter1', 'Chapter 1', 1), x[1])
                              for x in sect_summs_old]
        elif title == "Ethan Frome" and source == 'monkeynotes':
            for sect_title, sect_summ in sect_summs_old:
                if sect_title == 'Opening':
                    sect_title = 'Prologue'
                elif sect_title.startswith('Chapter 10'):
                    sect_title = 'Epilogue'
                sect_summs_new.append((sect_title, sect_summ))
        elif title == 'Typee' and source == 'monkeynotes':
            sect_summs_new = [(x[0].replace('Chapter 1Summary', 'Chapter 1',
                                            1), x[1]) for x in sect_summs_old]
        elif title == 'Typee' and source == 'barrons':
            sect_summs_new = [(x[0].replace('PREFACE AND CHAPTERS 1 TO 5',
                                            'Preface to Chapter 5'), x[1])
                              for x in sect_summs_old]
        elif title == "A Connecticut Yankee in King Arthur's Court":
            sect_summs_new = [(x[0].split('"', 1)[0].split(':',
                                                           1)[0].strip(), x[1])
                              for x in sect_summs_old
                              if x[0].startswith('CHAPTER')]
        elif title == 'The Count of Monte Cristo' and source == 'monkeynotes':
            sect_summs_new = [(x[0].split(':', 1)[0].split('-',
                                                           1)[0].strip(), x[1])
                              for x in sect_summs_old]
        elif title == 'The Secret Sharer' and source == 'monkeynotes':
            chap1 = set(['Section 1', 'Section 2', 'Section 3', 'Section 4'])
            chap2 = set(['Section 5', 'Section 6', 'Section 7', 'Section 8'])
            chap1_text, chap2_text = [], []
            for sect_title, sect_summ in sect_summs_old:
                if sect_title in chap1:
                    chap1_text.extend(sect_summ)
                elif sect_title in chap2:
                    chap2_text.extend(sect_summ)
            sect_summs_new = [('Chapter 1', chap1_text),
                              ('Chapter 2', chap2_text)]
        else:
            sect_summs_new = sect_summs_old

        if sect_summs_new:
            sect_summs_new = [(standardize_sect_title(x[0]), x[1])
                              for x in sect_summs_new]
            title_new = standardize_title(title)
            if title_new != title:
                print('renamed {} -> {}'.format(title, title_new))
                title = title_new
            book_summ_new = book_summ._replace(
                section_summaries=sect_summs_new, title=title_new)
        book_summaries_new.append(book_summ_new)
        seen.add((title, source))
        if start:  # for debugging
            print(title, source, idx)
            assert title == book_summaries_new[-1].title
            for i, x in enumerate(book_summaries_new[-1].section_summaries, 1):
                print(x[0] or x[1][0][0:100] + ' index ' + str(i))
            input()
    return book_summaries_new

Exemple #6

0

Afficher le fichier

Fichier : novelguide_scrape.py Projet : manestay/novel-chapter-dataset

def manual_fix_individual(book_summaries):
    """
    Note we do not manually fix the plays, since we do not use them in the literature dataset.
    """
    def remove_duplicates(sect_summs_old):
        seen = set()
        sect_summs = []
        for sect_title, sect_summ in sect_summs_old:
            if sect_title in seen:
                continue
            sect_summs.append((sect_title, sect_summ))
            seen.add(sect_title)
        return sect_summs

    def add_dash_numwords(sect_summs_old):
        sect_summs_new = []
        tens = ['Twenty', 'Thirty', 'Forty', 'Fifty', 'Sixty']
        for sect_title, sect_summ in sect_summs_old:
            sect_title = sect_title.replace('Sixty Two',
                                            'Sixty Two and Sixty Three')
            for t in tens:
                sect_title = sect_title.replace(t + ' ', t + '-')
            sect_title = sect_title.replace(',', '')
            words = sect_title.split()
            if len(words) > 2:
                sect_title = '{} {} - {}'.format(words[0], words[1], words[-1])
            else:
                sect_title = '{} {}'.format(words[0], words[1])

            sect_summs_new.append((sect_title, sect_summ))
        return sect_summs_new

    def greenwood_fix(sect_summs_old):
        sect_summs_new = []
        sect_summs_old = remove_duplicates(sect_summs_old)
        for sect_title, sect_summ in sect_summs_old:
            if sect_title.startswith('C'):
                sect_title = get_first_last_chapter(sect_title)
                sect_summs_new.append((sect_title, sect_summ))
            else:  # Part Two
                ss, st = [], ''
                curr_summ = []
                write = False
                for line in sect_summ:
                    if line.startswith('Analysis'):
                        ss.append((st, curr_summ))
                        curr_summ = []
                        write = False
                    elif line.startswith('Summary'):
                        st = get_first_last_chapter(line)
                        write = True
                    elif write:
                        curr_summ.append(line)
                sect_summs_new.extend(ss)
        return sect_summs_new

    def ambass_fix(sect_summs_old):
        sect_summs_old = remove_duplicates(sect_summs_old)
        numbers = [
            1, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1,
            2, 3, 1, 2, 3, 1, 2, 3, 4
        ]
        sect_summs_new = [('Chapter {}'.format(num), summ)
                          for num, (_, summ) in zip(numbers, sect_summs_old)]
        return sect_summs_new

    def mirth_fix(sect_summs_old):
        sect_summs_new = []
        for sect_title, sect_summ in sect_summs_old:
            sect_title = sect_title.split(' – ', 1)[-1]
            sect_title = sect_title.replace('6,7,8', '6-8').replace(',', '-')
            sect_title = sect_title.replace('and', '-')
            sect_summs_new.append((sect_title, sect_summ))
        return sect_summs_new

    def bovary_fix(sect_summs_old):
        sect_summs_old = deepcopy(sect_summs_old)
        sect_summs_new = sect_summs_old[0:-5]
        chap_8 = []
        for chap_title, chap_summ in sect_summs_old[-5:]:
            if chap_title.endswith('8'):
                chap_8.extend(chap_summ)
            elif chap_title.endswith('9'):
                chap_8.extend(chap_summ)
                sect_summs_new.append(('Chapter 8', chap_8))
            else:
                orig = int(chap_title.rsplit(' ', 1)[-1])
                sect_summs_new.append(
                    ('Chapter {}'.format(orig - 1), chap_summ))
        return sect_summs_new

    start = False  # True to debug
    book_summaries_new = []
    for idx, book_summ in enumerate(book_summaries):
        sect_summs_new = []
        sect_summs_old = book_summ.section_summaries
        title = book_summ.title
        # if idx == 0:
        #     start = True
        if title in NON_NOVEL_TITLES:
            continue
        elif title == 'Don Quixote':  # not the same chapter numbering as Gutenberg
            continue
        elif title in set([
                'My Antonia', "The House of Mirth", 'The Ambassadors',
                'War of the Worlds', 'Hard Times'
        ]):  # multibook
            book_count = 0
            if title == 'The House of Mirth':
                sect_summs_old = mirth_fix(sect_summs_old)
            elif title == 'The Ambassadors':
                sect_summs_old = ambass_fix(sect_summs_old)
            elif title == 'War of the Worlds':
                sect_summs_old = [('Chapter {}'.format(x[0].split('.',
                                                                  1)[0]), x[1])
                                  for x in sect_summs_old if '.' in x[0]]
            elif title == 'My Antonia':
                assert sect_summs_old[3][0] == 'Part IV'
                assert sect_summs_old[4][0] == 'Part I'
                sect_summs_old = sect_summs_old[4:]
            for i, (chap_title, sect_summ) in enumerate(sect_summs_old):
                chap_title = chap_title.replace("Part", 'Chapter')
                if not chap_title.startswith("Chapter"):
                    continue
                chap_title, book_count = fix_multibook(chap_title, book_count)
                sect_summs_new.append((chap_title, sect_summ))
        elif title in set([
                "Madame Bovary", "Gulliver's Travels",
                "Under the Greenwood Tree"
        ]):  # multipart
            book_count = 0
            if title == "Under the Greenwood Tree":
                sect_summs_old = greenwood_fix(sect_summs_old)
            elif title == 'Madame Bovary':
                sect_summs_old = bovary_fix(sect_summs_old)
            for i, (chap_title, sect_summ) in enumerate(sect_summs_old):
                if not chap_title.startswith("Chapter"):
                    continue
                chap_title, book_count = fix_multipart(chap_title, book_count)
                sect_summs_new.append((chap_title, sect_summ))
        elif title in set(['Crime and Punishment']):
            sect_summs_new = [(chap.replace(',', ':', 1), summ)
                              for chap, summ in sect_summs_old]
        elif title in set(['Treasure Island', 'Kidnapped']):
            sect_summs_old = remove_duplicates(sect_summs_old)
            sect_summs_new = [
                x for x in sect_summs_old if x[0].startswith('Chapter')
            ]
        elif title in set([
                'Main Street', 'The Scarlet Letter', 'The Beast in the Jungle',
                "The Age of Innocence", "The Call of the Wild", 'Ivanhoe'
        ]):
            sect_summs_new = remove_duplicates(sect_summs_old)
        elif title == 'Great Expectations':
            sect_summs_new = [('Chapter {}'.format(i), summ)
                              for i, (_, summ) in enumerate(sect_summs_old, 1)]
        elif title == 'Babbitt':
            sect_summs_old = remove_duplicates(sect_summs_old)
            for sect_title, sect_summ in sect_summs_old:
                sect_title = sect_title.replace(',', '')
                nums = re.findall('\d+', sect_title)
                sect_title = 'Chapter {}-{}'.format(nums[0], nums[-1])
                sect_summs_new.append((sect_title, sect_summ))
        elif title == 'Adam Bede':
            for sect_title, sect_summ in sect_summs_old:
                if sect_summ[0].startswith('George Eliot, Adam Bede. Edited'):
                    continue
                sect_summs_new.append((sect_title, sect_summ))
            assert sect_summs_new[0][0] == sect_summs_new[1][0] == 'Chapter 1'
            sect_summs_new = sect_summs_new[1:]
        elif title == 'Dracula':
            assert sect_summs_old[0][0] == 'Summary'
            sect_summs_new = sect_summs_old[1:]
        elif title == 'Lord Jim':
            chapters = [
                '1 - 2', '3 - 5', '6 - 8', '9 - 11', '12 - 13', '14 - 16',
                '17 - 18', '19 - 21', '22 - 23', '24 - 26', '27 - 29',
                '30 - 32', '33 - 35', '36 - 37', '38 - 40', '41 - 43',
                '44 - 45'
            ]
            chapters = ['Chapter {}'.format(x) for x in chapters]
            sect_summs_new = [(chap, summ[1])
                              for chap, summ in zip(chapters, sect_summs_old)]
        elif title == 'Ethan Frome':
            sect_summs_new = sect_summs_old
            sect_summs_new[0] = ('Prologue', sect_summs_new[0][1])
            sect_summs_new[-1] = ('Epilogue', sect_summs_new[-1][1])
        elif title == "A Connecticut Yankee in King Arthur's Court":
            for sect_title, sect_summ in sect_summs_old:
                if not sect_title.startswith("Chapter"):
                    continue
                sect_summs_new.append((sect_title, sect_summ))
            sect_summs_new[-1] = ('Chapter 36-45', sect_summs_new[-1][1])
        elif title == 'The Adventures of Tom Sawyer':  # chapter 16 is split into 16 and 17
            sect_summs_old = deepcopy(sect_summs_old)
            sect_summs_new = sect_summs_old[0:15]
            chap_16 = []
            for chap_title, chap_summ in sect_summs_old[15:]:
                if chap_title.endswith('16'):
                    chap_16.extend(chap_summ)
                elif chap_title.endswith('17'):
                    chap_16.extend(chap_summ)
                    sect_summs_new.append(('Chapter 16', chap_16))
                else:
                    orig = int(chap_title.rsplit(' ', 1)[-1])
                    sect_summs_new.append(
                        ('Chapter {}'.format(orig - 1), chap_summ))
        elif title == "Tess of the d'Urbervilles":
            phase_found = False
            for sect_title, sect_summ in sect_summs_old:
                if sect_title.startswith('Phase'):
                    phase_found = True
                    continue
                if not phase_found:
                    continue
                if sect_title == 'Chapters I–XI':
                    continue
                sect_summs_new.append((sect_title, sect_summ))
        elif title == 'A Portrait of the Artist as a Young Man':
            sect_summs_old = remove_duplicates(sect_summs_old)
            chap1_summ = []
            for sect_title, sect_summ in sect_summs_old:
                if "Part" in sect_title:
                    chap1_summ.extend(sect_summ)
                    continue
                elif chap1_summ:
                    sect_summs_new.append(('Chapter 1', chap1_summ))
                    chap1_summ = []
                sect_summs_new.append((sect_title, sect_summ))
        elif title == 'Of Human Bondage':
            sect_summs_old = remove_duplicates(sect_summs_old)
            sect_summs_new = [(chap.replace(' and ', '-'), summ)
                              for chap, summ in sect_summs_old]
        elif title == 'The Secret Sharer':
            sect_summs_new = [(x[0].replace('Part', 'Chapter'), x[1])
                              for x in sect_summs_old
                              if x[0].startswith('Part')]
        elif title == "Moby Dick":  # TODO: scrape with less manual fixing
            for sect_title, sect_summs in sect_summs_old:
                summs_new = []
                sect_title = sect_title.replace('\xa0', '').replace(
                    ' and ', ' - ').split(", “", 1)[0].split(",“",
                                                             1)[0].strip()
                if sect_title.startswith('hapter'):
                    sect_title = 'C' + sect_title
                elif sect_title == 'Chatper 39':
                    sect_title = 'Chapter 39'
                elif sect_title == 'Chapter 50' and sect_summs_new[-1][
                        0] == 'Chapter 50':
                    sect_title = 'Chapter 51'
                elif sect_title == 'Chapter 72' and sect_summs_new[-1][
                        0] == 'Chapter 72':
                    sect_title = 'Chapter 73'
                elif sect_title.startswith('Chapters 95'):
                    sect_title = 'Chapters 95-98'
                elif sect_title.startswith('Chapters 101'):
                    sect_title = 'Chapters 101-105'
                elif sect_title.startswith('Chapters 120'):
                    sect_title = 'Chapters 120-124'
                elif sect_title == 'Chapters 10, 11, - 12':
                    continue
                elif sect_title.startswith('Chapters 26 - 27'):
                    sect_title = 'Chapters 26-27'
                elif sect_title == 'The Epilogue':
                    sect_title = 'Epilogue'

                for p in sect_summs:
                    if p == 'Summary':
                        continue
                    elif p.startswith('Analysis'):
                        break
                    else:
                        summs_new.append(p)
                if not sect_title.startswith(('C', 'Epilogue')):
                    continue
                sect_summs_new.append((sect_title, summs_new))
            sect_summs_new = remove_duplicates(sect_summs_new)
        elif title == "Gulliver's Travels":
            for sect_title, sect_summ in sect_summs_old:
                if not sect_summ:
                    continue
                if sect_title.startswith("Part I"):
                    continue
                sect_summs_new.append((sect_title, sect_summ))
        elif title == "Siddhartha":
            for sect_title, sect_summ in sect_summs_old:
                if '-' in sect_title:
                    sect_title = sect_title.split('-', 1)[-1].strip()
                else:
                    sect_title, sect_summ = sect_summ[0], sect_summ[1:]
                sect_summs_new.append((sect_title, sect_summ))
        elif title == 'Sense and Sensibility':
            sect_summs_old = sect_summs_old[0:11] + sect_summs_old[21:]
            offset = 0
            for i, (sect_title, sect_summ) in enumerate(sect_summs_old, 1):
                if sect_title == 'Chapter XIII':  # chapter 12 is missing
                    offset = 1
                sect_title = 'Chapter {}'.format(i + offset)
                sect_summs_new.append((sect_title, sect_summ))
        elif title == 'White Fang':
            for sect_title, sect_summ in sect_summs_old:
                nums = sect_title.split(' ', 1)[0]
                part, chapter = nums.split('.', 1)
                sect_title = 'Part {}: Chapter {}'.format(part, chapter)
                sect_summs_new.append((sect_title, sect_summ))
        elif title == 'Bleak House':
            sect_summs_new = remove_duplicates(sect_summs_old)
            assert sect_summs_new[0][0] == 'Author’s Preface'
            sect_summs_new[0] = ('Preface', sect_summs_new[0][1])
            assert sect_summs_new[19][0] == 'Chapter XIX'
            text = sect_summs_new[20][1]
            text[0] = 'I' + text[0]
            XIX_new = (sect_summs_new[19][0], text)
            sect_summs_new[19] = XIX_new
            del sect_summs_new[20]
        elif title == 'Notes from the Underground':
            sect_summs_new = [(x[0].replace(' C', ': C'), x[1])
                              for x in sect_summs_old]
        elif title == "Middlemarch":
            sect_summs_new = remove_duplicates(sect_summs_old)
            sect_summs_new = [(chap.split('(', 1)[0].strip(), summ)
                              for chap, summ in sect_summs_new]
        elif title == "Walden":
            sect_summs_new = remove_duplicates(sect_summs_old)
            sect_summs_new = [(chap.split('‘', 1)[0].strip(), summ)
                              for chap, summ in sect_summs_new]
            sect_summs_new[-1] = ('Chapter 17-18', sect_summs_new[-1][1])
        elif title == 'A Tale of Two Cities':
            sect_summs_new = []
            for sect_title, sect_summ in sect_summs_old:
                sect_title = re.sub(r' ?C', ': C', sect_title)
                sect_summs_new.append((sect_title, sect_summ))
            sect_summs_new = remove_duplicates(sect_summs_new)
        elif title == 'A Christmas Carol':
            sect_summs_new = remove_duplicates(sect_summs_old)
            sect_summs_new = [(chap.split(':', 1)[0], summ)
                              for chap, summ in sect_summs_new
                              if not chap.startswith('Stave 1')]
        elif title == 'The Awakening':
            sect_summs_new = [(chap.replace('Part', 'Chapter', 1), summ)
                              for chap, summ in sect_summs_old]
        elif title == 'Around the World in Eighty Days':
            sect_summs_old = remove_duplicates(sect_summs_old)
            sect_summs_new = [(chap.split(':', 1)[0], summ)
                              for chap, summ in sect_summs_old
                              if chap.startswith('Chapter')]
            assert sect_summs_new[1][0] == 'Chapter 1'
            del sect_summs_new[1]
        elif title == "Fathers and Sons":
            sect_summs_old = remove_duplicates(sect_summs_old)
            for sect_title, sect_summ in sect_summs_old:
                if sect_title.endswith('Analysis'):
                    continue
                elif sect_title == 'Chapter 16':  # this one is analysis
                    continue
                elif sect_title == 'Chapters 16':
                    sect_title = 'Chapter 16'
                sect_summs_new.append((sect_title, sect_summ))
        elif title == "The Yellow Wallpaper":
            all_sects = [x[1] for x in book_summ.section_summaries]
            all_sects = [sublist for l in all_sects for sublist in l]
            sect_summs_new = [('book', all_sects)]
        elif title == 'Anna Karenina':
            sect_summs_new = [(chap.replace(' section', ': Chapter'), summ)
                              for chap, summ in sect_summs_old]
        elif title == 'The Metamorphosis':
            sect_summs_new = [(chap.replace('Section', 'Part'), summ)
                              for chap, summ in sect_summs_old]
        elif title in set([
                'Vanity Fair', 'Mansfield Park', 'Washington Square',
                'The Deerslayer'
        ]):
            sect_summs_new = add_dash_numwords(sect_summs_old)
            if title == 'The Deerslayer':
                assert sect_summs_new[0][0] == sect_summs_new[1][0]
                sect_summs_new.pop(0)
                assert sect_summs_new[-6][
                    0] == 'Chapters Twenty-and - Twenty-One'
                sect_summs_new[-6] = ('Chapter 20-21', sect_summs_new[-6][1])
        elif title == "The Jungle":
            sect_summs_new = [(chap.replace('Twenty ', 'Twenty-'), summ)
                              for chap, summ in sect_summs_old]
        elif title == 'The Mayor of Casterbridge':
            sect_summs_new = add_dash_numwords(sect_summs_old)
            assert sect_summs_new[4][0] == 'Twelve Thirteen - Fourteen'
            sect_summs_new[4] = ('Chapter 12-14', sect_summs_new[4][1])
        elif title == 'Persuasion':
            assert sect_summs_old[0][0].startswith('Volume')
            sect_summs_old = sect_summs_old[1:]
            sect_summs_old = sorted(sect_summs_old,
                                    key=lambda x: int(x[0].rsplit('-', 1)[-1])
                                    )  # sort by page number
            offset = 0
            for sect_title, sect_summ in sect_summs_old:
                if sect_title == "Chapter I, pages 115-122":
                    offset = 12
                sect_title = sect_title.split(',', 1)[0]
                if offset:
                    chap = roman_to_int(sect_title.rsplit(' ', 1)[-1])
                    sect_title = 'Chapter {}'.format(chap + offset)
                sect_summs_new.append((sect_title, sect_summ))
        elif title == 'Far from the Madding Crowd':
            sect_summs_new = [(x[0].replace('Ch.', 'Chapter').split(':', 1)[0], x[1]) \
                              for x in sect_summs_old]
        elif title == 'The Turn of the Screw':  # novelguide has 2 books with same title, use the other one
            continue
        elif title == 'Turn of the Screw':
            sect_summs_new = [(x[0].replace('Section', 'Chapter'), x[1])
                              for x in sect_summs_old]
        elif title == "The Adventures of Huckleberry Finn":
            for sect_title, sect_summ in sect_summs_old:
                if sect_title == 'Chapter 1-3':
                    sects = sect_summ[0].split("Chapter")
                    for sect in sects:
                        if not sect:
                            continue
                        st, ss = sect.split(':', 1)
                        sect_summs_new.append(
                            ('Chapter {}'.format(st.strip()), ss))
                else:
                    sect_summs_new.append((sect_title, sect_summ))
        elif title == "The Picture of Dorian Gray":
            sect_summs_new = sect_summs_old
            sect_summs_new[0] = ('Chapters 1-3', sect_summs_new[0][1])
        elif title == 'The Scarlet Pimpernel':
            sect_summs_new = sect_summs_old
            sect_summs_new[3] = ('Chapter III', sect_summs_new[3][1])
            sect_summs_new[7] = ('Chapter VII', sect_summs_new[7][1])
            sect_summs_new.pop(0)
        elif title == 'Jude the Obscure':
            for sect_title, sect_summ in sect_summs_old:
                if not sect_summ:
                    continue
                if sect_title == 'At Marygreen':
                    sect_title = 'I–1'
                elif sect_title == 'At Melchester':
                    sect_title = 'III–1'
                elif sect_title == 'At Christminster Again':
                    sect_title = 'VI–1'

                part_, chap = sect_title.split('–', 1)
                if chap == '1':  # the roman numerals are inaccurate on the original pages
                    part = part_
                sect_title = 'Part {}: Chapter {}'.format(part, chap)
                sect_summs_new.append((sect_title, sect_summ))
        elif title == 'Ulysses':
            sect_summs_new = [('Chapter {}'.format(x[0].rsplit(' ',
                                                               1)[-1]), x[1])
                              for x in sect_summs_old]
        elif title == 'The American':
            sect_summs_new = [(x[0].replace('Book', 'Chapter'), x[1])
                              for x in sect_summs_old]
        elif title == 'The Brothers Karamazov':
            book = 0
            for sect_title, sect_summ in sect_summs_old:
                if not sect_title.startswith('Chapter'):
                    continue
                if sect_title == 'Chapter 1':
                    book += 1
                sect_title = "Book {}: {}".format(book, sect_title)
                sect_summs_new.append((sect_title, sect_summ))
        elif title == "Winesburg, Ohio":
            sect_summs_new = [(x[0].replace('&', ',').replace('VI', 'IV').replace('Godliness', 'Godliness Part'), \
                               x[1]) for x in sect_summs_old]
        elif title == "War and Peace":
            ssn = [(standardize_sect_title(x[0], False), x[1])
                   for x in sect_summs_old]
            book_summ_new = book_summ._replace(section_summaries=ssn)
        elif title == 'The Hound of the Baskervilles':
            for sect_title, sect_summ in sect_summs_old:
                if ' - ' in sect_title:
                    sect_title = sect_title.split(' - ')[0]
                elif sect_title == 'Chapter 10' and sect_summs_new[-1][
                        0] == 'Chapter 15':
                    continue
                elif not sect_title.startswith('C'):
                    continue
                sect_summs_new.append((sect_title, sect_summ))
        else:
            sect_summs_new = sect_summs_old

        if sect_summs_new:
            sect_summs_new = [(standardize_sect_title(x[0]), x[1])
                              for x in sect_summs_new]
            title_new = standardize_title(title)
            if title_new != title:
                print('renamed {} -> {}'.format(title, title_new))
                title = title_new
            book_summ_new = book_summ._replace(
                section_summaries=sect_summs_new, title=title_new)
        book_summaries_new.append(book_summ_new)
        if start:  # for debugging
            print(title, idx)
            assert title == book_summaries_new[-1].title
            for i, x in enumerate(book_summaries_new[-1].section_summaries, 1):
                print(x[0] or x[1][0][0:100] + ' index ' + str(i))
            input()
    return book_summaries_new