Beispiel #1
0
def test_get_toc_subchapters(doc1_with_toc_path, downloads_dir):
    with PDFParser(doc1_with_toc_path, directory=downloads_dir) as pdfparser:
        chapters_toc = pdfparser.get_toc(subchapters=True)
        for chapter_dict in chapters_toc:
            if 'children' in chapter_dict and chapter_dict['children']:
                for subchapter_dict in chapter_dict['children']:
                    _check_pagerange_matches_title_len(subchapter_dict)
            else:
                _check_pagerange_matches_title_len(chapter_dict)
Beispiel #2
0
def test_split_chapters(doc1_with_toc_path, downloads_dir):
    with PDFParser(doc1_with_toc_path, directory=downloads_dir) as pdfparser:
        chapters = pdfparser.split_chapters()
        # pprint(chapters)
        for chapter in chapters:
            chapter_path = chapter['path']
            assert chapter_path.endswith('.pdf'), 'wrong extension -- expected .pdf'
            assert os.path.exists(chapter_path), 'missing split PDF file'
            _check_path_matches_title_len(chapter)
def split_pdfs(spec_lang_code):
    language_code = CHANNEL_SPEC[spec_lang_code]['language_code']
    page_ranges = CHANNEL_SPEC[spec_lang_code]['page_ranges']
    file_path = os.path.join(DOWNLOADS_FOLDER,
                             f'21CSGuide_{language_code}_converted.pdf')

    if os.path.exists(file_path):
        print(f'found file at {file_path}, splitting pdf')
        with PDFParser(file_path) as pdfparser:
            chapters = pdfparser.split_subchapters(jsondata=page_ranges)
    else:
        print(f'pdf not found at {file_path}')
    return chapters
Beispiel #4
0
def test_split_chapters3(doc3_with_toc_path, downloads_dir):
    # print(doc3_with_toc_path)
    with PDFParser(doc3_with_toc_path, directory=downloads_dir) as pdfparser:
        chapters = pdfparser.split_chapters()
        # pprint(chapters)
        for chapter in chapters:
            chapter_path = chapter['path']
            assert chapter_path.endswith('.pdf'), 'wrong extension -- expected .pdf'
            assert os.path.exists(chapter_path), 'missing split PDF file'
            assert _get_pdf_len(chapters[0]) == 1, 'wrong length for ch ' + str(chapters[0])
            assert _get_pdf_len(chapters[1]) == 1, 'wrong length for ch ' + str(chapters[1])
            assert _get_pdf_len(chapters[2]) == 2, 'wrong length for ch ' + str(chapters[2])
            assert _get_pdf_len(chapters[3]) == 206, 'wrong length for ch ' + str(chapters[3])
            assert _get_pdf_len(chapters[4]) == 9, 'wrong length for ch ' + str(chapters[4])
            assert _get_pdf_len(chapters[5]) == 9, 'wrong length for ch ' + str(chapters[5])
Beispiel #5
0
def split_chapters(lang_code):
    """
    Splits the chapters for the PDFs.
    """
    pdf = DATA[lang_code]['pdf_info']
    page_ranges = pdf['page_ranges']
    pdf_path_cropped = pdf['pdf_path_cropped']
    pdf_split_path = pdf['pdf_split_path']

    print('==> Splitting chapters for', pdf_path_cropped)
    print('====> PDF_PATH_CROPPED', pdf_path_cropped, 'PDF_SPLIT_PATH',
          pdf_split_path)
    with PDFParser(pdf_path_cropped, directory=pdf_split_path) as pdfparser:
        chapters = pdfparser.split_subchapters(jsondata=page_ranges)
        # for chapter in chapters:
        #     print(chapter)
    print('==> DONE splitting chapters for {} PDF.'.format(lang_code))
    return chapters
Beispiel #6
0
def test_split_subchapters3(doc3_with_toc_path, downloads_dir):
    with PDFParser(doc3_with_toc_path, directory=downloads_dir) as pdfparser:
        chapters = pdfparser.split_subchapters()
        ch3 = chapters[3]
        assert 'children' in ch3, 'no subchapters found in  ch3'
        assert len(ch3['children']) == 17, 'wrong number of subchapters'
        subchs = ch3['children']
        assert _get_pdf_len(
            subchs[0]) == 6, 'wrong length for subch ' + str(subchs[0])
        assert _get_pdf_len(
            subchs[1]) == 8, 'wrong length for subch ' + str(subchs[1])
        assert _get_pdf_len(
            subchs[2]) == 14, 'wrong length for subch ' + str(subchs[2])
        assert _get_pdf_len(
            subchs[3]) == 14, 'wrong length for subch ' + str(subchs[3])
        assert _get_pdf_len(
            subchs[4]) == 11, 'wrong length for subch ' + str(subchs[4])
        assert _get_pdf_len(
            subchs[5]) == 13, 'wrong length for subch ' + str(subchs[5])
        assert _get_pdf_len(
            subchs[6]) == 13, 'wrong length for subch ' + str(subchs[6])
        assert _get_pdf_len(
            subchs[7]) == 10, 'wrong length for subch ' + str(subchs[7])
        assert _get_pdf_len(
            subchs[8]) == 13, 'wrong length for subch ' + str(subchs[8])
        assert _get_pdf_len(
            subchs[9]) == 15, 'wrong length for subch ' + str(subchs[9])
        assert _get_pdf_len(
            subchs[10]) == 16, 'wrong length for subch ' + str(subchs[10])
        assert _get_pdf_len(
            subchs[11]) == 7, 'wrong length for subch ' + str(subchs[11])
        assert _get_pdf_len(
            subchs[12]) == 18, 'wrong length for subch ' + str(subchs[12])
        assert _get_pdf_len(
            subchs[13]) == 20, 'wrong length for subch ' + str(subchs[13])
        assert _get_pdf_len(
            subchs[14]) == 15, 'wrong length for subch ' + str(subchs[14])
        assert _get_pdf_len(
            subchs[15]) == 8, 'wrong length for subch ' + str(subchs[15])
        assert _get_pdf_len(
            subchs[16]) == 5, 'wrong length for subch ' + str(subchs[16])
Beispiel #7
0
def test_split_subchapters(doc1_with_toc_path, downloads_dir):
    with PDFParser(doc1_with_toc_path, directory=downloads_dir) as pdfparser:
        chapters = pdfparser.split_subchapters()
        # pprint(chapters)
        for ch in chapters[0:4]:
            assert 'children' not in ch, 'first four chapters have no subchapters...'
        assert _get_pdf_len(chapters[0]) == 1, 'wrong num pages in ' + str(chapters[0])
        assert _get_pdf_len(chapters[1]) == 1, 'wrong num pages in ' + str(chapters[1])
        assert _get_pdf_len(chapters[2]) == 2, 'wrong num pages in ' + str(chapters[2])
        assert _get_pdf_len(chapters[3]) == 3, 'wrong num pages in ' + str(chapters[3])

        ch4 = chapters[4]
        assert 'children' in ch4, 'no children'
        assert len(ch4['children']) == 2
        assert _get_pdf_len(ch4['children'][0]) == 1, 'wrong num pages in ' + str(ch4['children'][0])
        assert _get_pdf_len(ch4['children'][1]) == 1, 'wrong num pages in ' + str(ch4['children'][1])

        ch5 = chapters[5]
        assert 'children' in ch5, 'no children'
        assert len(ch5['children']) == 3
        assert _get_pdf_len(ch5['children'][0]) == 1, 'wrong num pages in ' + str(ch5['children'][0])
        assert _get_pdf_len(ch5['children'][1]) == 1, 'wrong num pages in ' + str(ch5['children'][1])
        assert _get_pdf_len(ch5['children'][2]) == 1, 'wrong num pages in ' + str(ch5['children'][2])
Beispiel #8
0
def test_get_toc(doc1_with_toc_path, downloads_dir):
    with PDFParser(doc1_with_toc_path, directory=downloads_dir)  as pdfparser:
        chapters_toc = pdfparser.get_toc()
        for chapter_dict in chapters_toc:
            _check_pagerange_matches_title_len(chapter_dict)
    def construct_channel(self, **kwargs):
        channel = self.get_channel(**kwargs)

        # Soupify goalkicker main page
        gk_url = 'https://' + self.channel_info['CHANNEL_SOURCE_DOMAIN'] + '/'
        gk_soup = get_soup(gk_url)

        # Get urls for each goalkicker book
        els_with_page_urls = gk_soup.find_all(class_='bookContainer')
        page_urls = [
            gk_url + el.find('a')['href'] for el in els_with_page_urls
        ]

        for book_counter, page_url in enumerate(page_urls):
            # Soupify book page
            page_soup = get_soup(page_url)

            # Extract and construct book info
            book_info = parse_book_info(page_soup)
            book_info['absolute_url'] = page_url + book_info['relative_url']

            # Add book to channel tree
            book_node_source_id = 'topic/' + book_info['subject']
            book_node = TopicNode(title=book_info['subject'],
                                  source_id=book_node_source_id)
            channel.add_child(book_node)

            # Use separate download directory for each book's pdf chunks. Avoids name conflicts between books
            download_dir = 'downloads/book_' + str(book_counter).rjust(
                2, '0') + '--' + book_info['subject']
            # Get chapters info
            pdf_path = book_info['absolute_url']
            with PDFParser(pdf_path, directory=download_dir) as pdfparser:
                chapters = pdfparser.split_chapters()

            # Add chapter nodes
            for i, chapter in enumerate(chapters):
                chapter_node_source_id = book_info[
                    'source_id'] + '/' + chapter['title']

                if chapter['title'].startswith('Chapter'):
                    chapter_num = re.search('Chapter (\d+)',
                                            chapter['title']).group(1)
                    chapter_description = 'Chapter ' + chapter_num + ' of the Goalkicker book on ' + book_info[
                        'subject']
                else:
                    chapter_description = '"' + chapter[
                        'title'] + '" section of the Goalkicker book on ' + book_info[
                            'subject']

                chapter_node = DocumentNode(
                    title=chapter['title'],
                    description=chapter_description,
                    source_id=chapter_node_source_id,
                    license=get_license('CC BY-SA',
                                        copyright_holder='Stack Overflow'),
                    language='en',
                    files=[DocumentFile(path=chapter['path'], language='en')],
                )
                book_node.add_child(chapter_node)

        return channel