def test_get_toc_subchapters(doc1_with_toc_path, downloads_dir): with PDFParser(doc1_with_toc_path, directory=downloads_dir) as pdfparser: chapters_toc = pdfparser.get_toc(subchapters=True) for chapter_dict in chapters_toc: if 'children' in chapter_dict and chapter_dict['children']: for subchapter_dict in chapter_dict['children']: _check_pagerange_matches_title_len(subchapter_dict) else: _check_pagerange_matches_title_len(chapter_dict)
def test_split_chapters(doc1_with_toc_path, downloads_dir): with PDFParser(doc1_with_toc_path, directory=downloads_dir) as pdfparser: chapters = pdfparser.split_chapters() # pprint(chapters) for chapter in chapters: chapter_path = chapter['path'] assert chapter_path.endswith('.pdf'), 'wrong extension -- expected .pdf' assert os.path.exists(chapter_path), 'missing split PDF file' _check_path_matches_title_len(chapter)
def split_pdfs(spec_lang_code): language_code = CHANNEL_SPEC[spec_lang_code]['language_code'] page_ranges = CHANNEL_SPEC[spec_lang_code]['page_ranges'] file_path = os.path.join(DOWNLOADS_FOLDER, f'21CSGuide_{language_code}_converted.pdf') if os.path.exists(file_path): print(f'found file at {file_path}, splitting pdf') with PDFParser(file_path) as pdfparser: chapters = pdfparser.split_subchapters(jsondata=page_ranges) else: print(f'pdf not found at {file_path}') return chapters
def test_split_chapters3(doc3_with_toc_path, downloads_dir): # print(doc3_with_toc_path) with PDFParser(doc3_with_toc_path, directory=downloads_dir) as pdfparser: chapters = pdfparser.split_chapters() # pprint(chapters) for chapter in chapters: chapter_path = chapter['path'] assert chapter_path.endswith('.pdf'), 'wrong extension -- expected .pdf' assert os.path.exists(chapter_path), 'missing split PDF file' assert _get_pdf_len(chapters[0]) == 1, 'wrong length for ch ' + str(chapters[0]) assert _get_pdf_len(chapters[1]) == 1, 'wrong length for ch ' + str(chapters[1]) assert _get_pdf_len(chapters[2]) == 2, 'wrong length for ch ' + str(chapters[2]) assert _get_pdf_len(chapters[3]) == 206, 'wrong length for ch ' + str(chapters[3]) assert _get_pdf_len(chapters[4]) == 9, 'wrong length for ch ' + str(chapters[4]) assert _get_pdf_len(chapters[5]) == 9, 'wrong length for ch ' + str(chapters[5])
def split_chapters(lang_code): """ Splits the chapters for the PDFs. """ pdf = DATA[lang_code]['pdf_info'] page_ranges = pdf['page_ranges'] pdf_path_cropped = pdf['pdf_path_cropped'] pdf_split_path = pdf['pdf_split_path'] print('==> Splitting chapters for', pdf_path_cropped) print('====> PDF_PATH_CROPPED', pdf_path_cropped, 'PDF_SPLIT_PATH', pdf_split_path) with PDFParser(pdf_path_cropped, directory=pdf_split_path) as pdfparser: chapters = pdfparser.split_subchapters(jsondata=page_ranges) # for chapter in chapters: # print(chapter) print('==> DONE splitting chapters for {} PDF.'.format(lang_code)) return chapters
def test_split_subchapters3(doc3_with_toc_path, downloads_dir): with PDFParser(doc3_with_toc_path, directory=downloads_dir) as pdfparser: chapters = pdfparser.split_subchapters() ch3 = chapters[3] assert 'children' in ch3, 'no subchapters found in ch3' assert len(ch3['children']) == 17, 'wrong number of subchapters' subchs = ch3['children'] assert _get_pdf_len( subchs[0]) == 6, 'wrong length for subch ' + str(subchs[0]) assert _get_pdf_len( subchs[1]) == 8, 'wrong length for subch ' + str(subchs[1]) assert _get_pdf_len( subchs[2]) == 14, 'wrong length for subch ' + str(subchs[2]) assert _get_pdf_len( subchs[3]) == 14, 'wrong length for subch ' + str(subchs[3]) assert _get_pdf_len( subchs[4]) == 11, 'wrong length for subch ' + str(subchs[4]) assert _get_pdf_len( subchs[5]) == 13, 'wrong length for subch ' + str(subchs[5]) assert _get_pdf_len( subchs[6]) == 13, 'wrong length for subch ' + str(subchs[6]) assert _get_pdf_len( subchs[7]) == 10, 'wrong length for subch ' + str(subchs[7]) assert _get_pdf_len( subchs[8]) == 13, 'wrong length for subch ' + str(subchs[8]) assert _get_pdf_len( subchs[9]) == 15, 'wrong length for subch ' + str(subchs[9]) assert _get_pdf_len( subchs[10]) == 16, 'wrong length for subch ' + str(subchs[10]) assert _get_pdf_len( subchs[11]) == 7, 'wrong length for subch ' + str(subchs[11]) assert _get_pdf_len( subchs[12]) == 18, 'wrong length for subch ' + str(subchs[12]) assert _get_pdf_len( subchs[13]) == 20, 'wrong length for subch ' + str(subchs[13]) assert _get_pdf_len( subchs[14]) == 15, 'wrong length for subch ' + str(subchs[14]) assert _get_pdf_len( subchs[15]) == 8, 'wrong length for subch ' + str(subchs[15]) assert _get_pdf_len( subchs[16]) == 5, 'wrong length for subch ' + str(subchs[16])
def test_split_subchapters(doc1_with_toc_path, downloads_dir): with PDFParser(doc1_with_toc_path, directory=downloads_dir) as pdfparser: chapters = pdfparser.split_subchapters() # pprint(chapters) for ch in chapters[0:4]: assert 'children' not in ch, 'first four chapters have no subchapters...' assert _get_pdf_len(chapters[0]) == 1, 'wrong num pages in ' + str(chapters[0]) assert _get_pdf_len(chapters[1]) == 1, 'wrong num pages in ' + str(chapters[1]) assert _get_pdf_len(chapters[2]) == 2, 'wrong num pages in ' + str(chapters[2]) assert _get_pdf_len(chapters[3]) == 3, 'wrong num pages in ' + str(chapters[3]) ch4 = chapters[4] assert 'children' in ch4, 'no children' assert len(ch4['children']) == 2 assert _get_pdf_len(ch4['children'][0]) == 1, 'wrong num pages in ' + str(ch4['children'][0]) assert _get_pdf_len(ch4['children'][1]) == 1, 'wrong num pages in ' + str(ch4['children'][1]) ch5 = chapters[5] assert 'children' in ch5, 'no children' assert len(ch5['children']) == 3 assert _get_pdf_len(ch5['children'][0]) == 1, 'wrong num pages in ' + str(ch5['children'][0]) assert _get_pdf_len(ch5['children'][1]) == 1, 'wrong num pages in ' + str(ch5['children'][1]) assert _get_pdf_len(ch5['children'][2]) == 1, 'wrong num pages in ' + str(ch5['children'][2])
def test_get_toc(doc1_with_toc_path, downloads_dir): with PDFParser(doc1_with_toc_path, directory=downloads_dir) as pdfparser: chapters_toc = pdfparser.get_toc() for chapter_dict in chapters_toc: _check_pagerange_matches_title_len(chapter_dict)
def construct_channel(self, **kwargs): channel = self.get_channel(**kwargs) # Soupify goalkicker main page gk_url = 'https://' + self.channel_info['CHANNEL_SOURCE_DOMAIN'] + '/' gk_soup = get_soup(gk_url) # Get urls for each goalkicker book els_with_page_urls = gk_soup.find_all(class_='bookContainer') page_urls = [ gk_url + el.find('a')['href'] for el in els_with_page_urls ] for book_counter, page_url in enumerate(page_urls): # Soupify book page page_soup = get_soup(page_url) # Extract and construct book info book_info = parse_book_info(page_soup) book_info['absolute_url'] = page_url + book_info['relative_url'] # Add book to channel tree book_node_source_id = 'topic/' + book_info['subject'] book_node = TopicNode(title=book_info['subject'], source_id=book_node_source_id) channel.add_child(book_node) # Use separate download directory for each book's pdf chunks. Avoids name conflicts between books download_dir = 'downloads/book_' + str(book_counter).rjust( 2, '0') + '--' + book_info['subject'] # Get chapters info pdf_path = book_info['absolute_url'] with PDFParser(pdf_path, directory=download_dir) as pdfparser: chapters = pdfparser.split_chapters() # Add chapter nodes for i, chapter in enumerate(chapters): chapter_node_source_id = book_info[ 'source_id'] + '/' + chapter['title'] if chapter['title'].startswith('Chapter'): chapter_num = re.search('Chapter (\d+)', chapter['title']).group(1) chapter_description = 'Chapter ' + chapter_num + ' of the Goalkicker book on ' + book_info[ 'subject'] else: chapter_description = '"' + chapter[ 'title'] + '" section of the Goalkicker book on ' + book_info[ 'subject'] chapter_node = DocumentNode( title=chapter['title'], description=chapter_description, source_id=chapter_node_source_id, license=get_license('CC BY-SA', copyright_holder='Stack Overflow'), language='en', files=[DocumentFile(path=chapter['path'], language='en')], ) book_node.add_child(chapter_node) return channel