def postprocess_page(self, page: Page): # Postprocess the page itself page_html = page.original_html.decode() page.postprocessed_html = self.postprocess_page_html(page, page_html) # Postprocess additional page HTML, if any page.additional_postprocessed_html = [] for additional_page in page.additional_html_pages: additional_html = additional_page.decode() page.additional_postprocessed_html.append(self.postprocess_page_html(page, additional_html))
def _process_page(self, page: Page, local_journal_handler, single=False): # Process the page and associated pics and maps page = self.page_crawler.parse_page(page, single=single) # Apply HTML post-processing for local browsability, if enabled if self.postprocess_html: self.html_postprocessor.postprocess_page(page) # Save original source and newly formatted resources page.save_originals(local_journal_handler) # Clear resources loaded into the page page.clear_resources() return page
def process_title(self, title, page: Page, include_metadata=True, include_title=False): if include_metadata: page.title = self.get_title(title) page.parse_distance_statement(self.get_distance_statement(title)) if include_title: title_text = self.get_title(title) if title_text: page.add_content(Heading(title_text)) main_content = self.get_main_content(title) while (main_content): if not self.process_next_bloc(main_content, page): break
def _process_journal(self, journal: Journal): journal_id = journal.journal_id # Handle standard multi-page journals with a ToC if journal.toc: log_handler.log.warning( 'Processing multiple pages for {0}'.format(journal_id)) # Iterate over all the retrieved pages and pull them separately. page_count = 0 for toc in journal.toc: if toc.url: page = self._process_page(toc.original_id) toc.set_page(page) # Calculate percentage per page, to keep consumers updated self.progress_update(((page_count / len(journal.toc)) * 80) + 10) page_count += 1 else: log_handler.log.warning( 'Processing single page for {0}'.format(journal_id)) # Handle single-page journals/articles that have all the content on the title page journal.single_page = True # Create a single new page and set with the title page html content_page = Page(journal_id=journal_id, original_id=journal_id, original_html=journal.original_html) # Process it as a normal page and add it to the ToC content_page = self._process_page(content_page, single=True) journal.add_single_page(content_page) self.progress_update(percent=90) # Save and clear any resources not associated with pages journal.save_resources(self.output_handler) journal.clear_resources() # Finally serialize the parsed data structure and output log_handler.log.info('Serializing data for {0}'.format(journal_id), extra={'journal_id': journal_id}) self.output_handler.serialize_and_save_journal(journal) self.progress_update(percent=100) log_handler.log.info('Completed {0}'.format(journal_id), extra={'journal_id': journal_id}) return journal
def retrieve_page(self, journal_id, original_id, url): log_handler.log.info('Retrieving page {0} for journal {1}'.format( original_id, journal_id), extra={'journal_id': journal_id}) self.update_progress() # Download the page HTML page_html = self.retriever.retrieve_page( url, original_id, error_message="Error code in retrieving journal page html: {0}") self.update_progress() page = Page(journal_id, original_id, page_html) # Download any additional other pages for url, part_id in self.find_additional_pages(page_html): log_handler.log.info( 'Additional page for {0} detected: {1}'.format( original_id, url), extra={'journal_id': journal_id}) self.update_progress() # Lack of a parsed part number means this has already been postprocessed (and then just use the whole path) additional_filename = '{0}_{1}'.format( original_id, part_id) if part_id else url.replace('.html', '') additional_html = self.retriever.retrieve_page( url, additional_filename, error_message= "Error code in retrieving additional page html: {0}") page.add_additional_html(additional_html) self.update_progress() return page
def process_next_bloc(self, main_content, page: Page): text_content, elem = self.get_text_before_next_element(main_content) if text_content: page.add_content(TextBlock(text_content)) if not elem: return False if self.is_picture_or_map(elem): if self.is_map(elem): map_block = self.process_map(elem) if map_block: page.add_content(map_block) else: image_block = self.process_pic(elem) if image_block: page.add_content(image_block) self.remove_everything_before(elem, including=True) self.update_progress() return True
def _process_journal(self, journal: Journal, local_journal_handler, from_page): journal_id = journal.journal_id # Handle standard multi-page journals with a ToC if journal.toc: log_handler.log.info( 'Processing multiple pages for {0}'.format(journal_id)) # Iterate over all the retrieved pages and pull them separately. page_num = 1 for toc in journal.toc: # Skip pages previously downloaded if specified if page_num < from_page: log_handler.log.info('Skipping page {0}'.format(page_num)) page_num += 1 continue if toc.url: log_handler.log.info( 'Processing content {0} of {1}'.format( page_num, len(journal.toc))) page = self.page_crawler.retrieve_page( journal_id, toc.original_id, toc.url) toc.set_page(page) # Calculate percentage per page, to keep consumers updated self.progress_update(((page_num / len(journal.toc)) * 80) + 10) self._process_page(toc.page, local_journal_handler) # Calculate percentage per page, to keep consumers updated self.progress_update((( (page_num + 0.5) / len(journal.toc)) * 80) + 10) page_num += 1 else: log_handler.log.warning( 'Processing single page for {0}'.format(journal_id)) # Handle single-page journals/articles that have all the content on the title page journal.single_page = True # Create a single new page and set with the title page html content_page = Page(journal_id=journal_id, original_id=journal_id, original_html=journal.original_html) # Process it as a normal page and add it to the ToC content_page = self._process_page(content_page, local_journal_handler, single=True) journal.add_single_page(content_page) # Calculate percentage per page, to keep consumers updated self.progress_update(90) log_handler.log.info('Completed {0}'.format(journal_id), extra={'journal_id': journal_id}) return journal