def postprocess_page(self, page: Page):
        # Postprocess the page itself
        page_html = page.original_html.decode()
        page.postprocessed_html = self.postprocess_page_html(page, page_html)

        # Postprocess additional page HTML, if any
        page.additional_postprocessed_html = []
        for additional_page in page.additional_html_pages:
            additional_html = additional_page.decode()
            page.additional_postprocessed_html.append(self.postprocess_page_html(page, additional_html))
예제 #2
0
    def _process_page(self, page: Page, local_journal_handler, single=False):

        # Process the page and associated pics and maps
        page = self.page_crawler.parse_page(page, single=single)

        # Apply HTML post-processing for local browsability, if enabled
        if self.postprocess_html:
            self.html_postprocessor.postprocess_page(page)

        # Save original source and newly formatted resources
        page.save_originals(local_journal_handler)
        # Clear resources loaded into the page
        page.clear_resources()
        return page
    def process_title(self,
                      title,
                      page: Page,
                      include_metadata=True,
                      include_title=False):

        if include_metadata:
            page.title = self.get_title(title)
            page.parse_distance_statement(self.get_distance_statement(title))

        if include_title:
            title_text = self.get_title(title)
            if title_text: page.add_content(Heading(title_text))

        main_content = self.get_main_content(title)
        while (main_content):
            if not self.process_next_bloc(main_content, page): break
    def _process_journal(self, journal: Journal):
        journal_id = journal.journal_id

        # Handle standard multi-page journals with a ToC
        if journal.toc:
            log_handler.log.warning(
                'Processing multiple pages for {0}'.format(journal_id))

            # Iterate over all the retrieved pages and pull them separately.
            page_count = 0
            for toc in journal.toc:
                if toc.url:
                    page = self._process_page(toc.original_id)
                    toc.set_page(page)

                # Calculate percentage per page, to keep consumers updated
                self.progress_update(((page_count / len(journal.toc)) * 80) +
                                     10)
                page_count += 1

        else:
            log_handler.log.warning(
                'Processing single page for {0}'.format(journal_id))

            # Handle single-page journals/articles that have all the content on the title page
            journal.single_page = True

            # Create a single new page and set with the title page html
            content_page = Page(journal_id=journal_id,
                                original_id=journal_id,
                                original_html=journal.original_html)

            # Process it as a normal page and add it to the ToC
            content_page = self._process_page(content_page, single=True)
            journal.add_single_page(content_page)
            self.progress_update(percent=90)

        # Save and clear any resources not associated with pages
        journal.save_resources(self.output_handler)
        journal.clear_resources()

        # Finally serialize the parsed data structure and output
        log_handler.log.info('Serializing data for {0}'.format(journal_id),
                             extra={'journal_id': journal_id})
        self.output_handler.serialize_and_save_journal(journal)

        self.progress_update(percent=100)
        log_handler.log.info('Completed {0}'.format(journal_id),
                             extra={'journal_id': journal_id})
        return journal
    def retrieve_page(self, journal_id, original_id, url):
        log_handler.log.info('Retrieving page {0} for journal {1}'.format(
            original_id, journal_id),
                             extra={'journal_id': journal_id})
        self.update_progress()

        # Download the page HTML
        page_html = self.retriever.retrieve_page(
            url,
            original_id,
            error_message="Error code in retrieving journal page html: {0}")
        self.update_progress()

        page = Page(journal_id, original_id, page_html)

        # Download any additional other pages
        for url, part_id in self.find_additional_pages(page_html):
            log_handler.log.info(
                'Additional page for {0} detected: {1}'.format(
                    original_id, url),
                extra={'journal_id': journal_id})
            self.update_progress()

            # Lack of a parsed part number means this has already been postprocessed (and then just use the whole path)
            additional_filename = '{0}_{1}'.format(
                original_id, part_id) if part_id else url.replace('.html', '')

            additional_html = self.retriever.retrieve_page(
                url,
                additional_filename,
                error_message=
                "Error code in retrieving additional page html: {0}")
            page.add_additional_html(additional_html)
            self.update_progress()

        return page
    def process_next_bloc(self, main_content, page: Page):
        text_content, elem = self.get_text_before_next_element(main_content)
        if text_content: page.add_content(TextBlock(text_content))

        if not elem: return False

        if self.is_picture_or_map(elem):
            if self.is_map(elem):
                map_block = self.process_map(elem)
                if map_block: page.add_content(map_block)
            else:
                image_block = self.process_pic(elem)
                if image_block: page.add_content(image_block)

        self.remove_everything_before(elem, including=True)
        self.update_progress()
        return True
예제 #7
0
    def _process_journal(self, journal: Journal, local_journal_handler,
                         from_page):
        journal_id = journal.journal_id

        # Handle standard multi-page journals with a ToC
        if journal.toc:
            log_handler.log.info(
                'Processing multiple pages for {0}'.format(journal_id))
            # Iterate over all the retrieved pages and pull them separately.
            page_num = 1

            for toc in journal.toc:

                # Skip pages previously downloaded if specified
                if page_num < from_page:
                    log_handler.log.info('Skipping page {0}'.format(page_num))
                    page_num += 1
                    continue

                if toc.url:
                    log_handler.log.info(
                        'Processing content {0} of {1}'.format(
                            page_num, len(journal.toc)))

                    page = self.page_crawler.retrieve_page(
                        journal_id, toc.original_id, toc.url)
                    toc.set_page(page)

                    # Calculate percentage per page, to keep consumers updated
                    self.progress_update(((page_num / len(journal.toc)) * 80) +
                                         10)

                    self._process_page(toc.page, local_journal_handler)
                    # Calculate percentage per page, to keep consumers updated
                    self.progress_update(((
                        (page_num + 0.5) / len(journal.toc)) * 80) + 10)

                page_num += 1
        else:
            log_handler.log.warning(
                'Processing single page for {0}'.format(journal_id))

            # Handle single-page journals/articles that have all the content on the title page
            journal.single_page = True

            # Create a single new page and set with the title page html
            content_page = Page(journal_id=journal_id,
                                original_id=journal_id,
                                original_html=journal.original_html)

            # Process it as a normal page and add it to the ToC
            content_page = self._process_page(content_page,
                                              local_journal_handler,
                                              single=True)
            journal.add_single_page(content_page)

            # Calculate percentage per page, to keep consumers updated
            self.progress_update(90)

        log_handler.log.info('Completed {0}'.format(journal_id),
                             extra={'journal_id': journal_id})
        return journal